Improve section detection with heading styles + fallback
Some checks are pending
Test Dashboard / test-and-dashboard (push) Waiting to run

- Primary: Detect sections via Heading 1 styles (structured)
- Fallback: Detect chapters via "Chapter X" text patterns
- Add text_patterns_only flag to skip heading styles (for messy docs)

This handles both well-structured business documents (manuals, PRDs)
and narrative content (books with explicit chapter headings).
This commit is contained in:
Ryan Malloy 2026-01-11 09:40:38 -07:00
parent d569034fa3
commit 89ad0c849d
2 changed files with 73 additions and 33 deletions

View File

@ -1,9 +1,9 @@
{
"metadata": {
"start_time": "2026-01-11T07:15:14.417108",
"start_time": "2026-01-11T09:40:29.164041",
"pytest_version": "9.0.2",
"end_time": "2026-01-11T07:15:15.173732",
"duration": 0.7566196918487549,
"end_time": "2026-01-11T09:40:30.048909",
"duration": 0.8848621845245361,
"exit_status": 0
},
"summary": {

View File

@ -353,7 +353,8 @@ class UniversalMixin(MCPMixin):
include_images: bool = Field(default=True, description="Index embedded images"),
include_chapters: bool = Field(default=True, description="Index chapters/sections (Word docs)"),
include_sheets: bool = Field(default=True, description="Index sheets (Excel docs)"),
include_slides: bool = Field(default=True, description="Index slides (PowerPoint docs)")
include_slides: bool = Field(default=True, description="Index slides (PowerPoint docs)"),
text_patterns_only: bool = Field(default=False, description="Ignore heading styles, detect chapters by 'Chapter X' text patterns only")
) -> dict[str, Any]:
"""Scan document and populate resource store with available content.
@ -415,7 +416,7 @@ class UniversalMixin(MCPMixin):
# Index chapters (Word documents)
if include_chapters and category == "word":
try:
chapters = await self._index_word_chapters(local_path, doc_id)
chapters = await self._index_word_chapters(local_path, doc_id, text_patterns_only)
indexed["resources"]["chapter"] = chapters
except Exception as e:
indexed["resources"]["chapter"] = {"error": str(e)}
@ -444,23 +445,39 @@ class UniversalMixin(MCPMixin):
return indexed
async def _index_word_chapters(self, file_path: str, doc_id: str) -> list[dict]:
"""Extract and index chapters from a Word document."""
async def _index_word_chapters(self, file_path: str, doc_id: str, text_patterns_only: bool = False) -> list[dict]:
"""Extract and index chapters/sections from a Word document.
Detection strategy (in order):
1. Primary: Heading 1 styles (structured, reliable) section://doc/N
2. Fallback: "Chapter X" text pattern (books, manuscripts) chapter://doc/N
If text_patterns_only=True, skips heading styles and uses only text patterns.
"""
import re
from docx import Document
doc = Document(file_path)
chapters = []
current_chapter = None
current_section = None
current_paragraphs = []
chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
section_num = 0
def save_chapter():
nonlocal current_chapter, current_paragraphs
if current_chapter is not None:
# Detection patterns
chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
heading_styles = {'Heading 1', 'Heading1', 'Title', 'Titre', 'Überschrift 1'}
def is_heading(para) -> bool:
"""Check if paragraph is a heading style."""
style_name = para.style.name if para.style else ''
return style_name in heading_styles or style_name.startswith('Heading 1')
def save_section(resource_type: str = "chapter"):
nonlocal current_section, current_paragraphs, section_num
if current_section is not None and current_paragraphs:
# Convert to markdown
markdown_lines = []
markdown_lines.append(f"# {current_chapter['title']}\n")
markdown_lines.append(f"# {current_section['title']}\n")
for para in current_paragraphs:
text = para.strip()
if text:
@ -469,11 +486,11 @@ class UniversalMixin(MCPMixin):
content = "\n".join(markdown_lines)
resource = EmbeddedResource(
resource_id=str(current_chapter["number"]),
resource_type="chapter",
resource_id=str(current_section["number"]),
resource_type=resource_type,
mime_type="text/markdown",
data=content,
name=current_chapter["title"],
name=current_section["title"],
metadata={
"word_count": len(content.split()),
"paragraph_count": len(current_paragraphs)
@ -482,28 +499,51 @@ class UniversalMixin(MCPMixin):
resource_store.store(doc_id, resource, file_path)
chapters.append({
"id": str(current_chapter["number"]),
"title": current_chapter["title"],
"uri": f"chapter://{doc_id}/{current_chapter['number']}",
"id": str(current_section["number"]),
"title": current_section["title"],
"uri": f"{resource_type}://{doc_id}/{current_section['number']}",
"word_count": len(content.split())
})
for para in doc.paragraphs:
text = para.text.strip()
match = chapter_pattern.match(text)
# Primary: detect by Heading 1 styles (structured, reliable)
# Skip if text_patterns_only=True (for messy docs with inconsistent styles)
if not text_patterns_only:
for para in doc.paragraphs:
text = para.text.strip()
if match:
save_chapter()
current_chapter = {
"number": int(match.group(1)),
"title": text[:100]
}
current_paragraphs = []
elif current_chapter is not None:
current_paragraphs.append(text)
if is_heading(para) and text:
save_section("section")
section_num += 1
current_section = {
"number": section_num,
"title": text[:100]
}
current_paragraphs = []
elif current_section is not None:
current_paragraphs.append(text)
# Save last chapter
save_chapter()
save_section("section")
# Fallback: try "Chapter X" text pattern (for docs without heading styles)
if not chapters:
current_section = None
current_paragraphs = []
for para in doc.paragraphs:
text = para.text.strip()
match = chapter_pattern.match(text)
if match:
save_section("chapter")
current_section = {
"number": int(match.group(1)),
"title": text[:100]
}
current_paragraphs = []
elif current_section is not None:
current_paragraphs.append(text)
save_section("chapter")
return chapters