Improve section detection with heading styles + fallback

- Primary: Detect sections via Heading 1 styles (structured) - Fallback: Detect chapters via "Chapter X" text patterns - Add text_patterns_only flag to skip heading styles (for messy docs) This handles both well-structured business documents (manuals, PRDs) and narrative content (books with explicit chapter headings).
2026-01-11 09:40:38 -07:00 · 2026-01-11 09:40:38 -07:00 · 89ad0c849d
commit 89ad0c849d
parent d569034fa3
2 changed files with 73 additions and 33 deletions
--- a/reports/test_results.json
+++ b/reports/test_results.json
@ -1,9 +1,9 @@
 {
  "metadata": {
-    "start_time": "2026-01-11T07:15:14.417108",
+    "start_time": "2026-01-11T09:40:29.164041",
    "pytest_version": "9.0.2",
-    "end_time": "2026-01-11T07:15:15.173732",
-    "duration": 0.7566196918487549,
+    "end_time": "2026-01-11T09:40:30.048909",
+    "duration": 0.8848621845245361,
    "exit_status": 0
  },
  "summary": {
--- a/src/mcp_office_tools/mixins/universal.py
+++ b/src/mcp_office_tools/mixins/universal.py
@ -353,7 +353,8 @@ class UniversalMixin(MCPMixin):
        include_images: bool = Field(default=True, description="Index embedded images"),
        include_chapters: bool = Field(default=True, description="Index chapters/sections (Word docs)"),
        include_sheets: bool = Field(default=True, description="Index sheets (Excel docs)"),
-        include_slides: bool = Field(default=True, description="Index slides (PowerPoint docs)")
+        include_slides: bool = Field(default=True, description="Index slides (PowerPoint docs)"),
+        text_patterns_only: bool = Field(default=False, description="Ignore heading styles, detect chapters by 'Chapter X' text patterns only")
    ) -> dict[str, Any]:
        """Scan document and populate resource store with available content.

@ -415,7 +416,7 @@ class UniversalMixin(MCPMixin):
        # Index chapters (Word documents)
        if include_chapters and category == "word":
            try:
-                chapters = await self._index_word_chapters(local_path, doc_id)
+                chapters = await self._index_word_chapters(local_path, doc_id, text_patterns_only)
                indexed["resources"]["chapter"] = chapters
            except Exception as e:
                indexed["resources"]["chapter"] = {"error": str(e)}
@ -444,23 +445,39 @@ class UniversalMixin(MCPMixin):

        return indexed

-    async def _index_word_chapters(self, file_path: str, doc_id: str) -> list[dict]:
-        """Extract and index chapters from a Word document."""
+    async def _index_word_chapters(self, file_path: str, doc_id: str, text_patterns_only: bool = False) -> list[dict]:
+        """Extract and index chapters/sections from a Word document.
+
+        Detection strategy (in order):
+        1. Primary: Heading 1 styles (structured, reliable) → section://doc/N
+        2. Fallback: "Chapter X" text pattern (books, manuscripts) → chapter://doc/N
+
+        If text_patterns_only=True, skips heading styles and uses only text patterns.
+        """
        import re
        from docx import Document

        doc = Document(file_path)
        chapters = []
-        current_chapter = None
+        current_section = None
        current_paragraphs = []
-        chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
+        section_num = 0

-        def save_chapter():
-            nonlocal current_chapter, current_paragraphs
-            if current_chapter is not None:
+        # Detection patterns
+        chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
+        heading_styles = {'Heading 1', 'Heading1', 'Title', 'Titre', 'Überschrift 1'}
+
+        def is_heading(para) -> bool:
+            """Check if paragraph is a heading style."""
+            style_name = para.style.name if para.style else ''
+            return style_name in heading_styles or style_name.startswith('Heading 1')
+
+        def save_section(resource_type: str = "chapter"):
+            nonlocal current_section, current_paragraphs, section_num
+            if current_section is not None and current_paragraphs:
                # Convert to markdown
                markdown_lines = []
-                markdown_lines.append(f"# {current_chapter['title']}\n")
+                markdown_lines.append(f"# {current_section['title']}\n")
                for para in current_paragraphs:
                    text = para.strip()
                    if text:
@ -469,11 +486,11 @@ class UniversalMixin(MCPMixin):
                content = "\n".join(markdown_lines)

                resource = EmbeddedResource(
-                    resource_id=str(current_chapter["number"]),
-                    resource_type="chapter",
+                    resource_id=str(current_section["number"]),
+                    resource_type=resource_type,
                    mime_type="text/markdown",
                    data=content,
-                    name=current_chapter["title"],
+                    name=current_section["title"],
                    metadata={
                        "word_count": len(content.split()),
                        "paragraph_count": len(current_paragraphs)
@ -482,28 +499,51 @@ class UniversalMixin(MCPMixin):
                resource_store.store(doc_id, resource, file_path)

                chapters.append({
-                    "id": str(current_chapter["number"]),
-                    "title": current_chapter["title"],
-                    "uri": f"chapter://{doc_id}/{current_chapter['number']}",
+                    "id": str(current_section["number"]),
+                    "title": current_section["title"],
+                    "uri": f"{resource_type}://{doc_id}/{current_section['number']}",
                    "word_count": len(content.split())
                })

-        for para in doc.paragraphs:
-            text = para.text.strip()
-            match = chapter_pattern.match(text)
+        # Primary: detect by Heading 1 styles (structured, reliable)
+        # Skip if text_patterns_only=True (for messy docs with inconsistent styles)
+        if not text_patterns_only:
+            for para in doc.paragraphs:
+                text = para.text.strip()

-            if match:
-                save_chapter()
-                current_chapter = {
-                    "number": int(match.group(1)),
-                    "title": text[:100]
-                }
-                current_paragraphs = []
-            elif current_chapter is not None:
-                current_paragraphs.append(text)
+                if is_heading(para) and text:
+                    save_section("section")
+                    section_num += 1
+                    current_section = {
+                        "number": section_num,
+                        "title": text[:100]
+                    }
+                    current_paragraphs = []
+                elif current_section is not None:
+                    current_paragraphs.append(text)

-        # Save last chapter
-        save_chapter()
+            save_section("section")
+
+        # Fallback: try "Chapter X" text pattern (for docs without heading styles)
+        if not chapters:
+            current_section = None
+            current_paragraphs = []
+
+            for para in doc.paragraphs:
+                text = para.text.strip()
+                match = chapter_pattern.match(text)
+
+                if match:
+                    save_section("chapter")
+                    current_section = {
+                        "number": int(match.group(1)),
+                        "title": text[:100]
+                    }
+                    current_paragraphs = []
+                elif current_section is not None:
+                    current_paragraphs.append(text)
+
+            save_section("chapter")

        return chapters