Add document navigation tools: outline, style check, search

New tools for easier document navigation: - get_document_outline: Structured view of headings with chapter detection - check_style_consistency: Find formatting issues and missing chapters - search_document: Search with context and chapter location All tools tested with 200+ page manuscript. Detects issues like Chapter 3 being styled as "normal" instead of "Heading 1".
2026-01-11 07:15:43 -07:00 · 2026-01-11 07:15:43 -07:00 · 1abce7f26d
commit 1abce7f26d
parent 34e636e782
5 changed files with 374 additions and 9 deletions
--- a/reports/test_results.json
+++ b/reports/test_results.json
@ -1,9 +1,9 @@
 {
  "metadata": {
-    "start_time": "2026-01-11T05:19:25.816074",
+    "start_time": "2026-01-11T07:15:14.417108",
    "pytest_version": "9.0.2",
-    "end_time": "2026-01-11T05:19:26.468770",
-    "duration": 0.6526906490325928,
+    "end_time": "2026-01-11T07:15:15.173732",
+    "duration": 0.7566196918487549,
    "exit_status": 0
  },
  "summary": {
--- a/src/mcp_office_tools/mixins/word.py
+++ b/src/mcp_office_tools/mixins/word.py
@ -634,4 +634,369 @@ class WordMixin(MCPMixin):

            stack.append(node)

-        return tree
+        return tree
+
+    # ==================== New Document Navigation Tools ====================
+
+    @mcp_tool(
+        name="get_document_outline",
+        description="Get a clean, structured outline of a Word document showing all headings, sections, and chapters with their locations. Perfect for understanding document structure before reading."
+    )
+    @handle_office_errors("Document outline")
+    async def get_document_outline(
+        self,
+        file_path: str = Field(description="Path to Word document or URL"),
+        include_word_counts: bool = Field(default=True, description="Include estimated word count per section"),
+        detect_chapters: bool = Field(default=True, description="Detect and flag chapter headings specifically")
+    ) -> dict[str, Any]:
+        """Extract structured document outline with chapter detection."""
+        from docx import Document
+        from docx.oxml.ns import qn
+
+        start_time = time.time()
+        local_path = await resolve_office_file_path(file_path)
+
+        validation = await validate_office_file(local_path)
+        if not validation["is_valid"]:
+            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
+
+        doc = Document(local_path)
+
+        outline = []
+        current_section = None
+        section_word_count = 0
+        total_words = 0
+        chapter_pattern = ["chapter", "section", "part", "introduction", "conclusion", "appendix", "preface", "epilogue"]
+
+        for para_idx, para in enumerate(doc.paragraphs):
+            text = para.text.strip()
+            word_count = len(text.split()) if text else 0
+            total_words += word_count
+
+            # Check if this is a heading
+            style_name = para.style.name.lower() if para.style else ""
+            is_heading = "heading" in style_name or "title" in style_name
+
+            # Determine heading level
+            level = 0
+            if is_heading:
+                if "title" in style_name:
+                    level = 0
+                elif "heading 1" in style_name or style_name == "heading1":
+                    level = 1
+                elif "heading 2" in style_name or style_name == "heading2":
+                    level = 2
+                elif "heading 3" in style_name or style_name == "heading3":
+                    level = 3
+                elif "heading" in style_name:
+                    # Try to extract number from style name
+                    import re
+                    match = re.search(r'heading\s*(\d+)', style_name)
+                    level = int(match.group(1)) if match else 4
+
+            if is_heading and text:
+                # Save previous section's word count
+                if current_section is not None and include_word_counts:
+                    current_section["word_count"] = section_word_count
+
+                # Detect if this is a chapter
+                is_chapter = False
+                chapter_number = None
+                if detect_chapters:
+                    text_lower = text.lower()
+                    for pattern in chapter_pattern:
+                        if pattern in text_lower:
+                            is_chapter = True
+                            # Try to extract chapter number
+                            import re
+                            match = re.search(r'(?:chapter|section|part)\s*(\d+)', text_lower)
+                            if match:
+                                chapter_number = int(match.group(1))
+                            break
+
+                current_section = {
+                    "text": text[:150] + ("..." if len(text) > 150 else ""),
+                    "level": level,
+                    "style": para.style.name if para.style else "Unknown",
+                    "paragraph_index": para_idx,
+                    "is_chapter": is_chapter
+                }
+
+                if chapter_number is not None:
+                    current_section["chapter_number"] = chapter_number
+
+                outline.append(current_section)
+                section_word_count = 0
+            else:
+                section_word_count += word_count
+
+        # Don't forget last section
+        if current_section is not None and include_word_counts:
+            current_section["word_count"] = section_word_count
+
+        # Build summary statistics
+        chapters = [item for item in outline if item.get("is_chapter")]
+        chapter_numbers = [c.get("chapter_number") for c in chapters if c.get("chapter_number")]
+
+        # Detect missing chapters
+        missing_chapters = []
+        if chapter_numbers:
+            expected = set(range(1, max(chapter_numbers) + 1))
+            found = set(chapter_numbers)
+            missing_chapters = sorted(expected - found)
+
+        return {
+            "outline": outline,
+            "summary": {
+                "total_headings": len(outline),
+                "chapters_found": len(chapters),
+                "chapter_numbers": chapter_numbers,
+                "missing_chapters": missing_chapters,
+                "total_words": total_words,
+                "total_paragraphs": len(doc.paragraphs)
+            },
+            "extraction_time": round(time.time() - start_time, 3)
+        }
+
+    @mcp_tool(
+        name="check_style_consistency",
+        description="Analyze a Word document for style inconsistencies, formatting issues, and potential problems like mismatched heading styles or missing chapters."
+    )
+    @handle_office_errors("Style consistency check")
+    async def check_style_consistency(
+        self,
+        file_path: str = Field(description="Path to Word document or URL")
+    ) -> dict[str, Any]:
+        """Check document for style and formatting consistency issues."""
+        from docx import Document
+
+        start_time = time.time()
+        local_path = await resolve_office_file_path(file_path)
+
+        validation = await validate_office_file(local_path)
+        if not validation["is_valid"]:
+            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
+
+        doc = Document(local_path)
+
+        issues = []
+        warnings = []
+
+        # Track heading styles and chapter detection
+        heading_styles = {}
+        chapters_by_style = {"heading": [], "other": []}
+        chapter_numbers_found = []
+
+        import re
+        chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
+
+        for para_idx, para in enumerate(doc.paragraphs):
+            text = para.text.strip()
+            style_name = para.style.name if para.style else "None"
+            style_lower = style_name.lower()
+
+            # Track style usage
+            heading_styles[style_name] = heading_styles.get(style_name, 0) + 1
+
+            # Check for chapter-like text
+            chapter_match = chapter_pattern.match(text)
+            if chapter_match:
+                chapter_num = int(chapter_match.group(1))
+                chapter_numbers_found.append(chapter_num)
+
+                is_heading_style = "heading" in style_lower
+
+                if is_heading_style:
+                    chapters_by_style["heading"].append({
+                        "chapter": chapter_num,
+                        "text": text[:80],
+                        "style": style_name,
+                        "paragraph": para_idx
+                    })
+                else:
+                    chapters_by_style["other"].append({
+                        "chapter": chapter_num,
+                        "text": text[:80],
+                        "style": style_name,
+                        "paragraph": para_idx
+                    })
+                    issues.append({
+                        "type": "inconsistent_chapter_style",
+                        "severity": "warning",
+                        "message": f"Chapter {chapter_num} uses '{style_name}' instead of a Heading style",
+                        "paragraph": para_idx,
+                        "text": text[:80]
+                    })
+
+            # Check for potential headings that aren't styled as headings
+            if text and len(text) < 100 and not text.endswith('.'):
+                is_heading_style = "heading" in style_lower or "title" in style_lower
+                looks_like_heading = any(word in text.lower() for word in
+                    ["chapter", "section", "part", "introduction", "conclusion", "appendix"])
+
+                if looks_like_heading and not is_heading_style:
+                    warnings.append({
+                        "type": "potential_heading_not_styled",
+                        "message": f"Text looks like a heading but uses '{style_name}' style",
+                        "paragraph": para_idx,
+                        "text": text[:80]
+                    })
+
+        # Check for missing chapters in sequence
+        missing_chapters = []
+        if chapter_numbers_found:
+            chapter_numbers_found.sort()
+            expected = set(range(1, max(chapter_numbers_found) + 1))
+            found = set(chapter_numbers_found)
+            missing_chapters = sorted(expected - found)
+
+            for missing in missing_chapters:
+                issues.append({
+                    "type": "missing_chapter",
+                    "severity": "error",
+                    "message": f"Chapter {missing} appears to be missing from sequence",
+                    "expected_between": f"Chapter {missing-1} and Chapter {missing+1}" if missing > 1 else f"Before Chapter {missing+1}"
+                })
+
+        # Check for duplicate chapter numbers
+        from collections import Counter
+        chapter_counts = Counter(chapter_numbers_found)
+        duplicates = {num: count for num, count in chapter_counts.items() if count > 1}
+        for chapter_num, count in duplicates.items():
+            issues.append({
+                "type": "duplicate_chapter",
+                "severity": "warning",
+                "message": f"Chapter {chapter_num} appears {count} times"
+            })
+
+        # Summary of heading style usage
+        heading_summary = {k: v for k, v in heading_styles.items()
+                         if "heading" in k.lower() or "title" in k.lower()}
+
+        return {
+            "issues": issues,
+            "warnings": warnings,
+            "chapter_analysis": {
+                "total_chapters": len(chapter_numbers_found),
+                "chapters_with_heading_style": len(chapters_by_style["heading"]),
+                "chapters_without_heading_style": len(chapters_by_style["other"]),
+                "missing_chapters": missing_chapters,
+                "duplicate_chapters": list(duplicates.keys()),
+                "chapter_details": chapters_by_style
+            },
+            "style_usage": heading_summary,
+            "health_score": self._calculate_doc_health_score(issues, warnings),
+            "analysis_time": round(time.time() - start_time, 3)
+        }
+
+    def _calculate_doc_health_score(self, issues: list, warnings: list) -> dict:
+        """Calculate document health score based on issues found."""
+        score = 100
+
+        for issue in issues:
+            if issue.get("severity") == "error":
+                score -= 10
+            elif issue.get("severity") == "warning":
+                score -= 5
+
+        for _ in warnings:
+            score -= 2
+
+        score = max(0, min(100, score))
+
+        if score >= 90:
+            rating = "excellent"
+        elif score >= 70:
+            rating = "good"
+        elif score >= 50:
+            rating = "fair"
+        else:
+            rating = "needs attention"
+
+        return {"score": score, "rating": rating}
+
+    @mcp_tool(
+        name="search_document",
+        description="Search for text within a Word document and return matches with surrounding context and location information."
+    )
+    @handle_office_errors("Document search")
+    async def search_document(
+        self,
+        file_path: str = Field(description="Path to Word document or URL"),
+        query: str = Field(description="Text to search for (case-insensitive)"),
+        context_chars: int = Field(default=100, description="Number of characters of context before and after match"),
+        max_results: int = Field(default=20, description="Maximum number of results to return")
+    ) -> dict[str, Any]:
+        """Search document for text with context."""
+        from docx import Document
+
+        start_time = time.time()
+        local_path = await resolve_office_file_path(file_path)
+
+        validation = await validate_office_file(local_path)
+        if not validation["is_valid"]:
+            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
+
+        doc = Document(local_path)
+        query_lower = query.lower()
+
+        results = []
+        current_chapter = None
+        current_section = None
+
+        for para_idx, para in enumerate(doc.paragraphs):
+            text = para.text
+            style_name = para.style.name if para.style else ""
+            style_lower = style_name.lower()
+
+            # Track current chapter/section for context
+            if "heading" in style_lower or "title" in style_lower:
+                if "1" in style_name or "title" in style_lower:
+                    current_chapter = text.strip()[:80]
+                    current_section = None
+                else:
+                    current_section = text.strip()[:80]
+
+            # Search for matches
+            text_lower = text.lower()
+            search_start = 0
+
+            while True:
+                pos = text_lower.find(query_lower, search_start)
+                if pos == -1:
+                    break
+
+                if len(results) >= max_results:
+                    break
+
+                # Extract context
+                context_start = max(0, pos - context_chars)
+                context_end = min(len(text), pos + len(query) + context_chars)
+
+                context = text[context_start:context_end]
+                if context_start > 0:
+                    context = "..." + context
+                if context_end < len(text):
+                    context = context + "..."
+
+                results.append({
+                    "paragraph_index": para_idx,
+                    "position": pos,
+                    "context": context,
+                    "chapter": current_chapter,
+                    "section": current_section,
+                    "style": style_name
+                })
+
+                search_start = pos + 1
+
+            if len(results) >= max_results:
+                break
+
+        return {
+            "query": query,
+            "total_matches": len(results),
+            "results": results,
+            "search_time": round(time.time() - start_time, 3),
+            "truncated": len(results) >= max_results
+        }
--- a/tests/test_mixins.py
+++ b/tests/test_mixins.py
@ -64,7 +64,7 @@ class TestMixinArchitecture:
        word = WordMixin()
        word.register_all(app)
        word_tools = len(app._tool_manager._tools) - initial_tool_count - universal_tools
-        assert word_tools == 3  # convert_to_markdown, extract_word_tables, analyze_word_structure
+        assert word_tools == 6  # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document

        excel = ExcelMixin()
        excel.register_all(app)
--- a/tests/test_server.py
+++ b/tests/test_server.py
@ -149,8 +149,8 @@ class TestMixinIntegration:
        # Verify no duplicates
        assert len(tool_names) == len(set(tool_names)), "Tool names should be unique"

-        # Verify expected count: 6 universal + 3 word + 3 excel = 12
-        assert len(tool_names) == 12, f"Expected 12 tools, got {len(tool_names)}: {list(tool_names.keys())}"
+        # Verify expected count: 6 universal + 6 word + 3 excel = 15
+        assert len(tool_names) == 15, f"Expected 15 tools, got {len(tool_names)}: {list(tool_names.keys())}"


 if __name__ == "__main__":
--- a/tests/test_word_mixin.py
+++ b/tests/test_word_mixin.py
@ -28,14 +28,14 @@ class TestWordMixinRegistration:
        mixin.register_all(app)

        assert mixin is not None
-        assert len(app._tool_manager._tools) == 3  # convert_to_markdown, extract_word_tables, analyze_word_structure
+        assert len(app._tool_manager._tools) == 6  # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document

    def test_tool_names_registered(self):
        """Test that Word-specific tools are registered."""
        app = FastMCP("Test Word")
        WordMixin().register_all(app)

-        expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure"}
+        expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure", "get_document_outline", "check_style_consistency", "search_document"}
        registered_tools = set(app._tool_manager._tools.keys())
        assert expected_tools.issubset(registered_tools)