Add document navigation tools: outline, style check, search

New tools for easier document navigation: - get_document_outline: Structured view of headings with chapter detection - check_style_consistency: Find formatting issues and missing chapters - search_document: Search with context and chapter location All tools tested with 200+ page manuscript. Detects issues like Chapter 3 being styled as "normal" instead of "Heading 1".
2026-01-11 07:15:43 -07:00 · 2026-01-11 07:15:43 -07:00 · 1abce7f26d
commit 1abce7f26d
parent 34e636e782
5 changed files with 374 additions and 9 deletions
--- a/reports/test_results.json
+++ b/reports/test_results.json
@ -1,9 +1,9 @@
 {
  "metadata": {
-    "start_time": "2026-01-11T05:19:25.816074",
+    "start_time": "2026-01-11T07:15:14.417108",
    "pytest_version": "9.0.2",
-    "end_time": "2026-01-11T05:19:26.468770",
+    "end_time": "2026-01-11T07:15:15.173732",
-    "duration": 0.6526906490325928,
+    "duration": 0.7566196918487549,
    "exit_status": 0
  },
  "summary": {
--- a/src/mcp_office_tools/mixins/word.py
+++ b/src/mcp_office_tools/mixins/word.py
@ -634,4 +634,369 @@ class WordMixin(MCPMixin):
            stack.append(node)
-        return tree
+        return tree
    # ==================== New Document Navigation Tools ====================
    @mcp_tool(
        name="get_document_outline",
        description="Get a clean, structured outline of a Word document showing all headings, sections, and chapters with their locations. Perfect for understanding document structure before reading."
    )
    @handle_office_errors("Document outline")
    async def get_document_outline(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        include_word_counts: bool = Field(default=True, description="Include estimated word count per section"),
        detect_chapters: bool = Field(default=True, description="Detect and flag chapter headings specifically")
    ) -> dict[str, Any]:
        """Extract structured document outline with chapter detection."""
        from docx import Document
        from docx.oxml.ns import qn
        start_time = time.time()
        local_path = await resolve_office_file_path(file_path)
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
        doc = Document(local_path)
        outline = []
        current_section = None
        section_word_count = 0
        total_words = 0
        chapter_pattern = ["chapter", "section", "part", "introduction", "conclusion", "appendix", "preface", "epilogue"]
        for para_idx, para in enumerate(doc.paragraphs):
            text = para.text.strip()
            word_count = len(text.split()) if text else 0
            total_words += word_count
            # Check if this is a heading
            style_name = para.style.name.lower() if para.style else ""
            is_heading = "heading" in style_name or "title" in style_name
            # Determine heading level
            level = 0
            if is_heading:
                if "title" in style_name:
                    level = 0
                elif "heading 1" in style_name or style_name == "heading1":
                    level = 1
                elif "heading 2" in style_name or style_name == "heading2":
                    level = 2
                elif "heading 3" in style_name or style_name == "heading3":
                    level = 3
                elif "heading" in style_name:
                    # Try to extract number from style name
                    import re
                    match = re.search(r'heading\s*(\d+)', style_name)
                    level = int(match.group(1)) if match else 4
            if is_heading and text:
                # Save previous section's word count
                if current_section is not None and include_word_counts:
                    current_section["word_count"] = section_word_count
                # Detect if this is a chapter
                is_chapter = False
                chapter_number = None
                if detect_chapters:
                    text_lower = text.lower()
                    for pattern in chapter_pattern:
                        if pattern in text_lower:
                            is_chapter = True
                            # Try to extract chapter number
                            import re
                            match = re.search(r'(?:chapter|section|part)\s*(\d+)', text_lower)
                            if match:
                                chapter_number = int(match.group(1))
                            break
                current_section = {
                    "text": text[:150] + ("..." if len(text) > 150 else ""),
                    "level": level,
                    "style": para.style.name if para.style else "Unknown",
                    "paragraph_index": para_idx,
                    "is_chapter": is_chapter
                }
                if chapter_number is not None:
                    current_section["chapter_number"] = chapter_number
                outline.append(current_section)
                section_word_count = 0
            else:
                section_word_count += word_count
        # Don't forget last section
        if current_section is not None and include_word_counts:
            current_section["word_count"] = section_word_count
        # Build summary statistics
        chapters = [item for item in outline if item.get("is_chapter")]
        chapter_numbers = [c.get("chapter_number") for c in chapters if c.get("chapter_number")]
        # Detect missing chapters
        missing_chapters = []
        if chapter_numbers:
            expected = set(range(1, max(chapter_numbers) + 1))
            found = set(chapter_numbers)
            missing_chapters = sorted(expected - found)
        return {
            "outline": outline,
            "summary": {
                "total_headings": len(outline),
                "chapters_found": len(chapters),
                "chapter_numbers": chapter_numbers,
                "missing_chapters": missing_chapters,
                "total_words": total_words,
                "total_paragraphs": len(doc.paragraphs)
            },
            "extraction_time": round(time.time() - start_time, 3)
        }
    @mcp_tool(
        name="check_style_consistency",
        description="Analyze a Word document for style inconsistencies, formatting issues, and potential problems like mismatched heading styles or missing chapters."
    )
    @handle_office_errors("Style consistency check")
    async def check_style_consistency(
        self,
        file_path: str = Field(description="Path to Word document or URL")
    ) -> dict[str, Any]:
        """Check document for style and formatting consistency issues."""
        from docx import Document
        start_time = time.time()
        local_path = await resolve_office_file_path(file_path)
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
        doc = Document(local_path)
        issues = []
        warnings = []
        # Track heading styles and chapter detection
        heading_styles = {}
        chapters_by_style = {"heading": [], "other": []}
        chapter_numbers_found = []
        import re
        chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
        for para_idx, para in enumerate(doc.paragraphs):
            text = para.text.strip()
            style_name = para.style.name if para.style else "None"
            style_lower = style_name.lower()
            # Track style usage
            heading_styles[style_name] = heading_styles.get(style_name, 0) + 1
            # Check for chapter-like text
            chapter_match = chapter_pattern.match(text)
            if chapter_match:
                chapter_num = int(chapter_match.group(1))
                chapter_numbers_found.append(chapter_num)
                is_heading_style = "heading" in style_lower
                if is_heading_style:
                    chapters_by_style["heading"].append({
                        "chapter": chapter_num,
                        "text": text[:80],
                        "style": style_name,
                        "paragraph": para_idx
                    })
                else:
                    chapters_by_style["other"].append({
                        "chapter": chapter_num,
                        "text": text[:80],
                        "style": style_name,
                        "paragraph": para_idx
                    })
                    issues.append({
                        "type": "inconsistent_chapter_style",
                        "severity": "warning",
                        "message": f"Chapter {chapter_num} uses '{style_name}' instead of a Heading style",
                        "paragraph": para_idx,
                        "text": text[:80]
                    })
            # Check for potential headings that aren't styled as headings
            if text and len(text) < 100 and not text.endswith('.'):
                is_heading_style = "heading" in style_lower or "title" in style_lower
                looks_like_heading = any(word in text.lower() for word in
                    ["chapter", "section", "part", "introduction", "conclusion", "appendix"])
                if looks_like_heading and not is_heading_style:
                    warnings.append({
                        "type": "potential_heading_not_styled",
                        "message": f"Text looks like a heading but uses '{style_name}' style",
                        "paragraph": para_idx,
                        "text": text[:80]
                    })
        # Check for missing chapters in sequence
        missing_chapters = []
        if chapter_numbers_found:
            chapter_numbers_found.sort()
            expected = set(range(1, max(chapter_numbers_found) + 1))
            found = set(chapter_numbers_found)
            missing_chapters = sorted(expected - found)
            for missing in missing_chapters:
                issues.append({
                    "type": "missing_chapter",
                    "severity": "error",
                    "message": f"Chapter {missing} appears to be missing from sequence",
                    "expected_between": f"Chapter {missing-1} and Chapter {missing+1}" if missing > 1 else f"Before Chapter {missing+1}"
                })
        # Check for duplicate chapter numbers
        from collections import Counter
        chapter_counts = Counter(chapter_numbers_found)
        duplicates = {num: count for num, count in chapter_counts.items() if count > 1}
        for chapter_num, count in duplicates.items():
            issues.append({
                "type": "duplicate_chapter",
                "severity": "warning",
                "message": f"Chapter {chapter_num} appears {count} times"
            })
        # Summary of heading style usage
        heading_summary = {k: v for k, v in heading_styles.items()
                         if "heading" in k.lower() or "title" in k.lower()}
        return {
            "issues": issues,
            "warnings": warnings,
            "chapter_analysis": {
                "total_chapters": len(chapter_numbers_found),
                "chapters_with_heading_style": len(chapters_by_style["heading"]),
                "chapters_without_heading_style": len(chapters_by_style["other"]),
                "missing_chapters": missing_chapters,
                "duplicate_chapters": list(duplicates.keys()),
                "chapter_details": chapters_by_style
            },
            "style_usage": heading_summary,
            "health_score": self._calculate_doc_health_score(issues, warnings),
            "analysis_time": round(time.time() - start_time, 3)
        }
    def _calculate_doc_health_score(self, issues: list, warnings: list) -> dict:
        """Calculate document health score based on issues found."""
        score = 100
        for issue in issues:
            if issue.get("severity") == "error":
                score -= 10
            elif issue.get("severity") == "warning":
                score -= 5
        for _ in warnings:
            score -= 2
        score = max(0, min(100, score))
        if score >= 90:
            rating = "excellent"
        elif score >= 70:
            rating = "good"
        elif score >= 50:
            rating = "fair"
        else:
            rating = "needs attention"
        return {"score": score, "rating": rating}
    @mcp_tool(
        name="search_document",
        description="Search for text within a Word document and return matches with surrounding context and location information."
    )
    @handle_office_errors("Document search")
    async def search_document(
        self,
        file_path: str = Field(description="Path to Word document or URL"),
        query: str = Field(description="Text to search for (case-insensitive)"),
        context_chars: int = Field(default=100, description="Number of characters of context before and after match"),
        max_results: int = Field(default=20, description="Maximum number of results to return")
    ) -> dict[str, Any]:
        """Search document for text with context."""
        from docx import Document
        start_time = time.time()
        local_path = await resolve_office_file_path(file_path)
        validation = await validate_office_file(local_path)
        if not validation["is_valid"]:
            raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
        doc = Document(local_path)
        query_lower = query.lower()
        results = []
        current_chapter = None
        current_section = None
        for para_idx, para in enumerate(doc.paragraphs):
            text = para.text
            style_name = para.style.name if para.style else ""
            style_lower = style_name.lower()
            # Track current chapter/section for context
            if "heading" in style_lower or "title" in style_lower:
                if "1" in style_name or "title" in style_lower:
                    current_chapter = text.strip()[:80]
                    current_section = None
                else:
                    current_section = text.strip()[:80]
            # Search for matches
            text_lower = text.lower()
            search_start = 0
            while True:
                pos = text_lower.find(query_lower, search_start)
                if pos == -1:
                    break
                if len(results) >= max_results:
                    break
                # Extract context
                context_start = max(0, pos - context_chars)
                context_end = min(len(text), pos + len(query) + context_chars)
                context = text[context_start:context_end]
                if context_start > 0:
                    context = "..." + context
                if context_end < len(text):
                    context = context + "..."
                results.append({
                    "paragraph_index": para_idx,
                    "position": pos,
                    "context": context,
                    "chapter": current_chapter,
                    "section": current_section,
                    "style": style_name
                })
                search_start = pos + 1
            if len(results) >= max_results:
                break
        return {
            "query": query,
            "total_matches": len(results),
            "results": results,
            "search_time": round(time.time() - start_time, 3),
            "truncated": len(results) >= max_results
        }
--- a/tests/test_mixins.py
+++ b/tests/test_mixins.py
@ -64,7 +64,7 @@ class TestMixinArchitecture:
        word = WordMixin()
        word.register_all(app)
        word_tools = len(app._tool_manager._tools) - initial_tool_count - universal_tools
-        assert word_tools == 3  # convert_to_markdown, extract_word_tables, analyze_word_structure
+        assert word_tools == 6  # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document
        excel = ExcelMixin()
        excel.register_all(app)
--- a/tests/test_server.py
+++ b/tests/test_server.py
@ -149,8 +149,8 @@ class TestMixinIntegration:
        # Verify no duplicates
        assert len(tool_names) == len(set(tool_names)), "Tool names should be unique"
-        # Verify expected count: 6 universal + 3 word + 3 excel = 12
+        # Verify expected count: 6 universal + 6 word + 3 excel = 15
-        assert len(tool_names) == 12, f"Expected 12 tools, got {len(tool_names)}: {list(tool_names.keys())}"
+        assert len(tool_names) == 15, f"Expected 15 tools, got {len(tool_names)}: {list(tool_names.keys())}"
 if __name__ == "__main__":
--- a/tests/test_word_mixin.py
+++ b/tests/test_word_mixin.py
@ -28,14 +28,14 @@ class TestWordMixinRegistration:
        mixin.register_all(app)
        assert mixin is not None
-        assert len(app._tool_manager._tools) == 3  # convert_to_markdown, extract_word_tables, analyze_word_structure
+        assert len(app._tool_manager._tools) == 6  # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document
    def test_tool_names_registered(self):
        """Test that Word-specific tools are registered."""
        app = FastMCP("Test Word")
        WordMixin().register_all(app)
-        expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure"}
+        expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure", "get_document_outline", "check_style_consistency", "search_document"}
        registered_tools = set(app._tool_manager._tools.keys())
        assert expected_tools.issubset(registered_tools)