diff --git a/reports/test_results.json b/reports/test_results.json index 009899e..45bc471 100644 --- a/reports/test_results.json +++ b/reports/test_results.json @@ -1,9 +1,9 @@ { "metadata": { - "start_time": "2026-01-11T05:19:25.816074", + "start_time": "2026-01-11T07:15:14.417108", "pytest_version": "9.0.2", - "end_time": "2026-01-11T05:19:26.468770", - "duration": 0.6526906490325928, + "end_time": "2026-01-11T07:15:15.173732", + "duration": 0.7566196918487549, "exit_status": 0 }, "summary": { diff --git a/src/mcp_office_tools/mixins/word.py b/src/mcp_office_tools/mixins/word.py index c8f4d62..8d1de0d 100644 --- a/src/mcp_office_tools/mixins/word.py +++ b/src/mcp_office_tools/mixins/word.py @@ -634,4 +634,369 @@ class WordMixin(MCPMixin): stack.append(node) - return tree \ No newline at end of file + return tree + + # ==================== New Document Navigation Tools ==================== + + @mcp_tool( + name="get_document_outline", + description="Get a clean, structured outline of a Word document showing all headings, sections, and chapters with their locations. Perfect for understanding document structure before reading." + ) + @handle_office_errors("Document outline") + async def get_document_outline( + self, + file_path: str = Field(description="Path to Word document or URL"), + include_word_counts: bool = Field(default=True, description="Include estimated word count per section"), + detect_chapters: bool = Field(default=True, description="Detect and flag chapter headings specifically") + ) -> dict[str, Any]: + """Extract structured document outline with chapter detection.""" + from docx import Document + from docx.oxml.ns import qn + + start_time = time.time() + local_path = await resolve_office_file_path(file_path) + + validation = await validate_office_file(local_path) + if not validation["is_valid"]: + raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}") + + doc = Document(local_path) + + outline = [] + current_section = None + section_word_count = 0 + total_words = 0 + chapter_pattern = ["chapter", "section", "part", "introduction", "conclusion", "appendix", "preface", "epilogue"] + + for para_idx, para in enumerate(doc.paragraphs): + text = para.text.strip() + word_count = len(text.split()) if text else 0 + total_words += word_count + + # Check if this is a heading + style_name = para.style.name.lower() if para.style else "" + is_heading = "heading" in style_name or "title" in style_name + + # Determine heading level + level = 0 + if is_heading: + if "title" in style_name: + level = 0 + elif "heading 1" in style_name or style_name == "heading1": + level = 1 + elif "heading 2" in style_name or style_name == "heading2": + level = 2 + elif "heading 3" in style_name or style_name == "heading3": + level = 3 + elif "heading" in style_name: + # Try to extract number from style name + import re + match = re.search(r'heading\s*(\d+)', style_name) + level = int(match.group(1)) if match else 4 + + if is_heading and text: + # Save previous section's word count + if current_section is not None and include_word_counts: + current_section["word_count"] = section_word_count + + # Detect if this is a chapter + is_chapter = False + chapter_number = None + if detect_chapters: + text_lower = text.lower() + for pattern in chapter_pattern: + if pattern in text_lower: + is_chapter = True + # Try to extract chapter number + import re + match = re.search(r'(?:chapter|section|part)\s*(\d+)', text_lower) + if match: + chapter_number = int(match.group(1)) + break + + current_section = { + "text": text[:150] + ("..." if len(text) > 150 else ""), + "level": level, + "style": para.style.name if para.style else "Unknown", + "paragraph_index": para_idx, + "is_chapter": is_chapter + } + + if chapter_number is not None: + current_section["chapter_number"] = chapter_number + + outline.append(current_section) + section_word_count = 0 + else: + section_word_count += word_count + + # Don't forget last section + if current_section is not None and include_word_counts: + current_section["word_count"] = section_word_count + + # Build summary statistics + chapters = [item for item in outline if item.get("is_chapter")] + chapter_numbers = [c.get("chapter_number") for c in chapters if c.get("chapter_number")] + + # Detect missing chapters + missing_chapters = [] + if chapter_numbers: + expected = set(range(1, max(chapter_numbers) + 1)) + found = set(chapter_numbers) + missing_chapters = sorted(expected - found) + + return { + "outline": outline, + "summary": { + "total_headings": len(outline), + "chapters_found": len(chapters), + "chapter_numbers": chapter_numbers, + "missing_chapters": missing_chapters, + "total_words": total_words, + "total_paragraphs": len(doc.paragraphs) + }, + "extraction_time": round(time.time() - start_time, 3) + } + + @mcp_tool( + name="check_style_consistency", + description="Analyze a Word document for style inconsistencies, formatting issues, and potential problems like mismatched heading styles or missing chapters." + ) + @handle_office_errors("Style consistency check") + async def check_style_consistency( + self, + file_path: str = Field(description="Path to Word document or URL") + ) -> dict[str, Any]: + """Check document for style and formatting consistency issues.""" + from docx import Document + + start_time = time.time() + local_path = await resolve_office_file_path(file_path) + + validation = await validate_office_file(local_path) + if not validation["is_valid"]: + raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}") + + doc = Document(local_path) + + issues = [] + warnings = [] + + # Track heading styles and chapter detection + heading_styles = {} + chapters_by_style = {"heading": [], "other": []} + chapter_numbers_found = [] + + import re + chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE) + + for para_idx, para in enumerate(doc.paragraphs): + text = para.text.strip() + style_name = para.style.name if para.style else "None" + style_lower = style_name.lower() + + # Track style usage + heading_styles[style_name] = heading_styles.get(style_name, 0) + 1 + + # Check for chapter-like text + chapter_match = chapter_pattern.match(text) + if chapter_match: + chapter_num = int(chapter_match.group(1)) + chapter_numbers_found.append(chapter_num) + + is_heading_style = "heading" in style_lower + + if is_heading_style: + chapters_by_style["heading"].append({ + "chapter": chapter_num, + "text": text[:80], + "style": style_name, + "paragraph": para_idx + }) + else: + chapters_by_style["other"].append({ + "chapter": chapter_num, + "text": text[:80], + "style": style_name, + "paragraph": para_idx + }) + issues.append({ + "type": "inconsistent_chapter_style", + "severity": "warning", + "message": f"Chapter {chapter_num} uses '{style_name}' instead of a Heading style", + "paragraph": para_idx, + "text": text[:80] + }) + + # Check for potential headings that aren't styled as headings + if text and len(text) < 100 and not text.endswith('.'): + is_heading_style = "heading" in style_lower or "title" in style_lower + looks_like_heading = any(word in text.lower() for word in + ["chapter", "section", "part", "introduction", "conclusion", "appendix"]) + + if looks_like_heading and not is_heading_style: + warnings.append({ + "type": "potential_heading_not_styled", + "message": f"Text looks like a heading but uses '{style_name}' style", + "paragraph": para_idx, + "text": text[:80] + }) + + # Check for missing chapters in sequence + missing_chapters = [] + if chapter_numbers_found: + chapter_numbers_found.sort() + expected = set(range(1, max(chapter_numbers_found) + 1)) + found = set(chapter_numbers_found) + missing_chapters = sorted(expected - found) + + for missing in missing_chapters: + issues.append({ + "type": "missing_chapter", + "severity": "error", + "message": f"Chapter {missing} appears to be missing from sequence", + "expected_between": f"Chapter {missing-1} and Chapter {missing+1}" if missing > 1 else f"Before Chapter {missing+1}" + }) + + # Check for duplicate chapter numbers + from collections import Counter + chapter_counts = Counter(chapter_numbers_found) + duplicates = {num: count for num, count in chapter_counts.items() if count > 1} + for chapter_num, count in duplicates.items(): + issues.append({ + "type": "duplicate_chapter", + "severity": "warning", + "message": f"Chapter {chapter_num} appears {count} times" + }) + + # Summary of heading style usage + heading_summary = {k: v for k, v in heading_styles.items() + if "heading" in k.lower() or "title" in k.lower()} + + return { + "issues": issues, + "warnings": warnings, + "chapter_analysis": { + "total_chapters": len(chapter_numbers_found), + "chapters_with_heading_style": len(chapters_by_style["heading"]), + "chapters_without_heading_style": len(chapters_by_style["other"]), + "missing_chapters": missing_chapters, + "duplicate_chapters": list(duplicates.keys()), + "chapter_details": chapters_by_style + }, + "style_usage": heading_summary, + "health_score": self._calculate_doc_health_score(issues, warnings), + "analysis_time": round(time.time() - start_time, 3) + } + + def _calculate_doc_health_score(self, issues: list, warnings: list) -> dict: + """Calculate document health score based on issues found.""" + score = 100 + + for issue in issues: + if issue.get("severity") == "error": + score -= 10 + elif issue.get("severity") == "warning": + score -= 5 + + for _ in warnings: + score -= 2 + + score = max(0, min(100, score)) + + if score >= 90: + rating = "excellent" + elif score >= 70: + rating = "good" + elif score >= 50: + rating = "fair" + else: + rating = "needs attention" + + return {"score": score, "rating": rating} + + @mcp_tool( + name="search_document", + description="Search for text within a Word document and return matches with surrounding context and location information." + ) + @handle_office_errors("Document search") + async def search_document( + self, + file_path: str = Field(description="Path to Word document or URL"), + query: str = Field(description="Text to search for (case-insensitive)"), + context_chars: int = Field(default=100, description="Number of characters of context before and after match"), + max_results: int = Field(default=20, description="Maximum number of results to return") + ) -> dict[str, Any]: + """Search document for text with context.""" + from docx import Document + + start_time = time.time() + local_path = await resolve_office_file_path(file_path) + + validation = await validate_office_file(local_path) + if not validation["is_valid"]: + raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}") + + doc = Document(local_path) + query_lower = query.lower() + + results = [] + current_chapter = None + current_section = None + + for para_idx, para in enumerate(doc.paragraphs): + text = para.text + style_name = para.style.name if para.style else "" + style_lower = style_name.lower() + + # Track current chapter/section for context + if "heading" in style_lower or "title" in style_lower: + if "1" in style_name or "title" in style_lower: + current_chapter = text.strip()[:80] + current_section = None + else: + current_section = text.strip()[:80] + + # Search for matches + text_lower = text.lower() + search_start = 0 + + while True: + pos = text_lower.find(query_lower, search_start) + if pos == -1: + break + + if len(results) >= max_results: + break + + # Extract context + context_start = max(0, pos - context_chars) + context_end = min(len(text), pos + len(query) + context_chars) + + context = text[context_start:context_end] + if context_start > 0: + context = "..." + context + if context_end < len(text): + context = context + "..." + + results.append({ + "paragraph_index": para_idx, + "position": pos, + "context": context, + "chapter": current_chapter, + "section": current_section, + "style": style_name + }) + + search_start = pos + 1 + + if len(results) >= max_results: + break + + return { + "query": query, + "total_matches": len(results), + "results": results, + "search_time": round(time.time() - start_time, 3), + "truncated": len(results) >= max_results + } \ No newline at end of file diff --git a/tests/test_mixins.py b/tests/test_mixins.py index 6bcbf07..8a4a27b 100644 --- a/tests/test_mixins.py +++ b/tests/test_mixins.py @@ -64,7 +64,7 @@ class TestMixinArchitecture: word = WordMixin() word.register_all(app) word_tools = len(app._tool_manager._tools) - initial_tool_count - universal_tools - assert word_tools == 3 # convert_to_markdown, extract_word_tables, analyze_word_structure + assert word_tools == 6 # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document excel = ExcelMixin() excel.register_all(app) diff --git a/tests/test_server.py b/tests/test_server.py index 896099a..8d52774 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -149,8 +149,8 @@ class TestMixinIntegration: # Verify no duplicates assert len(tool_names) == len(set(tool_names)), "Tool names should be unique" - # Verify expected count: 6 universal + 3 word + 3 excel = 12 - assert len(tool_names) == 12, f"Expected 12 tools, got {len(tool_names)}: {list(tool_names.keys())}" + # Verify expected count: 6 universal + 6 word + 3 excel = 15 + assert len(tool_names) == 15, f"Expected 15 tools, got {len(tool_names)}: {list(tool_names.keys())}" if __name__ == "__main__": diff --git a/tests/test_word_mixin.py b/tests/test_word_mixin.py index ac9c86a..4dc2376 100644 --- a/tests/test_word_mixin.py +++ b/tests/test_word_mixin.py @@ -28,14 +28,14 @@ class TestWordMixinRegistration: mixin.register_all(app) assert mixin is not None - assert len(app._tool_manager._tools) == 3 # convert_to_markdown, extract_word_tables, analyze_word_structure + assert len(app._tool_manager._tools) == 6 # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document def test_tool_names_registered(self): """Test that Word-specific tools are registered.""" app = FastMCP("Test Word") WordMixin().register_all(app) - expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure"} + expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure", "get_document_outline", "check_style_consistency", "search_document"} registered_tools = set(app._tool_manager._tools.keys()) assert expected_tools.issubset(registered_tools)