Add document navigation tools: outline, style check, search

New tools for easier document navigation:
- get_document_outline: Structured view of headings with chapter detection
- check_style_consistency: Find formatting issues and missing chapters
- search_document: Search with context and chapter location

All tools tested with 200+ page manuscript. Detects issues like
Chapter 3 being styled as "normal" instead of "Heading 1".
This commit is contained in:
Ryan Malloy 2026-01-11 07:15:43 -07:00
parent 34e636e782
commit 1abce7f26d
5 changed files with 374 additions and 9 deletions

View File

@ -1,9 +1,9 @@
{ {
"metadata": { "metadata": {
"start_time": "2026-01-11T05:19:25.816074", "start_time": "2026-01-11T07:15:14.417108",
"pytest_version": "9.0.2", "pytest_version": "9.0.2",
"end_time": "2026-01-11T05:19:26.468770", "end_time": "2026-01-11T07:15:15.173732",
"duration": 0.6526906490325928, "duration": 0.7566196918487549,
"exit_status": 0 "exit_status": 0
}, },
"summary": { "summary": {

View File

@ -634,4 +634,369 @@ class WordMixin(MCPMixin):
stack.append(node) stack.append(node)
return tree return tree
# ==================== New Document Navigation Tools ====================
@mcp_tool(
name="get_document_outline",
description="Get a clean, structured outline of a Word document showing all headings, sections, and chapters with their locations. Perfect for understanding document structure before reading."
)
@handle_office_errors("Document outline")
async def get_document_outline(
self,
file_path: str = Field(description="Path to Word document or URL"),
include_word_counts: bool = Field(default=True, description="Include estimated word count per section"),
detect_chapters: bool = Field(default=True, description="Detect and flag chapter headings specifically")
) -> dict[str, Any]:
"""Extract structured document outline with chapter detection."""
from docx import Document
from docx.oxml.ns import qn
start_time = time.time()
local_path = await resolve_office_file_path(file_path)
validation = await validate_office_file(local_path)
if not validation["is_valid"]:
raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
doc = Document(local_path)
outline = []
current_section = None
section_word_count = 0
total_words = 0
chapter_pattern = ["chapter", "section", "part", "introduction", "conclusion", "appendix", "preface", "epilogue"]
for para_idx, para in enumerate(doc.paragraphs):
text = para.text.strip()
word_count = len(text.split()) if text else 0
total_words += word_count
# Check if this is a heading
style_name = para.style.name.lower() if para.style else ""
is_heading = "heading" in style_name or "title" in style_name
# Determine heading level
level = 0
if is_heading:
if "title" in style_name:
level = 0
elif "heading 1" in style_name or style_name == "heading1":
level = 1
elif "heading 2" in style_name or style_name == "heading2":
level = 2
elif "heading 3" in style_name or style_name == "heading3":
level = 3
elif "heading" in style_name:
# Try to extract number from style name
import re
match = re.search(r'heading\s*(\d+)', style_name)
level = int(match.group(1)) if match else 4
if is_heading and text:
# Save previous section's word count
if current_section is not None and include_word_counts:
current_section["word_count"] = section_word_count
# Detect if this is a chapter
is_chapter = False
chapter_number = None
if detect_chapters:
text_lower = text.lower()
for pattern in chapter_pattern:
if pattern in text_lower:
is_chapter = True
# Try to extract chapter number
import re
match = re.search(r'(?:chapter|section|part)\s*(\d+)', text_lower)
if match:
chapter_number = int(match.group(1))
break
current_section = {
"text": text[:150] + ("..." if len(text) > 150 else ""),
"level": level,
"style": para.style.name if para.style else "Unknown",
"paragraph_index": para_idx,
"is_chapter": is_chapter
}
if chapter_number is not None:
current_section["chapter_number"] = chapter_number
outline.append(current_section)
section_word_count = 0
else:
section_word_count += word_count
# Don't forget last section
if current_section is not None and include_word_counts:
current_section["word_count"] = section_word_count
# Build summary statistics
chapters = [item for item in outline if item.get("is_chapter")]
chapter_numbers = [c.get("chapter_number") for c in chapters if c.get("chapter_number")]
# Detect missing chapters
missing_chapters = []
if chapter_numbers:
expected = set(range(1, max(chapter_numbers) + 1))
found = set(chapter_numbers)
missing_chapters = sorted(expected - found)
return {
"outline": outline,
"summary": {
"total_headings": len(outline),
"chapters_found": len(chapters),
"chapter_numbers": chapter_numbers,
"missing_chapters": missing_chapters,
"total_words": total_words,
"total_paragraphs": len(doc.paragraphs)
},
"extraction_time": round(time.time() - start_time, 3)
}
@mcp_tool(
name="check_style_consistency",
description="Analyze a Word document for style inconsistencies, formatting issues, and potential problems like mismatched heading styles or missing chapters."
)
@handle_office_errors("Style consistency check")
async def check_style_consistency(
self,
file_path: str = Field(description="Path to Word document or URL")
) -> dict[str, Any]:
"""Check document for style and formatting consistency issues."""
from docx import Document
start_time = time.time()
local_path = await resolve_office_file_path(file_path)
validation = await validate_office_file(local_path)
if not validation["is_valid"]:
raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
doc = Document(local_path)
issues = []
warnings = []
# Track heading styles and chapter detection
heading_styles = {}
chapters_by_style = {"heading": [], "other": []}
chapter_numbers_found = []
import re
chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
for para_idx, para in enumerate(doc.paragraphs):
text = para.text.strip()
style_name = para.style.name if para.style else "None"
style_lower = style_name.lower()
# Track style usage
heading_styles[style_name] = heading_styles.get(style_name, 0) + 1
# Check for chapter-like text
chapter_match = chapter_pattern.match(text)
if chapter_match:
chapter_num = int(chapter_match.group(1))
chapter_numbers_found.append(chapter_num)
is_heading_style = "heading" in style_lower
if is_heading_style:
chapters_by_style["heading"].append({
"chapter": chapter_num,
"text": text[:80],
"style": style_name,
"paragraph": para_idx
})
else:
chapters_by_style["other"].append({
"chapter": chapter_num,
"text": text[:80],
"style": style_name,
"paragraph": para_idx
})
issues.append({
"type": "inconsistent_chapter_style",
"severity": "warning",
"message": f"Chapter {chapter_num} uses '{style_name}' instead of a Heading style",
"paragraph": para_idx,
"text": text[:80]
})
# Check for potential headings that aren't styled as headings
if text and len(text) < 100 and not text.endswith('.'):
is_heading_style = "heading" in style_lower or "title" in style_lower
looks_like_heading = any(word in text.lower() for word in
["chapter", "section", "part", "introduction", "conclusion", "appendix"])
if looks_like_heading and not is_heading_style:
warnings.append({
"type": "potential_heading_not_styled",
"message": f"Text looks like a heading but uses '{style_name}' style",
"paragraph": para_idx,
"text": text[:80]
})
# Check for missing chapters in sequence
missing_chapters = []
if chapter_numbers_found:
chapter_numbers_found.sort()
expected = set(range(1, max(chapter_numbers_found) + 1))
found = set(chapter_numbers_found)
missing_chapters = sorted(expected - found)
for missing in missing_chapters:
issues.append({
"type": "missing_chapter",
"severity": "error",
"message": f"Chapter {missing} appears to be missing from sequence",
"expected_between": f"Chapter {missing-1} and Chapter {missing+1}" if missing > 1 else f"Before Chapter {missing+1}"
})
# Check for duplicate chapter numbers
from collections import Counter
chapter_counts = Counter(chapter_numbers_found)
duplicates = {num: count for num, count in chapter_counts.items() if count > 1}
for chapter_num, count in duplicates.items():
issues.append({
"type": "duplicate_chapter",
"severity": "warning",
"message": f"Chapter {chapter_num} appears {count} times"
})
# Summary of heading style usage
heading_summary = {k: v for k, v in heading_styles.items()
if "heading" in k.lower() or "title" in k.lower()}
return {
"issues": issues,
"warnings": warnings,
"chapter_analysis": {
"total_chapters": len(chapter_numbers_found),
"chapters_with_heading_style": len(chapters_by_style["heading"]),
"chapters_without_heading_style": len(chapters_by_style["other"]),
"missing_chapters": missing_chapters,
"duplicate_chapters": list(duplicates.keys()),
"chapter_details": chapters_by_style
},
"style_usage": heading_summary,
"health_score": self._calculate_doc_health_score(issues, warnings),
"analysis_time": round(time.time() - start_time, 3)
}
def _calculate_doc_health_score(self, issues: list, warnings: list) -> dict:
"""Calculate document health score based on issues found."""
score = 100
for issue in issues:
if issue.get("severity") == "error":
score -= 10
elif issue.get("severity") == "warning":
score -= 5
for _ in warnings:
score -= 2
score = max(0, min(100, score))
if score >= 90:
rating = "excellent"
elif score >= 70:
rating = "good"
elif score >= 50:
rating = "fair"
else:
rating = "needs attention"
return {"score": score, "rating": rating}
@mcp_tool(
name="search_document",
description="Search for text within a Word document and return matches with surrounding context and location information."
)
@handle_office_errors("Document search")
async def search_document(
self,
file_path: str = Field(description="Path to Word document or URL"),
query: str = Field(description="Text to search for (case-insensitive)"),
context_chars: int = Field(default=100, description="Number of characters of context before and after match"),
max_results: int = Field(default=20, description="Maximum number of results to return")
) -> dict[str, Any]:
"""Search document for text with context."""
from docx import Document
start_time = time.time()
local_path = await resolve_office_file_path(file_path)
validation = await validate_office_file(local_path)
if not validation["is_valid"]:
raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
doc = Document(local_path)
query_lower = query.lower()
results = []
current_chapter = None
current_section = None
for para_idx, para in enumerate(doc.paragraphs):
text = para.text
style_name = para.style.name if para.style else ""
style_lower = style_name.lower()
# Track current chapter/section for context
if "heading" in style_lower or "title" in style_lower:
if "1" in style_name or "title" in style_lower:
current_chapter = text.strip()[:80]
current_section = None
else:
current_section = text.strip()[:80]
# Search for matches
text_lower = text.lower()
search_start = 0
while True:
pos = text_lower.find(query_lower, search_start)
if pos == -1:
break
if len(results) >= max_results:
break
# Extract context
context_start = max(0, pos - context_chars)
context_end = min(len(text), pos + len(query) + context_chars)
context = text[context_start:context_end]
if context_start > 0:
context = "..." + context
if context_end < len(text):
context = context + "..."
results.append({
"paragraph_index": para_idx,
"position": pos,
"context": context,
"chapter": current_chapter,
"section": current_section,
"style": style_name
})
search_start = pos + 1
if len(results) >= max_results:
break
return {
"query": query,
"total_matches": len(results),
"results": results,
"search_time": round(time.time() - start_time, 3),
"truncated": len(results) >= max_results
}

View File

@ -64,7 +64,7 @@ class TestMixinArchitecture:
word = WordMixin() word = WordMixin()
word.register_all(app) word.register_all(app)
word_tools = len(app._tool_manager._tools) - initial_tool_count - universal_tools word_tools = len(app._tool_manager._tools) - initial_tool_count - universal_tools
assert word_tools == 3 # convert_to_markdown, extract_word_tables, analyze_word_structure assert word_tools == 6 # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document
excel = ExcelMixin() excel = ExcelMixin()
excel.register_all(app) excel.register_all(app)

View File

@ -149,8 +149,8 @@ class TestMixinIntegration:
# Verify no duplicates # Verify no duplicates
assert len(tool_names) == len(set(tool_names)), "Tool names should be unique" assert len(tool_names) == len(set(tool_names)), "Tool names should be unique"
# Verify expected count: 6 universal + 3 word + 3 excel = 12 # Verify expected count: 6 universal + 6 word + 3 excel = 15
assert len(tool_names) == 12, f"Expected 12 tools, got {len(tool_names)}: {list(tool_names.keys())}" assert len(tool_names) == 15, f"Expected 15 tools, got {len(tool_names)}: {list(tool_names.keys())}"
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -28,14 +28,14 @@ class TestWordMixinRegistration:
mixin.register_all(app) mixin.register_all(app)
assert mixin is not None assert mixin is not None
assert len(app._tool_manager._tools) == 3 # convert_to_markdown, extract_word_tables, analyze_word_structure assert len(app._tool_manager._tools) == 6 # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document
def test_tool_names_registered(self): def test_tool_names_registered(self):
"""Test that Word-specific tools are registered.""" """Test that Word-specific tools are registered."""
app = FastMCP("Test Word") app = FastMCP("Test Word")
WordMixin().register_all(app) WordMixin().register_all(app)
expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure"} expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure", "get_document_outline", "check_style_consistency", "search_document"}
registered_tools = set(app._tool_manager._tools.keys()) registered_tools = set(app._tool_manager._tools.keys())
assert expected_tools.issubset(registered_tools) assert expected_tools.issubset(registered_tools)