Add document navigation tools: outline, style check, search
New tools for easier document navigation: - get_document_outline: Structured view of headings with chapter detection - check_style_consistency: Find formatting issues and missing chapters - search_document: Search with context and chapter location All tools tested with 200+ page manuscript. Detects issues like Chapter 3 being styled as "normal" instead of "Heading 1".
This commit is contained in:
parent
34e636e782
commit
1abce7f26d
@ -1,9 +1,9 @@
|
||||
{
|
||||
"metadata": {
|
||||
"start_time": "2026-01-11T05:19:25.816074",
|
||||
"start_time": "2026-01-11T07:15:14.417108",
|
||||
"pytest_version": "9.0.2",
|
||||
"end_time": "2026-01-11T05:19:26.468770",
|
||||
"duration": 0.6526906490325928,
|
||||
"end_time": "2026-01-11T07:15:15.173732",
|
||||
"duration": 0.7566196918487549,
|
||||
"exit_status": 0
|
||||
},
|
||||
"summary": {
|
||||
|
||||
@ -634,4 +634,369 @@ class WordMixin(MCPMixin):
|
||||
|
||||
stack.append(node)
|
||||
|
||||
return tree
|
||||
return tree
|
||||
|
||||
# ==================== New Document Navigation Tools ====================
|
||||
|
||||
@mcp_tool(
|
||||
name="get_document_outline",
|
||||
description="Get a clean, structured outline of a Word document showing all headings, sections, and chapters with their locations. Perfect for understanding document structure before reading."
|
||||
)
|
||||
@handle_office_errors("Document outline")
|
||||
async def get_document_outline(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document or URL"),
|
||||
include_word_counts: bool = Field(default=True, description="Include estimated word count per section"),
|
||||
detect_chapters: bool = Field(default=True, description="Detect and flag chapter headings specifically")
|
||||
) -> dict[str, Any]:
|
||||
"""Extract structured document outline with chapter detection."""
|
||||
from docx import Document
|
||||
from docx.oxml.ns import qn
|
||||
|
||||
start_time = time.time()
|
||||
local_path = await resolve_office_file_path(file_path)
|
||||
|
||||
validation = await validate_office_file(local_path)
|
||||
if not validation["is_valid"]:
|
||||
raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
|
||||
|
||||
doc = Document(local_path)
|
||||
|
||||
outline = []
|
||||
current_section = None
|
||||
section_word_count = 0
|
||||
total_words = 0
|
||||
chapter_pattern = ["chapter", "section", "part", "introduction", "conclusion", "appendix", "preface", "epilogue"]
|
||||
|
||||
for para_idx, para in enumerate(doc.paragraphs):
|
||||
text = para.text.strip()
|
||||
word_count = len(text.split()) if text else 0
|
||||
total_words += word_count
|
||||
|
||||
# Check if this is a heading
|
||||
style_name = para.style.name.lower() if para.style else ""
|
||||
is_heading = "heading" in style_name or "title" in style_name
|
||||
|
||||
# Determine heading level
|
||||
level = 0
|
||||
if is_heading:
|
||||
if "title" in style_name:
|
||||
level = 0
|
||||
elif "heading 1" in style_name or style_name == "heading1":
|
||||
level = 1
|
||||
elif "heading 2" in style_name or style_name == "heading2":
|
||||
level = 2
|
||||
elif "heading 3" in style_name or style_name == "heading3":
|
||||
level = 3
|
||||
elif "heading" in style_name:
|
||||
# Try to extract number from style name
|
||||
import re
|
||||
match = re.search(r'heading\s*(\d+)', style_name)
|
||||
level = int(match.group(1)) if match else 4
|
||||
|
||||
if is_heading and text:
|
||||
# Save previous section's word count
|
||||
if current_section is not None and include_word_counts:
|
||||
current_section["word_count"] = section_word_count
|
||||
|
||||
# Detect if this is a chapter
|
||||
is_chapter = False
|
||||
chapter_number = None
|
||||
if detect_chapters:
|
||||
text_lower = text.lower()
|
||||
for pattern in chapter_pattern:
|
||||
if pattern in text_lower:
|
||||
is_chapter = True
|
||||
# Try to extract chapter number
|
||||
import re
|
||||
match = re.search(r'(?:chapter|section|part)\s*(\d+)', text_lower)
|
||||
if match:
|
||||
chapter_number = int(match.group(1))
|
||||
break
|
||||
|
||||
current_section = {
|
||||
"text": text[:150] + ("..." if len(text) > 150 else ""),
|
||||
"level": level,
|
||||
"style": para.style.name if para.style else "Unknown",
|
||||
"paragraph_index": para_idx,
|
||||
"is_chapter": is_chapter
|
||||
}
|
||||
|
||||
if chapter_number is not None:
|
||||
current_section["chapter_number"] = chapter_number
|
||||
|
||||
outline.append(current_section)
|
||||
section_word_count = 0
|
||||
else:
|
||||
section_word_count += word_count
|
||||
|
||||
# Don't forget last section
|
||||
if current_section is not None and include_word_counts:
|
||||
current_section["word_count"] = section_word_count
|
||||
|
||||
# Build summary statistics
|
||||
chapters = [item for item in outline if item.get("is_chapter")]
|
||||
chapter_numbers = [c.get("chapter_number") for c in chapters if c.get("chapter_number")]
|
||||
|
||||
# Detect missing chapters
|
||||
missing_chapters = []
|
||||
if chapter_numbers:
|
||||
expected = set(range(1, max(chapter_numbers) + 1))
|
||||
found = set(chapter_numbers)
|
||||
missing_chapters = sorted(expected - found)
|
||||
|
||||
return {
|
||||
"outline": outline,
|
||||
"summary": {
|
||||
"total_headings": len(outline),
|
||||
"chapters_found": len(chapters),
|
||||
"chapter_numbers": chapter_numbers,
|
||||
"missing_chapters": missing_chapters,
|
||||
"total_words": total_words,
|
||||
"total_paragraphs": len(doc.paragraphs)
|
||||
},
|
||||
"extraction_time": round(time.time() - start_time, 3)
|
||||
}
|
||||
|
||||
@mcp_tool(
|
||||
name="check_style_consistency",
|
||||
description="Analyze a Word document for style inconsistencies, formatting issues, and potential problems like mismatched heading styles or missing chapters."
|
||||
)
|
||||
@handle_office_errors("Style consistency check")
|
||||
async def check_style_consistency(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document or URL")
|
||||
) -> dict[str, Any]:
|
||||
"""Check document for style and formatting consistency issues."""
|
||||
from docx import Document
|
||||
|
||||
start_time = time.time()
|
||||
local_path = await resolve_office_file_path(file_path)
|
||||
|
||||
validation = await validate_office_file(local_path)
|
||||
if not validation["is_valid"]:
|
||||
raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
|
||||
|
||||
doc = Document(local_path)
|
||||
|
||||
issues = []
|
||||
warnings = []
|
||||
|
||||
# Track heading styles and chapter detection
|
||||
heading_styles = {}
|
||||
chapters_by_style = {"heading": [], "other": []}
|
||||
chapter_numbers_found = []
|
||||
|
||||
import re
|
||||
chapter_pattern = re.compile(r'^chapter\s*(\d+)', re.IGNORECASE)
|
||||
|
||||
for para_idx, para in enumerate(doc.paragraphs):
|
||||
text = para.text.strip()
|
||||
style_name = para.style.name if para.style else "None"
|
||||
style_lower = style_name.lower()
|
||||
|
||||
# Track style usage
|
||||
heading_styles[style_name] = heading_styles.get(style_name, 0) + 1
|
||||
|
||||
# Check for chapter-like text
|
||||
chapter_match = chapter_pattern.match(text)
|
||||
if chapter_match:
|
||||
chapter_num = int(chapter_match.group(1))
|
||||
chapter_numbers_found.append(chapter_num)
|
||||
|
||||
is_heading_style = "heading" in style_lower
|
||||
|
||||
if is_heading_style:
|
||||
chapters_by_style["heading"].append({
|
||||
"chapter": chapter_num,
|
||||
"text": text[:80],
|
||||
"style": style_name,
|
||||
"paragraph": para_idx
|
||||
})
|
||||
else:
|
||||
chapters_by_style["other"].append({
|
||||
"chapter": chapter_num,
|
||||
"text": text[:80],
|
||||
"style": style_name,
|
||||
"paragraph": para_idx
|
||||
})
|
||||
issues.append({
|
||||
"type": "inconsistent_chapter_style",
|
||||
"severity": "warning",
|
||||
"message": f"Chapter {chapter_num} uses '{style_name}' instead of a Heading style",
|
||||
"paragraph": para_idx,
|
||||
"text": text[:80]
|
||||
})
|
||||
|
||||
# Check for potential headings that aren't styled as headings
|
||||
if text and len(text) < 100 and not text.endswith('.'):
|
||||
is_heading_style = "heading" in style_lower or "title" in style_lower
|
||||
looks_like_heading = any(word in text.lower() for word in
|
||||
["chapter", "section", "part", "introduction", "conclusion", "appendix"])
|
||||
|
||||
if looks_like_heading and not is_heading_style:
|
||||
warnings.append({
|
||||
"type": "potential_heading_not_styled",
|
||||
"message": f"Text looks like a heading but uses '{style_name}' style",
|
||||
"paragraph": para_idx,
|
||||
"text": text[:80]
|
||||
})
|
||||
|
||||
# Check for missing chapters in sequence
|
||||
missing_chapters = []
|
||||
if chapter_numbers_found:
|
||||
chapter_numbers_found.sort()
|
||||
expected = set(range(1, max(chapter_numbers_found) + 1))
|
||||
found = set(chapter_numbers_found)
|
||||
missing_chapters = sorted(expected - found)
|
||||
|
||||
for missing in missing_chapters:
|
||||
issues.append({
|
||||
"type": "missing_chapter",
|
||||
"severity": "error",
|
||||
"message": f"Chapter {missing} appears to be missing from sequence",
|
||||
"expected_between": f"Chapter {missing-1} and Chapter {missing+1}" if missing > 1 else f"Before Chapter {missing+1}"
|
||||
})
|
||||
|
||||
# Check for duplicate chapter numbers
|
||||
from collections import Counter
|
||||
chapter_counts = Counter(chapter_numbers_found)
|
||||
duplicates = {num: count for num, count in chapter_counts.items() if count > 1}
|
||||
for chapter_num, count in duplicates.items():
|
||||
issues.append({
|
||||
"type": "duplicate_chapter",
|
||||
"severity": "warning",
|
||||
"message": f"Chapter {chapter_num} appears {count} times"
|
||||
})
|
||||
|
||||
# Summary of heading style usage
|
||||
heading_summary = {k: v for k, v in heading_styles.items()
|
||||
if "heading" in k.lower() or "title" in k.lower()}
|
||||
|
||||
return {
|
||||
"issues": issues,
|
||||
"warnings": warnings,
|
||||
"chapter_analysis": {
|
||||
"total_chapters": len(chapter_numbers_found),
|
||||
"chapters_with_heading_style": len(chapters_by_style["heading"]),
|
||||
"chapters_without_heading_style": len(chapters_by_style["other"]),
|
||||
"missing_chapters": missing_chapters,
|
||||
"duplicate_chapters": list(duplicates.keys()),
|
||||
"chapter_details": chapters_by_style
|
||||
},
|
||||
"style_usage": heading_summary,
|
||||
"health_score": self._calculate_doc_health_score(issues, warnings),
|
||||
"analysis_time": round(time.time() - start_time, 3)
|
||||
}
|
||||
|
||||
def _calculate_doc_health_score(self, issues: list, warnings: list) -> dict:
|
||||
"""Calculate document health score based on issues found."""
|
||||
score = 100
|
||||
|
||||
for issue in issues:
|
||||
if issue.get("severity") == "error":
|
||||
score -= 10
|
||||
elif issue.get("severity") == "warning":
|
||||
score -= 5
|
||||
|
||||
for _ in warnings:
|
||||
score -= 2
|
||||
|
||||
score = max(0, min(100, score))
|
||||
|
||||
if score >= 90:
|
||||
rating = "excellent"
|
||||
elif score >= 70:
|
||||
rating = "good"
|
||||
elif score >= 50:
|
||||
rating = "fair"
|
||||
else:
|
||||
rating = "needs attention"
|
||||
|
||||
return {"score": score, "rating": rating}
|
||||
|
||||
@mcp_tool(
|
||||
name="search_document",
|
||||
description="Search for text within a Word document and return matches with surrounding context and location information."
|
||||
)
|
||||
@handle_office_errors("Document search")
|
||||
async def search_document(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document or URL"),
|
||||
query: str = Field(description="Text to search for (case-insensitive)"),
|
||||
context_chars: int = Field(default=100, description="Number of characters of context before and after match"),
|
||||
max_results: int = Field(default=20, description="Maximum number of results to return")
|
||||
) -> dict[str, Any]:
|
||||
"""Search document for text with context."""
|
||||
from docx import Document
|
||||
|
||||
start_time = time.time()
|
||||
local_path = await resolve_office_file_path(file_path)
|
||||
|
||||
validation = await validate_office_file(local_path)
|
||||
if not validation["is_valid"]:
|
||||
raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
|
||||
|
||||
doc = Document(local_path)
|
||||
query_lower = query.lower()
|
||||
|
||||
results = []
|
||||
current_chapter = None
|
||||
current_section = None
|
||||
|
||||
for para_idx, para in enumerate(doc.paragraphs):
|
||||
text = para.text
|
||||
style_name = para.style.name if para.style else ""
|
||||
style_lower = style_name.lower()
|
||||
|
||||
# Track current chapter/section for context
|
||||
if "heading" in style_lower or "title" in style_lower:
|
||||
if "1" in style_name or "title" in style_lower:
|
||||
current_chapter = text.strip()[:80]
|
||||
current_section = None
|
||||
else:
|
||||
current_section = text.strip()[:80]
|
||||
|
||||
# Search for matches
|
||||
text_lower = text.lower()
|
||||
search_start = 0
|
||||
|
||||
while True:
|
||||
pos = text_lower.find(query_lower, search_start)
|
||||
if pos == -1:
|
||||
break
|
||||
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
|
||||
# Extract context
|
||||
context_start = max(0, pos - context_chars)
|
||||
context_end = min(len(text), pos + len(query) + context_chars)
|
||||
|
||||
context = text[context_start:context_end]
|
||||
if context_start > 0:
|
||||
context = "..." + context
|
||||
if context_end < len(text):
|
||||
context = context + "..."
|
||||
|
||||
results.append({
|
||||
"paragraph_index": para_idx,
|
||||
"position": pos,
|
||||
"context": context,
|
||||
"chapter": current_chapter,
|
||||
"section": current_section,
|
||||
"style": style_name
|
||||
})
|
||||
|
||||
search_start = pos + 1
|
||||
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"total_matches": len(results),
|
||||
"results": results,
|
||||
"search_time": round(time.time() - start_time, 3),
|
||||
"truncated": len(results) >= max_results
|
||||
}
|
||||
@ -64,7 +64,7 @@ class TestMixinArchitecture:
|
||||
word = WordMixin()
|
||||
word.register_all(app)
|
||||
word_tools = len(app._tool_manager._tools) - initial_tool_count - universal_tools
|
||||
assert word_tools == 3 # convert_to_markdown, extract_word_tables, analyze_word_structure
|
||||
assert word_tools == 6 # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document
|
||||
|
||||
excel = ExcelMixin()
|
||||
excel.register_all(app)
|
||||
|
||||
@ -149,8 +149,8 @@ class TestMixinIntegration:
|
||||
# Verify no duplicates
|
||||
assert len(tool_names) == len(set(tool_names)), "Tool names should be unique"
|
||||
|
||||
# Verify expected count: 6 universal + 3 word + 3 excel = 12
|
||||
assert len(tool_names) == 12, f"Expected 12 tools, got {len(tool_names)}: {list(tool_names.keys())}"
|
||||
# Verify expected count: 6 universal + 6 word + 3 excel = 15
|
||||
assert len(tool_names) == 15, f"Expected 15 tools, got {len(tool_names)}: {list(tool_names.keys())}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -28,14 +28,14 @@ class TestWordMixinRegistration:
|
||||
mixin.register_all(app)
|
||||
|
||||
assert mixin is not None
|
||||
assert len(app._tool_manager._tools) == 3 # convert_to_markdown, extract_word_tables, analyze_word_structure
|
||||
assert len(app._tool_manager._tools) == 6 # convert_to_markdown, extract_word_tables, analyze_word_structure, get_document_outline, check_style_consistency, search_document
|
||||
|
||||
def test_tool_names_registered(self):
|
||||
"""Test that Word-specific tools are registered."""
|
||||
app = FastMCP("Test Word")
|
||||
WordMixin().register_all(app)
|
||||
|
||||
expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure"}
|
||||
expected_tools = {"convert_to_markdown", "extract_word_tables", "analyze_word_structure", "get_document_outline", "check_style_consistency", "search_document"}
|
||||
registered_tools = set(app._tool_manager._tools.keys())
|
||||
assert expected_tools.issubset(registered_tools)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user