Fix page range extraction for large documents and MCP connection

Bug fixes:
- Remove 100-paragraph cap that prevented extracting content past ~page 4
  Now calculates limit based on number of pages requested (300 paras/page)
- Add fallback page estimation when docs lack explicit page breaks
  Uses ~25 paragraphs per page for navigation in non-paginated docs
- Fix _get_available_headings to scan full document (was only first 100 elements)
  Headings like Chapter 10 at element 1524 were invisible
- Fix MCP connection by disabling FastMCP banner (show_banner=False)
  ASCII art banner was corrupting stdout JSON-RPC protocol

Changes:
- Default image_mode changed from 'base64' to 'files' to avoid huge responses
- Add proper .mcp.json config with command/args format
- Add test document to .gitignore for privacy
This commit is contained in:
Ryan Malloy 2026-01-11 04:27:56 -07:00
parent 35869b6099
commit 210aa99e0b
5 changed files with 158 additions and 26 deletions

3
.gitignore vendored
View File

@ -78,3 +78,6 @@ tmp/
# Temporary files created during processing # Temporary files created during processing
*.tmp *.tmp
*.temp *.temp
# Test documents (personal/private)
ORIGINAL - The Other Side of the Bed*.docx

9
.mcp.json Normal file
View File

@ -0,0 +1,9 @@
{
"mcpServers": {
"office-tools": {
"type": "stdio",
"command": "uv",
"args": ["run", "python", "-m", "mcp_office_tools.server_monolithic"]
}
}
}

View File

@ -44,15 +44,15 @@ class WordMixin(MCPMixin):
async def convert_to_markdown( async def convert_to_markdown(
self, self,
file_path: str = Field(description="Path to Office document or URL"), file_path: str = Field(description="Path to Office document or URL"),
include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"), include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"), image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"), max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"), preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"), page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."), bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."), chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"), summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')"), output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name."),
# Pagination parameters # Pagination parameters
limit: int = Field(default=50, description="Maximum number of document sections to return per page"), limit: int = Field(default=50, description="Maximum number of document sections to return per page"),
cursor_id: Optional[str] = Field(default=None, description="Cursor ID for pagination continuation"), cursor_id: Optional[str] = Field(default=None, description="Cursor ID for pagination continuation"),

View File

@ -287,15 +287,15 @@ async def analyze_document_health(
@app.tool() @app.tool()
async def convert_to_markdown( async def convert_to_markdown(
file_path: str = Field(description="Path to Office document or URL"), file_path: str = Field(description="Path to Office document or URL"),
include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"), include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"), image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"), max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"), preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"), page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."), bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."), chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"), summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')") output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name.")
) -> dict[str, Any]: ) -> dict[str, Any]:
"""Convert Office documents to Markdown format with intelligent processing recommendations. """Convert Office documents to Markdown format with intelligent processing recommendations.
@ -1299,11 +1299,14 @@ async def _convert_docx_with_python_docx(
max_chars = 100000 max_chars = 100000
bookmark_range = None bookmark_range = None
elif page_numbers: elif page_numbers:
# For page ranges, severely limit content extraction # For page ranges, allow sufficient content for requested pages
max_pages_requested = max(page_numbers) if page_numbers else 1 # Pages can vary wildly in paragraph count (some have 250+ paragraphs)
# Rough estimate: ~20-30 paragraphs per page # Base limits on NUMBER of pages requested, not max page number
max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max num_pages_requested = len(page_numbers)
max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max # Allow ~300 paragraphs per page (generous for variable page sizes)
max_paragraphs = num_pages_requested * 300
# Allow ~50k chars per page (generous for text-heavy pages)
max_chars = num_pages_requested * 50000
bookmark_range = None bookmark_range = None
chapter_range = None chapter_range = None
else: else:
@ -1315,6 +1318,19 @@ async def _convert_docx_with_python_docx(
current_page = 1 current_page = 1
processed_paragraphs = 0 processed_paragraphs = 0
total_chars = 0 total_chars = 0
paragraph_count_for_page = 0 # Track paragraphs for estimated page breaks
PARAGRAPHS_PER_PAGE = 25 # Estimate ~25 paragraphs per page for fallback
# Pre-scan to check if document has explicit page breaks
has_explicit_page_breaks = False
if page_numbers:
for element in doc.element.body:
if isinstance(element, CT_P):
para = Paragraph(element, doc)
if _has_page_break(para):
has_explicit_page_breaks = True
break
include_current_page = not page_numbers or current_page in page_numbers include_current_page = not page_numbers or current_page in page_numbers
table_of_contents = [] # Track headings with page numbers for TOC table_of_contents = [] # Track headings with page numbers for TOC
@ -1332,11 +1348,24 @@ async def _convert_docx_with_python_docx(
if isinstance(element, CT_P): if isinstance(element, CT_P):
paragraph = Paragraph(element, doc) paragraph = Paragraph(element, doc)
# Check for page breaks # Check for page breaks (explicit or estimated)
if _has_page_break(paragraph): if _has_page_break(paragraph):
# Explicit page break found
current_page += 1 current_page += 1
paragraph_count_for_page = 0
include_current_page = not page_numbers or current_page in page_numbers include_current_page = not page_numbers or current_page in page_numbers
continue continue
elif page_numbers and not has_explicit_page_breaks:
# No explicit page breaks - use paragraph count estimation
paragraph_count_for_page += 1
if paragraph_count_for_page >= PARAGRAPHS_PER_PAGE:
current_page += 1
paragraph_count_for_page = 0
include_current_page = not page_numbers or current_page in page_numbers
# Skip content not in requested page range
if not include_current_page:
continue
# Process content with strict limits # Process content with strict limits
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure) markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
@ -1372,6 +1401,10 @@ async def _convert_docx_with_python_docx(
}) })
elif isinstance(element, CT_Tbl): elif isinstance(element, CT_Tbl):
# Skip tables not in requested page range
if not include_current_page:
continue
# Process tables with strict limits # Process tables with strict limits
if processed_paragraphs < max_paragraphs and total_chars < max_chars: if processed_paragraphs < max_paragraphs and total_chars < max_chars:
table = Table(element, doc) table = Table(element, doc)
@ -1764,8 +1797,13 @@ async def _get_available_headings(doc) -> list[str]:
try: try:
headings = [] headings = []
# Search through document elements for headings # Search through ALL document elements for headings (not limited to first 100)
for element in doc.element.body[:100]: # Only check first 100 elements to avoid token issues # This ensures we find chapters at the end of long documents
for element in doc.element.body:
# Early exit if we have enough headings
if len(headings) >= 30:
break
try: try:
if element.tag.endswith('}p'): # Word paragraph element if element.tag.endswith('}p'): # Word paragraph element
# Get the text content # Get the text content
@ -2202,7 +2240,9 @@ def main():
return return
# Run the FastMCP server # Run the FastMCP server
app.run() # CRITICAL: show_banner=False is required for stdio transport!
# FastMCP's banner prints ASCII art to stdout which breaks JSON-RPC protocol
app.run(show_banner=False)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -409,5 +409,85 @@ class TestLegacyWordSupport:
assert "conversion_method" in result["metadata"] assert "conversion_method" in result["metadata"]
class TestPageRangeFiltering:
"""Test page_range content filtering for convert_to_markdown.
These tests verify that the page_range parameter correctly filters
content based on either explicit page breaks or estimated paragraph counts.
"""
@pytest.fixture
def mixin(self):
"""Create WordMixin for testing."""
app = FastMCP("Test")
mixin = WordMixin()
mixin.register_all(app)
return mixin
@pytest.mark.asyncio
@patch('mcp_office_tools.mixins.word.resolve_office_file_path')
@patch('mcp_office_tools.mixins.word.validate_office_file')
@patch('mcp_office_tools.mixins.word.detect_format')
async def test_page_range_filters_different_content(self, mock_detect, mock_validate, mock_resolve, mixin):
"""Test that different page_range values return different content.
This is the key regression test for the page_range bug where
include_current_page was set but never used to filter content.
"""
mock_resolve.return_value = "/test.docx"
mock_validate.return_value = {"is_valid": True, "errors": []}
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word Document"}
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
with patch.object(mixin, '_get_processing_recommendation') as mock_recommend:
mock_analyze.return_value = {"estimated_pages": 10}
mock_recommend.return_value = {"status": "optimal", "message": "", "suggested_workflow": [], "warnings": []}
# Create mock conversions that return different content per page
call_count = [0]
def mock_convert_side_effect(*args, **kwargs):
call_count[0] += 1
page_numbers = args[5] if len(args) > 5 else kwargs.get('page_numbers')
if page_numbers == [1, 2]:
return {
"content": "# Page 1-2 Content\n\nThis is from pages 1 and 2.",
"method_used": "python-docx-custom",
"images": [],
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
}
elif page_numbers == [10, 11]:
return {
"content": "# Page 10-11 Content\n\nThis is from pages 10 and 11.",
"method_used": "python-docx-custom",
"images": [],
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
}
else:
return {
"content": "# Full Content",
"method_used": "python-docx-custom",
"images": [],
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 20}
}
with patch.object(mixin, '_convert_docx_to_markdown', side_effect=mock_convert_side_effect):
# Test page_range 1-2
result_1_2 = await mixin.convert_to_markdown(
file_path="/test.docx",
page_range="1-2"
)
# Test page_range 10-11
result_10_11 = await mixin.convert_to_markdown(
file_path="/test.docx",
page_range="10-11"
)
# The content should be different for different page ranges
assert "Page 1-2" in result_1_2["markdown"]
assert "Page 10-11" in result_10_11["markdown"]
assert result_1_2["markdown"] != result_10_11["markdown"]
if __name__ == "__main__": if __name__ == "__main__":
pytest.main([__file__, "-v"]) pytest.main([__file__, "-v"])