Fix page range extraction for large documents and MCP connection
Bug fixes: - Remove 100-paragraph cap that prevented extracting content past ~page 4 Now calculates limit based on number of pages requested (300 paras/page) - Add fallback page estimation when docs lack explicit page breaks Uses ~25 paragraphs per page for navigation in non-paginated docs - Fix _get_available_headings to scan full document (was only first 100 elements) Headings like Chapter 10 at element 1524 were invisible - Fix MCP connection by disabling FastMCP banner (show_banner=False) ASCII art banner was corrupting stdout JSON-RPC protocol Changes: - Default image_mode changed from 'base64' to 'files' to avoid huge responses - Add proper .mcp.json config with command/args format - Add test document to .gitignore for privacy
This commit is contained in:
parent
35869b6099
commit
210aa99e0b
3
.gitignore
vendored
3
.gitignore
vendored
@ -78,3 +78,6 @@ tmp/
|
||||
# Temporary files created during processing
|
||||
*.tmp
|
||||
*.temp
|
||||
|
||||
# Test documents (personal/private)
|
||||
ORIGINAL - The Other Side of the Bed*.docx
|
||||
9
.mcp.json
Normal file
9
.mcp.json
Normal file
@ -0,0 +1,9 @@
|
||||
{
|
||||
"mcpServers": {
|
||||
"office-tools": {
|
||||
"type": "stdio",
|
||||
"command": "uv",
|
||||
"args": ["run", "python", "-m", "mcp_office_tools.server_monolithic"]
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -44,15 +44,15 @@ class WordMixin(MCPMixin):
|
||||
async def convert_to_markdown(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Office document or URL"),
|
||||
include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
|
||||
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
|
||||
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
||||
include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
|
||||
image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
|
||||
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
|
||||
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
||||
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
||||
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
|
||||
chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
|
||||
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
||||
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')"),
|
||||
output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name."),
|
||||
# Pagination parameters
|
||||
limit: int = Field(default=50, description="Maximum number of document sections to return per page"),
|
||||
cursor_id: Optional[str] = Field(default=None, description="Cursor ID for pagination continuation"),
|
||||
|
||||
@ -287,15 +287,15 @@ async def analyze_document_health(
|
||||
@app.tool()
|
||||
async def convert_to_markdown(
|
||||
file_path: str = Field(description="Path to Office document or URL"),
|
||||
include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
|
||||
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
|
||||
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
||||
include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
|
||||
image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
|
||||
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
|
||||
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
||||
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
||||
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
|
||||
chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
|
||||
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
||||
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
||||
output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name.")
|
||||
) -> dict[str, Any]:
|
||||
"""Convert Office documents to Markdown format with intelligent processing recommendations.
|
||||
|
||||
@ -1299,11 +1299,14 @@ async def _convert_docx_with_python_docx(
|
||||
max_chars = 100000
|
||||
bookmark_range = None
|
||||
elif page_numbers:
|
||||
# For page ranges, severely limit content extraction
|
||||
max_pages_requested = max(page_numbers) if page_numbers else 1
|
||||
# Rough estimate: ~20-30 paragraphs per page
|
||||
max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max
|
||||
max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max
|
||||
# For page ranges, allow sufficient content for requested pages
|
||||
# Pages can vary wildly in paragraph count (some have 250+ paragraphs)
|
||||
# Base limits on NUMBER of pages requested, not max page number
|
||||
num_pages_requested = len(page_numbers)
|
||||
# Allow ~300 paragraphs per page (generous for variable page sizes)
|
||||
max_paragraphs = num_pages_requested * 300
|
||||
# Allow ~50k chars per page (generous for text-heavy pages)
|
||||
max_chars = num_pages_requested * 50000
|
||||
bookmark_range = None
|
||||
chapter_range = None
|
||||
else:
|
||||
@ -1315,6 +1318,19 @@ async def _convert_docx_with_python_docx(
|
||||
current_page = 1
|
||||
processed_paragraphs = 0
|
||||
total_chars = 0
|
||||
paragraph_count_for_page = 0 # Track paragraphs for estimated page breaks
|
||||
PARAGRAPHS_PER_PAGE = 25 # Estimate ~25 paragraphs per page for fallback
|
||||
|
||||
# Pre-scan to check if document has explicit page breaks
|
||||
has_explicit_page_breaks = False
|
||||
if page_numbers:
|
||||
for element in doc.element.body:
|
||||
if isinstance(element, CT_P):
|
||||
para = Paragraph(element, doc)
|
||||
if _has_page_break(para):
|
||||
has_explicit_page_breaks = True
|
||||
break
|
||||
|
||||
include_current_page = not page_numbers or current_page in page_numbers
|
||||
table_of_contents = [] # Track headings with page numbers for TOC
|
||||
|
||||
@ -1332,11 +1348,24 @@ async def _convert_docx_with_python_docx(
|
||||
if isinstance(element, CT_P):
|
||||
paragraph = Paragraph(element, doc)
|
||||
|
||||
# Check for page breaks
|
||||
# Check for page breaks (explicit or estimated)
|
||||
if _has_page_break(paragraph):
|
||||
# Explicit page break found
|
||||
current_page += 1
|
||||
paragraph_count_for_page = 0
|
||||
include_current_page = not page_numbers or current_page in page_numbers
|
||||
continue
|
||||
elif page_numbers and not has_explicit_page_breaks:
|
||||
# No explicit page breaks - use paragraph count estimation
|
||||
paragraph_count_for_page += 1
|
||||
if paragraph_count_for_page >= PARAGRAPHS_PER_PAGE:
|
||||
current_page += 1
|
||||
paragraph_count_for_page = 0
|
||||
include_current_page = not page_numbers or current_page in page_numbers
|
||||
|
||||
# Skip content not in requested page range
|
||||
if not include_current_page:
|
||||
continue
|
||||
|
||||
# Process content with strict limits
|
||||
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
|
||||
@ -1372,6 +1401,10 @@ async def _convert_docx_with_python_docx(
|
||||
})
|
||||
|
||||
elif isinstance(element, CT_Tbl):
|
||||
# Skip tables not in requested page range
|
||||
if not include_current_page:
|
||||
continue
|
||||
|
||||
# Process tables with strict limits
|
||||
if processed_paragraphs < max_paragraphs and total_chars < max_chars:
|
||||
table = Table(element, doc)
|
||||
@ -1764,8 +1797,13 @@ async def _get_available_headings(doc) -> list[str]:
|
||||
try:
|
||||
headings = []
|
||||
|
||||
# Search through document elements for headings
|
||||
for element in doc.element.body[:100]: # Only check first 100 elements to avoid token issues
|
||||
# Search through ALL document elements for headings (not limited to first 100)
|
||||
# This ensures we find chapters at the end of long documents
|
||||
for element in doc.element.body:
|
||||
# Early exit if we have enough headings
|
||||
if len(headings) >= 30:
|
||||
break
|
||||
|
||||
try:
|
||||
if element.tag.endswith('}p'): # Word paragraph element
|
||||
# Get the text content
|
||||
@ -2202,7 +2240,9 @@ def main():
|
||||
return
|
||||
|
||||
# Run the FastMCP server
|
||||
app.run()
|
||||
# CRITICAL: show_banner=False is required for stdio transport!
|
||||
# FastMCP's banner prints ASCII art to stdout which breaks JSON-RPC protocol
|
||||
app.run(show_banner=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -409,5 +409,85 @@ class TestLegacyWordSupport:
|
||||
assert "conversion_method" in result["metadata"]
|
||||
|
||||
|
||||
class TestPageRangeFiltering:
|
||||
"""Test page_range content filtering for convert_to_markdown.
|
||||
|
||||
These tests verify that the page_range parameter correctly filters
|
||||
content based on either explicit page breaks or estimated paragraph counts.
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def mixin(self):
|
||||
"""Create WordMixin for testing."""
|
||||
app = FastMCP("Test")
|
||||
mixin = WordMixin()
|
||||
mixin.register_all(app)
|
||||
return mixin
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('mcp_office_tools.mixins.word.resolve_office_file_path')
|
||||
@patch('mcp_office_tools.mixins.word.validate_office_file')
|
||||
@patch('mcp_office_tools.mixins.word.detect_format')
|
||||
async def test_page_range_filters_different_content(self, mock_detect, mock_validate, mock_resolve, mixin):
|
||||
"""Test that different page_range values return different content.
|
||||
|
||||
This is the key regression test for the page_range bug where
|
||||
include_current_page was set but never used to filter content.
|
||||
"""
|
||||
mock_resolve.return_value = "/test.docx"
|
||||
mock_validate.return_value = {"is_valid": True, "errors": []}
|
||||
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word Document"}
|
||||
|
||||
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
||||
with patch.object(mixin, '_get_processing_recommendation') as mock_recommend:
|
||||
mock_analyze.return_value = {"estimated_pages": 10}
|
||||
mock_recommend.return_value = {"status": "optimal", "message": "", "suggested_workflow": [], "warnings": []}
|
||||
|
||||
# Create mock conversions that return different content per page
|
||||
call_count = [0]
|
||||
def mock_convert_side_effect(*args, **kwargs):
|
||||
call_count[0] += 1
|
||||
page_numbers = args[5] if len(args) > 5 else kwargs.get('page_numbers')
|
||||
if page_numbers == [1, 2]:
|
||||
return {
|
||||
"content": "# Page 1-2 Content\n\nThis is from pages 1 and 2.",
|
||||
"method_used": "python-docx-custom",
|
||||
"images": [],
|
||||
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
|
||||
}
|
||||
elif page_numbers == [10, 11]:
|
||||
return {
|
||||
"content": "# Page 10-11 Content\n\nThis is from pages 10 and 11.",
|
||||
"method_used": "python-docx-custom",
|
||||
"images": [],
|
||||
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"content": "# Full Content",
|
||||
"method_used": "python-docx-custom",
|
||||
"images": [],
|
||||
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 20}
|
||||
}
|
||||
|
||||
with patch.object(mixin, '_convert_docx_to_markdown', side_effect=mock_convert_side_effect):
|
||||
# Test page_range 1-2
|
||||
result_1_2 = await mixin.convert_to_markdown(
|
||||
file_path="/test.docx",
|
||||
page_range="1-2"
|
||||
)
|
||||
|
||||
# Test page_range 10-11
|
||||
result_10_11 = await mixin.convert_to_markdown(
|
||||
file_path="/test.docx",
|
||||
page_range="10-11"
|
||||
)
|
||||
|
||||
# The content should be different for different page ranges
|
||||
assert "Page 1-2" in result_1_2["markdown"]
|
||||
assert "Page 10-11" in result_10_11["markdown"]
|
||||
assert result_1_2["markdown"] != result_10_11["markdown"]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Loading…
x
Reference in New Issue
Block a user