Fix page range extraction for large documents and MCP connection
Bug fixes: - Remove 100-paragraph cap that prevented extracting content past ~page 4 Now calculates limit based on number of pages requested (300 paras/page) - Add fallback page estimation when docs lack explicit page breaks Uses ~25 paragraphs per page for navigation in non-paginated docs - Fix _get_available_headings to scan full document (was only first 100 elements) Headings like Chapter 10 at element 1524 were invisible - Fix MCP connection by disabling FastMCP banner (show_banner=False) ASCII art banner was corrupting stdout JSON-RPC protocol Changes: - Default image_mode changed from 'base64' to 'files' to avoid huge responses - Add proper .mcp.json config with command/args format - Add test document to .gitignore for privacy
This commit is contained in:
parent
35869b6099
commit
210aa99e0b
3
.gitignore
vendored
3
.gitignore
vendored
@ -78,3 +78,6 @@ tmp/
|
|||||||
# Temporary files created during processing
|
# Temporary files created during processing
|
||||||
*.tmp
|
*.tmp
|
||||||
*.temp
|
*.temp
|
||||||
|
|
||||||
|
# Test documents (personal/private)
|
||||||
|
ORIGINAL - The Other Side of the Bed*.docx
|
||||||
9
.mcp.json
Normal file
9
.mcp.json
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"office-tools": {
|
||||||
|
"type": "stdio",
|
||||||
|
"command": "uv",
|
||||||
|
"args": ["run", "python", "-m", "mcp_office_tools.server_monolithic"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -44,15 +44,15 @@ class WordMixin(MCPMixin):
|
|||||||
async def convert_to_markdown(
|
async def convert_to_markdown(
|
||||||
self,
|
self,
|
||||||
file_path: str = Field(description="Path to Office document or URL"),
|
file_path: str = Field(description="Path to Office document or URL"),
|
||||||
include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
|
include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
|
||||||
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
|
image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
|
||||||
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
|
||||||
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
||||||
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
||||||
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
|
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
|
||||||
chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
|
chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
|
||||||
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
||||||
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')"),
|
output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name."),
|
||||||
# Pagination parameters
|
# Pagination parameters
|
||||||
limit: int = Field(default=50, description="Maximum number of document sections to return per page"),
|
limit: int = Field(default=50, description="Maximum number of document sections to return per page"),
|
||||||
cursor_id: Optional[str] = Field(default=None, description="Cursor ID for pagination continuation"),
|
cursor_id: Optional[str] = Field(default=None, description="Cursor ID for pagination continuation"),
|
||||||
|
|||||||
@ -287,15 +287,15 @@ async def analyze_document_health(
|
|||||||
@app.tool()
|
@app.tool()
|
||||||
async def convert_to_markdown(
|
async def convert_to_markdown(
|
||||||
file_path: str = Field(description="Path to Office document or URL"),
|
file_path: str = Field(description="Path to Office document or URL"),
|
||||||
include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
|
include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
|
||||||
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
|
image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
|
||||||
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
|
||||||
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
||||||
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
||||||
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
|
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
|
||||||
chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
|
chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
|
||||||
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
||||||
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name.")
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert Office documents to Markdown format with intelligent processing recommendations.
|
"""Convert Office documents to Markdown format with intelligent processing recommendations.
|
||||||
|
|
||||||
@ -1299,11 +1299,14 @@ async def _convert_docx_with_python_docx(
|
|||||||
max_chars = 100000
|
max_chars = 100000
|
||||||
bookmark_range = None
|
bookmark_range = None
|
||||||
elif page_numbers:
|
elif page_numbers:
|
||||||
# For page ranges, severely limit content extraction
|
# For page ranges, allow sufficient content for requested pages
|
||||||
max_pages_requested = max(page_numbers) if page_numbers else 1
|
# Pages can vary wildly in paragraph count (some have 250+ paragraphs)
|
||||||
# Rough estimate: ~20-30 paragraphs per page
|
# Base limits on NUMBER of pages requested, not max page number
|
||||||
max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max
|
num_pages_requested = len(page_numbers)
|
||||||
max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max
|
# Allow ~300 paragraphs per page (generous for variable page sizes)
|
||||||
|
max_paragraphs = num_pages_requested * 300
|
||||||
|
# Allow ~50k chars per page (generous for text-heavy pages)
|
||||||
|
max_chars = num_pages_requested * 50000
|
||||||
bookmark_range = None
|
bookmark_range = None
|
||||||
chapter_range = None
|
chapter_range = None
|
||||||
else:
|
else:
|
||||||
@ -1315,6 +1318,19 @@ async def _convert_docx_with_python_docx(
|
|||||||
current_page = 1
|
current_page = 1
|
||||||
processed_paragraphs = 0
|
processed_paragraphs = 0
|
||||||
total_chars = 0
|
total_chars = 0
|
||||||
|
paragraph_count_for_page = 0 # Track paragraphs for estimated page breaks
|
||||||
|
PARAGRAPHS_PER_PAGE = 25 # Estimate ~25 paragraphs per page for fallback
|
||||||
|
|
||||||
|
# Pre-scan to check if document has explicit page breaks
|
||||||
|
has_explicit_page_breaks = False
|
||||||
|
if page_numbers:
|
||||||
|
for element in doc.element.body:
|
||||||
|
if isinstance(element, CT_P):
|
||||||
|
para = Paragraph(element, doc)
|
||||||
|
if _has_page_break(para):
|
||||||
|
has_explicit_page_breaks = True
|
||||||
|
break
|
||||||
|
|
||||||
include_current_page = not page_numbers or current_page in page_numbers
|
include_current_page = not page_numbers or current_page in page_numbers
|
||||||
table_of_contents = [] # Track headings with page numbers for TOC
|
table_of_contents = [] # Track headings with page numbers for TOC
|
||||||
|
|
||||||
@ -1332,11 +1348,24 @@ async def _convert_docx_with_python_docx(
|
|||||||
if isinstance(element, CT_P):
|
if isinstance(element, CT_P):
|
||||||
paragraph = Paragraph(element, doc)
|
paragraph = Paragraph(element, doc)
|
||||||
|
|
||||||
# Check for page breaks
|
# Check for page breaks (explicit or estimated)
|
||||||
if _has_page_break(paragraph):
|
if _has_page_break(paragraph):
|
||||||
|
# Explicit page break found
|
||||||
current_page += 1
|
current_page += 1
|
||||||
|
paragraph_count_for_page = 0
|
||||||
include_current_page = not page_numbers or current_page in page_numbers
|
include_current_page = not page_numbers or current_page in page_numbers
|
||||||
continue
|
continue
|
||||||
|
elif page_numbers and not has_explicit_page_breaks:
|
||||||
|
# No explicit page breaks - use paragraph count estimation
|
||||||
|
paragraph_count_for_page += 1
|
||||||
|
if paragraph_count_for_page >= PARAGRAPHS_PER_PAGE:
|
||||||
|
current_page += 1
|
||||||
|
paragraph_count_for_page = 0
|
||||||
|
include_current_page = not page_numbers or current_page in page_numbers
|
||||||
|
|
||||||
|
# Skip content not in requested page range
|
||||||
|
if not include_current_page:
|
||||||
|
continue
|
||||||
|
|
||||||
# Process content with strict limits
|
# Process content with strict limits
|
||||||
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
|
markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
|
||||||
@ -1372,6 +1401,10 @@ async def _convert_docx_with_python_docx(
|
|||||||
})
|
})
|
||||||
|
|
||||||
elif isinstance(element, CT_Tbl):
|
elif isinstance(element, CT_Tbl):
|
||||||
|
# Skip tables not in requested page range
|
||||||
|
if not include_current_page:
|
||||||
|
continue
|
||||||
|
|
||||||
# Process tables with strict limits
|
# Process tables with strict limits
|
||||||
if processed_paragraphs < max_paragraphs and total_chars < max_chars:
|
if processed_paragraphs < max_paragraphs and total_chars < max_chars:
|
||||||
table = Table(element, doc)
|
table = Table(element, doc)
|
||||||
@ -1764,8 +1797,13 @@ async def _get_available_headings(doc) -> list[str]:
|
|||||||
try:
|
try:
|
||||||
headings = []
|
headings = []
|
||||||
|
|
||||||
# Search through document elements for headings
|
# Search through ALL document elements for headings (not limited to first 100)
|
||||||
for element in doc.element.body[:100]: # Only check first 100 elements to avoid token issues
|
# This ensures we find chapters at the end of long documents
|
||||||
|
for element in doc.element.body:
|
||||||
|
# Early exit if we have enough headings
|
||||||
|
if len(headings) >= 30:
|
||||||
|
break
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if element.tag.endswith('}p'): # Word paragraph element
|
if element.tag.endswith('}p'): # Word paragraph element
|
||||||
# Get the text content
|
# Get the text content
|
||||||
@ -2202,7 +2240,9 @@ def main():
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Run the FastMCP server
|
# Run the FastMCP server
|
||||||
app.run()
|
# CRITICAL: show_banner=False is required for stdio transport!
|
||||||
|
# FastMCP's banner prints ASCII art to stdout which breaks JSON-RPC protocol
|
||||||
|
app.run(show_banner=False)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -409,5 +409,85 @@ class TestLegacyWordSupport:
|
|||||||
assert "conversion_method" in result["metadata"]
|
assert "conversion_method" in result["metadata"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestPageRangeFiltering:
|
||||||
|
"""Test page_range content filtering for convert_to_markdown.
|
||||||
|
|
||||||
|
These tests verify that the page_range parameter correctly filters
|
||||||
|
content based on either explicit page breaks or estimated paragraph counts.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mixin(self):
|
||||||
|
"""Create WordMixin for testing."""
|
||||||
|
app = FastMCP("Test")
|
||||||
|
mixin = WordMixin()
|
||||||
|
mixin.register_all(app)
|
||||||
|
return mixin
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@patch('mcp_office_tools.mixins.word.resolve_office_file_path')
|
||||||
|
@patch('mcp_office_tools.mixins.word.validate_office_file')
|
||||||
|
@patch('mcp_office_tools.mixins.word.detect_format')
|
||||||
|
async def test_page_range_filters_different_content(self, mock_detect, mock_validate, mock_resolve, mixin):
|
||||||
|
"""Test that different page_range values return different content.
|
||||||
|
|
||||||
|
This is the key regression test for the page_range bug where
|
||||||
|
include_current_page was set but never used to filter content.
|
||||||
|
"""
|
||||||
|
mock_resolve.return_value = "/test.docx"
|
||||||
|
mock_validate.return_value = {"is_valid": True, "errors": []}
|
||||||
|
mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word Document"}
|
||||||
|
|
||||||
|
with patch.object(mixin, '_analyze_document_size') as mock_analyze:
|
||||||
|
with patch.object(mixin, '_get_processing_recommendation') as mock_recommend:
|
||||||
|
mock_analyze.return_value = {"estimated_pages": 10}
|
||||||
|
mock_recommend.return_value = {"status": "optimal", "message": "", "suggested_workflow": [], "warnings": []}
|
||||||
|
|
||||||
|
# Create mock conversions that return different content per page
|
||||||
|
call_count = [0]
|
||||||
|
def mock_convert_side_effect(*args, **kwargs):
|
||||||
|
call_count[0] += 1
|
||||||
|
page_numbers = args[5] if len(args) > 5 else kwargs.get('page_numbers')
|
||||||
|
if page_numbers == [1, 2]:
|
||||||
|
return {
|
||||||
|
"content": "# Page 1-2 Content\n\nThis is from pages 1 and 2.",
|
||||||
|
"method_used": "python-docx-custom",
|
||||||
|
"images": [],
|
||||||
|
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
|
||||||
|
}
|
||||||
|
elif page_numbers == [10, 11]:
|
||||||
|
return {
|
||||||
|
"content": "# Page 10-11 Content\n\nThis is from pages 10 and 11.",
|
||||||
|
"method_used": "python-docx-custom",
|
||||||
|
"images": [],
|
||||||
|
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"content": "# Full Content",
|
||||||
|
"method_used": "python-docx-custom",
|
||||||
|
"images": [],
|
||||||
|
"structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 20}
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch.object(mixin, '_convert_docx_to_markdown', side_effect=mock_convert_side_effect):
|
||||||
|
# Test page_range 1-2
|
||||||
|
result_1_2 = await mixin.convert_to_markdown(
|
||||||
|
file_path="/test.docx",
|
||||||
|
page_range="1-2"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test page_range 10-11
|
||||||
|
result_10_11 = await mixin.convert_to_markdown(
|
||||||
|
file_path="/test.docx",
|
||||||
|
page_range="10-11"
|
||||||
|
)
|
||||||
|
|
||||||
|
# The content should be different for different page ranges
|
||||||
|
assert "Page 1-2" in result_1_2["markdown"]
|
||||||
|
assert "Page 10-11" in result_10_11["markdown"]
|
||||||
|
assert result_1_2["markdown"] != result_10_11["markdown"]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pytest.main([__file__, "-v"])
|
pytest.main([__file__, "-v"])
|
||||||
Loading…
x
Reference in New Issue
Block a user