Fix page range extraction for large documents and MCP connection

Bug fixes: - Remove 100-paragraph cap that prevented extracting content past ~page 4 Now calculates limit based on number of pages requested (300 paras/page) - Add fallback page estimation when docs lack explicit page breaks Uses ~25 paragraphs per page for navigation in non-paginated docs - Fix _get_available_headings to scan full document (was only first 100 elements) Headings like Chapter 10 at element 1524 were invisible - Fix MCP connection by disabling FastMCP banner (show_banner=False) ASCII art banner was corrupting stdout JSON-RPC protocol Changes: - Default image_mode changed from 'base64' to 'files' to avoid huge responses - Add proper .mcp.json config with command/args format - Add test document to .gitignore for privacy
2026-01-11 04:27:56 -07:00 · 2026-01-11 04:27:56 -07:00 · 210aa99e0b
commit 210aa99e0b
parent 35869b6099
5 changed files with 158 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -78,3 +78,6 @@ tmp/
 # Temporary files created during processing
 *.tmp
 *.temp
 # Test documents (personal/private)
 ORIGINAL - The Other Side of the Bed*.docx
--- a/.mcp.json
+++ b/.mcp.json
@ -0,0 +1,9 @@
 {
  "mcpServers": {
    "office-tools": {
      "type": "stdio",
      "command": "uv",
      "args": ["run", "python", "-m", "mcp_office_tools.server_monolithic"]
    }
  }
 }
--- a/src/mcp_office_tools/mixins/word.py
+++ b/src/mcp_office_tools/mixins/word.py
@ -44,15 +44,15 @@ class WordMixin(MCPMixin):
    async def convert_to_markdown(
        self,
        file_path: str = Field(description="Path to Office document or URL"),
-        include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
+        include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
-        image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
+        image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
-        max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
+        max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
        preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
        page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
        bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
        chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
        summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
-        output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')"),
+        output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name."),
        # Pagination parameters
        limit: int = Field(default=50, description="Maximum number of document sections to return per page"),
        cursor_id: Optional[str] = Field(default=None, description="Cursor ID for pagination continuation"),
--- a/src/mcp_office_tools/server_monolithic.py
+++ b/src/mcp_office_tools/server_monolithic.py
@ -287,15 +287,15 @@ async def analyze_document_health(
@app.tool()
 async def convert_to_markdown(
    file_path: str = Field(description="Path to Office document or URL"),
-    include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
+    include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
-    image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
+    image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
-    max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
+    max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
    preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
    page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
    bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
    chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
    summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
-    output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
+    output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name.")
 ) -> dict[str, Any]:
    """Convert Office documents to Markdown format with intelligent processing recommendations.
@ -1299,11 +1299,14 @@ async def _convert_docx_with_python_docx(
        max_chars = 100000
        bookmark_range = None
    elif page_numbers:
-        # For page ranges, severely limit content extraction
+        # For page ranges, allow sufficient content for requested pages
-        max_pages_requested = max(page_numbers) if page_numbers else 1
+        # Pages can vary wildly in paragraph count (some have 250+ paragraphs)
-        # Rough estimate: ~20-30 paragraphs per page
+        # Base limits on NUMBER of pages requested, not max page number
-        max_paragraphs = min(max_pages_requested * 25, 100)  # Cap at 100 paragraphs max
+        num_pages_requested = len(page_numbers)
-        max_chars = min(max_pages_requested * 8000, 40000)  # Cap at 40k chars max
+        # Allow ~300 paragraphs per page (generous for variable page sizes)
        max_paragraphs = num_pages_requested * 300
        # Allow ~50k chars per page (generous for text-heavy pages)
        max_chars = num_pages_requested * 50000
        bookmark_range = None
        chapter_range = None
    else:
@ -1315,6 +1318,19 @@ async def _convert_docx_with_python_docx(
    current_page = 1
    processed_paragraphs = 0
    total_chars = 0
    paragraph_count_for_page = 0  # Track paragraphs for estimated page breaks
    PARAGRAPHS_PER_PAGE = 25  # Estimate ~25 paragraphs per page for fallback
    # Pre-scan to check if document has explicit page breaks
    has_explicit_page_breaks = False
    if page_numbers:
        for element in doc.element.body:
            if isinstance(element, CT_P):
                para = Paragraph(element, doc)
                if _has_page_break(para):
                    has_explicit_page_breaks = True
                    break
    include_current_page = not page_numbers or current_page in page_numbers
    table_of_contents = []  # Track headings with page numbers for TOC
@ -1332,11 +1348,24 @@ async def _convert_docx_with_python_docx(
        if isinstance(element, CT_P):
            paragraph = Paragraph(element, doc)
-            # Check for page breaks
+            # Check for page breaks (explicit or estimated)
            if _has_page_break(paragraph):
                # Explicit page break found
                current_page += 1
                paragraph_count_for_page = 0
                include_current_page = not page_numbers or current_page in page_numbers
                continue
            elif page_numbers and not has_explicit_page_breaks:
                # No explicit page breaks - use paragraph count estimation
                paragraph_count_for_page += 1
                if paragraph_count_for_page >= PARAGRAPHS_PER_PAGE:
                    current_page += 1
                    paragraph_count_for_page = 0
                    include_current_page = not page_numbers or current_page in page_numbers
            # Skip content not in requested page range
            if not include_current_page:
                continue
            # Process content with strict limits
            markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
@ -1372,6 +1401,10 @@ async def _convert_docx_with_python_docx(
                    })
        elif isinstance(element, CT_Tbl):
            # Skip tables not in requested page range
            if not include_current_page:
                continue
            # Process tables with strict limits
            if processed_paragraphs < max_paragraphs and total_chars < max_chars:
                table = Table(element, doc)
@ -1764,8 +1797,13 @@ async def _get_available_headings(doc) -> list[str]:
    try:
        headings = []
-        # Search through document elements for headings
+        # Search through ALL document elements for headings (not limited to first 100)
-        for element in doc.element.body[:100]:  # Only check first 100 elements to avoid token issues
+        # This ensures we find chapters at the end of long documents
        for element in doc.element.body:
            # Early exit if we have enough headings
            if len(headings) >= 30:
                break
            try:
                if element.tag.endswith('}p'):  # Word paragraph element
                    # Get the text content
@ -2202,7 +2240,9 @@ def main():
        return
    # Run the FastMCP server
-    app.run()
+    # CRITICAL: show_banner=False is required for stdio transport!
    # FastMCP's banner prints ASCII art to stdout which breaks JSON-RPC protocol
    app.run(show_banner=False)
 if __name__ == "__main__":
--- a/tests/test_word_mixin.py
+++ b/tests/test_word_mixin.py
@ -409,5 +409,85 @@ class TestLegacyWordSupport:
                    assert "conversion_method" in result["metadata"]
 class TestPageRangeFiltering:
    """Test page_range content filtering for convert_to_markdown.
    These tests verify that the page_range parameter correctly filters
    content based on either explicit page breaks or estimated paragraph counts.
    """
    @pytest.fixture
    def mixin(self):
        """Create WordMixin for testing."""
        app = FastMCP("Test")
        mixin = WordMixin()
        mixin.register_all(app)
        return mixin
    @pytest.mark.asyncio
    @patch('mcp_office_tools.mixins.word.resolve_office_file_path')
    @patch('mcp_office_tools.mixins.word.validate_office_file')
    @patch('mcp_office_tools.mixins.word.detect_format')
    async def test_page_range_filters_different_content(self, mock_detect, mock_validate, mock_resolve, mixin):
        """Test that different page_range values return different content.
        This is the key regression test for the page_range bug where
        include_current_page was set but never used to filter content.
        """
        mock_resolve.return_value = "/test.docx"
        mock_validate.return_value = {"is_valid": True, "errors": []}
        mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word Document"}
        with patch.object(mixin, '_analyze_document_size') as mock_analyze:
            with patch.object(mixin, '_get_processing_recommendation') as mock_recommend:
                mock_analyze.return_value = {"estimated_pages": 10}
                mock_recommend.return_value = {"status": "optimal", "message": "", "suggested_workflow": [], "warnings": []}
                # Create mock conversions that return different content per page
                call_count = [0]
                def mock_convert_side_effect(*args, **kwargs):
                    call_count[0] += 1
                    page_numbers = args[5] if len(args) > 5 else kwargs.get('page_numbers')
                    if page_numbers == [1, 2]:
                        return {
                            "content": "# Page 1-2 Content\n\nThis is from pages 1 and 2.",
                            "method_used": "python-docx-custom",
                            "images": [],
                            "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
                        }
                    elif page_numbers == [10, 11]:
                        return {
                            "content": "# Page 10-11 Content\n\nThis is from pages 10 and 11.",
                            "method_used": "python-docx-custom",
                            "images": [],
                            "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
                        }
                    else:
                        return {
                            "content": "# Full Content",
                            "method_used": "python-docx-custom",
                            "images": [],
                            "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 20}
                        }
                with patch.object(mixin, '_convert_docx_to_markdown', side_effect=mock_convert_side_effect):
                    # Test page_range 1-2
                    result_1_2 = await mixin.convert_to_markdown(
                        file_path="/test.docx",
                        page_range="1-2"
                    )
                    # Test page_range 10-11
                    result_10_11 = await mixin.convert_to_markdown(
                        file_path="/test.docx",
                        page_range="10-11"
                    )
                    # The content should be different for different page ranges
                    assert "Page 1-2" in result_1_2["markdown"]
                    assert "Page 10-11" in result_10_11["markdown"]
                    assert result_1_2["markdown"] != result_10_11["markdown"]
 if __name__ == "__main__":
    pytest.main([__file__, "-v"])