Fix page range extraction for large documents and MCP connection

Bug fixes: - Remove 100-paragraph cap that prevented extracting content past ~page 4 Now calculates limit based on number of pages requested (300 paras/page) - Add fallback page estimation when docs lack explicit page breaks Uses ~25 paragraphs per page for navigation in non-paginated docs - Fix _get_available_headings to scan full document (was only first 100 elements) Headings like Chapter 10 at element 1524 were invisible - Fix MCP connection by disabling FastMCP banner (show_banner=False) ASCII art banner was corrupting stdout JSON-RPC protocol Changes: - Default image_mode changed from 'base64' to 'files' to avoid huge responses - Add proper .mcp.json config with command/args format - Add test document to .gitignore for privacy
2026-01-11 04:27:56 -07:00 · 2026-01-11 04:27:56 -07:00 · 210aa99e0b
commit 210aa99e0b
parent 35869b6099
5 changed files with 158 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -78,3 +78,6 @@ tmp/
 # Temporary files created during processing
 *.tmp
 *.temp
+
+# Test documents (personal/private)
+ORIGINAL - The Other Side of the Bed*.docx
--- a/.mcp.json
+++ b/.mcp.json
@ -0,0 +1,9 @@
+{
+  "mcpServers": {
+    "office-tools": {
+      "type": "stdio",
+      "command": "uv",
+      "args": ["run", "python", "-m", "mcp_office_tools.server_monolithic"]
+    }
+  }
+}
--- a/src/mcp_office_tools/mixins/word.py
+++ b/src/mcp_office_tools/mixins/word.py
@ -44,15 +44,15 @@ class WordMixin(MCPMixin):
    async def convert_to_markdown(
        self,
        file_path: str = Field(description="Path to Office document or URL"),
-        include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
-        image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
-        max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
+        include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
+        image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
+        max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
        preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
        page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
        bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
        chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
        summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
-        output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')"),
+        output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name."),
        # Pagination parameters
        limit: int = Field(default=50, description="Maximum number of document sections to return per page"),
        cursor_id: Optional[str] = Field(default=None, description="Cursor ID for pagination continuation"),
--- a/src/mcp_office_tools/server_monolithic.py
+++ b/src/mcp_office_tools/server_monolithic.py
@ -287,15 +287,15 @@ async def analyze_document_health(
@app.tool()
 async def convert_to_markdown(
    file_path: str = Field(description="Path to Office document or URL"),
-    include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
-    image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
-    max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
+    include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."),
+    image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"),
+    max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"),
    preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
    page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
    bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
    chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
    summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
-    output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
+    output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name.")
 ) -> dict[str, Any]:
    """Convert Office documents to Markdown format with intelligent processing recommendations.
    
@ -1299,11 +1299,14 @@ async def _convert_docx_with_python_docx(
        max_chars = 100000
        bookmark_range = None
    elif page_numbers:
-        # For page ranges, severely limit content extraction
-        max_pages_requested = max(page_numbers) if page_numbers else 1
-        # Rough estimate: ~20-30 paragraphs per page
-        max_paragraphs = min(max_pages_requested * 25, 100)  # Cap at 100 paragraphs max
-        max_chars = min(max_pages_requested * 8000, 40000)  # Cap at 40k chars max
+        # For page ranges, allow sufficient content for requested pages
+        # Pages can vary wildly in paragraph count (some have 250+ paragraphs)
+        # Base limits on NUMBER of pages requested, not max page number
+        num_pages_requested = len(page_numbers)
+        # Allow ~300 paragraphs per page (generous for variable page sizes)
+        max_paragraphs = num_pages_requested * 300
+        # Allow ~50k chars per page (generous for text-heavy pages)
+        max_chars = num_pages_requested * 50000
        bookmark_range = None
        chapter_range = None
    else:
@ -1315,6 +1318,19 @@ async def _convert_docx_with_python_docx(
    current_page = 1
    processed_paragraphs = 0
    total_chars = 0
+    paragraph_count_for_page = 0  # Track paragraphs for estimated page breaks
+    PARAGRAPHS_PER_PAGE = 25  # Estimate ~25 paragraphs per page for fallback
+
+    # Pre-scan to check if document has explicit page breaks
+    has_explicit_page_breaks = False
+    if page_numbers:
+        for element in doc.element.body:
+            if isinstance(element, CT_P):
+                para = Paragraph(element, doc)
+                if _has_page_break(para):
+                    has_explicit_page_breaks = True
+                    break
+
    include_current_page = not page_numbers or current_page in page_numbers
    table_of_contents = []  # Track headings with page numbers for TOC

@ -1332,11 +1348,24 @@ async def _convert_docx_with_python_docx(
        if isinstance(element, CT_P):
            paragraph = Paragraph(element, doc)

-            # Check for page breaks
+            # Check for page breaks (explicit or estimated)
            if _has_page_break(paragraph):
+                # Explicit page break found
                current_page += 1
+                paragraph_count_for_page = 0
                include_current_page = not page_numbers or current_page in page_numbers
                continue
+            elif page_numbers and not has_explicit_page_breaks:
+                # No explicit page breaks - use paragraph count estimation
+                paragraph_count_for_page += 1
+                if paragraph_count_for_page >= PARAGRAPHS_PER_PAGE:
+                    current_page += 1
+                    paragraph_count_for_page = 0
+                    include_current_page = not page_numbers or current_page in page_numbers
+
+            # Skip content not in requested page range
+            if not include_current_page:
+                continue

            # Process content with strict limits
            markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
@ -1372,6 +1401,10 @@ async def _convert_docx_with_python_docx(
                    })

        elif isinstance(element, CT_Tbl):
+            # Skip tables not in requested page range
+            if not include_current_page:
+                continue
+
            # Process tables with strict limits
            if processed_paragraphs < max_paragraphs and total_chars < max_chars:
                table = Table(element, doc)
@ -1764,8 +1797,13 @@ async def _get_available_headings(doc) -> list[str]:
    try:
        headings = []

-        # Search through document elements for headings
-        for element in doc.element.body[:100]:  # Only check first 100 elements to avoid token issues
+        # Search through ALL document elements for headings (not limited to first 100)
+        # This ensures we find chapters at the end of long documents
+        for element in doc.element.body:
+            # Early exit if we have enough headings
+            if len(headings) >= 30:
+                break
+
            try:
                if element.tag.endswith('}p'):  # Word paragraph element
                    # Get the text content
@ -2202,7 +2240,9 @@ def main():
        return

    # Run the FastMCP server
-    app.run()
+    # CRITICAL: show_banner=False is required for stdio transport!
+    # FastMCP's banner prints ASCII art to stdout which breaks JSON-RPC protocol
+    app.run(show_banner=False)


 if __name__ == "__main__":
--- a/tests/test_word_mixin.py
+++ b/tests/test_word_mixin.py
@ -409,5 +409,85 @@ class TestLegacyWordSupport:
                    assert "conversion_method" in result["metadata"]


+class TestPageRangeFiltering:
+    """Test page_range content filtering for convert_to_markdown.
+
+    These tests verify that the page_range parameter correctly filters
+    content based on either explicit page breaks or estimated paragraph counts.
+    """
+
+    @pytest.fixture
+    def mixin(self):
+        """Create WordMixin for testing."""
+        app = FastMCP("Test")
+        mixin = WordMixin()
+        mixin.register_all(app)
+        return mixin
+
+    @pytest.mark.asyncio
+    @patch('mcp_office_tools.mixins.word.resolve_office_file_path')
+    @patch('mcp_office_tools.mixins.word.validate_office_file')
+    @patch('mcp_office_tools.mixins.word.detect_format')
+    async def test_page_range_filters_different_content(self, mock_detect, mock_validate, mock_resolve, mixin):
+        """Test that different page_range values return different content.
+
+        This is the key regression test for the page_range bug where
+        include_current_page was set but never used to filter content.
+        """
+        mock_resolve.return_value = "/test.docx"
+        mock_validate.return_value = {"is_valid": True, "errors": []}
+        mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word Document"}
+
+        with patch.object(mixin, '_analyze_document_size') as mock_analyze:
+            with patch.object(mixin, '_get_processing_recommendation') as mock_recommend:
+                mock_analyze.return_value = {"estimated_pages": 10}
+                mock_recommend.return_value = {"status": "optimal", "message": "", "suggested_workflow": [], "warnings": []}
+
+                # Create mock conversions that return different content per page
+                call_count = [0]
+                def mock_convert_side_effect(*args, **kwargs):
+                    call_count[0] += 1
+                    page_numbers = args[5] if len(args) > 5 else kwargs.get('page_numbers')
+                    if page_numbers == [1, 2]:
+                        return {
+                            "content": "# Page 1-2 Content\n\nThis is from pages 1 and 2.",
+                            "method_used": "python-docx-custom",
+                            "images": [],
+                            "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
+                        }
+                    elif page_numbers == [10, 11]:
+                        return {
+                            "content": "# Page 10-11 Content\n\nThis is from pages 10 and 11.",
+                            "method_used": "python-docx-custom",
+                            "images": [],
+                            "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5}
+                        }
+                    else:
+                        return {
+                            "content": "# Full Content",
+                            "method_used": "python-docx-custom",
+                            "images": [],
+                            "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 20}
+                        }
+
+                with patch.object(mixin, '_convert_docx_to_markdown', side_effect=mock_convert_side_effect):
+                    # Test page_range 1-2
+                    result_1_2 = await mixin.convert_to_markdown(
+                        file_path="/test.docx",
+                        page_range="1-2"
+                    )
+
+                    # Test page_range 10-11
+                    result_10_11 = await mixin.convert_to_markdown(
+                        file_path="/test.docx",
+                        page_range="10-11"
+                    )
+
+                    # The content should be different for different page ranges
+                    assert "Page 1-2" in result_1_2["markdown"]
+                    assert "Page 10-11" in result_10_11["markdown"]
+                    assert result_1_2["markdown"] != result_10_11["markdown"]
+
+
 if __name__ == "__main__":
    pytest.main([__file__, "-v"])