diff --git a/.gitignore b/.gitignore index 3585efc..b39e0f3 100644 --- a/.gitignore +++ b/.gitignore @@ -77,4 +77,7 @@ tmp/ # Temporary files created during processing *.tmp -*.temp \ No newline at end of file +*.temp + +# Test documents (personal/private) +ORIGINAL - The Other Side of the Bed*.docx \ No newline at end of file diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 0000000..0bef080 --- /dev/null +++ b/.mcp.json @@ -0,0 +1,9 @@ +{ + "mcpServers": { + "office-tools": { + "type": "stdio", + "command": "uv", + "args": ["run", "python", "-m", "mcp_office_tools.server_monolithic"] + } + } +} diff --git a/src/mcp_office_tools/mixins/word.py b/src/mcp_office_tools/mixins/word.py index c42e891..e1ad1ca 100644 --- a/src/mcp_office_tools/mixins/word.py +++ b/src/mcp_office_tools/mixins/word.py @@ -44,15 +44,15 @@ class WordMixin(MCPMixin): async def convert_to_markdown( self, file_path: str = Field(description="Path to Office document or URL"), - include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"), - image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"), - max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"), + include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."), + image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"), + max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"), preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"), page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"), bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."), chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."), summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"), - output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')"), + output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name."), # Pagination parameters limit: int = Field(default=50, description="Maximum number of document sections to return per page"), cursor_id: Optional[str] = Field(default=None, description="Cursor ID for pagination continuation"), diff --git a/src/mcp_office_tools/server_monolithic.py b/src/mcp_office_tools/server_monolithic.py index 5f85e58..fcfad65 100644 --- a/src/mcp_office_tools/server_monolithic.py +++ b/src/mcp_office_tools/server_monolithic.py @@ -287,15 +287,15 @@ async def analyze_document_health( @app.tool() async def convert_to_markdown( file_path: str = Field(description="Path to Office document or URL"), - include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"), - image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"), - max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"), + include_images: bool = Field(default=True, description="Include images in markdown output. When True, images are extracted to files and linked in the markdown."), + image_mode: str = Field(default="files", description="Image handling mode: 'files' (default, saves to disk and links), 'base64' (embeds inline - WARNING: can create massive responses), or 'references' (metadata only, no content)"), + max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding (only used when image_mode='base64')"), preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"), page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"), bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."), chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."), summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"), - output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')") + output_dir: str = Field(default="", description="Output directory for extracted image files. If empty, uses a temp directory based on document name.") ) -> dict[str, Any]: """Convert Office documents to Markdown format with intelligent processing recommendations. @@ -1299,11 +1299,14 @@ async def _convert_docx_with_python_docx( max_chars = 100000 bookmark_range = None elif page_numbers: - # For page ranges, severely limit content extraction - max_pages_requested = max(page_numbers) if page_numbers else 1 - # Rough estimate: ~20-30 paragraphs per page - max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max - max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max + # For page ranges, allow sufficient content for requested pages + # Pages can vary wildly in paragraph count (some have 250+ paragraphs) + # Base limits on NUMBER of pages requested, not max page number + num_pages_requested = len(page_numbers) + # Allow ~300 paragraphs per page (generous for variable page sizes) + max_paragraphs = num_pages_requested * 300 + # Allow ~50k chars per page (generous for text-heavy pages) + max_chars = num_pages_requested * 50000 bookmark_range = None chapter_range = None else: @@ -1315,9 +1318,22 @@ async def _convert_docx_with_python_docx( current_page = 1 processed_paragraphs = 0 total_chars = 0 + paragraph_count_for_page = 0 # Track paragraphs for estimated page breaks + PARAGRAPHS_PER_PAGE = 25 # Estimate ~25 paragraphs per page for fallback + + # Pre-scan to check if document has explicit page breaks + has_explicit_page_breaks = False + if page_numbers: + for element in doc.element.body: + if isinstance(element, CT_P): + para = Paragraph(element, doc) + if _has_page_break(para): + has_explicit_page_breaks = True + break + include_current_page = not page_numbers or current_page in page_numbers table_of_contents = [] # Track headings with page numbers for TOC - + for element_idx, element in enumerate(doc.element.body): # Early termination if we've processed enough content if processed_paragraphs >= max_paragraphs or total_chars >= max_chars: @@ -1331,13 +1347,26 @@ async def _convert_docx_with_python_docx( if isinstance(element, CT_P): paragraph = Paragraph(element, doc) - - # Check for page breaks + + # Check for page breaks (explicit or estimated) if _has_page_break(paragraph): + # Explicit page break found current_page += 1 + paragraph_count_for_page = 0 include_current_page = not page_numbers or current_page in page_numbers continue - + elif page_numbers and not has_explicit_page_breaks: + # No explicit page breaks - use paragraph count estimation + paragraph_count_for_page += 1 + if paragraph_count_for_page >= PARAGRAPHS_PER_PAGE: + current_page += 1 + paragraph_count_for_page = 0 + include_current_page = not page_numbers or current_page in page_numbers + + # Skip content not in requested page range + if not include_current_page: + continue + # Process content with strict limits markdown_text = _paragraph_to_markdown(paragraph, preserve_structure) if markdown_text.strip(): @@ -1345,7 +1374,7 @@ async def _convert_docx_with_python_docx( text_length = len(markdown_text) if total_chars + text_length > max_chars: break # Stop processing - + markdown_parts.append(markdown_text) processed_paragraphs += 1 total_chars += text_length @@ -1372,6 +1401,10 @@ async def _convert_docx_with_python_docx( }) elif isinstance(element, CT_Tbl): + # Skip tables not in requested page range + if not include_current_page: + continue + # Process tables with strict limits if processed_paragraphs < max_paragraphs and total_chars < max_chars: table = Table(element, doc) @@ -1763,14 +1796,19 @@ async def _get_available_headings(doc) -> list[str]: """Extract available headings from the document to help users find chapter names.""" try: headings = [] - - # Search through document elements for headings - for element in doc.element.body[:100]: # Only check first 100 elements to avoid token issues + + # Search through ALL document elements for headings (not limited to first 100) + # This ensures we find chapters at the end of long documents + for element in doc.element.body: + # Early exit if we have enough headings + if len(headings) >= 30: + break + try: if element.tag.endswith('}p'): # Word paragraph element # Get the text content text_content = ''.join(text_elem.text or '' for text_elem in element.xpath('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})) - + if text_content.strip(): # Check if it's a heading by looking at paragraph style style_elem = element.xpath('.//w:pStyle', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) @@ -1785,9 +1823,9 @@ async def _get_available_headings(doc) -> list[str]: headings.append(text_content.strip()) except Exception: continue - + return headings[:20] # Return max 20 headings to avoid token issues - + except Exception: return [] @@ -2202,7 +2240,9 @@ def main(): return # Run the FastMCP server - app.run() + # CRITICAL: show_banner=False is required for stdio transport! + # FastMCP's banner prints ASCII art to stdout which breaks JSON-RPC protocol + app.run(show_banner=False) if __name__ == "__main__": diff --git a/tests/test_word_mixin.py b/tests/test_word_mixin.py index f6b9b00..ac9c86a 100644 --- a/tests/test_word_mixin.py +++ b/tests/test_word_mixin.py @@ -409,5 +409,85 @@ class TestLegacyWordSupport: assert "conversion_method" in result["metadata"] +class TestPageRangeFiltering: + """Test page_range content filtering for convert_to_markdown. + + These tests verify that the page_range parameter correctly filters + content based on either explicit page breaks or estimated paragraph counts. + """ + + @pytest.fixture + def mixin(self): + """Create WordMixin for testing.""" + app = FastMCP("Test") + mixin = WordMixin() + mixin.register_all(app) + return mixin + + @pytest.mark.asyncio + @patch('mcp_office_tools.mixins.word.resolve_office_file_path') + @patch('mcp_office_tools.mixins.word.validate_office_file') + @patch('mcp_office_tools.mixins.word.detect_format') + async def test_page_range_filters_different_content(self, mock_detect, mock_validate, mock_resolve, mixin): + """Test that different page_range values return different content. + + This is the key regression test for the page_range bug where + include_current_page was set but never used to filter content. + """ + mock_resolve.return_value = "/test.docx" + mock_validate.return_value = {"is_valid": True, "errors": []} + mock_detect.return_value = {"category": "word", "extension": ".docx", "format_name": "Word Document"} + + with patch.object(mixin, '_analyze_document_size') as mock_analyze: + with patch.object(mixin, '_get_processing_recommendation') as mock_recommend: + mock_analyze.return_value = {"estimated_pages": 10} + mock_recommend.return_value = {"status": "optimal", "message": "", "suggested_workflow": [], "warnings": []} + + # Create mock conversions that return different content per page + call_count = [0] + def mock_convert_side_effect(*args, **kwargs): + call_count[0] += 1 + page_numbers = args[5] if len(args) > 5 else kwargs.get('page_numbers') + if page_numbers == [1, 2]: + return { + "content": "# Page 1-2 Content\n\nThis is from pages 1 and 2.", + "method_used": "python-docx-custom", + "images": [], + "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5} + } + elif page_numbers == [10, 11]: + return { + "content": "# Page 10-11 Content\n\nThis is from pages 10 and 11.", + "method_used": "python-docx-custom", + "images": [], + "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 5} + } + else: + return { + "content": "# Full Content", + "method_used": "python-docx-custom", + "images": [], + "structure": {"headings": [], "tables": 0, "lists": 0, "paragraphs": 20} + } + + with patch.object(mixin, '_convert_docx_to_markdown', side_effect=mock_convert_side_effect): + # Test page_range 1-2 + result_1_2 = await mixin.convert_to_markdown( + file_path="/test.docx", + page_range="1-2" + ) + + # Test page_range 10-11 + result_10_11 = await mixin.convert_to_markdown( + file_path="/test.docx", + page_range="10-11" + ) + + # The content should be different for different page ranges + assert "Page 1-2" in result_1_2["markdown"] + assert "Page 10-11" in result_10_11["markdown"] + assert result_1_2["markdown"] != result_10_11["markdown"] + + if __name__ == "__main__": pytest.main([__file__, "-v"]) \ No newline at end of file