⚡ Implement true page-range filtering for efficient processing

- Add page break detection using Word XML structure - Process only specified pages instead of full document + truncation - Route page-range requests to python-docx for granular control - Skip mammoth for page-specific processing (mammoth processes full doc) - Add page metadata to results when filtering is used - Significantly reduce memory usage and response size for large documents
2025-08-19 13:12:19 -06:00 · 2025-08-19 13:12:19 -06:00 · a485e05759
commit a485e05759
parent f884c99bbd
1 changed files with 61 additions and 20 deletions
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@ -1011,8 +1011,16 @@ async def _convert_docx_to_markdown(
    """Convert .docx file to markdown with comprehensive feature support."""
    import base64

+    # If page_numbers is specified, we need to use python-docx for page-based extraction
+    # as mammoth processes the entire document
+    if page_numbers:
+        return await _convert_docx_with_python_docx(
+            file_path, include_images, image_mode, max_image_size,
+            preserve_structure, page_numbers, summary_only, output_dir
+        )
+    
    try:
-        # Try mammoth first for better HTML->Markdown conversion
+        # Try mammoth first for better HTML->Markdown conversion (full document only)
        import mammoth

        # Configure mammoth for markdown-friendly output
@ -1201,31 +1209,45 @@ async def _convert_docx_with_python_docx(
                    "markdown_ref": f"![Image {i+1}]({img['filename']})"
                })

-    # Process document elements
+    # Process document elements with page filtering if specified
+    current_page = 1
+    include_current_page = not page_numbers or current_page in page_numbers
+    
    for element in doc.element.body:
        if isinstance(element, CT_P):
            paragraph = Paragraph(element, doc)
-            markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
-            if markdown_text.strip():
-                markdown_parts.append(markdown_text)
-                structure_info["paragraphs"] += 1
+            
+            # Check for page breaks
+            if _has_page_break(paragraph):
+                current_page += 1
+                include_current_page = not page_numbers or current_page in page_numbers
+                continue
+            
+            # Only process content from specified pages
+            if include_current_page:
+                markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
+                if markdown_text.strip():
+                    markdown_parts.append(markdown_text)
+                    structure_info["paragraphs"] += 1

-                # Track headings
-                if preserve_structure and markdown_text.startswith('#'):
-                    level = len(markdown_text) - len(markdown_text.lstrip('#'))
-                    heading_text = markdown_text.lstrip('# ').strip()
-                    structure_info["headings"].append({
-                        "level": level,
-                        "text": heading_text,
-                        "position": len(markdown_parts) - 1
-                    })
+                    # Track headings
+                    if preserve_structure and markdown_text.startswith('#'):
+                        level = len(markdown_text) - len(markdown_text.lstrip('#'))
+                        heading_text = markdown_text.lstrip('# ').strip()
+                        structure_info["headings"].append({
+                            "level": level,
+                            "text": heading_text,
+                            "position": len(markdown_parts) - 1
+                        })

        elif isinstance(element, CT_Tbl):
-            table = Table(element, doc)
-            table_markdown = _table_to_markdown(table)
-            if table_markdown.strip():
-                markdown_parts.append(table_markdown)
-                structure_info["tables"] += 1
+            # Only process tables from specified pages
+            if include_current_page:
+                table = Table(element, doc)
+                table_markdown = _table_to_markdown(table)
+                if table_markdown.strip():
+                    markdown_parts.append(table_markdown)
+                    structure_info["tables"] += 1

    # Add image references at the end if any
    if include_images and images_info:
@ -1240,6 +1262,11 @@ async def _convert_docx_with_python_docx(
        "method_used": "python-docx-custom",
        "images": images_info
    }
+    
+    # Add page filtering info
+    if page_numbers:
+        result["pages_processed"] = page_numbers
+        result["total_pages_in_range"] = len(page_numbers)

    # Handle summary mode
    if summary_only and len(markdown_content) > 5000:
@ -1464,6 +1491,20 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
    return structure


+def _has_page_break(paragraph) -> bool:
+    """Check if a paragraph contains a page break."""
+    try:
+        # Check for explicit page breaks in paragraph runs
+        for run in paragraph.runs:
+            if run._r.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br') is not None:
+                br_elem = run._r.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br')
+                if br_elem is not None and br_elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type') == 'page':
+                    return True
+        return False
+    except Exception:
+        return False
+
+
 def _parse_page_range(page_range: str) -> list[int]:
    """Parse page range string into list of page numbers.