🔥 Fix critical issue: page_range was processing entire document

- Replace unreliable Word page detection with element-based limiting - Cap extraction at 25 paragraphs per 'page' requested (max 100 total) - Cap extraction at 8k chars per 'page' requested (max 40k total) - Add early termination when limits reached - Add processing_limits metadata to show actual extraction stats - Prevent 1.28M token responses by stopping at reasonable content limits - Single page (page_range='1') now limited to ~25 paragraphs/8k chars
2025-08-22 08:00:02 -06:00 · 2025-08-22 08:00:02 -06:00 · b2033fc239
commit b2033fc239
parent 431022e113
1 changed files with 65 additions and 28 deletions
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@ -1260,12 +1260,28 @@ async def _convert_docx_with_python_docx(
                    "markdown_ref": f"![Image {i+1}]({img['filename']})"
                })
-    # Process document elements with page filtering if specified
+    # Process document elements with aggressive content limiting
    # Since Word page detection is unreliable, use element-based limiting
    if page_numbers:
        # For page ranges, severely limit content extraction
        max_pages_requested = max(page_numbers) if page_numbers else 1
        # Rough estimate: ~20-30 paragraphs per page
        max_paragraphs = min(max_pages_requested * 25, 100)  # Cap at 100 paragraphs max
        max_chars = min(max_pages_requested * 8000, 40000)  # Cap at 40k chars max
    else:
        max_paragraphs = 1000  # Large limit for full document
        max_chars = 200000
    current_page = 1
    processed_paragraphs = 0
    total_chars = 0
    include_current_page = not page_numbers or current_page in page_numbers
    table_of_contents = []  # Track headings with page numbers for TOC
    for element in doc.element.body:
        # Early termination if we've processed enough content
        if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
            break
        if isinstance(element, CT_P):
            paragraph = Paragraph(element, doc)
@ -1275,11 +1291,17 @@ async def _convert_docx_with_python_docx(
                include_current_page = not page_numbers or current_page in page_numbers
                continue
-            # Only process content from specified pages
+            # Process content with strict limits
            if include_current_page:
            markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
            if markdown_text.strip():
                # Check if adding this would exceed limits
                text_length = len(markdown_text)
                if total_chars + text_length > max_chars:
                    break  # Stop processing
                markdown_parts.append(markdown_text)
                processed_paragraphs += 1
                total_chars += text_length
                structure_info["paragraphs"] += 1
                # Track headings for both structure and TOC
@ -1303,12 +1325,17 @@ async def _convert_docx_with_python_docx(
                    })
        elif isinstance(element, CT_Tbl):
-            # Only process tables from specified pages
+            # Process tables with strict limits
-            if include_current_page:
+            if processed_paragraphs < max_paragraphs and total_chars < max_chars:
                table = Table(element, doc)
                table_markdown = _table_to_markdown(table)
                if table_markdown.strip():
                    table_length = len(table_markdown)
                    if total_chars + table_length > max_chars:
                        break  # Stop processing
                    markdown_parts.append(table_markdown)
                    total_chars += table_length
                    structure_info["tables"] += 1
    # Add image references at the end if any
@ -1329,6 +1356,16 @@ async def _convert_docx_with_python_docx(
    if table_of_contents:
        result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents)
    # Add processing limits info
    result["processing_limits"] = {
        "max_paragraphs_allowed": max_paragraphs,
        "max_chars_allowed": max_chars,
        "paragraphs_processed": processed_paragraphs,
        "chars_processed": total_chars,
        "content_truncated": processed_paragraphs >= max_paragraphs or total_chars >= max_chars,
        "note": f"Processed {processed_paragraphs}/{max_paragraphs} paragraphs, {total_chars:,}/{max_chars:,} chars"
    }
    # Add page filtering info
    if page_numbers:
        result["pages_processed"] = page_numbers