From b2033fc239c6587c5850ffbec1b5cff4c8e8fa42 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Fri, 22 Aug 2025 08:00:02 -0600 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A5=20Fix=20critical=20issue:=20page?= =?UTF-8?q?=5Frange=20was=20processing=20entire=20document?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace unreliable Word page detection with element-based limiting - Cap extraction at 25 paragraphs per 'page' requested (max 100 total) - Cap extraction at 8k chars per 'page' requested (max 40k total) - Add early termination when limits reached - Add processing_limits metadata to show actual extraction stats - Prevent 1.28M token responses by stopping at reasonable content limits - Single page (page_range='1') now limited to ~25 paragraphs/8k chars --- src/mcp_office_tools/server.py | 93 ++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 28 deletions(-) diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py index 62636e0..b26a578 100644 --- a/src/mcp_office_tools/server.py +++ b/src/mcp_office_tools/server.py @@ -1260,12 +1260,28 @@ async def _convert_docx_with_python_docx( "markdown_ref": f"![Image {i+1}]({img['filename']})" }) - # Process document elements with page filtering if specified + # Process document elements with aggressive content limiting + # Since Word page detection is unreliable, use element-based limiting + if page_numbers: + # For page ranges, severely limit content extraction + max_pages_requested = max(page_numbers) if page_numbers else 1 + # Rough estimate: ~20-30 paragraphs per page + max_paragraphs = min(max_pages_requested * 25, 100) # Cap at 100 paragraphs max + max_chars = min(max_pages_requested * 8000, 40000) # Cap at 40k chars max + else: + max_paragraphs = 1000 # Large limit for full document + max_chars = 200000 + current_page = 1 + processed_paragraphs = 0 + total_chars = 0 include_current_page = not page_numbers or current_page in page_numbers table_of_contents = [] # Track headings with page numbers for TOC for element in doc.element.body: + # Early termination if we've processed enough content + if processed_paragraphs >= max_paragraphs or total_chars >= max_chars: + break if isinstance(element, CT_P): paragraph = Paragraph(element, doc) @@ -1275,40 +1291,51 @@ async def _convert_docx_with_python_docx( include_current_page = not page_numbers or current_page in page_numbers continue - # Only process content from specified pages - if include_current_page: - markdown_text = _paragraph_to_markdown(paragraph, preserve_structure) - if markdown_text.strip(): - markdown_parts.append(markdown_text) - structure_info["paragraphs"] += 1 + # Process content with strict limits + markdown_text = _paragraph_to_markdown(paragraph, preserve_structure) + if markdown_text.strip(): + # Check if adding this would exceed limits + text_length = len(markdown_text) + if total_chars + text_length > max_chars: + break # Stop processing + + markdown_parts.append(markdown_text) + processed_paragraphs += 1 + total_chars += text_length + structure_info["paragraphs"] += 1 - # Track headings for both structure and TOC - if preserve_structure and markdown_text.startswith('#'): - level = len(markdown_text) - len(markdown_text.lstrip('#')) - heading_text = markdown_text.lstrip('# ').strip() - heading_info = { - "level": level, - "text": heading_text, - "position": len(markdown_parts) - 1, - "page": current_page - } - structure_info["headings"].append(heading_info) - - # Add to table of contents - table_of_contents.append({ - "level": level, - "title": heading_text, - "page": current_page, - "suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}" - }) + # Track headings for both structure and TOC + if preserve_structure and markdown_text.startswith('#'): + level = len(markdown_text) - len(markdown_text.lstrip('#')) + heading_text = markdown_text.lstrip('# ').strip() + heading_info = { + "level": level, + "text": heading_text, + "position": len(markdown_parts) - 1, + "page": current_page + } + structure_info["headings"].append(heading_info) + + # Add to table of contents + table_of_contents.append({ + "level": level, + "title": heading_text, + "page": current_page, + "suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}" + }) elif isinstance(element, CT_Tbl): - # Only process tables from specified pages - if include_current_page: + # Process tables with strict limits + if processed_paragraphs < max_paragraphs and total_chars < max_chars: table = Table(element, doc) table_markdown = _table_to_markdown(table) if table_markdown.strip(): + table_length = len(table_markdown) + if total_chars + table_length > max_chars: + break # Stop processing + markdown_parts.append(table_markdown) + total_chars += table_length structure_info["tables"] += 1 # Add image references at the end if any @@ -1329,6 +1356,16 @@ async def _convert_docx_with_python_docx( if table_of_contents: result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents) + # Add processing limits info + result["processing_limits"] = { + "max_paragraphs_allowed": max_paragraphs, + "max_chars_allowed": max_chars, + "paragraphs_processed": processed_paragraphs, + "chars_processed": total_chars, + "content_truncated": processed_paragraphs >= max_paragraphs or total_chars >= max_chars, + "note": f"Processed {processed_paragraphs}/{max_paragraphs} paragraphs, {total_chars:,}/{max_chars:,} chars" + } + # Add page filtering info if page_numbers: result["pages_processed"] = page_numbers