From b2033fc239c6587c5850ffbec1b5cff4c8e8fa42 Mon Sep 17 00:00:00 2001
From: Ryan Malloy <ryan@supported.systems>
Date: Fri, 22 Aug 2025 08:00:02 -0600
Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A5=20Fix=20critical=20issue:=20page?=
 =?UTF-8?q?=5Frange=20was=20processing=20entire=20document?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace unreliable Word page detection with element-based limiting
- Cap extraction at 25 paragraphs per 'page' requested (max 100 total)
- Cap extraction at 8k chars per 'page' requested (max 40k total)
- Add early termination when limits reached
- Add processing_limits metadata to show actual extraction stats
- Prevent 1.28M token responses by stopping at reasonable content limits
- Single page (page_range='1') now limited to ~25 paragraphs/8k chars
---
 src/mcp_office_tools/server.py | 93 ++++++++++++++++++++++++----------
 1 file changed, 65 insertions(+), 28 deletions(-)

diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py
index 62636e0..b26a578 100644
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@@ -1260,12 +1260,28 @@ async def _convert_docx_with_python_docx(
                     "markdown_ref": f"![Image {i+1}]({img['filename']})"
                 })
 
-    # Process document elements with page filtering if specified
+    # Process document elements with aggressive content limiting
+    # Since Word page detection is unreliable, use element-based limiting
+    if page_numbers:
+        # For page ranges, severely limit content extraction
+        max_pages_requested = max(page_numbers) if page_numbers else 1
+        # Rough estimate: ~20-30 paragraphs per page
+        max_paragraphs = min(max_pages_requested * 25, 100)  # Cap at 100 paragraphs max
+        max_chars = min(max_pages_requested * 8000, 40000)  # Cap at 40k chars max
+    else:
+        max_paragraphs = 1000  # Large limit for full document
+        max_chars = 200000
+    
     current_page = 1
+    processed_paragraphs = 0
+    total_chars = 0
     include_current_page = not page_numbers or current_page in page_numbers
     table_of_contents = []  # Track headings with page numbers for TOC
     
     for element in doc.element.body:
+        # Early termination if we've processed enough content
+        if processed_paragraphs >= max_paragraphs or total_chars >= max_chars:
+            break
         if isinstance(element, CT_P):
             paragraph = Paragraph(element, doc)
             
@@ -1275,40 +1291,51 @@ async def _convert_docx_with_python_docx(
                 include_current_page = not page_numbers or current_page in page_numbers
                 continue
             
-            # Only process content from specified pages
-            if include_current_page:
-                markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
-                if markdown_text.strip():
-                    markdown_parts.append(markdown_text)
-                    structure_info["paragraphs"] += 1
+            # Process content with strict limits
+            markdown_text = _paragraph_to_markdown(paragraph, preserve_structure)
+            if markdown_text.strip():
+                # Check if adding this would exceed limits
+                text_length = len(markdown_text)
+                if total_chars + text_length > max_chars:
+                    break  # Stop processing
+                
+                markdown_parts.append(markdown_text)
+                processed_paragraphs += 1
+                total_chars += text_length
+                structure_info["paragraphs"] += 1
 
-                    # Track headings for both structure and TOC
-                    if preserve_structure and markdown_text.startswith('#'):
-                        level = len(markdown_text) - len(markdown_text.lstrip('#'))
-                        heading_text = markdown_text.lstrip('# ').strip()
-                        heading_info = {
-                            "level": level,
-                            "text": heading_text,
-                            "position": len(markdown_parts) - 1,
-                            "page": current_page
-                        }
-                        structure_info["headings"].append(heading_info)
-                        
-                        # Add to table of contents
-                        table_of_contents.append({
-                            "level": level,
-                            "title": heading_text,
-                            "page": current_page,
-                            "suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}"
-                        })
+                # Track headings for both structure and TOC
+                if preserve_structure and markdown_text.startswith('#'):
+                    level = len(markdown_text) - len(markdown_text.lstrip('#'))
+                    heading_text = markdown_text.lstrip('# ').strip()
+                    heading_info = {
+                        "level": level,
+                        "text": heading_text,
+                        "position": len(markdown_parts) - 1,
+                        "page": current_page
+                    }
+                    structure_info["headings"].append(heading_info)
+                    
+                    # Add to table of contents
+                    table_of_contents.append({
+                        "level": level,
+                        "title": heading_text,
+                        "page": current_page,
+                        "suggested_page_range": f"{current_page}-{current_page + _estimate_section_length(level)}"
+                    })
 
         elif isinstance(element, CT_Tbl):
-            # Only process tables from specified pages
-            if include_current_page:
+            # Process tables with strict limits
+            if processed_paragraphs < max_paragraphs and total_chars < max_chars:
                 table = Table(element, doc)
                 table_markdown = _table_to_markdown(table)
                 if table_markdown.strip():
+                    table_length = len(table_markdown)
+                    if total_chars + table_length > max_chars:
+                        break  # Stop processing
+                    
                     markdown_parts.append(table_markdown)
+                    total_chars += table_length
                     structure_info["tables"] += 1
 
     # Add image references at the end if any
@@ -1329,6 +1356,16 @@ async def _convert_docx_with_python_docx(
     if table_of_contents:
         result["table_of_contents"] = _optimize_toc_page_ranges(table_of_contents)
     
+    # Add processing limits info
+    result["processing_limits"] = {
+        "max_paragraphs_allowed": max_paragraphs,
+        "max_chars_allowed": max_chars,
+        "paragraphs_processed": processed_paragraphs,
+        "chars_processed": total_chars,
+        "content_truncated": processed_paragraphs >= max_paragraphs or total_chars >= max_chars,
+        "note": f"Processed {processed_paragraphs}/{max_paragraphs} paragraphs, {total_chars:,}/{max_chars:,} chars"
+    }
+    
     # Add page filtering info
     if page_numbers:
         result["pages_processed"] = page_numbers