From f884c99bbd3d574a0ae61dbe91555ada9e9d729d Mon Sep 17 00:00:00 2001
From: Ryan Malloy <ryan@supported.systems>
Date: Mon, 18 Aug 2025 23:32:00 -0600
Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=AF=20Add=20page-range=20chunking=20an?=
 =?UTF-8?q?d=20summary=20mode=20for=20large=20documents?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace character-based chunking with page-range support (e.g., '1-5', '1,3,5-10')
- Add summary_only mode to prevent large response errors (>25k tokens)
- Implement response size limiting with 5000 char truncation in summary mode
- Support selective page processing for better memory efficiency
- Maintain backward compatibility with existing parameters
---
 src/mcp_office_tools/server.py | 117 ++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 30 deletions(-)

diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py
index a293afd..993b483 100644
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@@ -291,13 +291,14 @@ async def convert_to_markdown(
     image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
     max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
     preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
-    chunk_size: int = Field(default=0, description="Split large documents into chunks (0 = no chunking)"),
+    page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). Empty = all pages"),
+    summary_only: bool = Field(default=False, description="Return only metadata and structure summary (for large docs)"),
     output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
 ) -> dict[str, Any]:
-    """Convert Office documents to Markdown format with image support and structure preservation.
+    """Convert Office documents to Markdown format with page-range support and structure preservation.
     
-    Handles large .docx files efficiently with options for image embedding, file extraction,
-    and document chunking for very large files.
+    Supports page-based chunking for large documents and summary mode for quick overview.
+    Use page_range to process specific pages only, or summary_only=true for large documents.
     """
     start_time = time.time()
 
@@ -319,35 +320,49 @@ async def convert_to_markdown(
         if category != "word":
             raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")
 
+        # Parse page range if provided
+        page_numbers = _parse_page_range(page_range) if page_range else None
+        
         # Convert to markdown based on format
         if extension == ".docx":
             markdown_result = await _convert_docx_to_markdown(
                 local_path, include_images, image_mode, max_image_size,
-                preserve_structure, chunk_size, output_dir
+                preserve_structure, page_numbers, summary_only, output_dir
             )
         else:  # .doc
             # For legacy .doc files, use mammoth if available
             markdown_result = await _convert_doc_to_markdown(
                 local_path, include_images, image_mode, max_image_size,
-                preserve_structure, chunk_size, output_dir
+                preserve_structure, page_numbers, summary_only, output_dir
             )
 
+        # Build result based on mode
         result = {
-            "markdown": markdown_result["content"],
             "metadata": {
                 "original_file": os.path.basename(local_path),
                 "format": format_info["format_name"],
                 "conversion_method": markdown_result["method_used"],
-                "character_count": len(markdown_result["content"]),
-                "word_count": len(markdown_result["content"].split()),
-                "conversion_time": round(time.time() - start_time, 3)
+                "conversion_time": round(time.time() - start_time, 3),
+                "summary_only": summary_only
             }
         }
+        
+        # Add page range info if used
+        if page_range:
+            result["metadata"]["page_range"] = page_range
+            result["metadata"]["pages_processed"] = len(page_numbers) if page_numbers else 0
 
-        # Add chunking info if applicable
-        if chunk_size > 0 and markdown_result.get("chunks"):
-            result["chunks"] = markdown_result["chunks"]
-            result["metadata"]["chunk_count"] = len(markdown_result["chunks"])
+        # Add content based on mode
+        if summary_only:
+            # Only include summary information for large documents
+            result["metadata"]["character_count"] = len(markdown_result["content"])
+            result["metadata"]["word_count"] = len(markdown_result["content"].split())
+            result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"]
+        else:
+            # Include full content for smaller documents or page ranges
+            result["markdown"] = markdown_result["content"]
+            result["metadata"]["character_count"] = len(markdown_result["content"])
+            result["metadata"]["word_count"] = len(markdown_result["content"].split())
 
         # Add image info
         if include_images and markdown_result.get("images"):
@@ -989,7 +1004,8 @@ async def _convert_docx_to_markdown(
     image_mode: str,
     max_image_size: int,
     preserve_structure: bool,
-    chunk_size: int,
+    page_numbers: list[int],
+    summary_only: bool,
     output_dir: str
 ) -> dict[str, Any]:
     """Convert .docx file to markdown with comprehensive feature support."""
@@ -1092,10 +1108,13 @@ async def _convert_docx_to_markdown(
                     "images": []
                 }
 
-            # Handle chunking if requested
-            if chunk_size > 0 and len(markdown_content) > chunk_size:
-                chunks = _chunk_markdown(markdown_content, chunk_size)
-                conversion_result["chunks"] = chunks
+            # Handle summary mode
+            if summary_only and len(markdown_content) > 5000:
+                # For summary mode, truncate large content
+                markdown_content = markdown_content[:5000] + "\n\n[Content truncated - use summary_only=false for full content]"
+            
+            # Update the conversion result
+            conversion_result["content"] = markdown_content
 
             # Extract structure information
             if preserve_structure:
@@ -1108,13 +1127,13 @@ async def _convert_docx_to_markdown(
         # Fall back to python-docx with custom markdown conversion
         return await _convert_docx_with_python_docx(
             file_path, include_images, image_mode, max_image_size,
-            preserve_structure, chunk_size, output_dir
+            preserve_structure, page_numbers, summary_only, output_dir
         )
     except Exception:
         # Fall back to python-docx
         return await _convert_docx_with_python_docx(
             file_path, include_images, image_mode, max_image_size,
-            preserve_structure, chunk_size, output_dir
+            preserve_structure, page_numbers, summary_only, output_dir
         )
 
 
@@ -1124,7 +1143,8 @@ async def _convert_docx_with_python_docx(
     image_mode: str,
     max_image_size: int,
     preserve_structure: bool,
-    chunk_size: int,
+    page_numbers: list[int],
+    summary_only: bool,
     output_dir: str
 ) -> dict[str, Any]:
     """Convert .docx using python-docx with custom markdown conversion."""
@@ -1221,10 +1241,12 @@ async def _convert_docx_with_python_docx(
         "images": images_info
     }
 
-    # Handle chunking
-    if chunk_size > 0 and len(markdown_content) > chunk_size:
-        chunks = _chunk_markdown(markdown_content, chunk_size)
-        result["chunks"] = chunks
+    # Handle summary mode
+    if summary_only and len(markdown_content) > 5000:
+        markdown_content = markdown_content[:5000] + "\n\n[Content truncated - use summary_only=false for full content]"
+    
+    # Update the result content
+    result["content"] = markdown_content
 
     # Add structure info
     if preserve_structure:
@@ -1239,7 +1261,8 @@ async def _convert_doc_to_markdown(
     image_mode: str,
     max_image_size: int,
     preserve_structure: bool,
-    chunk_size: int,
+    page_numbers: list[int],
+    summary_only: bool,
     output_dir: str
 ) -> dict[str, Any]:
     """Convert legacy .doc file to markdown using available methods."""
@@ -1256,9 +1279,12 @@ async def _convert_doc_to_markdown(
                 "images": []  # Legacy .doc image extraction is complex
             }
 
-            if chunk_size > 0 and len(markdown_content) > chunk_size:
-                chunks = _chunk_markdown(markdown_content, chunk_size)
-                conversion_result["chunks"] = chunks
+            # Handle summary mode  
+            if summary_only and len(markdown_content) > 5000:
+                markdown_content = markdown_content[:5000] + "\n\n[Content truncated - use summary_only=false for full content]"
+            
+            # Update the conversion result
+            conversion_result["content"] = markdown_content
 
             if preserve_structure:
                 structure = _extract_markdown_structure(markdown_content)
@@ -1438,6 +1464,37 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
     return structure
 
 
+def _parse_page_range(page_range: str) -> list[int]:
+    """Parse page range string into list of page numbers.
+    
+    Examples:
+        "1-5" -> [1, 2, 3, 4, 5]
+        "1,3,5" -> [1, 3, 5]
+        "1-3,5,7-9" -> [1, 2, 3, 5, 7, 8, 9]
+    """
+    pages = set()
+    
+    for part in page_range.split(','):
+        part = part.strip()
+        if '-' in part:
+            # Handle range like "1-5"
+            start, end = part.split('-', 1)
+            try:
+                start_num = int(start.strip())
+                end_num = int(end.strip())
+                pages.update(range(start_num, end_num + 1))
+            except ValueError:
+                continue
+        else:
+            # Handle single page like "3"
+            try:
+                pages.add(int(part))
+            except ValueError:
+                continue
+    
+    return sorted(list(pages))
+
+
 def main():
     """Main entry point for the MCP server."""
     import sys