From f884c99bbd3d574a0ae61dbe91555ada9e9d729d Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Mon, 18 Aug 2025 23:32:00 -0600 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=AF=20Add=20page-range=20chunking=20an?= =?UTF-8?q?d=20summary=20mode=20for=20large=20documents?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace character-based chunking with page-range support (e.g., '1-5', '1,3,5-10') - Add summary_only mode to prevent large response errors (>25k tokens) - Implement response size limiting with 5000 char truncation in summary mode - Support selective page processing for better memory efficiency - Maintain backward compatibility with existing parameters --- src/mcp_office_tools/server.py | 117 ++++++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 30 deletions(-) diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py index a293afd..993b483 100644 --- a/src/mcp_office_tools/server.py +++ b/src/mcp_office_tools/server.py @@ -291,13 +291,14 @@ async def convert_to_markdown( image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"), max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"), preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"), - chunk_size: int = Field(default=0, description="Split large documents into chunks (0 = no chunking)"), + page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). Empty = all pages"), + summary_only: bool = Field(default=False, description="Return only metadata and structure summary (for large docs)"), output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')") ) -> dict[str, Any]: - """Convert Office documents to Markdown format with image support and structure preservation. + """Convert Office documents to Markdown format with page-range support and structure preservation. - Handles large .docx files efficiently with options for image embedding, file extraction, - and document chunking for very large files. + Supports page-based chunking for large documents and summary mode for quick overview. + Use page_range to process specific pages only, or summary_only=true for large documents. """ start_time = time.time() @@ -319,35 +320,49 @@ async def convert_to_markdown( if category != "word": raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}") + # Parse page range if provided + page_numbers = _parse_page_range(page_range) if page_range else None + # Convert to markdown based on format if extension == ".docx": markdown_result = await _convert_docx_to_markdown( local_path, include_images, image_mode, max_image_size, - preserve_structure, chunk_size, output_dir + preserve_structure, page_numbers, summary_only, output_dir ) else: # .doc # For legacy .doc files, use mammoth if available markdown_result = await _convert_doc_to_markdown( local_path, include_images, image_mode, max_image_size, - preserve_structure, chunk_size, output_dir + preserve_structure, page_numbers, summary_only, output_dir ) + # Build result based on mode result = { - "markdown": markdown_result["content"], "metadata": { "original_file": os.path.basename(local_path), "format": format_info["format_name"], "conversion_method": markdown_result["method_used"], - "character_count": len(markdown_result["content"]), - "word_count": len(markdown_result["content"].split()), - "conversion_time": round(time.time() - start_time, 3) + "conversion_time": round(time.time() - start_time, 3), + "summary_only": summary_only } } + + # Add page range info if used + if page_range: + result["metadata"]["page_range"] = page_range + result["metadata"]["pages_processed"] = len(page_numbers) if page_numbers else 0 - # Add chunking info if applicable - if chunk_size > 0 and markdown_result.get("chunks"): - result["chunks"] = markdown_result["chunks"] - result["metadata"]["chunk_count"] = len(markdown_result["chunks"]) + # Add content based on mode + if summary_only: + # Only include summary information for large documents + result["metadata"]["character_count"] = len(markdown_result["content"]) + result["metadata"]["word_count"] = len(markdown_result["content"].split()) + result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"] + else: + # Include full content for smaller documents or page ranges + result["markdown"] = markdown_result["content"] + result["metadata"]["character_count"] = len(markdown_result["content"]) + result["metadata"]["word_count"] = len(markdown_result["content"].split()) # Add image info if include_images and markdown_result.get("images"): @@ -989,7 +1004,8 @@ async def _convert_docx_to_markdown( image_mode: str, max_image_size: int, preserve_structure: bool, - chunk_size: int, + page_numbers: list[int], + summary_only: bool, output_dir: str ) -> dict[str, Any]: """Convert .docx file to markdown with comprehensive feature support.""" @@ -1092,10 +1108,13 @@ async def _convert_docx_to_markdown( "images": [] } - # Handle chunking if requested - if chunk_size > 0 and len(markdown_content) > chunk_size: - chunks = _chunk_markdown(markdown_content, chunk_size) - conversion_result["chunks"] = chunks + # Handle summary mode + if summary_only and len(markdown_content) > 5000: + # For summary mode, truncate large content + markdown_content = markdown_content[:5000] + "\n\n[Content truncated - use summary_only=false for full content]" + + # Update the conversion result + conversion_result["content"] = markdown_content # Extract structure information if preserve_structure: @@ -1108,13 +1127,13 @@ async def _convert_docx_to_markdown( # Fall back to python-docx with custom markdown conversion return await _convert_docx_with_python_docx( file_path, include_images, image_mode, max_image_size, - preserve_structure, chunk_size, output_dir + preserve_structure, page_numbers, summary_only, output_dir ) except Exception: # Fall back to python-docx return await _convert_docx_with_python_docx( file_path, include_images, image_mode, max_image_size, - preserve_structure, chunk_size, output_dir + preserve_structure, page_numbers, summary_only, output_dir ) @@ -1124,7 +1143,8 @@ async def _convert_docx_with_python_docx( image_mode: str, max_image_size: int, preserve_structure: bool, - chunk_size: int, + page_numbers: list[int], + summary_only: bool, output_dir: str ) -> dict[str, Any]: """Convert .docx using python-docx with custom markdown conversion.""" @@ -1221,10 +1241,12 @@ async def _convert_docx_with_python_docx( "images": images_info } - # Handle chunking - if chunk_size > 0 and len(markdown_content) > chunk_size: - chunks = _chunk_markdown(markdown_content, chunk_size) - result["chunks"] = chunks + # Handle summary mode + if summary_only and len(markdown_content) > 5000: + markdown_content = markdown_content[:5000] + "\n\n[Content truncated - use summary_only=false for full content]" + + # Update the result content + result["content"] = markdown_content # Add structure info if preserve_structure: @@ -1239,7 +1261,8 @@ async def _convert_doc_to_markdown( image_mode: str, max_image_size: int, preserve_structure: bool, - chunk_size: int, + page_numbers: list[int], + summary_only: bool, output_dir: str ) -> dict[str, Any]: """Convert legacy .doc file to markdown using available methods.""" @@ -1256,9 +1279,12 @@ async def _convert_doc_to_markdown( "images": [] # Legacy .doc image extraction is complex } - if chunk_size > 0 and len(markdown_content) > chunk_size: - chunks = _chunk_markdown(markdown_content, chunk_size) - conversion_result["chunks"] = chunks + # Handle summary mode + if summary_only and len(markdown_content) > 5000: + markdown_content = markdown_content[:5000] + "\n\n[Content truncated - use summary_only=false for full content]" + + # Update the conversion result + conversion_result["content"] = markdown_content if preserve_structure: structure = _extract_markdown_structure(markdown_content) @@ -1438,6 +1464,37 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]: return structure +def _parse_page_range(page_range: str) -> list[int]: + """Parse page range string into list of page numbers. + + Examples: + "1-5" -> [1, 2, 3, 4, 5] + "1,3,5" -> [1, 3, 5] + "1-3,5,7-9" -> [1, 2, 3, 5, 7, 8, 9] + """ + pages = set() + + for part in page_range.split(','): + part = part.strip() + if '-' in part: + # Handle range like "1-5" + start, end = part.split('-', 1) + try: + start_num = int(start.strip()) + end_num = int(end.strip()) + pages.update(range(start_num, end_num + 1)) + except ValueError: + continue + else: + # Handle single page like "3" + try: + pages.add(int(part)) + except ValueError: + continue + + return sorted(list(pages)) + + def main(): """Main entry point for the MCP server.""" import sys