diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py index 2fbaf44..83fe70c 100644 --- a/src/mcp_office_tools/server.py +++ b/src/mcp_office_tools/server.py @@ -291,14 +291,18 @@ async def convert_to_markdown( image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"), max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"), preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"), - page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). Empty = all pages"), - summary_only: bool = Field(default=False, description="Return only metadata and structure summary (for large docs)"), + page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"), + summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"), output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')") ) -> dict[str, Any]: - """Convert Office documents to Markdown format with page-range support and structure preservation. + """Convert Office documents to Markdown format with intelligent processing recommendations. - Supports page-based chunking for large documents and summary mode for quick overview. - Use page_range to process specific pages only, or summary_only=true for large documents. + ⚠️ RECOMMENDED WORKFLOW FOR LARGE DOCUMENTS (>5 pages): + 1. First call: Use summary_only=true to get document overview and structure + 2. Then: Use page_range (e.g., "1-10", "15-25") to process specific sections + + This prevents response size errors and provides efficient processing. + Small documents (<5 pages) can be processed without page_range restrictions. """ start_time = time.time() @@ -320,6 +324,12 @@ async def convert_to_markdown( if category != "word": raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}") + # Analyze document size and provide intelligent recommendations + doc_analysis = await _analyze_document_size(local_path, extension) + processing_recommendation = _get_processing_recommendation( + doc_analysis, page_range, summary_only + ) + # Parse page range if provided page_numbers = _parse_page_range(page_range) if page_range else None @@ -343,7 +353,9 @@ async def convert_to_markdown( "format": format_info["format_name"], "conversion_method": markdown_result["method_used"], "conversion_time": round(time.time() - start_time, 3), - "summary_only": summary_only + "summary_only": summary_only, + "document_analysis": doc_analysis, + "processing_recommendation": processing_recommendation } } @@ -1536,6 +1548,128 @@ def _parse_page_range(page_range: str) -> list[int]: return sorted(list(pages)) +async def _analyze_document_size(file_path: str, extension: str) -> dict[str, Any]: + """Analyze document to estimate size and complexity.""" + analysis = { + "estimated_pages": 1, + "file_size_mb": 0, + "complexity": "simple", + "estimated_content_size": "small" + } + + try: + # Get file size + from pathlib import Path + file_size = Path(file_path).stat().st_size + analysis["file_size_mb"] = round(file_size / (1024 * 1024), 2) + + if extension == ".docx": + try: + import docx + doc = docx.Document(file_path) + + # Estimate pages based on content + paragraph_count = len(doc.paragraphs) + table_count = len(doc.tables) + + # Rough estimation: ~40 paragraphs per page + estimated_pages = max(1, paragraph_count // 40) + analysis["estimated_pages"] = estimated_pages + + # Determine complexity + if table_count > 10 or paragraph_count > 500: + analysis["complexity"] = "complex" + elif table_count > 5 or paragraph_count > 200: + analysis["complexity"] = "moderate" + + # Estimate content size + if estimated_pages > 20: + analysis["estimated_content_size"] = "very_large" + elif estimated_pages > 10: + analysis["estimated_content_size"] = "large" + elif estimated_pages > 5: + analysis["estimated_content_size"] = "medium" + + except Exception: + # Fallback to file size estimation + if file_size > 5 * 1024 * 1024: # 5MB + analysis["estimated_pages"] = 50 + analysis["estimated_content_size"] = "very_large" + elif file_size > 1 * 1024 * 1024: # 1MB + analysis["estimated_pages"] = 20 + analysis["estimated_content_size"] = "large" + elif file_size > 500 * 1024: # 500KB + analysis["estimated_pages"] = 10 + analysis["estimated_content_size"] = "medium" + + except Exception: + pass + + return analysis + + +def _get_processing_recommendation( + doc_analysis: dict[str, Any], + page_range: str, + summary_only: bool +) -> dict[str, Any]: + """Generate intelligent processing recommendations based on document analysis.""" + + estimated_pages = doc_analysis["estimated_pages"] + content_size = doc_analysis["estimated_content_size"] + + recommendation = { + "status": "optimal", + "message": "", + "suggested_workflow": [], + "warnings": [] + } + + # Large document recommendations + if content_size in ["large", "very_large"] and not page_range and not summary_only: + recommendation["status"] = "suboptimal" + recommendation["message"] = ( + f"⚠️ Large document detected ({estimated_pages} estimated pages). " + "Consider using recommended workflow for better performance." + ) + recommendation["suggested_workflow"] = [ + "1. First: Call with summary_only=true to get document overview", + "2. Then: Use page_range to process specific sections (e.g., '1-10', '20-30')", + "3. Alternative: Process in chunks of 10-15 pages to avoid response limits" + ] + recommendation["warnings"] = [ + "Full document processing may hit 25k token response limit", + "Large responses may be slow and consume significant resources" + ] + + # Medium document recommendations + elif content_size == "medium" and not page_range and not summary_only: + recommendation["status"] = "caution" + recommendation["message"] = ( + f"Medium document detected ({estimated_pages} estimated pages). " + "Consider summary_only=true first if you encounter response size issues." + ) + recommendation["suggested_workflow"] = [ + "Option 1: Try full processing (current approach)", + "Option 2: Use summary_only=true first, then page_range if needed" + ] + + # Optimal usage patterns + elif summary_only: + recommendation["message"] = "✅ Excellent! Using summary mode for initial document analysis." + recommendation["suggested_workflow"] = [ + "After reviewing summary, use page_range to extract specific sections of interest" + ] + + elif page_range and content_size in ["large", "very_large"]: + recommendation["message"] = "✅ Perfect! Using page-range processing for efficient extraction." + + elif content_size == "small": + recommendation["message"] = "✅ Small document - full processing is optimal." + + return recommendation + + def main(): """Main entry point for the MCP server.""" import sys