🧠 Add intelligent processing recommendations for optimal workflow

- Analyze document size and complexity before processing - Provide clear workflow recommendations in response metadata - Strongly recommend summary_only + page_range for large documents (>10 pages) - Add warning system for suboptimal usage patterns - Update parameter descriptions with best practice guidance - Help users avoid 25k token response limits proactively
2025-08-19 13:16:48 -06:00 · 2025-08-19 13:16:48 -06:00 · d94bd39da6
commit d94bd39da6
parent a485e05759
1 changed files with 140 additions and 6 deletions
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@ -291,14 +291,18 @@ async def convert_to_markdown(
    image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
    max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
    preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
-    page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). Empty = all pages"),
-    summary_only: bool = Field(default=False, description="Return only metadata and structure summary (for large docs)"),
+    page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
+    summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
    output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
 ) -> dict[str, Any]:
-    """Convert Office documents to Markdown format with page-range support and structure preservation.
+    """Convert Office documents to Markdown format with intelligent processing recommendations.
    
-    Supports page-based chunking for large documents and summary mode for quick overview.
-    Use page_range to process specific pages only, or summary_only=true for large documents.
+    ⚠️  RECOMMENDED WORKFLOW FOR LARGE DOCUMENTS (>5 pages):
+    1. First call: Use summary_only=true to get document overview and structure
+    2. Then: Use page_range (e.g., "1-10", "15-25") to process specific sections
+    
+    This prevents response size errors and provides efficient processing.
+    Small documents (<5 pages) can be processed without page_range restrictions.
    """
    start_time = time.time()

@ -320,6 +324,12 @@ async def convert_to_markdown(
        if category != "word":
            raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")

+        # Analyze document size and provide intelligent recommendations
+        doc_analysis = await _analyze_document_size(local_path, extension)
+        processing_recommendation = _get_processing_recommendation(
+            doc_analysis, page_range, summary_only
+        )
+        
        # Parse page range if provided
        page_numbers = _parse_page_range(page_range) if page_range else None
        
@ -343,7 +353,9 @@ async def convert_to_markdown(
                "format": format_info["format_name"],
                "conversion_method": markdown_result["method_used"],
                "conversion_time": round(time.time() - start_time, 3),
-                "summary_only": summary_only
+                "summary_only": summary_only,
+                "document_analysis": doc_analysis,
+                "processing_recommendation": processing_recommendation
            }
        }
        
@ -1536,6 +1548,128 @@ def _parse_page_range(page_range: str) -> list[int]:
    return sorted(list(pages))


+async def _analyze_document_size(file_path: str, extension: str) -> dict[str, Any]:
+    """Analyze document to estimate size and complexity."""
+    analysis = {
+        "estimated_pages": 1,
+        "file_size_mb": 0,
+        "complexity": "simple",
+        "estimated_content_size": "small"
+    }
+    
+    try:
+        # Get file size
+        from pathlib import Path
+        file_size = Path(file_path).stat().st_size
+        analysis["file_size_mb"] = round(file_size / (1024 * 1024), 2)
+        
+        if extension == ".docx":
+            try:
+                import docx
+                doc = docx.Document(file_path)
+                
+                # Estimate pages based on content
+                paragraph_count = len(doc.paragraphs)
+                table_count = len(doc.tables)
+                
+                # Rough estimation: ~40 paragraphs per page
+                estimated_pages = max(1, paragraph_count // 40)
+                analysis["estimated_pages"] = estimated_pages
+                
+                # Determine complexity
+                if table_count > 10 or paragraph_count > 500:
+                    analysis["complexity"] = "complex"
+                elif table_count > 5 or paragraph_count > 200:
+                    analysis["complexity"] = "moderate"
+                
+                # Estimate content size
+                if estimated_pages > 20:
+                    analysis["estimated_content_size"] = "very_large"
+                elif estimated_pages > 10:
+                    analysis["estimated_content_size"] = "large"  
+                elif estimated_pages > 5:
+                    analysis["estimated_content_size"] = "medium"
+                
+            except Exception:
+                # Fallback to file size estimation
+                if file_size > 5 * 1024 * 1024:  # 5MB
+                    analysis["estimated_pages"] = 50
+                    analysis["estimated_content_size"] = "very_large"
+                elif file_size > 1 * 1024 * 1024:  # 1MB
+                    analysis["estimated_pages"] = 20
+                    analysis["estimated_content_size"] = "large"
+                elif file_size > 500 * 1024:  # 500KB
+                    analysis["estimated_pages"] = 10
+                    analysis["estimated_content_size"] = "medium"
+        
+    except Exception:
+        pass
+    
+    return analysis
+
+
+def _get_processing_recommendation(
+    doc_analysis: dict[str, Any], 
+    page_range: str, 
+    summary_only: bool
+) -> dict[str, Any]:
+    """Generate intelligent processing recommendations based on document analysis."""
+    
+    estimated_pages = doc_analysis["estimated_pages"]
+    content_size = doc_analysis["estimated_content_size"]
+    
+    recommendation = {
+        "status": "optimal",
+        "message": "",
+        "suggested_workflow": [],
+        "warnings": []
+    }
+    
+    # Large document recommendations
+    if content_size in ["large", "very_large"] and not page_range and not summary_only:
+        recommendation["status"] = "suboptimal"
+        recommendation["message"] = (
+            f"⚠️  Large document detected ({estimated_pages} estimated pages). "
+            "Consider using recommended workflow for better performance."
+        )
+        recommendation["suggested_workflow"] = [
+            "1. First: Call with summary_only=true to get document overview",
+            "2. Then: Use page_range to process specific sections (e.g., '1-10', '20-30')",
+            "3. Alternative: Process in chunks of 10-15 pages to avoid response limits"
+        ]
+        recommendation["warnings"] = [
+            "Full document processing may hit 25k token response limit",
+            "Large responses may be slow and consume significant resources"
+        ]
+    
+    # Medium document recommendations  
+    elif content_size == "medium" and not page_range and not summary_only:
+        recommendation["status"] = "caution"
+        recommendation["message"] = (
+            f"Medium document detected ({estimated_pages} estimated pages). "
+            "Consider summary_only=true first if you encounter response size issues."
+        )
+        recommendation["suggested_workflow"] = [
+            "Option 1: Try full processing (current approach)",
+            "Option 2: Use summary_only=true first, then page_range if needed"
+        ]
+    
+    # Optimal usage patterns
+    elif summary_only:
+        recommendation["message"] = "✅ Excellent! Using summary mode for initial document analysis."
+        recommendation["suggested_workflow"] = [
+            "After reviewing summary, use page_range to extract specific sections of interest"
+        ]
+    
+    elif page_range and content_size in ["large", "very_large"]:
+        recommendation["message"] = "✅ Perfect! Using page-range processing for efficient extraction."
+    
+    elif content_size == "small":
+        recommendation["message"] = "✅ Small document - full processing is optimal."
+    
+    return recommendation
+
+
 def main():
    """Main entry point for the MCP server."""
    import sys