🧠 Add intelligent processing recommendations for optimal workflow

- Analyze document size and complexity before processing - Provide clear workflow recommendations in response metadata - Strongly recommend summary_only + page_range for large documents (>10 pages) - Add warning system for suboptimal usage patterns - Update parameter descriptions with best practice guidance - Help users avoid 25k token response limits proactively
2025-08-19 13:16:48 -06:00 · 2025-08-19 13:16:48 -06:00 · d94bd39da6
commit d94bd39da6
parent a485e05759
1 changed files with 140 additions and 6 deletions
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@ -291,14 +291,18 @@ async def convert_to_markdown(
    image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
    max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
    preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
-    page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). Empty = all pages"),
+    page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
-    summary_only: bool = Field(default=False, description="Return only metadata and structure summary (for large docs)"),
+    summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
    output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
 ) -> dict[str, Any]:
-    """Convert Office documents to Markdown format with page-range support and structure preservation.
+    """Convert Office documents to Markdown format with intelligent processing recommendations.
-    Supports page-based chunking for large documents and summary mode for quick overview.
+    ⚠️  RECOMMENDED WORKFLOW FOR LARGE DOCUMENTS (>5 pages):
-    Use page_range to process specific pages only, or summary_only=true for large documents.
+    1. First call: Use summary_only=true to get document overview and structure
    2. Then: Use page_range (e.g., "1-10", "15-25") to process specific sections
    This prevents response size errors and provides efficient processing.
    Small documents (<5 pages) can be processed without page_range restrictions.
    """
    start_time = time.time()
@ -320,6 +324,12 @@ async def convert_to_markdown(
        if category != "word":
            raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")
        # Analyze document size and provide intelligent recommendations
        doc_analysis = await _analyze_document_size(local_path, extension)
        processing_recommendation = _get_processing_recommendation(
            doc_analysis, page_range, summary_only
        )
        # Parse page range if provided
        page_numbers = _parse_page_range(page_range) if page_range else None
@ -343,7 +353,9 @@ async def convert_to_markdown(
                "format": format_info["format_name"],
                "conversion_method": markdown_result["method_used"],
                "conversion_time": round(time.time() - start_time, 3),
-                "summary_only": summary_only
+                "summary_only": summary_only,
                "document_analysis": doc_analysis,
                "processing_recommendation": processing_recommendation
            }
        }
@ -1536,6 +1548,128 @@ def _parse_page_range(page_range: str) -> list[int]:
    return sorted(list(pages))
 async def _analyze_document_size(file_path: str, extension: str) -> dict[str, Any]:
    """Analyze document to estimate size and complexity."""
    analysis = {
        "estimated_pages": 1,
        "file_size_mb": 0,
        "complexity": "simple",
        "estimated_content_size": "small"
    }
    try:
        # Get file size
        from pathlib import Path
        file_size = Path(file_path).stat().st_size
        analysis["file_size_mb"] = round(file_size / (1024 * 1024), 2)
        if extension == ".docx":
            try:
                import docx
                doc = docx.Document(file_path)
                # Estimate pages based on content
                paragraph_count = len(doc.paragraphs)
                table_count = len(doc.tables)
                # Rough estimation: ~40 paragraphs per page
                estimated_pages = max(1, paragraph_count // 40)
                analysis["estimated_pages"] = estimated_pages
                # Determine complexity
                if table_count > 10 or paragraph_count > 500:
                    analysis["complexity"] = "complex"
                elif table_count > 5 or paragraph_count > 200:
                    analysis["complexity"] = "moderate"
                # Estimate content size
                if estimated_pages > 20:
                    analysis["estimated_content_size"] = "very_large"
                elif estimated_pages > 10:
                    analysis["estimated_content_size"] = "large"  
                elif estimated_pages > 5:
                    analysis["estimated_content_size"] = "medium"
            except Exception:
                # Fallback to file size estimation
                if file_size > 5 * 1024 * 1024:  # 5MB
                    analysis["estimated_pages"] = 50
                    analysis["estimated_content_size"] = "very_large"
                elif file_size > 1 * 1024 * 1024:  # 1MB
                    analysis["estimated_pages"] = 20
                    analysis["estimated_content_size"] = "large"
                elif file_size > 500 * 1024:  # 500KB
                    analysis["estimated_pages"] = 10
                    analysis["estimated_content_size"] = "medium"
    except Exception:
        pass
    return analysis
 def _get_processing_recommendation(
    doc_analysis: dict[str, Any], 
    page_range: str, 
    summary_only: bool
 ) -> dict[str, Any]:
    """Generate intelligent processing recommendations based on document analysis."""
    estimated_pages = doc_analysis["estimated_pages"]
    content_size = doc_analysis["estimated_content_size"]
    recommendation = {
        "status": "optimal",
        "message": "",
        "suggested_workflow": [],
        "warnings": []
    }
    # Large document recommendations
    if content_size in ["large", "very_large"] and not page_range and not summary_only:
        recommendation["status"] = "suboptimal"
        recommendation["message"] = (
            f"⚠️  Large document detected ({estimated_pages} estimated pages). "
            "Consider using recommended workflow for better performance."
        )
        recommendation["suggested_workflow"] = [
            "1. First: Call with summary_only=true to get document overview",
            "2. Then: Use page_range to process specific sections (e.g., '1-10', '20-30')",
            "3. Alternative: Process in chunks of 10-15 pages to avoid response limits"
        ]
        recommendation["warnings"] = [
            "Full document processing may hit 25k token response limit",
            "Large responses may be slow and consume significant resources"
        ]
    # Medium document recommendations  
    elif content_size == "medium" and not page_range and not summary_only:
        recommendation["status"] = "caution"
        recommendation["message"] = (
            f"Medium document detected ({estimated_pages} estimated pages). "
            "Consider summary_only=true first if you encounter response size issues."
        )
        recommendation["suggested_workflow"] = [
            "Option 1: Try full processing (current approach)",
            "Option 2: Use summary_only=true first, then page_range if needed"
        ]
    # Optimal usage patterns
    elif summary_only:
        recommendation["message"] = "✅ Excellent! Using summary mode for initial document analysis."
        recommendation["suggested_workflow"] = [
            "After reviewing summary, use page_range to extract specific sections of interest"
        ]
    elif page_range and content_size in ["large", "very_large"]:
        recommendation["message"] = "✅ Perfect! Using page-range processing for efficient extraction."
    elif content_size == "small":
        recommendation["message"] = "✅ Small document - full processing is optimal."
    return recommendation
 def main():
    """Main entry point for the MCP server."""
    import sys