🧠 Add intelligent processing recommendations for optimal workflow

- Analyze document size and complexity before processing
- Provide clear workflow recommendations in response metadata
- Strongly recommend summary_only + page_range for large documents (>10 pages)
- Add warning system for suboptimal usage patterns
- Update parameter descriptions with best practice guidance
- Help users avoid 25k token response limits proactively
This commit is contained in:
Ryan Malloy 2025-08-19 13:16:48 -06:00
parent a485e05759
commit d94bd39da6

View File

@ -291,14 +291,18 @@ async def convert_to_markdown(
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). Empty = all pages"),
summary_only: bool = Field(default=False, description="Return only metadata and structure summary (for large docs)"),
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
) -> dict[str, Any]:
"""Convert Office documents to Markdown format with page-range support and structure preservation.
"""Convert Office documents to Markdown format with intelligent processing recommendations.
Supports page-based chunking for large documents and summary mode for quick overview.
Use page_range to process specific pages only, or summary_only=true for large documents.
RECOMMENDED WORKFLOW FOR LARGE DOCUMENTS (>5 pages):
1. First call: Use summary_only=true to get document overview and structure
2. Then: Use page_range (e.g., "1-10", "15-25") to process specific sections
This prevents response size errors and provides efficient processing.
Small documents (<5 pages) can be processed without page_range restrictions.
"""
start_time = time.time()
@ -320,6 +324,12 @@ async def convert_to_markdown(
if category != "word":
raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")
# Analyze document size and provide intelligent recommendations
doc_analysis = await _analyze_document_size(local_path, extension)
processing_recommendation = _get_processing_recommendation(
doc_analysis, page_range, summary_only
)
# Parse page range if provided
page_numbers = _parse_page_range(page_range) if page_range else None
@ -343,7 +353,9 @@ async def convert_to_markdown(
"format": format_info["format_name"],
"conversion_method": markdown_result["method_used"],
"conversion_time": round(time.time() - start_time, 3),
"summary_only": summary_only
"summary_only": summary_only,
"document_analysis": doc_analysis,
"processing_recommendation": processing_recommendation
}
}
@ -1536,6 +1548,128 @@ def _parse_page_range(page_range: str) -> list[int]:
return sorted(list(pages))
async def _analyze_document_size(file_path: str, extension: str) -> dict[str, Any]:
"""Analyze document to estimate size and complexity."""
analysis = {
"estimated_pages": 1,
"file_size_mb": 0,
"complexity": "simple",
"estimated_content_size": "small"
}
try:
# Get file size
from pathlib import Path
file_size = Path(file_path).stat().st_size
analysis["file_size_mb"] = round(file_size / (1024 * 1024), 2)
if extension == ".docx":
try:
import docx
doc = docx.Document(file_path)
# Estimate pages based on content
paragraph_count = len(doc.paragraphs)
table_count = len(doc.tables)
# Rough estimation: ~40 paragraphs per page
estimated_pages = max(1, paragraph_count // 40)
analysis["estimated_pages"] = estimated_pages
# Determine complexity
if table_count > 10 or paragraph_count > 500:
analysis["complexity"] = "complex"
elif table_count > 5 or paragraph_count > 200:
analysis["complexity"] = "moderate"
# Estimate content size
if estimated_pages > 20:
analysis["estimated_content_size"] = "very_large"
elif estimated_pages > 10:
analysis["estimated_content_size"] = "large"
elif estimated_pages > 5:
analysis["estimated_content_size"] = "medium"
except Exception:
# Fallback to file size estimation
if file_size > 5 * 1024 * 1024: # 5MB
analysis["estimated_pages"] = 50
analysis["estimated_content_size"] = "very_large"
elif file_size > 1 * 1024 * 1024: # 1MB
analysis["estimated_pages"] = 20
analysis["estimated_content_size"] = "large"
elif file_size > 500 * 1024: # 500KB
analysis["estimated_pages"] = 10
analysis["estimated_content_size"] = "medium"
except Exception:
pass
return analysis
def _get_processing_recommendation(
doc_analysis: dict[str, Any],
page_range: str,
summary_only: bool
) -> dict[str, Any]:
"""Generate intelligent processing recommendations based on document analysis."""
estimated_pages = doc_analysis["estimated_pages"]
content_size = doc_analysis["estimated_content_size"]
recommendation = {
"status": "optimal",
"message": "",
"suggested_workflow": [],
"warnings": []
}
# Large document recommendations
if content_size in ["large", "very_large"] and not page_range and not summary_only:
recommendation["status"] = "suboptimal"
recommendation["message"] = (
f"⚠️ Large document detected ({estimated_pages} estimated pages). "
"Consider using recommended workflow for better performance."
)
recommendation["suggested_workflow"] = [
"1. First: Call with summary_only=true to get document overview",
"2. Then: Use page_range to process specific sections (e.g., '1-10', '20-30')",
"3. Alternative: Process in chunks of 10-15 pages to avoid response limits"
]
recommendation["warnings"] = [
"Full document processing may hit 25k token response limit",
"Large responses may be slow and consume significant resources"
]
# Medium document recommendations
elif content_size == "medium" and not page_range and not summary_only:
recommendation["status"] = "caution"
recommendation["message"] = (
f"Medium document detected ({estimated_pages} estimated pages). "
"Consider summary_only=true first if you encounter response size issues."
)
recommendation["suggested_workflow"] = [
"Option 1: Try full processing (current approach)",
"Option 2: Use summary_only=true first, then page_range if needed"
]
# Optimal usage patterns
elif summary_only:
recommendation["message"] = "✅ Excellent! Using summary mode for initial document analysis."
recommendation["suggested_workflow"] = [
"After reviewing summary, use page_range to extract specific sections of interest"
]
elif page_range and content_size in ["large", "very_large"]:
recommendation["message"] = "✅ Perfect! Using page-range processing for efficient extraction."
elif content_size == "small":
recommendation["message"] = "✅ Small document - full processing is optimal."
return recommendation
def main():
"""Main entry point for the MCP server."""
import sys