🧠 Add intelligent processing recommendations for optimal workflow
- Analyze document size and complexity before processing - Provide clear workflow recommendations in response metadata - Strongly recommend summary_only + page_range for large documents (>10 pages) - Add warning system for suboptimal usage patterns - Update parameter descriptions with best practice guidance - Help users avoid 25k token response limits proactively
This commit is contained in:
parent
a485e05759
commit
d94bd39da6
@ -291,14 +291,18 @@ async def convert_to_markdown(
|
||||
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
|
||||
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
||||
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
||||
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). Empty = all pages"),
|
||||
summary_only: bool = Field(default=False, description="Return only metadata and structure summary (for large docs)"),
|
||||
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
||||
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
||||
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
||||
) -> dict[str, Any]:
|
||||
"""Convert Office documents to Markdown format with page-range support and structure preservation.
|
||||
"""Convert Office documents to Markdown format with intelligent processing recommendations.
|
||||
|
||||
Supports page-based chunking for large documents and summary mode for quick overview.
|
||||
Use page_range to process specific pages only, or summary_only=true for large documents.
|
||||
⚠️ RECOMMENDED WORKFLOW FOR LARGE DOCUMENTS (>5 pages):
|
||||
1. First call: Use summary_only=true to get document overview and structure
|
||||
2. Then: Use page_range (e.g., "1-10", "15-25") to process specific sections
|
||||
|
||||
This prevents response size errors and provides efficient processing.
|
||||
Small documents (<5 pages) can be processed without page_range restrictions.
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
@ -320,6 +324,12 @@ async def convert_to_markdown(
|
||||
if category != "word":
|
||||
raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")
|
||||
|
||||
# Analyze document size and provide intelligent recommendations
|
||||
doc_analysis = await _analyze_document_size(local_path, extension)
|
||||
processing_recommendation = _get_processing_recommendation(
|
||||
doc_analysis, page_range, summary_only
|
||||
)
|
||||
|
||||
# Parse page range if provided
|
||||
page_numbers = _parse_page_range(page_range) if page_range else None
|
||||
|
||||
@ -343,7 +353,9 @@ async def convert_to_markdown(
|
||||
"format": format_info["format_name"],
|
||||
"conversion_method": markdown_result["method_used"],
|
||||
"conversion_time": round(time.time() - start_time, 3),
|
||||
"summary_only": summary_only
|
||||
"summary_only": summary_only,
|
||||
"document_analysis": doc_analysis,
|
||||
"processing_recommendation": processing_recommendation
|
||||
}
|
||||
}
|
||||
|
||||
@ -1536,6 +1548,128 @@ def _parse_page_range(page_range: str) -> list[int]:
|
||||
return sorted(list(pages))
|
||||
|
||||
|
||||
async def _analyze_document_size(file_path: str, extension: str) -> dict[str, Any]:
|
||||
"""Analyze document to estimate size and complexity."""
|
||||
analysis = {
|
||||
"estimated_pages": 1,
|
||||
"file_size_mb": 0,
|
||||
"complexity": "simple",
|
||||
"estimated_content_size": "small"
|
||||
}
|
||||
|
||||
try:
|
||||
# Get file size
|
||||
from pathlib import Path
|
||||
file_size = Path(file_path).stat().st_size
|
||||
analysis["file_size_mb"] = round(file_size / (1024 * 1024), 2)
|
||||
|
||||
if extension == ".docx":
|
||||
try:
|
||||
import docx
|
||||
doc = docx.Document(file_path)
|
||||
|
||||
# Estimate pages based on content
|
||||
paragraph_count = len(doc.paragraphs)
|
||||
table_count = len(doc.tables)
|
||||
|
||||
# Rough estimation: ~40 paragraphs per page
|
||||
estimated_pages = max(1, paragraph_count // 40)
|
||||
analysis["estimated_pages"] = estimated_pages
|
||||
|
||||
# Determine complexity
|
||||
if table_count > 10 or paragraph_count > 500:
|
||||
analysis["complexity"] = "complex"
|
||||
elif table_count > 5 or paragraph_count > 200:
|
||||
analysis["complexity"] = "moderate"
|
||||
|
||||
# Estimate content size
|
||||
if estimated_pages > 20:
|
||||
analysis["estimated_content_size"] = "very_large"
|
||||
elif estimated_pages > 10:
|
||||
analysis["estimated_content_size"] = "large"
|
||||
elif estimated_pages > 5:
|
||||
analysis["estimated_content_size"] = "medium"
|
||||
|
||||
except Exception:
|
||||
# Fallback to file size estimation
|
||||
if file_size > 5 * 1024 * 1024: # 5MB
|
||||
analysis["estimated_pages"] = 50
|
||||
analysis["estimated_content_size"] = "very_large"
|
||||
elif file_size > 1 * 1024 * 1024: # 1MB
|
||||
analysis["estimated_pages"] = 20
|
||||
analysis["estimated_content_size"] = "large"
|
||||
elif file_size > 500 * 1024: # 500KB
|
||||
analysis["estimated_pages"] = 10
|
||||
analysis["estimated_content_size"] = "medium"
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return analysis
|
||||
|
||||
|
||||
def _get_processing_recommendation(
|
||||
doc_analysis: dict[str, Any],
|
||||
page_range: str,
|
||||
summary_only: bool
|
||||
) -> dict[str, Any]:
|
||||
"""Generate intelligent processing recommendations based on document analysis."""
|
||||
|
||||
estimated_pages = doc_analysis["estimated_pages"]
|
||||
content_size = doc_analysis["estimated_content_size"]
|
||||
|
||||
recommendation = {
|
||||
"status": "optimal",
|
||||
"message": "",
|
||||
"suggested_workflow": [],
|
||||
"warnings": []
|
||||
}
|
||||
|
||||
# Large document recommendations
|
||||
if content_size in ["large", "very_large"] and not page_range and not summary_only:
|
||||
recommendation["status"] = "suboptimal"
|
||||
recommendation["message"] = (
|
||||
f"⚠️ Large document detected ({estimated_pages} estimated pages). "
|
||||
"Consider using recommended workflow for better performance."
|
||||
)
|
||||
recommendation["suggested_workflow"] = [
|
||||
"1. First: Call with summary_only=true to get document overview",
|
||||
"2. Then: Use page_range to process specific sections (e.g., '1-10', '20-30')",
|
||||
"3. Alternative: Process in chunks of 10-15 pages to avoid response limits"
|
||||
]
|
||||
recommendation["warnings"] = [
|
||||
"Full document processing may hit 25k token response limit",
|
||||
"Large responses may be slow and consume significant resources"
|
||||
]
|
||||
|
||||
# Medium document recommendations
|
||||
elif content_size == "medium" and not page_range and not summary_only:
|
||||
recommendation["status"] = "caution"
|
||||
recommendation["message"] = (
|
||||
f"Medium document detected ({estimated_pages} estimated pages). "
|
||||
"Consider summary_only=true first if you encounter response size issues."
|
||||
)
|
||||
recommendation["suggested_workflow"] = [
|
||||
"Option 1: Try full processing (current approach)",
|
||||
"Option 2: Use summary_only=true first, then page_range if needed"
|
||||
]
|
||||
|
||||
# Optimal usage patterns
|
||||
elif summary_only:
|
||||
recommendation["message"] = "✅ Excellent! Using summary mode for initial document analysis."
|
||||
recommendation["suggested_workflow"] = [
|
||||
"After reviewing summary, use page_range to extract specific sections of interest"
|
||||
]
|
||||
|
||||
elif page_range and content_size in ["large", "very_large"]:
|
||||
recommendation["message"] = "✅ Perfect! Using page-range processing for efficient extraction."
|
||||
|
||||
elif content_size == "small":
|
||||
recommendation["message"] = "✅ Small document - full processing is optimal."
|
||||
|
||||
return recommendation
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the MCP server."""
|
||||
import sys
|
||||
|
Loading…
x
Reference in New Issue
Block a user