🧠 Add intelligent processing recommendations for optimal workflow
- Analyze document size and complexity before processing - Provide clear workflow recommendations in response metadata - Strongly recommend summary_only + page_range for large documents (>10 pages) - Add warning system for suboptimal usage patterns - Update parameter descriptions with best practice guidance - Help users avoid 25k token response limits proactively
This commit is contained in:
parent
a485e05759
commit
d94bd39da6
@ -291,14 +291,18 @@ async def convert_to_markdown(
|
|||||||
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
|
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
|
||||||
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
||||||
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
||||||
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). Empty = all pages"),
|
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
||||||
summary_only: bool = Field(default=False, description="Return only metadata and structure summary (for large docs)"),
|
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
||||||
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert Office documents to Markdown format with page-range support and structure preservation.
|
"""Convert Office documents to Markdown format with intelligent processing recommendations.
|
||||||
|
|
||||||
Supports page-based chunking for large documents and summary mode for quick overview.
|
⚠️ RECOMMENDED WORKFLOW FOR LARGE DOCUMENTS (>5 pages):
|
||||||
Use page_range to process specific pages only, or summary_only=true for large documents.
|
1. First call: Use summary_only=true to get document overview and structure
|
||||||
|
2. Then: Use page_range (e.g., "1-10", "15-25") to process specific sections
|
||||||
|
|
||||||
|
This prevents response size errors and provides efficient processing.
|
||||||
|
Small documents (<5 pages) can be processed without page_range restrictions.
|
||||||
"""
|
"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
@ -320,6 +324,12 @@ async def convert_to_markdown(
|
|||||||
if category != "word":
|
if category != "word":
|
||||||
raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")
|
raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")
|
||||||
|
|
||||||
|
# Analyze document size and provide intelligent recommendations
|
||||||
|
doc_analysis = await _analyze_document_size(local_path, extension)
|
||||||
|
processing_recommendation = _get_processing_recommendation(
|
||||||
|
doc_analysis, page_range, summary_only
|
||||||
|
)
|
||||||
|
|
||||||
# Parse page range if provided
|
# Parse page range if provided
|
||||||
page_numbers = _parse_page_range(page_range) if page_range else None
|
page_numbers = _parse_page_range(page_range) if page_range else None
|
||||||
|
|
||||||
@ -343,7 +353,9 @@ async def convert_to_markdown(
|
|||||||
"format": format_info["format_name"],
|
"format": format_info["format_name"],
|
||||||
"conversion_method": markdown_result["method_used"],
|
"conversion_method": markdown_result["method_used"],
|
||||||
"conversion_time": round(time.time() - start_time, 3),
|
"conversion_time": round(time.time() - start_time, 3),
|
||||||
"summary_only": summary_only
|
"summary_only": summary_only,
|
||||||
|
"document_analysis": doc_analysis,
|
||||||
|
"processing_recommendation": processing_recommendation
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1536,6 +1548,128 @@ def _parse_page_range(page_range: str) -> list[int]:
|
|||||||
return sorted(list(pages))
|
return sorted(list(pages))
|
||||||
|
|
||||||
|
|
||||||
|
async def _analyze_document_size(file_path: str, extension: str) -> dict[str, Any]:
|
||||||
|
"""Analyze document to estimate size and complexity."""
|
||||||
|
analysis = {
|
||||||
|
"estimated_pages": 1,
|
||||||
|
"file_size_mb": 0,
|
||||||
|
"complexity": "simple",
|
||||||
|
"estimated_content_size": "small"
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get file size
|
||||||
|
from pathlib import Path
|
||||||
|
file_size = Path(file_path).stat().st_size
|
||||||
|
analysis["file_size_mb"] = round(file_size / (1024 * 1024), 2)
|
||||||
|
|
||||||
|
if extension == ".docx":
|
||||||
|
try:
|
||||||
|
import docx
|
||||||
|
doc = docx.Document(file_path)
|
||||||
|
|
||||||
|
# Estimate pages based on content
|
||||||
|
paragraph_count = len(doc.paragraphs)
|
||||||
|
table_count = len(doc.tables)
|
||||||
|
|
||||||
|
# Rough estimation: ~40 paragraphs per page
|
||||||
|
estimated_pages = max(1, paragraph_count // 40)
|
||||||
|
analysis["estimated_pages"] = estimated_pages
|
||||||
|
|
||||||
|
# Determine complexity
|
||||||
|
if table_count > 10 or paragraph_count > 500:
|
||||||
|
analysis["complexity"] = "complex"
|
||||||
|
elif table_count > 5 or paragraph_count > 200:
|
||||||
|
analysis["complexity"] = "moderate"
|
||||||
|
|
||||||
|
# Estimate content size
|
||||||
|
if estimated_pages > 20:
|
||||||
|
analysis["estimated_content_size"] = "very_large"
|
||||||
|
elif estimated_pages > 10:
|
||||||
|
analysis["estimated_content_size"] = "large"
|
||||||
|
elif estimated_pages > 5:
|
||||||
|
analysis["estimated_content_size"] = "medium"
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
# Fallback to file size estimation
|
||||||
|
if file_size > 5 * 1024 * 1024: # 5MB
|
||||||
|
analysis["estimated_pages"] = 50
|
||||||
|
analysis["estimated_content_size"] = "very_large"
|
||||||
|
elif file_size > 1 * 1024 * 1024: # 1MB
|
||||||
|
analysis["estimated_pages"] = 20
|
||||||
|
analysis["estimated_content_size"] = "large"
|
||||||
|
elif file_size > 500 * 1024: # 500KB
|
||||||
|
analysis["estimated_pages"] = 10
|
||||||
|
analysis["estimated_content_size"] = "medium"
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return analysis
|
||||||
|
|
||||||
|
|
||||||
|
def _get_processing_recommendation(
|
||||||
|
doc_analysis: dict[str, Any],
|
||||||
|
page_range: str,
|
||||||
|
summary_only: bool
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Generate intelligent processing recommendations based on document analysis."""
|
||||||
|
|
||||||
|
estimated_pages = doc_analysis["estimated_pages"]
|
||||||
|
content_size = doc_analysis["estimated_content_size"]
|
||||||
|
|
||||||
|
recommendation = {
|
||||||
|
"status": "optimal",
|
||||||
|
"message": "",
|
||||||
|
"suggested_workflow": [],
|
||||||
|
"warnings": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Large document recommendations
|
||||||
|
if content_size in ["large", "very_large"] and not page_range and not summary_only:
|
||||||
|
recommendation["status"] = "suboptimal"
|
||||||
|
recommendation["message"] = (
|
||||||
|
f"⚠️ Large document detected ({estimated_pages} estimated pages). "
|
||||||
|
"Consider using recommended workflow for better performance."
|
||||||
|
)
|
||||||
|
recommendation["suggested_workflow"] = [
|
||||||
|
"1. First: Call with summary_only=true to get document overview",
|
||||||
|
"2. Then: Use page_range to process specific sections (e.g., '1-10', '20-30')",
|
||||||
|
"3. Alternative: Process in chunks of 10-15 pages to avoid response limits"
|
||||||
|
]
|
||||||
|
recommendation["warnings"] = [
|
||||||
|
"Full document processing may hit 25k token response limit",
|
||||||
|
"Large responses may be slow and consume significant resources"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Medium document recommendations
|
||||||
|
elif content_size == "medium" and not page_range and not summary_only:
|
||||||
|
recommendation["status"] = "caution"
|
||||||
|
recommendation["message"] = (
|
||||||
|
f"Medium document detected ({estimated_pages} estimated pages). "
|
||||||
|
"Consider summary_only=true first if you encounter response size issues."
|
||||||
|
)
|
||||||
|
recommendation["suggested_workflow"] = [
|
||||||
|
"Option 1: Try full processing (current approach)",
|
||||||
|
"Option 2: Use summary_only=true first, then page_range if needed"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Optimal usage patterns
|
||||||
|
elif summary_only:
|
||||||
|
recommendation["message"] = "✅ Excellent! Using summary mode for initial document analysis."
|
||||||
|
recommendation["suggested_workflow"] = [
|
||||||
|
"After reviewing summary, use page_range to extract specific sections of interest"
|
||||||
|
]
|
||||||
|
|
||||||
|
elif page_range and content_size in ["large", "very_large"]:
|
||||||
|
recommendation["message"] = "✅ Perfect! Using page-range processing for efficient extraction."
|
||||||
|
|
||||||
|
elif content_size == "small":
|
||||||
|
recommendation["message"] = "✅ Small document - full processing is optimal."
|
||||||
|
|
||||||
|
return recommendation
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Main entry point for the MCP server."""
|
"""Main entry point for the MCP server."""
|
||||||
import sys
|
import sys
|
||||||
|
Loading…
x
Reference in New Issue
Block a user