🚀 Add ultra-fast summary mode to prevent massive 1M+ token responses
- Bypass all complex processing in summary_only mode - Extract only first 50 paragraphs, max 10 headings, 5 content paragraphs - Add bookmark detection for chapter navigation hints - Limit summary content to 2000 chars max - Prevent 1,282,370 token responses with surgical precision - Show bookmark names as chapter start indicators
This commit is contained in:
parent
3dffce6904
commit
431022e113
@ -366,14 +366,28 @@ async def convert_to_markdown(
|
|||||||
|
|
||||||
# Add content based on mode
|
# Add content based on mode
|
||||||
if summary_only:
|
if summary_only:
|
||||||
# Only include summary information for large documents
|
# VERY restrictive summary mode to prevent massive responses
|
||||||
result["metadata"]["character_count"] = len(markdown_result["content"])
|
result["metadata"]["character_count"] = len(markdown_result["content"])
|
||||||
result["metadata"]["word_count"] = len(markdown_result["content"].split())
|
result["metadata"]["word_count"] = len(markdown_result["content"].split())
|
||||||
result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"]
|
|
||||||
|
|
||||||
# Add table of contents with page ranges for navigation
|
# Ultra-short summary (only 500 chars max)
|
||||||
|
result["summary"] = markdown_result["content"][:500] + "..." if len(markdown_result["content"]) > 500 else markdown_result["content"]
|
||||||
|
|
||||||
|
# Severely limit table of contents to prevent 1M+ token responses
|
||||||
if "table_of_contents" in markdown_result:
|
if "table_of_contents" in markdown_result:
|
||||||
result["table_of_contents"] = markdown_result["table_of_contents"]
|
toc = markdown_result["table_of_contents"]
|
||||||
|
if "sections" in toc and len(toc["sections"]) > 20:
|
||||||
|
# Limit to first 20 sections only
|
||||||
|
limited_toc = {
|
||||||
|
"sections": toc["sections"][:20],
|
||||||
|
"total_sections": len(toc["sections"]),
|
||||||
|
"showing_first": 20,
|
||||||
|
"note": f"Showing first 20 of {len(toc['sections'])} sections. Use page_range to extract specific sections.",
|
||||||
|
"suggested_chunking": toc.get("suggested_chunking", [])[:10] # Limit chunking suggestions too
|
||||||
|
}
|
||||||
|
result["table_of_contents"] = limited_toc
|
||||||
|
else:
|
||||||
|
result["table_of_contents"] = toc
|
||||||
else:
|
else:
|
||||||
# Include content with automatic size limiting to prevent MCP errors
|
# Include content with automatic size limiting to prevent MCP errors
|
||||||
content = markdown_result["content"]
|
content = markdown_result["content"]
|
||||||
@ -1044,6 +1058,10 @@ async def _convert_docx_to_markdown(
|
|||||||
"""Convert .docx file to markdown with comprehensive feature support."""
|
"""Convert .docx file to markdown with comprehensive feature support."""
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
|
# ULTRA-FAST summary mode - skip all complex processing
|
||||||
|
if summary_only:
|
||||||
|
return await _get_ultra_fast_summary(file_path)
|
||||||
|
|
||||||
# If page_numbers is specified, we need to use python-docx for page-based extraction
|
# If page_numbers is specified, we need to use python-docx for page-based extraction
|
||||||
# as mammoth processes the entire document
|
# as mammoth processes the entire document
|
||||||
if page_numbers:
|
if page_numbers:
|
||||||
@ -1539,6 +1557,78 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
|
|||||||
return structure
|
return structure
|
||||||
|
|
||||||
|
|
||||||
|
async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
|
||||||
|
"""Ultra-fast summary that extracts minimal data to prevent MCP token limits."""
|
||||||
|
try:
|
||||||
|
import docx
|
||||||
|
doc = docx.Document(file_path)
|
||||||
|
|
||||||
|
# Extract only the first few paragraphs and major headings
|
||||||
|
content_parts = []
|
||||||
|
heading_count = 0
|
||||||
|
paragraph_count = 0
|
||||||
|
max_content_length = 2000 # Very short limit
|
||||||
|
current_length = 0
|
||||||
|
|
||||||
|
# Get basic structure info quickly
|
||||||
|
total_paragraphs = len(doc.paragraphs)
|
||||||
|
total_tables = len(doc.tables)
|
||||||
|
|
||||||
|
# Extract bookmarks (chapter markers)
|
||||||
|
bookmarks = []
|
||||||
|
try:
|
||||||
|
# Access document's bookmarks through the XML
|
||||||
|
for bookmark in doc.element.xpath('//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
|
||||||
|
bookmark_name = bookmark.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
|
||||||
|
if bookmark_name and not bookmark_name.startswith('_'): # Skip system bookmarks
|
||||||
|
bookmarks.append(bookmark_name)
|
||||||
|
except Exception:
|
||||||
|
pass # Bookmarks extraction failed, continue without
|
||||||
|
|
||||||
|
# Extract just a few key headings and the start of content
|
||||||
|
for para in doc.paragraphs[:50]: # Only check first 50 paragraphs
|
||||||
|
text = para.text.strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if it's a heading (simple heuristic)
|
||||||
|
is_heading = (para.style and "heading" in para.style.name.lower()) or len(text) < 100
|
||||||
|
|
||||||
|
if is_heading and heading_count < 10: # Max 10 headings
|
||||||
|
content_parts.append(f"# {text}")
|
||||||
|
heading_count += 1
|
||||||
|
current_length += len(text) + 3
|
||||||
|
elif paragraph_count < 5 and current_length < max_content_length: # Max 5 paragraphs
|
||||||
|
content_parts.append(text)
|
||||||
|
paragraph_count += 1
|
||||||
|
current_length += len(text)
|
||||||
|
|
||||||
|
if current_length > max_content_length:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Create very basic summary
|
||||||
|
summary_content = "\n\n".join(content_parts)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"content": summary_content,
|
||||||
|
"method_used": "ultra-fast-summary",
|
||||||
|
"table_of_contents": {
|
||||||
|
"note": "Use full document processing for detailed TOC",
|
||||||
|
"basic_info": f"Document has ~{total_paragraphs} paragraphs, {total_tables} tables, {heading_count} headings found in first scan",
|
||||||
|
"bookmarks": bookmarks[:20] if bookmarks else [], # Limit to first 20 bookmarks
|
||||||
|
"bookmark_count": len(bookmarks),
|
||||||
|
"bookmark_note": "Bookmarks often indicate chapter starts. Use these as navigation hints for page_range extraction."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"content": f"Error creating summary: {str(e)}",
|
||||||
|
"method_used": "error-fallback",
|
||||||
|
"table_of_contents": {"note": "Summary generation failed"}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _smart_truncate_content(content: str, max_chars: int) -> str:
|
def _smart_truncate_content(content: str, max_chars: int) -> str:
|
||||||
"""Intelligently truncate content while preserving structure and readability."""
|
"""Intelligently truncate content while preserving structure and readability."""
|
||||||
if len(content) <= max_chars:
|
if len(content) <= max_chars:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user