🚀 Add ultra-fast summary mode to prevent massive 1M+ token responses

- Bypass all complex processing in summary_only mode - Extract only first 50 paragraphs, max 10 headings, 5 content paragraphs - Add bookmark detection for chapter navigation hints - Limit summary content to 2000 chars max - Prevent 1,282,370 token responses with surgical precision - Show bookmark names as chapter start indicators
2025-08-22 07:56:19 -06:00 · 2025-08-22 07:56:19 -06:00 · 431022e113
commit 431022e113
parent 3dffce6904
1 changed files with 94 additions and 4 deletions
--- a/src/mcp_office_tools/server.py
+++ b/src/mcp_office_tools/server.py
@ -366,14 +366,28 @@ async def convert_to_markdown(

        # Add content based on mode
        if summary_only:
-            # Only include summary information for large documents
+            # VERY restrictive summary mode to prevent massive responses
            result["metadata"]["character_count"] = len(markdown_result["content"])
            result["metadata"]["word_count"] = len(markdown_result["content"].split())
-            result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"]
            
-            # Add table of contents with page ranges for navigation
+            # Ultra-short summary (only 500 chars max)
+            result["summary"] = markdown_result["content"][:500] + "..." if len(markdown_result["content"]) > 500 else markdown_result["content"]
+            
+            # Severely limit table of contents to prevent 1M+ token responses
            if "table_of_contents" in markdown_result:
-                result["table_of_contents"] = markdown_result["table_of_contents"]
+                toc = markdown_result["table_of_contents"]
+                if "sections" in toc and len(toc["sections"]) > 20:
+                    # Limit to first 20 sections only
+                    limited_toc = {
+                        "sections": toc["sections"][:20],
+                        "total_sections": len(toc["sections"]),
+                        "showing_first": 20,
+                        "note": f"Showing first 20 of {len(toc['sections'])} sections. Use page_range to extract specific sections.",
+                        "suggested_chunking": toc.get("suggested_chunking", [])[:10]  # Limit chunking suggestions too
+                    }
+                    result["table_of_contents"] = limited_toc
+                else:
+                    result["table_of_contents"] = toc
        else:
            # Include content with automatic size limiting to prevent MCP errors
            content = markdown_result["content"]
@ -1044,6 +1058,10 @@ async def _convert_docx_to_markdown(
    """Convert .docx file to markdown with comprehensive feature support."""
    import base64

+    # ULTRA-FAST summary mode - skip all complex processing
+    if summary_only:
+        return await _get_ultra_fast_summary(file_path)
+    
    # If page_numbers is specified, we need to use python-docx for page-based extraction
    # as mammoth processes the entire document
    if page_numbers:
@ -1539,6 +1557,78 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]:
    return structure


+async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]:
+    """Ultra-fast summary that extracts minimal data to prevent MCP token limits."""
+    try:
+        import docx
+        doc = docx.Document(file_path)
+        
+        # Extract only the first few paragraphs and major headings
+        content_parts = []
+        heading_count = 0
+        paragraph_count = 0
+        max_content_length = 2000  # Very short limit
+        current_length = 0
+        
+        # Get basic structure info quickly
+        total_paragraphs = len(doc.paragraphs)
+        total_tables = len(doc.tables)
+        
+        # Extract bookmarks (chapter markers)
+        bookmarks = []
+        try:
+            # Access document's bookmarks through the XML
+            for bookmark in doc.element.xpath('//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
+                bookmark_name = bookmark.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name')
+                if bookmark_name and not bookmark_name.startswith('_'):  # Skip system bookmarks
+                    bookmarks.append(bookmark_name)
+        except Exception:
+            pass  # Bookmarks extraction failed, continue without
+        
+        # Extract just a few key headings and the start of content
+        for para in doc.paragraphs[:50]:  # Only check first 50 paragraphs
+            text = para.text.strip()
+            if not text:
+                continue
+                
+            # Check if it's a heading (simple heuristic)
+            is_heading = (para.style and "heading" in para.style.name.lower()) or len(text) < 100
+            
+            if is_heading and heading_count < 10:  # Max 10 headings
+                content_parts.append(f"# {text}")
+                heading_count += 1
+                current_length += len(text) + 3
+            elif paragraph_count < 5 and current_length < max_content_length:  # Max 5 paragraphs
+                content_parts.append(text)
+                paragraph_count += 1
+                current_length += len(text)
+            
+            if current_length > max_content_length:
+                break
+        
+        # Create very basic summary
+        summary_content = "\n\n".join(content_parts)
+        
+        return {
+            "content": summary_content,
+            "method_used": "ultra-fast-summary", 
+            "table_of_contents": {
+                "note": "Use full document processing for detailed TOC",
+                "basic_info": f"Document has ~{total_paragraphs} paragraphs, {total_tables} tables, {heading_count} headings found in first scan",
+                "bookmarks": bookmarks[:20] if bookmarks else [],  # Limit to first 20 bookmarks
+                "bookmark_count": len(bookmarks),
+                "bookmark_note": "Bookmarks often indicate chapter starts. Use these as navigation hints for page_range extraction."
+            }
+        }
+        
+    except Exception as e:
+        return {
+            "content": f"Error creating summary: {str(e)}",
+            "method_used": "error-fallback",
+            "table_of_contents": {"note": "Summary generation failed"}
+        }
+
+
 def _smart_truncate_content(content: str, max_chars: int) -> str:
    """Intelligently truncate content while preserving structure and readability."""
    if len(content) <= max_chars: