From 431022e113d618fad92af86efff4d337c3848ec9 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Fri, 22 Aug 2025 07:56:19 -0600 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=80=20Add=20ultra-fast=20summary=20mod?= =?UTF-8?q?e=20to=20prevent=20massive=201M+=20token=20responses?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Bypass all complex processing in summary_only mode - Extract only first 50 paragraphs, max 10 headings, 5 content paragraphs - Add bookmark detection for chapter navigation hints - Limit summary content to 2000 chars max - Prevent 1,282,370 token responses with surgical precision - Show bookmark names as chapter start indicators --- src/mcp_office_tools/server.py | 98 ++++++++++++++++++++++++++++++++-- 1 file changed, 94 insertions(+), 4 deletions(-) diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py index fea7319..62636e0 100644 --- a/src/mcp_office_tools/server.py +++ b/src/mcp_office_tools/server.py @@ -366,14 +366,28 @@ async def convert_to_markdown( # Add content based on mode if summary_only: - # Only include summary information for large documents + # VERY restrictive summary mode to prevent massive responses result["metadata"]["character_count"] = len(markdown_result["content"]) result["metadata"]["word_count"] = len(markdown_result["content"].split()) - result["summary"] = markdown_result["content"][:1000] + "..." if len(markdown_result["content"]) > 1000 else markdown_result["content"] - # Add table of contents with page ranges for navigation + # Ultra-short summary (only 500 chars max) + result["summary"] = markdown_result["content"][:500] + "..." if len(markdown_result["content"]) > 500 else markdown_result["content"] + + # Severely limit table of contents to prevent 1M+ token responses if "table_of_contents" in markdown_result: - result["table_of_contents"] = markdown_result["table_of_contents"] + toc = markdown_result["table_of_contents"] + if "sections" in toc and len(toc["sections"]) > 20: + # Limit to first 20 sections only + limited_toc = { + "sections": toc["sections"][:20], + "total_sections": len(toc["sections"]), + "showing_first": 20, + "note": f"Showing first 20 of {len(toc['sections'])} sections. Use page_range to extract specific sections.", + "suggested_chunking": toc.get("suggested_chunking", [])[:10] # Limit chunking suggestions too + } + result["table_of_contents"] = limited_toc + else: + result["table_of_contents"] = toc else: # Include content with automatic size limiting to prevent MCP errors content = markdown_result["content"] @@ -1044,6 +1058,10 @@ async def _convert_docx_to_markdown( """Convert .docx file to markdown with comprehensive feature support.""" import base64 + # ULTRA-FAST summary mode - skip all complex processing + if summary_only: + return await _get_ultra_fast_summary(file_path) + # If page_numbers is specified, we need to use python-docx for page-based extraction # as mammoth processes the entire document if page_numbers: @@ -1539,6 +1557,78 @@ def _extract_markdown_structure(content: str) -> dict[str, Any]: return structure +async def _get_ultra_fast_summary(file_path: str) -> dict[str, Any]: + """Ultra-fast summary that extracts minimal data to prevent MCP token limits.""" + try: + import docx + doc = docx.Document(file_path) + + # Extract only the first few paragraphs and major headings + content_parts = [] + heading_count = 0 + paragraph_count = 0 + max_content_length = 2000 # Very short limit + current_length = 0 + + # Get basic structure info quickly + total_paragraphs = len(doc.paragraphs) + total_tables = len(doc.tables) + + # Extract bookmarks (chapter markers) + bookmarks = [] + try: + # Access document's bookmarks through the XML + for bookmark in doc.element.xpath('//w:bookmarkStart', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}): + bookmark_name = bookmark.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name') + if bookmark_name and not bookmark_name.startswith('_'): # Skip system bookmarks + bookmarks.append(bookmark_name) + except Exception: + pass # Bookmarks extraction failed, continue without + + # Extract just a few key headings and the start of content + for para in doc.paragraphs[:50]: # Only check first 50 paragraphs + text = para.text.strip() + if not text: + continue + + # Check if it's a heading (simple heuristic) + is_heading = (para.style and "heading" in para.style.name.lower()) or len(text) < 100 + + if is_heading and heading_count < 10: # Max 10 headings + content_parts.append(f"# {text}") + heading_count += 1 + current_length += len(text) + 3 + elif paragraph_count < 5 and current_length < max_content_length: # Max 5 paragraphs + content_parts.append(text) + paragraph_count += 1 + current_length += len(text) + + if current_length > max_content_length: + break + + # Create very basic summary + summary_content = "\n\n".join(content_parts) + + return { + "content": summary_content, + "method_used": "ultra-fast-summary", + "table_of_contents": { + "note": "Use full document processing for detailed TOC", + "basic_info": f"Document has ~{total_paragraphs} paragraphs, {total_tables} tables, {heading_count} headings found in first scan", + "bookmarks": bookmarks[:20] if bookmarks else [], # Limit to first 20 bookmarks + "bookmark_count": len(bookmarks), + "bookmark_note": "Bookmarks often indicate chapter starts. Use these as navigation hints for page_range extraction." + } + } + + except Exception as e: + return { + "content": f"Error creating summary: {str(e)}", + "method_used": "error-fallback", + "table_of_contents": {"note": "Summary generation failed"} + } + + def _smart_truncate_content(content: str, max_chars: int) -> str: """Intelligently truncate content while preserving structure and readability.""" if len(content) <= max_chars: