"""Word Document Tools Mixin - Specialized tools for Word document processing.""" import os import time from typing import Any from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool from pydantic import Field from ..utils import OfficeFileError, resolve_office_file_path, validate_office_file, detect_format class WordMixin(MCPMixin): """Mixin containing Word-specific tools for advanced document processing.""" @mcp_tool( name="convert_to_markdown", description="Convert Office documents to Markdown format with intelligent processing recommendations. ⚠️ RECOMMENDED WORKFLOW FOR LARGE DOCUMENTS (>5 pages): 1. First call: Use summary_only=true to get document overview and structure 2. Then: Use page_range (e.g., '1-10', '15-25') to process specific sections. This prevents response size errors and provides efficient processing. Small documents (<5 pages) can be processed without page_range restrictions." ) async def convert_to_markdown( self, file_path: str = Field(description="Path to Office document or URL"), include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"), image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"), max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"), preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"), page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"), bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."), chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."), summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"), output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')") ) -> dict[str, Any]: start_time = time.time() try: # Resolve file path local_path = await resolve_office_file_path(file_path) # Validate file validation = await validate_office_file(local_path) if not validation["is_valid"]: raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}") # Get format info format_info = await detect_format(local_path) category = format_info["category"] extension = format_info["extension"] # Currently focused on Word documents for markdown conversion if category != "word": raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}") # Analyze document size and provide intelligent recommendations doc_analysis = await self._analyze_document_size(local_path, extension) processing_recommendation = self._get_processing_recommendation( doc_analysis, page_range, summary_only ) # Parse page range if provided page_numbers = self._parse_page_range(page_range) if page_range else None # Prioritize bookmark/chapter extraction over page ranges if bookmark_name or chapter_name: page_numbers = None # Ignore page ranges when bookmark or chapter is specified # Convert to markdown based on format if extension == ".docx": markdown_result = await self._convert_docx_to_markdown( local_path, include_images, image_mode, max_image_size, preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name ) else: # .doc # For legacy .doc files, use mammoth if available markdown_result = await self._convert_doc_to_markdown( local_path, include_images, image_mode, max_image_size, preserve_structure, page_numbers, summary_only, output_dir ) # Build result based on mode result = { "metadata": { "original_file": os.path.basename(local_path), "format": format_info["format_name"], "conversion_method": markdown_result["method_used"], "conversion_time": round(time.time() - start_time, 3), "summary_only": summary_only, "document_analysis": doc_analysis, "processing_recommendation": processing_recommendation } } # Add page range info if used if page_range: result["metadata"]["page_range"] = page_range result["metadata"]["pages_processed"] = len(page_numbers) if page_numbers else 0 # Add content based on mode if summary_only: # VERY restrictive summary mode to prevent massive responses result["metadata"]["character_count"] = len(markdown_result["content"]) result["metadata"]["word_count"] = len(markdown_result["content"].split()) # Ultra-short summary (only 500 chars max) result["summary"] = markdown_result["content"][:500] + "..." if len(markdown_result["content"]) > 500 else markdown_result["content"] # Severely limit table of contents to prevent 1M+ token responses if "table_of_contents" in markdown_result: toc = markdown_result["table_of_contents"] if isinstance(toc, dict): # Keep only essential TOC info, severely truncated result["table_of_contents"] = { "note": toc.get("note", ""), "basic_info": toc.get("basic_info", "")[:200], # Limit to 200 chars } # Add bookmark/heading info if available (limit to first 5 items) if "bookmarks" in toc: result["table_of_contents"]["bookmarks"] = toc["bookmarks"][:5] result["table_of_contents"]["bookmark_count"] = toc.get("bookmark_count", 0) if "available_headings" in toc: result["table_of_contents"]["available_headings"] = toc["available_headings"][:5] result["table_of_contents"]["heading_count"] = toc.get("heading_count", 0) else: result["table_of_contents"] = {"note": "Summary mode - use full processing for detailed TOC"} else: # Full content mode result["markdown"] = markdown_result["content"] result["content_truncated"] = len(markdown_result["content"]) >= 200000 # Warn if near limit # Add images info if "images" in markdown_result: result["images"] = markdown_result["images"] # Add structure info if "structure" in markdown_result: result["structure"] = markdown_result["structure"] # Add table of contents if available if "table_of_contents" in markdown_result: result["table_of_contents"] = markdown_result["table_of_contents"] return result except OfficeFileError: raise except Exception as e: raise OfficeFileError(f"Markdown conversion failed: {str(e)}") # Helper methods - import from monolithic server async def _analyze_document_size(self, file_path: str, extension: str) -> dict[str, Any]: """Analyze document size for processing recommendations.""" from ..server_monolithic import _analyze_document_size return await _analyze_document_size(file_path, extension) def _get_processing_recommendation(self, doc_analysis: dict[str, Any], page_range: str, summary_only: bool) -> dict[str, Any]: """Get processing recommendations based on document analysis.""" from ..server_monolithic import _get_processing_recommendation return _get_processing_recommendation(doc_analysis, page_range, summary_only) def _parse_page_range(self, page_range: str) -> list[int]: """Parse page range string into list of page numbers.""" from ..server_monolithic import _parse_page_range return _parse_page_range(page_range) async def _convert_docx_to_markdown( self, file_path: str, include_images: bool, image_mode: str, max_image_size: int, preserve_structure: bool, page_numbers: list[int], summary_only: bool, output_dir: str, bookmark_name: str = "", chapter_name: str = "" ) -> dict[str, Any]: """Convert .docx to markdown.""" from ..server_monolithic import _convert_docx_to_markdown return await _convert_docx_to_markdown( file_path, include_images, image_mode, max_image_size, preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name ) async def _convert_doc_to_markdown( self, file_path: str, include_images: bool, image_mode: str, max_image_size: int, preserve_structure: bool, page_numbers: list[int], summary_only: bool, output_dir: str ) -> dict[str, Any]: """Convert legacy .doc to markdown.""" from ..server_monolithic import _convert_doc_to_markdown return await _convert_doc_to_markdown( file_path, include_images, image_mode, max_image_size, preserve_structure, page_numbers, summary_only, output_dir )