mcp-office-tools/src/mcp_office_tools/mixins/word.py

"""Word Document Tools Mixin - Specialized tools for Word document processing."""

import os
import time
from typing import Any

from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
from pydantic import Field

from ..utils import OfficeFileError, resolve_office_file_path, validate_office_file, detect_format


class WordMixin(MCPMixin):
    """Mixin containing Word-specific tools for advanced document processing."""

    @mcp_tool(
        name="convert_to_markdown",
        description="Convert Office documents to Markdown format with intelligent processing recommendations. ⚠️ RECOMMENDED WORKFLOW FOR LARGE DOCUMENTS (>5 pages): 1. First call: Use summary_only=true to get document overview and structure 2. Then: Use page_range (e.g., '1-10', '15-25') to process specific sections. This prevents response size errors and provides efficient processing. Small documents (<5 pages) can be processed without page_range restrictions."
    )
    async def convert_to_markdown(
        self,
        file_path: str = Field(description="Path to Office document or URL"),
        include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
        image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
        max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
        preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
        page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
        bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
        chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
        summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
        output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
    ) -> dict[str, Any]:
        start_time = time.time()

        try:
            # Resolve file path
            local_path = await resolve_office_file_path(file_path)

            # Validate file
            validation = await validate_office_file(local_path)
            if not validation["is_valid"]:
                raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")

            # Get format info
            format_info = await detect_format(local_path)
            category = format_info["category"]
            extension = format_info["extension"]

            # Currently focused on Word documents for markdown conversion
            if category != "word":
                raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")

            # Analyze document size and provide intelligent recommendations
            doc_analysis = await self._analyze_document_size(local_path, extension)
            processing_recommendation = self._get_processing_recommendation(
                doc_analysis, page_range, summary_only
            )

            # Parse page range if provided
            page_numbers = self._parse_page_range(page_range) if page_range else None

            # Prioritize bookmark/chapter extraction over page ranges
            if bookmark_name or chapter_name:
                page_numbers = None  # Ignore page ranges when bookmark or chapter is specified

            # Convert to markdown based on format
            if extension == ".docx":
                markdown_result = await self._convert_docx_to_markdown(
                    local_path, include_images, image_mode, max_image_size,
                    preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
                )
            else:  # .doc
                # For legacy .doc files, use mammoth if available
                markdown_result = await self._convert_doc_to_markdown(
                    local_path, include_images, image_mode, max_image_size,
                    preserve_structure, page_numbers, summary_only, output_dir
                )

            # Build result based on mode
            result = {
                "metadata": {
                    "original_file": os.path.basename(local_path),
                    "format": format_info["format_name"],
                    "conversion_method": markdown_result["method_used"],
                    "conversion_time": round(time.time() - start_time, 3),
                    "summary_only": summary_only,
                    "document_analysis": doc_analysis,
                    "processing_recommendation": processing_recommendation
                }
            }

            # Add page range info if used
            if page_range:
                result["metadata"]["page_range"] = page_range
                result["metadata"]["pages_processed"] = len(page_numbers) if page_numbers else 0

            # Add content based on mode
            if summary_only:
                # VERY restrictive summary mode to prevent massive responses
                result["metadata"]["character_count"] = len(markdown_result["content"])
                result["metadata"]["word_count"] = len(markdown_result["content"].split())

                # Ultra-short summary (only 500 chars max)
                result["summary"] = markdown_result["content"][:500] + "..." if len(markdown_result["content"]) > 500 else markdown_result["content"]

                # Severely limit table of contents to prevent 1M+ token responses
                if "table_of_contents" in markdown_result:
                    toc = markdown_result["table_of_contents"]
                    if isinstance(toc, dict):
                        # Keep only essential TOC info, severely truncated
                        result["table_of_contents"] = {
                            "note": toc.get("note", ""),
                            "basic_info": toc.get("basic_info", "")[:200],  # Limit to 200 chars
                        }
                        # Add bookmark/heading info if available (limit to first 5 items)
                        if "bookmarks" in toc:
                            result["table_of_contents"]["bookmarks"] = toc["bookmarks"][:5]
                            result["table_of_contents"]["bookmark_count"] = toc.get("bookmark_count", 0)
                        if "available_headings" in toc:
                            result["table_of_contents"]["available_headings"] = toc["available_headings"][:5]
                            result["table_of_contents"]["heading_count"] = toc.get("heading_count", 0)
                    else:
                        result["table_of_contents"] = {"note": "Summary mode - use full processing for detailed TOC"}
            else:
                # Full content mode
                result["markdown"] = markdown_result["content"]
                result["content_truncated"] = len(markdown_result["content"]) >= 200000  # Warn if near limit

                # Add images info
                if "images" in markdown_result:
                    result["images"] = markdown_result["images"]

                # Add structure info
                if "structure" in markdown_result:
                    result["structure"] = markdown_result["structure"]

                # Add table of contents if available
                if "table_of_contents" in markdown_result:
                    result["table_of_contents"] = markdown_result["table_of_contents"]

            return result

        except OfficeFileError:
            raise
        except Exception as e:
            raise OfficeFileError(f"Markdown conversion failed: {str(e)}")

    # Helper methods - import from monolithic server
    async def _analyze_document_size(self, file_path: str, extension: str) -> dict[str, Any]:
        """Analyze document size for processing recommendations."""
        from ..server_monolithic import _analyze_document_size
        return await _analyze_document_size(file_path, extension)

    def _get_processing_recommendation(self, doc_analysis: dict[str, Any], page_range: str, summary_only: bool) -> dict[str, Any]:
        """Get processing recommendations based on document analysis."""
        from ..server_monolithic import _get_processing_recommendation
        return _get_processing_recommendation(doc_analysis, page_range, summary_only)

    def _parse_page_range(self, page_range: str) -> list[int]:
        """Parse page range string into list of page numbers."""
        from ..server_monolithic import _parse_page_range
        return _parse_page_range(page_range)

    async def _convert_docx_to_markdown(
        self, file_path: str, include_images: bool, image_mode: str, max_image_size: int,
        preserve_structure: bool, page_numbers: list[int], summary_only: bool, output_dir: str,
        bookmark_name: str = "", chapter_name: str = ""
    ) -> dict[str, Any]:
        """Convert .docx to markdown."""
        from ..server_monolithic import _convert_docx_to_markdown
        return await _convert_docx_to_markdown(
            file_path, include_images, image_mode, max_image_size,
            preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
        )

    async def _convert_doc_to_markdown(
        self, file_path: str, include_images: bool, image_mode: str, max_image_size: int,
        preserve_structure: bool, page_numbers: list[int], summary_only: bool, output_dir: str
    ) -> dict[str, Any]:
        """Convert legacy .doc to markdown."""
        from ..server_monolithic import _convert_doc_to_markdown
        return await _convert_doc_to_markdown(
            file_path, include_images, image_mode, max_image_size,
            preserve_structure, page_numbers, summary_only, output_dir
        )