- Use app.run_stdio_async() instead of deprecated stdio_server import - Aligns with FastMCP 2.11.3 API - Server now starts correctly with uv run mcp-office-tools - Maintains all MCPMixin functionality and tool registration
185 lines
10 KiB
Python
185 lines
10 KiB
Python
"""Word Document Tools Mixin - Specialized tools for Word document processing."""
|
|
|
|
import os
|
|
import time
|
|
from typing import Any
|
|
|
|
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
|
|
from pydantic import Field
|
|
|
|
from ..utils import OfficeFileError, resolve_office_file_path, validate_office_file, detect_format
|
|
|
|
|
|
class WordMixin(MCPMixin):
|
|
"""Mixin containing Word-specific tools for advanced document processing."""
|
|
|
|
@mcp_tool(
|
|
name="convert_to_markdown",
|
|
description="Convert Office documents to Markdown format with intelligent processing recommendations. ⚠️ RECOMMENDED WORKFLOW FOR LARGE DOCUMENTS (>5 pages): 1. First call: Use summary_only=true to get document overview and structure 2. Then: Use page_range (e.g., '1-10', '15-25') to process specific sections. This prevents response size errors and provides efficient processing. Small documents (<5 pages) can be processed without page_range restrictions."
|
|
)
|
|
async def convert_to_markdown(
|
|
self,
|
|
file_path: str = Field(description="Path to Office document or URL"),
|
|
include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
|
|
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
|
|
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
|
|
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
|
|
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
|
|
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
|
|
chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
|
|
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
|
|
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
|
|
) -> dict[str, Any]:
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Resolve file path
|
|
local_path = await resolve_office_file_path(file_path)
|
|
|
|
# Validate file
|
|
validation = await validate_office_file(local_path)
|
|
if not validation["is_valid"]:
|
|
raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
|
|
|
|
# Get format info
|
|
format_info = await detect_format(local_path)
|
|
category = format_info["category"]
|
|
extension = format_info["extension"]
|
|
|
|
# Currently focused on Word documents for markdown conversion
|
|
if category != "word":
|
|
raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")
|
|
|
|
# Analyze document size and provide intelligent recommendations
|
|
doc_analysis = await self._analyze_document_size(local_path, extension)
|
|
processing_recommendation = self._get_processing_recommendation(
|
|
doc_analysis, page_range, summary_only
|
|
)
|
|
|
|
# Parse page range if provided
|
|
page_numbers = self._parse_page_range(page_range) if page_range else None
|
|
|
|
# Prioritize bookmark/chapter extraction over page ranges
|
|
if bookmark_name or chapter_name:
|
|
page_numbers = None # Ignore page ranges when bookmark or chapter is specified
|
|
|
|
# Convert to markdown based on format
|
|
if extension == ".docx":
|
|
markdown_result = await self._convert_docx_to_markdown(
|
|
local_path, include_images, image_mode, max_image_size,
|
|
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
|
|
)
|
|
else: # .doc
|
|
# For legacy .doc files, use mammoth if available
|
|
markdown_result = await self._convert_doc_to_markdown(
|
|
local_path, include_images, image_mode, max_image_size,
|
|
preserve_structure, page_numbers, summary_only, output_dir
|
|
)
|
|
|
|
# Build result based on mode
|
|
result = {
|
|
"metadata": {
|
|
"original_file": os.path.basename(local_path),
|
|
"format": format_info["format_name"],
|
|
"conversion_method": markdown_result["method_used"],
|
|
"conversion_time": round(time.time() - start_time, 3),
|
|
"summary_only": summary_only,
|
|
"document_analysis": doc_analysis,
|
|
"processing_recommendation": processing_recommendation
|
|
}
|
|
}
|
|
|
|
# Add page range info if used
|
|
if page_range:
|
|
result["metadata"]["page_range"] = page_range
|
|
result["metadata"]["pages_processed"] = len(page_numbers) if page_numbers else 0
|
|
|
|
# Add content based on mode
|
|
if summary_only:
|
|
# VERY restrictive summary mode to prevent massive responses
|
|
result["metadata"]["character_count"] = len(markdown_result["content"])
|
|
result["metadata"]["word_count"] = len(markdown_result["content"].split())
|
|
|
|
# Ultra-short summary (only 500 chars max)
|
|
result["summary"] = markdown_result["content"][:500] + "..." if len(markdown_result["content"]) > 500 else markdown_result["content"]
|
|
|
|
# Severely limit table of contents to prevent 1M+ token responses
|
|
if "table_of_contents" in markdown_result:
|
|
toc = markdown_result["table_of_contents"]
|
|
if isinstance(toc, dict):
|
|
# Keep only essential TOC info, severely truncated
|
|
result["table_of_contents"] = {
|
|
"note": toc.get("note", ""),
|
|
"basic_info": toc.get("basic_info", "")[:200], # Limit to 200 chars
|
|
}
|
|
# Add bookmark/heading info if available (limit to first 5 items)
|
|
if "bookmarks" in toc:
|
|
result["table_of_contents"]["bookmarks"] = toc["bookmarks"][:5]
|
|
result["table_of_contents"]["bookmark_count"] = toc.get("bookmark_count", 0)
|
|
if "available_headings" in toc:
|
|
result["table_of_contents"]["available_headings"] = toc["available_headings"][:5]
|
|
result["table_of_contents"]["heading_count"] = toc.get("heading_count", 0)
|
|
else:
|
|
result["table_of_contents"] = {"note": "Summary mode - use full processing for detailed TOC"}
|
|
else:
|
|
# Full content mode
|
|
result["markdown"] = markdown_result["content"]
|
|
result["content_truncated"] = len(markdown_result["content"]) >= 200000 # Warn if near limit
|
|
|
|
# Add images info
|
|
if "images" in markdown_result:
|
|
result["images"] = markdown_result["images"]
|
|
|
|
# Add structure info
|
|
if "structure" in markdown_result:
|
|
result["structure"] = markdown_result["structure"]
|
|
|
|
# Add table of contents if available
|
|
if "table_of_contents" in markdown_result:
|
|
result["table_of_contents"] = markdown_result["table_of_contents"]
|
|
|
|
return result
|
|
|
|
except OfficeFileError:
|
|
raise
|
|
except Exception as e:
|
|
raise OfficeFileError(f"Markdown conversion failed: {str(e)}")
|
|
|
|
# Helper methods - import from monolithic server
|
|
async def _analyze_document_size(self, file_path: str, extension: str) -> dict[str, Any]:
|
|
"""Analyze document size for processing recommendations."""
|
|
from ..server_monolithic import _analyze_document_size
|
|
return await _analyze_document_size(file_path, extension)
|
|
|
|
def _get_processing_recommendation(self, doc_analysis: dict[str, Any], page_range: str, summary_only: bool) -> dict[str, Any]:
|
|
"""Get processing recommendations based on document analysis."""
|
|
from ..server_monolithic import _get_processing_recommendation
|
|
return _get_processing_recommendation(doc_analysis, page_range, summary_only)
|
|
|
|
def _parse_page_range(self, page_range: str) -> list[int]:
|
|
"""Parse page range string into list of page numbers."""
|
|
from ..server_monolithic import _parse_page_range
|
|
return _parse_page_range(page_range)
|
|
|
|
async def _convert_docx_to_markdown(
|
|
self, file_path: str, include_images: bool, image_mode: str, max_image_size: int,
|
|
preserve_structure: bool, page_numbers: list[int], summary_only: bool, output_dir: str,
|
|
bookmark_name: str = "", chapter_name: str = ""
|
|
) -> dict[str, Any]:
|
|
"""Convert .docx to markdown."""
|
|
from ..server_monolithic import _convert_docx_to_markdown
|
|
return await _convert_docx_to_markdown(
|
|
file_path, include_images, image_mode, max_image_size,
|
|
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
|
|
)
|
|
|
|
async def _convert_doc_to_markdown(
|
|
self, file_path: str, include_images: bool, image_mode: str, max_image_size: int,
|
|
preserve_structure: bool, page_numbers: list[int], summary_only: bool, output_dir: str
|
|
) -> dict[str, Any]:
|
|
"""Convert legacy .doc to markdown."""
|
|
from ..server_monolithic import _convert_doc_to_markdown
|
|
return await _convert_doc_to_markdown(
|
|
file_path, include_images, image_mode, max_image_size,
|
|
preserve_structure, page_numbers, summary_only, output_dir
|
|
) |