Ryan Malloy 0748eec48d Fix FastMCP stdio server import
- Use app.run_stdio_async() instead of deprecated stdio_server import
- Aligns with FastMCP 2.11.3 API
- Server now starts correctly with uv run mcp-office-tools
- Maintains all MCPMixin functionality and tool registration
2025-09-26 15:49:00 -06:00

185 lines
10 KiB
Python

"""Word Document Tools Mixin - Specialized tools for Word document processing."""
import os
import time
from typing import Any
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
from pydantic import Field
from ..utils import OfficeFileError, resolve_office_file_path, validate_office_file, detect_format
class WordMixin(MCPMixin):
"""Mixin containing Word-specific tools for advanced document processing."""
@mcp_tool(
name="convert_to_markdown",
description="Convert Office documents to Markdown format with intelligent processing recommendations. ⚠️ RECOMMENDED WORKFLOW FOR LARGE DOCUMENTS (>5 pages): 1. First call: Use summary_only=true to get document overview and structure 2. Then: Use page_range (e.g., '1-10', '15-25') to process specific sections. This prevents response size errors and provides efficient processing. Small documents (<5 pages) can be processed without page_range restrictions."
)
async def convert_to_markdown(
self,
file_path: str = Field(description="Path to Office document or URL"),
include_images: bool = Field(default=True, description="Include images in markdown with base64 encoding or file references"),
image_mode: str = Field(default="base64", description="Image handling mode: 'base64', 'files', or 'references'"),
max_image_size: int = Field(default=1024*1024, description="Maximum image size in bytes for base64 encoding"),
preserve_structure: bool = Field(default=True, description="Preserve document structure (headings, lists, tables)"),
page_range: str = Field(default="", description="Page range to convert (e.g., '1-5', '3', '1,3,5-10'). RECOMMENDED for large documents. Empty = all pages"),
bookmark_name: str = Field(default="", description="Extract content for a specific bookmark/chapter (e.g., 'Chapter1_Start'). More reliable than page ranges."),
chapter_name: str = Field(default="", description="Extract content for a chapter by heading text (e.g., 'Chapter 1', 'Introduction'). Works when bookmarks aren't available."),
summary_only: bool = Field(default=False, description="Return only metadata and truncated summary. STRONGLY RECOMMENDED for large docs (>10 pages)"),
output_dir: str = Field(default="", description="Output directory for image files (if image_mode='files')")
) -> dict[str, Any]:
start_time = time.time()
try:
# Resolve file path
local_path = await resolve_office_file_path(file_path)
# Validate file
validation = await validate_office_file(local_path)
if not validation["is_valid"]:
raise OfficeFileError(f"Invalid file: {', '.join(validation['errors'])}")
# Get format info
format_info = await detect_format(local_path)
category = format_info["category"]
extension = format_info["extension"]
# Currently focused on Word documents for markdown conversion
if category != "word":
raise OfficeFileError(f"Markdown conversion currently only supports Word documents, got: {category}")
# Analyze document size and provide intelligent recommendations
doc_analysis = await self._analyze_document_size(local_path, extension)
processing_recommendation = self._get_processing_recommendation(
doc_analysis, page_range, summary_only
)
# Parse page range if provided
page_numbers = self._parse_page_range(page_range) if page_range else None
# Prioritize bookmark/chapter extraction over page ranges
if bookmark_name or chapter_name:
page_numbers = None # Ignore page ranges when bookmark or chapter is specified
# Convert to markdown based on format
if extension == ".docx":
markdown_result = await self._convert_docx_to_markdown(
local_path, include_images, image_mode, max_image_size,
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
)
else: # .doc
# For legacy .doc files, use mammoth if available
markdown_result = await self._convert_doc_to_markdown(
local_path, include_images, image_mode, max_image_size,
preserve_structure, page_numbers, summary_only, output_dir
)
# Build result based on mode
result = {
"metadata": {
"original_file": os.path.basename(local_path),
"format": format_info["format_name"],
"conversion_method": markdown_result["method_used"],
"conversion_time": round(time.time() - start_time, 3),
"summary_only": summary_only,
"document_analysis": doc_analysis,
"processing_recommendation": processing_recommendation
}
}
# Add page range info if used
if page_range:
result["metadata"]["page_range"] = page_range
result["metadata"]["pages_processed"] = len(page_numbers) if page_numbers else 0
# Add content based on mode
if summary_only:
# VERY restrictive summary mode to prevent massive responses
result["metadata"]["character_count"] = len(markdown_result["content"])
result["metadata"]["word_count"] = len(markdown_result["content"].split())
# Ultra-short summary (only 500 chars max)
result["summary"] = markdown_result["content"][:500] + "..." if len(markdown_result["content"]) > 500 else markdown_result["content"]
# Severely limit table of contents to prevent 1M+ token responses
if "table_of_contents" in markdown_result:
toc = markdown_result["table_of_contents"]
if isinstance(toc, dict):
# Keep only essential TOC info, severely truncated
result["table_of_contents"] = {
"note": toc.get("note", ""),
"basic_info": toc.get("basic_info", "")[:200], # Limit to 200 chars
}
# Add bookmark/heading info if available (limit to first 5 items)
if "bookmarks" in toc:
result["table_of_contents"]["bookmarks"] = toc["bookmarks"][:5]
result["table_of_contents"]["bookmark_count"] = toc.get("bookmark_count", 0)
if "available_headings" in toc:
result["table_of_contents"]["available_headings"] = toc["available_headings"][:5]
result["table_of_contents"]["heading_count"] = toc.get("heading_count", 0)
else:
result["table_of_contents"] = {"note": "Summary mode - use full processing for detailed TOC"}
else:
# Full content mode
result["markdown"] = markdown_result["content"]
result["content_truncated"] = len(markdown_result["content"]) >= 200000 # Warn if near limit
# Add images info
if "images" in markdown_result:
result["images"] = markdown_result["images"]
# Add structure info
if "structure" in markdown_result:
result["structure"] = markdown_result["structure"]
# Add table of contents if available
if "table_of_contents" in markdown_result:
result["table_of_contents"] = markdown_result["table_of_contents"]
return result
except OfficeFileError:
raise
except Exception as e:
raise OfficeFileError(f"Markdown conversion failed: {str(e)}")
# Helper methods - import from monolithic server
async def _analyze_document_size(self, file_path: str, extension: str) -> dict[str, Any]:
"""Analyze document size for processing recommendations."""
from ..server_monolithic import _analyze_document_size
return await _analyze_document_size(file_path, extension)
def _get_processing_recommendation(self, doc_analysis: dict[str, Any], page_range: str, summary_only: bool) -> dict[str, Any]:
"""Get processing recommendations based on document analysis."""
from ..server_monolithic import _get_processing_recommendation
return _get_processing_recommendation(doc_analysis, page_range, summary_only)
def _parse_page_range(self, page_range: str) -> list[int]:
"""Parse page range string into list of page numbers."""
from ..server_monolithic import _parse_page_range
return _parse_page_range(page_range)
async def _convert_docx_to_markdown(
self, file_path: str, include_images: bool, image_mode: str, max_image_size: int,
preserve_structure: bool, page_numbers: list[int], summary_only: bool, output_dir: str,
bookmark_name: str = "", chapter_name: str = ""
) -> dict[str, Any]:
"""Convert .docx to markdown."""
from ..server_monolithic import _convert_docx_to_markdown
return await _convert_docx_to_markdown(
file_path, include_images, image_mode, max_image_size,
preserve_structure, page_numbers, summary_only, output_dir, bookmark_name, chapter_name
)
async def _convert_doc_to_markdown(
self, file_path: str, include_images: bool, image_mode: str, max_image_size: int,
preserve_structure: bool, page_numbers: list[int], summary_only: bool, output_dir: str
) -> dict[str, Any]:
"""Convert legacy .doc to markdown."""
from ..server_monolithic import _convert_doc_to_markdown
return await _convert_doc_to_markdown(
file_path, include_images, image_mode, max_image_size,
preserve_structure, page_numbers, summary_only, output_dir
)