Ryan Malloy b681cb030b Initial commit: MCP Office Tools v0.1.0
- Comprehensive Microsoft Office document processing server
- Support for Word (.docx, .doc), Excel (.xlsx, .xls), PowerPoint (.pptx, .ppt), CSV
- 6 universal tools: extract_text, extract_images, extract_metadata, detect_office_format, analyze_document_health, get_supported_formats
- Multi-library fallback system for robust processing
- URL support with intelligent caching
- Legacy Office format support (97-2003)
- FastMCP integration with async architecture
- Production ready with comprehensive documentation

🤖 Generated with Claude Code (claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 01:01:48 -06:00

361 lines
11 KiB
Python

"""File validation utilities for Office documents."""
import os
from pathlib import Path
from typing import Dict, Any, Optional
from urllib.parse import urlparse
import aiohttp
import aiofiles
# Optional magic import for MIME type detection
try:
import magic
HAS_MAGIC = True
except ImportError:
HAS_MAGIC = False
class OfficeFileError(Exception):
"""Custom exception for Office file processing errors."""
pass
# Office format MIME types and extensions
OFFICE_FORMATS = {
# Word Documents
".docx": {
"mime_types": [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
],
"format_name": "Word Document (DOCX)",
"category": "word"
},
".doc": {
"mime_types": [
"application/msword",
"application/vnd.ms-office"
],
"format_name": "Word Document (DOC)",
"category": "word"
},
".docm": {
"mime_types": [
"application/vnd.ms-word.document.macroEnabled.12"
],
"format_name": "Word Macro Document",
"category": "word"
},
".dotx": {
"mime_types": [
"application/vnd.openxmlformats-officedocument.wordprocessingml.template"
],
"format_name": "Word Template",
"category": "word"
},
".dot": {
"mime_types": [
"application/msword"
],
"format_name": "Word Template (Legacy)",
"category": "word"
},
# Excel Spreadsheets
".xlsx": {
"mime_types": [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
"format_name": "Excel Spreadsheet (XLSX)",
"category": "excel"
},
".xls": {
"mime_types": [
"application/vnd.ms-excel",
"application/excel"
],
"format_name": "Excel Spreadsheet (XLS)",
"category": "excel"
},
".xlsm": {
"mime_types": [
"application/vnd.ms-excel.sheet.macroEnabled.12"
],
"format_name": "Excel Macro Spreadsheet",
"category": "excel"
},
".xltx": {
"mime_types": [
"application/vnd.openxmlformats-officedocument.spreadsheetml.template"
],
"format_name": "Excel Template",
"category": "excel"
},
".xlt": {
"mime_types": [
"application/vnd.ms-excel"
],
"format_name": "Excel Template (Legacy)",
"category": "excel"
},
".csv": {
"mime_types": [
"text/csv",
"application/csv"
],
"format_name": "CSV File",
"category": "excel"
},
# PowerPoint Presentations
".pptx": {
"mime_types": [
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
],
"format_name": "PowerPoint Presentation (PPTX)",
"category": "powerpoint"
},
".ppt": {
"mime_types": [
"application/vnd.ms-powerpoint"
],
"format_name": "PowerPoint Presentation (PPT)",
"category": "powerpoint"
},
".pptm": {
"mime_types": [
"application/vnd.ms-powerpoint.presentation.macroEnabled.12"
],
"format_name": "PowerPoint Macro Presentation",
"category": "powerpoint"
},
".potx": {
"mime_types": [
"application/vnd.openxmlformats-officedocument.presentationml.template"
],
"format_name": "PowerPoint Template",
"category": "powerpoint"
},
".pot": {
"mime_types": [
"application/vnd.ms-powerpoint"
],
"format_name": "PowerPoint Template (Legacy)",
"category": "powerpoint"
}
}
def get_supported_extensions() -> list[str]:
"""Get list of all supported file extensions."""
return list(OFFICE_FORMATS.keys())
def get_format_info(extension: str) -> Optional[Dict[str, Any]]:
"""Get format information for a file extension."""
return OFFICE_FORMATS.get(extension.lower())
def detect_file_format(file_path: str) -> Dict[str, Any]:
"""Detect Office document format from file."""
path = Path(file_path)
if not path.exists():
raise OfficeFileError(f"File not found: {file_path}")
if not path.is_file():
raise OfficeFileError(f"Path is not a file: {file_path}")
# Get file extension
extension = path.suffix.lower()
# Get format info
format_info = get_format_info(extension)
if not format_info:
raise OfficeFileError(f"Unsupported file format: {extension}")
# Try to detect MIME type
mime_type = None
if HAS_MAGIC:
try:
mime_type = magic.from_file(file_path, mime=True)
except Exception:
# Fallback to extension-based detection
pass
# Validate MIME type matches expected formats
expected_mimes = format_info["mime_types"]
mime_valid = mime_type in expected_mimes if mime_type else False
return {
"file_path": str(path.absolute()),
"extension": extension,
"format_name": format_info["format_name"],
"category": format_info["category"],
"mime_type": mime_type,
"mime_valid": mime_valid,
"file_size": path.stat().st_size,
"is_legacy": extension in [".doc", ".xls", ".ppt", ".dot", ".xlt", ".pot"],
"supports_macros": extension in [".docm", ".xlsm", ".pptm"]
}
async def validate_office_file(file_path: str) -> Dict[str, Any]:
"""Comprehensive validation of Office document."""
# Basic format detection
format_info = detect_file_format(file_path)
# Additional validation checks
validation_results = {
**format_info,
"is_valid": True,
"errors": [],
"warnings": [],
"corruption_check": None,
"password_protected": False
}
# Check file size
if format_info["file_size"] == 0:
validation_results["is_valid"] = False
validation_results["errors"].append("File is empty")
elif format_info["file_size"] > 500_000_000: # 500MB limit
validation_results["warnings"].append("Large file may cause performance issues")
# Basic corruption check for Office files
try:
await _check_file_corruption(file_path, format_info)
except Exception as e:
validation_results["corruption_check"] = f"Error during corruption check: {str(e)}"
validation_results["warnings"].append("Could not verify file integrity")
# Check for password protection
try:
is_encrypted = await _check_encryption(file_path, format_info)
validation_results["password_protected"] = is_encrypted
if is_encrypted:
validation_results["warnings"].append("File is password protected")
except Exception:
pass # Encryption check is optional
return validation_results
async def _check_file_corruption(file_path: str, format_info: Dict[str, Any]) -> None:
"""Basic corruption check for Office files."""
category = format_info["category"]
extension = format_info["extension"]
# For modern Office formats, check ZIP structure
if extension in [".docx", ".xlsx", ".pptx", ".docm", ".xlsm", ".pptm"]:
import zipfile
try:
with zipfile.ZipFile(file_path, 'r') as zip_file:
# Test ZIP integrity
zip_file.testzip()
except zipfile.BadZipFile:
raise OfficeFileError("File appears to be corrupted (invalid ZIP structure)")
# For legacy formats, basic file header check
elif extension in [".doc", ".xls", ".ppt"]:
async with aiofiles.open(file_path, 'rb') as f:
header = await f.read(8)
# OLE Compound Document signature
if not header.startswith(b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'):
raise OfficeFileError("File appears to be corrupted (invalid OLE signature)")
async def _check_encryption(file_path: str, format_info: Dict[str, Any]) -> bool:
"""Check if Office file is password protected."""
try:
import msoffcrypto
with open(file_path, 'rb') as f:
office_file = msoffcrypto.OfficeFile(f)
return office_file.is_encrypted()
except ImportError:
# msoffcrypto-tool not available
return False
except Exception:
# Any other error, assume not encrypted
return False
def is_url(path: str) -> bool:
"""Check if path is a URL."""
try:
result = urlparse(path)
return all([result.scheme, result.netloc])
except Exception:
return False
async def download_office_file(url: str, timeout: int = 30) -> str:
"""Download Office file from URL to temporary location."""
import tempfile
if not is_url(url):
raise OfficeFileError(f"Invalid URL: {url}")
# Validate URL scheme
parsed = urlparse(url)
if parsed.scheme not in ['http', 'https']:
raise OfficeFileError(f"Unsupported URL scheme: {parsed.scheme}")
# Create temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.office_temp')
temp_path = temp_file.name
temp_file.close()
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=timeout) as response:
response.raise_for_status()
# Check content type
content_type = response.headers.get('content-type', '').lower()
# Write file content
async with aiofiles.open(temp_path, 'wb') as f:
async for chunk in response.content.iter_chunked(8192):
await f.write(chunk)
return temp_path
except Exception as e:
# Clean up on error
try:
os.unlink(temp_path)
except OSError:
pass
raise OfficeFileError(f"Failed to download file from URL: {str(e)}")
def validate_office_path(file_path: str) -> str:
"""Validate and normalize Office file path."""
if not file_path:
raise OfficeFileError("File path cannot be empty")
file_path = str(file_path).strip()
if is_url(file_path):
return file_path # URLs handled separately
# Resolve and validate local path
path = Path(file_path).resolve()
if not path.exists():
raise OfficeFileError(f"File not found: {file_path}")
if not path.is_file():
raise OfficeFileError(f"Path is not a file: {file_path}")
# Check extension
extension = path.suffix.lower()
if extension not in OFFICE_FORMATS:
supported = ", ".join(sorted(OFFICE_FORMATS.keys()))
raise OfficeFileError(
f"Unsupported file format '{extension}'. "
f"Supported formats: {supported}"
)
return str(path)