- Comprehensive Microsoft Office document processing server
- Support for Word (.docx, .doc), Excel (.xlsx, .xls), PowerPoint (.pptx, .ppt), CSV
- 6 universal tools: extract_text, extract_images, extract_metadata, detect_office_format, analyze_document_health, get_supported_formats
- Multi-library fallback system for robust processing
- URL support with intelligent caching
- Legacy Office format support (97-2003)
- FastMCP integration with async architecture
- Production ready with comprehensive documentation
🤖 Generated with Claude Code (claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
361 lines
11 KiB
Python
361 lines
11 KiB
Python
"""File validation utilities for Office documents."""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional
|
|
from urllib.parse import urlparse
|
|
import aiohttp
|
|
import aiofiles
|
|
|
|
# Optional magic import for MIME type detection
|
|
try:
|
|
import magic
|
|
HAS_MAGIC = True
|
|
except ImportError:
|
|
HAS_MAGIC = False
|
|
|
|
|
|
class OfficeFileError(Exception):
|
|
"""Custom exception for Office file processing errors."""
|
|
pass
|
|
|
|
|
|
# Office format MIME types and extensions
|
|
OFFICE_FORMATS = {
|
|
# Word Documents
|
|
".docx": {
|
|
"mime_types": [
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
],
|
|
"format_name": "Word Document (DOCX)",
|
|
"category": "word"
|
|
},
|
|
".doc": {
|
|
"mime_types": [
|
|
"application/msword",
|
|
"application/vnd.ms-office"
|
|
],
|
|
"format_name": "Word Document (DOC)",
|
|
"category": "word"
|
|
},
|
|
".docm": {
|
|
"mime_types": [
|
|
"application/vnd.ms-word.document.macroEnabled.12"
|
|
],
|
|
"format_name": "Word Macro Document",
|
|
"category": "word"
|
|
},
|
|
".dotx": {
|
|
"mime_types": [
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.template"
|
|
],
|
|
"format_name": "Word Template",
|
|
"category": "word"
|
|
},
|
|
".dot": {
|
|
"mime_types": [
|
|
"application/msword"
|
|
],
|
|
"format_name": "Word Template (Legacy)",
|
|
"category": "word"
|
|
},
|
|
|
|
# Excel Spreadsheets
|
|
".xlsx": {
|
|
"mime_types": [
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
],
|
|
"format_name": "Excel Spreadsheet (XLSX)",
|
|
"category": "excel"
|
|
},
|
|
".xls": {
|
|
"mime_types": [
|
|
"application/vnd.ms-excel",
|
|
"application/excel"
|
|
],
|
|
"format_name": "Excel Spreadsheet (XLS)",
|
|
"category": "excel"
|
|
},
|
|
".xlsm": {
|
|
"mime_types": [
|
|
"application/vnd.ms-excel.sheet.macroEnabled.12"
|
|
],
|
|
"format_name": "Excel Macro Spreadsheet",
|
|
"category": "excel"
|
|
},
|
|
".xltx": {
|
|
"mime_types": [
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.template"
|
|
],
|
|
"format_name": "Excel Template",
|
|
"category": "excel"
|
|
},
|
|
".xlt": {
|
|
"mime_types": [
|
|
"application/vnd.ms-excel"
|
|
],
|
|
"format_name": "Excel Template (Legacy)",
|
|
"category": "excel"
|
|
},
|
|
".csv": {
|
|
"mime_types": [
|
|
"text/csv",
|
|
"application/csv"
|
|
],
|
|
"format_name": "CSV File",
|
|
"category": "excel"
|
|
},
|
|
|
|
# PowerPoint Presentations
|
|
".pptx": {
|
|
"mime_types": [
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
|
],
|
|
"format_name": "PowerPoint Presentation (PPTX)",
|
|
"category": "powerpoint"
|
|
},
|
|
".ppt": {
|
|
"mime_types": [
|
|
"application/vnd.ms-powerpoint"
|
|
],
|
|
"format_name": "PowerPoint Presentation (PPT)",
|
|
"category": "powerpoint"
|
|
},
|
|
".pptm": {
|
|
"mime_types": [
|
|
"application/vnd.ms-powerpoint.presentation.macroEnabled.12"
|
|
],
|
|
"format_name": "PowerPoint Macro Presentation",
|
|
"category": "powerpoint"
|
|
},
|
|
".potx": {
|
|
"mime_types": [
|
|
"application/vnd.openxmlformats-officedocument.presentationml.template"
|
|
],
|
|
"format_name": "PowerPoint Template",
|
|
"category": "powerpoint"
|
|
},
|
|
".pot": {
|
|
"mime_types": [
|
|
"application/vnd.ms-powerpoint"
|
|
],
|
|
"format_name": "PowerPoint Template (Legacy)",
|
|
"category": "powerpoint"
|
|
}
|
|
}
|
|
|
|
|
|
def get_supported_extensions() -> list[str]:
|
|
"""Get list of all supported file extensions."""
|
|
return list(OFFICE_FORMATS.keys())
|
|
|
|
|
|
def get_format_info(extension: str) -> Optional[Dict[str, Any]]:
|
|
"""Get format information for a file extension."""
|
|
return OFFICE_FORMATS.get(extension.lower())
|
|
|
|
|
|
def detect_file_format(file_path: str) -> Dict[str, Any]:
|
|
"""Detect Office document format from file."""
|
|
path = Path(file_path)
|
|
|
|
if not path.exists():
|
|
raise OfficeFileError(f"File not found: {file_path}")
|
|
|
|
if not path.is_file():
|
|
raise OfficeFileError(f"Path is not a file: {file_path}")
|
|
|
|
# Get file extension
|
|
extension = path.suffix.lower()
|
|
|
|
# Get format info
|
|
format_info = get_format_info(extension)
|
|
if not format_info:
|
|
raise OfficeFileError(f"Unsupported file format: {extension}")
|
|
|
|
# Try to detect MIME type
|
|
mime_type = None
|
|
if HAS_MAGIC:
|
|
try:
|
|
mime_type = magic.from_file(file_path, mime=True)
|
|
except Exception:
|
|
# Fallback to extension-based detection
|
|
pass
|
|
|
|
# Validate MIME type matches expected formats
|
|
expected_mimes = format_info["mime_types"]
|
|
mime_valid = mime_type in expected_mimes if mime_type else False
|
|
|
|
return {
|
|
"file_path": str(path.absolute()),
|
|
"extension": extension,
|
|
"format_name": format_info["format_name"],
|
|
"category": format_info["category"],
|
|
"mime_type": mime_type,
|
|
"mime_valid": mime_valid,
|
|
"file_size": path.stat().st_size,
|
|
"is_legacy": extension in [".doc", ".xls", ".ppt", ".dot", ".xlt", ".pot"],
|
|
"supports_macros": extension in [".docm", ".xlsm", ".pptm"]
|
|
}
|
|
|
|
|
|
async def validate_office_file(file_path: str) -> Dict[str, Any]:
|
|
"""Comprehensive validation of Office document."""
|
|
# Basic format detection
|
|
format_info = detect_file_format(file_path)
|
|
|
|
# Additional validation checks
|
|
validation_results = {
|
|
**format_info,
|
|
"is_valid": True,
|
|
"errors": [],
|
|
"warnings": [],
|
|
"corruption_check": None,
|
|
"password_protected": False
|
|
}
|
|
|
|
# Check file size
|
|
if format_info["file_size"] == 0:
|
|
validation_results["is_valid"] = False
|
|
validation_results["errors"].append("File is empty")
|
|
elif format_info["file_size"] > 500_000_000: # 500MB limit
|
|
validation_results["warnings"].append("Large file may cause performance issues")
|
|
|
|
# Basic corruption check for Office files
|
|
try:
|
|
await _check_file_corruption(file_path, format_info)
|
|
except Exception as e:
|
|
validation_results["corruption_check"] = f"Error during corruption check: {str(e)}"
|
|
validation_results["warnings"].append("Could not verify file integrity")
|
|
|
|
# Check for password protection
|
|
try:
|
|
is_encrypted = await _check_encryption(file_path, format_info)
|
|
validation_results["password_protected"] = is_encrypted
|
|
if is_encrypted:
|
|
validation_results["warnings"].append("File is password protected")
|
|
except Exception:
|
|
pass # Encryption check is optional
|
|
|
|
return validation_results
|
|
|
|
|
|
async def _check_file_corruption(file_path: str, format_info: Dict[str, Any]) -> None:
|
|
"""Basic corruption check for Office files."""
|
|
category = format_info["category"]
|
|
extension = format_info["extension"]
|
|
|
|
# For modern Office formats, check ZIP structure
|
|
if extension in [".docx", ".xlsx", ".pptx", ".docm", ".xlsm", ".pptm"]:
|
|
import zipfile
|
|
try:
|
|
with zipfile.ZipFile(file_path, 'r') as zip_file:
|
|
# Test ZIP integrity
|
|
zip_file.testzip()
|
|
except zipfile.BadZipFile:
|
|
raise OfficeFileError("File appears to be corrupted (invalid ZIP structure)")
|
|
|
|
# For legacy formats, basic file header check
|
|
elif extension in [".doc", ".xls", ".ppt"]:
|
|
async with aiofiles.open(file_path, 'rb') as f:
|
|
header = await f.read(8)
|
|
# OLE Compound Document signature
|
|
if not header.startswith(b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'):
|
|
raise OfficeFileError("File appears to be corrupted (invalid OLE signature)")
|
|
|
|
|
|
async def _check_encryption(file_path: str, format_info: Dict[str, Any]) -> bool:
|
|
"""Check if Office file is password protected."""
|
|
try:
|
|
import msoffcrypto
|
|
|
|
with open(file_path, 'rb') as f:
|
|
office_file = msoffcrypto.OfficeFile(f)
|
|
return office_file.is_encrypted()
|
|
except ImportError:
|
|
# msoffcrypto-tool not available
|
|
return False
|
|
except Exception:
|
|
# Any other error, assume not encrypted
|
|
return False
|
|
|
|
|
|
def is_url(path: str) -> bool:
|
|
"""Check if path is a URL."""
|
|
try:
|
|
result = urlparse(path)
|
|
return all([result.scheme, result.netloc])
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
async def download_office_file(url: str, timeout: int = 30) -> str:
|
|
"""Download Office file from URL to temporary location."""
|
|
import tempfile
|
|
|
|
if not is_url(url):
|
|
raise OfficeFileError(f"Invalid URL: {url}")
|
|
|
|
# Validate URL scheme
|
|
parsed = urlparse(url)
|
|
if parsed.scheme not in ['http', 'https']:
|
|
raise OfficeFileError(f"Unsupported URL scheme: {parsed.scheme}")
|
|
|
|
# Create temporary file
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.office_temp')
|
|
temp_path = temp_file.name
|
|
temp_file.close()
|
|
|
|
try:
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(url, timeout=timeout) as response:
|
|
response.raise_for_status()
|
|
|
|
# Check content type
|
|
content_type = response.headers.get('content-type', '').lower()
|
|
|
|
# Write file content
|
|
async with aiofiles.open(temp_path, 'wb') as f:
|
|
async for chunk in response.content.iter_chunked(8192):
|
|
await f.write(chunk)
|
|
|
|
return temp_path
|
|
|
|
except Exception as e:
|
|
# Clean up on error
|
|
try:
|
|
os.unlink(temp_path)
|
|
except OSError:
|
|
pass
|
|
raise OfficeFileError(f"Failed to download file from URL: {str(e)}")
|
|
|
|
|
|
def validate_office_path(file_path: str) -> str:
|
|
"""Validate and normalize Office file path."""
|
|
if not file_path:
|
|
raise OfficeFileError("File path cannot be empty")
|
|
|
|
file_path = str(file_path).strip()
|
|
|
|
if is_url(file_path):
|
|
return file_path # URLs handled separately
|
|
|
|
# Resolve and validate local path
|
|
path = Path(file_path).resolve()
|
|
|
|
if not path.exists():
|
|
raise OfficeFileError(f"File not found: {file_path}")
|
|
|
|
if not path.is_file():
|
|
raise OfficeFileError(f"Path is not a file: {file_path}")
|
|
|
|
# Check extension
|
|
extension = path.suffix.lower()
|
|
if extension not in OFFICE_FORMATS:
|
|
supported = ", ".join(sorted(OFFICE_FORMATS.keys()))
|
|
raise OfficeFileError(
|
|
f"Unsupported file format '{extension}'. "
|
|
f"Supported formats: {supported}"
|
|
)
|
|
|
|
return str(path) |