"""File validation utilities for Office documents.""" import os from pathlib import Path from typing import Dict, Any, Optional from urllib.parse import urlparse import aiohttp import aiofiles # Optional magic import for MIME type detection try: import magic HAS_MAGIC = True except ImportError: HAS_MAGIC = False class OfficeFileError(Exception): """Custom exception for Office file processing errors.""" pass # Office format MIME types and extensions OFFICE_FORMATS = { # Word Documents ".docx": { "mime_types": [ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ], "format_name": "Word Document (DOCX)", "category": "word" }, ".doc": { "mime_types": [ "application/msword", "application/vnd.ms-office" ], "format_name": "Word Document (DOC)", "category": "word" }, ".docm": { "mime_types": [ "application/vnd.ms-word.document.macroEnabled.12" ], "format_name": "Word Macro Document", "category": "word" }, ".dotx": { "mime_types": [ "application/vnd.openxmlformats-officedocument.wordprocessingml.template" ], "format_name": "Word Template", "category": "word" }, ".dot": { "mime_types": [ "application/msword" ], "format_name": "Word Template (Legacy)", "category": "word" }, # Excel Spreadsheets ".xlsx": { "mime_types": [ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ], "format_name": "Excel Spreadsheet (XLSX)", "category": "excel" }, ".xls": { "mime_types": [ "application/vnd.ms-excel", "application/excel" ], "format_name": "Excel Spreadsheet (XLS)", "category": "excel" }, ".xlsm": { "mime_types": [ "application/vnd.ms-excel.sheet.macroEnabled.12" ], "format_name": "Excel Macro Spreadsheet", "category": "excel" }, ".xltx": { "mime_types": [ "application/vnd.openxmlformats-officedocument.spreadsheetml.template" ], "format_name": "Excel Template", "category": "excel" }, ".xlt": { "mime_types": [ "application/vnd.ms-excel" ], "format_name": "Excel Template (Legacy)", "category": "excel" }, ".csv": { "mime_types": [ "text/csv", "application/csv" ], "format_name": "CSV File", "category": "excel" }, # PowerPoint Presentations ".pptx": { "mime_types": [ "application/vnd.openxmlformats-officedocument.presentationml.presentation" ], "format_name": "PowerPoint Presentation (PPTX)", "category": "powerpoint" }, ".ppt": { "mime_types": [ "application/vnd.ms-powerpoint" ], "format_name": "PowerPoint Presentation (PPT)", "category": "powerpoint" }, ".pptm": { "mime_types": [ "application/vnd.ms-powerpoint.presentation.macroEnabled.12" ], "format_name": "PowerPoint Macro Presentation", "category": "powerpoint" }, ".potx": { "mime_types": [ "application/vnd.openxmlformats-officedocument.presentationml.template" ], "format_name": "PowerPoint Template", "category": "powerpoint" }, ".pot": { "mime_types": [ "application/vnd.ms-powerpoint" ], "format_name": "PowerPoint Template (Legacy)", "category": "powerpoint" } } def get_supported_extensions() -> list[str]: """Get list of all supported file extensions.""" return list(OFFICE_FORMATS.keys()) def get_format_info(extension: str) -> Optional[Dict[str, Any]]: """Get format information for a file extension.""" return OFFICE_FORMATS.get(extension.lower()) def detect_file_format(file_path: str) -> Dict[str, Any]: """Detect Office document format from file.""" path = Path(file_path) if not path.exists(): raise OfficeFileError(f"File not found: {file_path}") if not path.is_file(): raise OfficeFileError(f"Path is not a file: {file_path}") # Get file extension extension = path.suffix.lower() # Get format info format_info = get_format_info(extension) if not format_info: raise OfficeFileError(f"Unsupported file format: {extension}") # Try to detect MIME type mime_type = None if HAS_MAGIC: try: mime_type = magic.from_file(file_path, mime=True) except Exception: # Fallback to extension-based detection pass # Validate MIME type matches expected formats expected_mimes = format_info["mime_types"] mime_valid = mime_type in expected_mimes if mime_type else False return { "file_path": str(path.absolute()), "extension": extension, "format_name": format_info["format_name"], "category": format_info["category"], "mime_type": mime_type, "mime_valid": mime_valid, "file_size": path.stat().st_size, "is_legacy": extension in [".doc", ".xls", ".ppt", ".dot", ".xlt", ".pot"], "supports_macros": extension in [".docm", ".xlsm", ".pptm"] } async def validate_office_file(file_path: str) -> Dict[str, Any]: """Comprehensive validation of Office document.""" # Basic format detection format_info = detect_file_format(file_path) # Additional validation checks validation_results = { **format_info, "is_valid": True, "errors": [], "warnings": [], "corruption_check": None, "password_protected": False } # Check file size if format_info["file_size"] == 0: validation_results["is_valid"] = False validation_results["errors"].append("File is empty") elif format_info["file_size"] > 500_000_000: # 500MB limit validation_results["warnings"].append("Large file may cause performance issues") # Basic corruption check for Office files try: await _check_file_corruption(file_path, format_info) except Exception as e: validation_results["corruption_check"] = f"Error during corruption check: {str(e)}" validation_results["warnings"].append("Could not verify file integrity") # Check for password protection try: is_encrypted = await _check_encryption(file_path, format_info) validation_results["password_protected"] = is_encrypted if is_encrypted: validation_results["warnings"].append("File is password protected") except Exception: pass # Encryption check is optional return validation_results async def _check_file_corruption(file_path: str, format_info: Dict[str, Any]) -> None: """Basic corruption check for Office files.""" category = format_info["category"] extension = format_info["extension"] # For modern Office formats, check ZIP structure if extension in [".docx", ".xlsx", ".pptx", ".docm", ".xlsm", ".pptm"]: import zipfile try: with zipfile.ZipFile(file_path, 'r') as zip_file: # Test ZIP integrity zip_file.testzip() except zipfile.BadZipFile: raise OfficeFileError("File appears to be corrupted (invalid ZIP structure)") # For legacy formats, basic file header check elif extension in [".doc", ".xls", ".ppt"]: async with aiofiles.open(file_path, 'rb') as f: header = await f.read(8) # OLE Compound Document signature if not header.startswith(b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'): raise OfficeFileError("File appears to be corrupted (invalid OLE signature)") async def _check_encryption(file_path: str, format_info: Dict[str, Any]) -> bool: """Check if Office file is password protected.""" try: import msoffcrypto with open(file_path, 'rb') as f: office_file = msoffcrypto.OfficeFile(f) return office_file.is_encrypted() except ImportError: # msoffcrypto-tool not available return False except Exception: # Any other error, assume not encrypted return False def is_url(path: str) -> bool: """Check if path is a URL.""" try: result = urlparse(path) return all([result.scheme, result.netloc]) except Exception: return False async def download_office_file(url: str, timeout: int = 30) -> str: """Download Office file from URL to temporary location.""" import tempfile if not is_url(url): raise OfficeFileError(f"Invalid URL: {url}") # Validate URL scheme parsed = urlparse(url) if parsed.scheme not in ['http', 'https']: raise OfficeFileError(f"Unsupported URL scheme: {parsed.scheme}") # Create temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.office_temp') temp_path = temp_file.name temp_file.close() try: async with aiohttp.ClientSession() as session: async with session.get(url, timeout=timeout) as response: response.raise_for_status() # Check content type content_type = response.headers.get('content-type', '').lower() # Write file content async with aiofiles.open(temp_path, 'wb') as f: async for chunk in response.content.iter_chunked(8192): await f.write(chunk) return temp_path except Exception as e: # Clean up on error try: os.unlink(temp_path) except OSError: pass raise OfficeFileError(f"Failed to download file from URL: {str(e)}") def validate_office_path(file_path: str) -> str: """Validate and normalize Office file path.""" if not file_path: raise OfficeFileError("File path cannot be empty") file_path = str(file_path).strip() if is_url(file_path): return file_path # URLs handled separately # Resolve and validate local path path = Path(file_path).resolve() if not path.exists(): raise OfficeFileError(f"File not found: {file_path}") if not path.is_file(): raise OfficeFileError(f"Path is not a file: {file_path}") # Check extension extension = path.suffix.lower() if extension not in OFFICE_FORMATS: supported = ", ".join(sorted(OFFICE_FORMATS.keys())) raise OfficeFileError( f"Unsupported file format '{extension}'. " f"Supported formats: {supported}" ) return str(path)