Ryan Malloy 572379d9aa 🎉 Complete Phase 2: WordPerfect processor implementation
 WordPerfect Production Support:
- Comprehensive WordPerfect processor with 5-layer fallback chain
- Support for WP 4.2, 5.0-5.1, 6.0+ (.wpd, .wp, .wp5, .wp6)
- libwpd integration (wpd2text, wpd2html, wpd2raw)
- Binary strings extraction and emergency parsing
- Password detection and encoding intelligence
- Document structure analysis and integrity checking

🏗️ Infrastructure Enhancements:
- Created comprehensive CLAUDE.md development guide
- Updated implementation status documentation
- Added WordPerfect processor test suite
- Enhanced format detection with WP magic signatures
- Production-ready with graceful dependency handling

📊 Project Status:
- 2/4 core processors complete (dBASE + WordPerfect)
- 25+ legacy format detection engine operational
- Phase 2 complete: Ready for Lotus 1-2-3 implementation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-18 02:03:44 -06:00

251 lines
7.2 KiB
Python

"""
File and URL validation utilities for legacy document processing.
"""
import os
import re
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse
try:
import structlog
logger = structlog.get_logger(__name__)
except ImportError:
import logging
logger = logging.getLogger(__name__)
class ValidationError(Exception):
"""Custom exception for validation errors."""
pass
def validate_file_path(file_path: str) -> None:
"""
Validate file path for legacy document processing.
Args:
file_path: Path to validate
Raises:
ValidationError: If path is invalid or inaccessible
"""
if not file_path:
raise ValidationError("File path cannot be empty")
if not isinstance(file_path, str):
raise ValidationError("File path must be a string")
# Convert to Path object for validation
path = Path(file_path)
# Check if file exists
if not path.exists():
raise ValidationError(f"File does not exist: {file_path}")
# Check if it's actually a file (not directory)
if not path.is_file():
raise ValidationError(f"Path is not a file: {file_path}")
# Check read permissions
if not os.access(file_path, os.R_OK):
raise ValidationError(f"File is not readable: {file_path}")
# Check file size (prevent processing of extremely large files)
file_size = path.stat().st_size
max_size = 500 * 1024 * 1024 # 500MB limit
if file_size > max_size:
raise ValidationError(f"File too large ({file_size} bytes). Maximum size: {max_size} bytes")
# Check for suspicious file extensions that might be dangerous
suspicious_extensions = {'.exe', '.com', '.bat', '.cmd', '.scr', '.pif'}
if path.suffix.lower() in suspicious_extensions:
raise ValidationError(f"Potentially dangerous file extension: {path.suffix}")
logger.debug("File validation passed", file_path=file_path, size=file_size)
def validate_url(url: str) -> None:
"""
Validate URL for downloading legacy documents.
Args:
url: URL to validate
Raises:
ValidationError: If URL is invalid or unsafe
"""
if not url:
raise ValidationError("URL cannot be empty")
if not isinstance(url, str):
raise ValidationError("URL must be a string")
# Parse URL
try:
parsed = urlparse(url)
except Exception as e:
raise ValidationError(f"Invalid URL format: {str(e)}")
# Only allow HTTPS for security
if parsed.scheme != 'https':
raise ValidationError("Only HTTPS URLs are allowed for security")
# Check for valid hostname
if not parsed.netloc:
raise ValidationError("URL must have a valid hostname")
# Block localhost and private IP ranges for security
hostname = parsed.hostname
if hostname:
if hostname.lower() in ['localhost', '127.0.0.1', '::1']:
raise ValidationError("Localhost URLs are not allowed")
# Basic check for private IP ranges (simplified)
if hostname.startswith(('192.168.', '10.', '172.')):
raise ValidationError("Private IP addresses are not allowed")
# URL length limit
if len(url) > 2048:
raise ValidationError("URL too long (maximum 2048 characters)")
logger.debug("URL validation passed", url=url)
def get_safe_filename(filename: str) -> str:
"""
Generate safe filename for caching downloaded files.
Args:
filename: Original filename
Returns:
str: Safe filename for filesystem storage
"""
if not filename:
return "unknown_file"
# Remove path components
filename = os.path.basename(filename)
# Replace unsafe characters
safe_chars = re.compile(r'[^a-zA-Z0-9._-]')
safe_filename = safe_chars.sub('_', filename)
# Limit length
if len(safe_filename) > 100:
name, ext = os.path.splitext(safe_filename)
safe_filename = name[:95] + ext
# Ensure it's not empty and doesn't start with dot
if not safe_filename or safe_filename.startswith('.'):
safe_filename = "file_" + safe_filename
return safe_filename
def is_legacy_extension(file_path: str) -> bool:
"""
Check if file extension indicates a legacy format.
Args:
file_path: Path to check
Returns:
bool: True if extension suggests legacy format
"""
legacy_extensions = {
# PC/DOS Era
'.dbf', '.db', '.dbt', # dBASE
'.wpd', '.wp', '.wp4', '.wp5', '.wp6', # WordPerfect
'.wk1', '.wk3', '.wk4', '.wks', # Lotus 1-2-3
'.wb1', '.wb2', '.wb3', '.qpw', # Quattro Pro
'.ws', '.wd', # WordStar
'.sam', # AmiPro
'.wri', # Write
# Apple/Mac Era
'.cwk', '.appleworks', # AppleWorks
'.cws', # ClarisWorks
'.mac', '.mcw', # MacWrite
'.wn', # WriteNow
'.hc', '.stack', # HyperCard
'.pict', '.pic', # PICT
'.pntg', '.drw', # MacPaint/MacDraw
'.hqx', # BinHex
'.sit', '.sitx', # StuffIt
'.rsrc', # Resource fork
'.scrapbook', # System 7 Scrapbook
# Additional legacy formats
'.vc', # VisiCalc
'.wrk', '.wr1', # Symphony
'.proj', '', # Think C/Pascal
'.fp3', '.fp5', '.fp7', '.fmp12', # FileMaker
'.px', '.mb', # Paradox
'.fpt', '.cdx' # FoxPro
}
extension = Path(file_path).suffix.lower()
return extension in legacy_extensions
def validate_processing_method(method: str) -> None:
"""
Validate processing method parameter.
Args:
method: Processing method to validate
Raises:
ValidationError: If method is invalid
"""
valid_methods = {
'auto', 'primary', 'fallback',
# Format-specific methods
'dbfread', 'simpledbf', 'pandas_dbf',
'libwpd', 'wpd_python', 'strings_extract',
'pylotus123', 'gnumeric', 'custom_wk_parser',
'libcwk', 'resource_fork', 'mac_textutil',
'hypercard_parser', 'hypertalk_extract'
}
if method not in valid_methods:
raise ValidationError(f"Invalid processing method: {method}")
def get_file_info(file_path: str) -> dict:
"""
Get basic file information for processing.
Args:
file_path: Path to analyze
Returns:
dict: File information including size, dates, extension
"""
try:
path = Path(file_path)
stat = path.stat()
return {
"filename": path.name,
"extension": path.suffix.lower(),
"size": stat.st_size,
"created": stat.st_ctime,
"modified": stat.st_mtime,
"is_legacy_format": is_legacy_extension(file_path)
}
except Exception as e:
logger.error("Failed to get file info", error=str(e), file_path=file_path)
return {
"filename": "unknown",
"extension": "",
"size": 0,
"created": 0,
"modified": 0,
"is_legacy_format": False,
"error": str(e)
}