""" Core processing engine for legacy document formats. Orchestrates multi-library fallback chains, AI enhancement, and provides bulletproof processing for vintage documents. """ import asyncio import os import tempfile import time from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional, Union from dataclasses import dataclass # Optional imports try: import structlog logger = structlog.get_logger(__name__) except ImportError: import logging logger = logging.getLogger(__name__) from .detection import FormatInfo # Import processors dynamically to avoid circular imports try: from ..processors.dbase import DBaseProcessor from ..processors.wordperfect import WordPerfectProcessor from ..processors.lotus123 import Lotus123Processor from ..processors.appleworks import AppleWorksProcessor from ..processors.hypercard import HyperCardProcessor from ..processors.autocad import AutoCADProcessor from ..processors.pagemaker import PageMakerProcessor from ..processors.generic_cadd import GenericCADDProcessor except ImportError as e: logger.warning(f"Processor import failed: {e}") # Create stub processors for missing ones DBaseProcessor = lambda: None WordPerfectProcessor = lambda: None Lotus123Processor = lambda: None AppleWorksProcessor = lambda: None HyperCardProcessor = lambda: None AutoCADProcessor = lambda: None PageMakerProcessor = lambda: None GenericCADDProcessor = lambda: None try: from ..ai.enhancement import AIEnhancementPipeline except ImportError: class AIEnhancementPipeline: def __init__(self): pass async def enhance_extraction(self, *args): return None try: from ..utils.recovery import CorruptionRecoverySystem except ImportError: class CorruptionRecoverySystem: def __init__(self): pass async def attempt_recovery(self, *args): return None @dataclass class ProcessingResult: """Comprehensive result from legacy document processing.""" success: bool text_content: Optional[str] = None structured_content: Optional[Dict[str, Any]] = None method_used: str = "unknown" processing_time: float = 0.0 fallback_attempts: int = 0 success_rate: float = 0.0 # Metadata creation_date: Optional[str] = None last_modified: Optional[str] = None format_specific_metadata: Dict[str, Any] = None # AI Analysis ai_analysis: Optional[Dict[str, Any]] = None # Error handling error_message: Optional[str] = None recovery_suggestions: List[str] = None def __post_init__(self): if self.format_specific_metadata is None: self.format_specific_metadata = {} if self.recovery_suggestions is None: self.recovery_suggestions = [] @dataclass class HealthAnalysis: """Comprehensive health analysis of vintage files.""" overall_health: str # "excellent", "good", "fair", "poor", "critical" health_score: float # 0.0 - 10.0 header_status: str structure_integrity: str corruption_level: float # Recovery assessment is_recoverable: bool recovery_confidence: float recommended_recovery_methods: List[str] expected_success_rate: float # Vintage characteristics estimated_age: Optional[str] creation_software: Optional[str] format_evolution: str authenticity_score: float # Recommendations processing_recommendations: List[str] preservation_priority: str # "critical", "high", "medium", "low" def __post_init__(self): if self.recommended_recovery_methods is None: self.recommended_recovery_methods = [] if self.processing_recommendations is None: self.processing_recommendations = [] class ProcessingError(Exception): """Custom exception for processing errors.""" pass class ProcessingEngine: """ Core processing engine that orchestrates legacy document processing through specialized processors with multi-library fallback chains. """ def __init__(self): self.processors = self._initialize_processors() self.ai_pipeline = AIEnhancementPipeline() self.recovery_system = CorruptionRecoverySystem() def _initialize_processors(self) -> Dict[str, Any]: """Initialize all format-specific processors.""" return { "dbase": DBaseProcessor(), "wordperfect": WordPerfectProcessor(), "lotus123": Lotus123Processor(), "appleworks": AppleWorksProcessor(), "hypercard": HyperCardProcessor(), "autocad": AutoCADProcessor(), "pagemaker": PageMakerProcessor(), "generic_cadd": GenericCADDProcessor(), # Additional processors will be added as implemented } async def process_document( self, file_path: str, format_info: FormatInfo, preserve_formatting: bool = True, method: str = "auto", enable_ai_enhancement: bool = True ) -> ProcessingResult: """ Process legacy document with comprehensive error handling and fallbacks. Args: file_path: Path to the legacy document format_info: Detected format information preserve_formatting: Whether to preserve document structure method: Processing method ("auto", "primary", "fallback", or specific) enable_ai_enhancement: Whether to apply AI enhancement Returns: ProcessingResult: Comprehensive processing results """ start_time = time.time() fallback_attempts = 0 try: logger.info("Starting document processing", format=format_info.format_name, method=method) # Get appropriate processor processor = self._get_processor(format_info.format_family) if not processor: return ProcessingResult( success=False, error_message=f"No processor available for format: {format_info.format_family}", processing_time=time.time() - start_time ) # Attempt processing with fallback chain result = None processing_methods = self._get_processing_methods(processor, method) for attempt, process_method in enumerate(processing_methods): try: logger.debug("Attempting processing method", method=process_method, attempt=attempt + 1) result = await processor.process( file_path=file_path, method=process_method, preserve_formatting=preserve_formatting ) if result and result.success: break fallback_attempts += 1 except Exception as e: logger.warning("Processing method failed", method=process_method, error=str(e)) fallback_attempts += 1 continue # If all methods failed, try corruption recovery if not result or not result.success: logger.info("Attempting corruption recovery", file_path=file_path) result = await self._attempt_recovery(file_path, format_info) # Apply AI enhancement if enabled and processing succeeded if result and result.success and enable_ai_enhancement: try: ai_analysis = await self.ai_pipeline.enhance_extraction( result, format_info ) result.ai_analysis = ai_analysis except Exception as e: logger.warning("AI enhancement failed", error=str(e)) # Calculate final metrics processing_time = time.time() - start_time success_rate = 1.0 if result.success else 0.0 result.processing_time = processing_time result.fallback_attempts = fallback_attempts result.success_rate = success_rate logger.info("Document processing completed", success=result.success, processing_time=processing_time, fallback_attempts=fallback_attempts) return result except Exception as e: processing_time = time.time() - start_time logger.error("Document processing failed", error=str(e)) return ProcessingResult( success=False, error_message=f"Processing failed: {str(e)}", processing_time=processing_time, fallback_attempts=fallback_attempts, recovery_suggestions=[ "Check file integrity and format", "Try using method='fallback'", "Verify file is not corrupted", "Contact support if issue persists" ] ) def _get_processor(self, format_family: str): """Get appropriate processor for format family.""" return self.processors.get(format_family) def _get_processing_methods(self, processor, method: str) -> List[str]: """Get ordered list of processing methods to try.""" if method == "auto": return processor.get_processing_chain() elif method == "primary": return processor.get_processing_chain()[:1] elif method == "fallback": return processor.get_processing_chain()[1:] else: # Specific method requested return [method] + processor.get_processing_chain() async def _attempt_recovery(self, file_path: str, format_info: FormatInfo) -> ProcessingResult: """Attempt to recover data from corrupted vintage files.""" try: logger.info("Attempting corruption recovery", file_path=file_path) recovery_result = await self.recovery_system.attempt_recovery( file_path, format_info ) if recovery_result.success: return ProcessingResult( success=True, text_content=recovery_result.recovered_text, method_used="corruption_recovery", format_specific_metadata={"recovery_method": recovery_result.method_used} ) else: return ProcessingResult( success=False, error_message="Recovery failed - file may be too damaged", recovery_suggestions=[ "File appears to be severely corrupted", "Try using specialized recovery software", "Check if backup copies exist", "Consider manual text extraction" ] ) except Exception as e: logger.error("Recovery attempt failed", error=str(e)) return ProcessingResult( success=False, error_message=f"Recovery failed: {str(e)}" ) async def analyze_file_health( self, file_path: str, format_info: FormatInfo, deep_analysis: bool = True ) -> HealthAnalysis: """ Perform comprehensive health analysis of vintage document files. Args: file_path: Path to the file to analyze format_info: Detected format information deep_analysis: Whether to perform deep structural analysis Returns: HealthAnalysis: Comprehensive health assessment """ try: logger.info("Starting health analysis", file_path=file_path, deep=deep_analysis) # Basic file analysis file_size = os.path.getsize(file_path) file_stat = os.stat(file_path) creation_time = datetime.fromtimestamp(file_stat.st_ctime) # Initialize health metrics health_score = 10.0 issues = [] # Check file accessibility if file_size == 0: health_score -= 8.0 issues.append("File is empty") # Read file header for analysis try: with open(file_path, 'rb') as f: header = f.read(min(1024, file_size)) # Header integrity check header_status = await self._analyze_header_integrity(header, format_info) if header_status != "excellent": health_score -= 2.0 except Exception as e: health_score -= 5.0 issues.append(f"Cannot read file header: {str(e)}") header_status = "critical" # Structure integrity analysis if deep_analysis: structure_status = await self._analyze_structure_integrity(file_path, format_info) if structure_status == "corrupted": health_score -= 4.0 elif structure_status == "damaged": health_score -= 2.0 else: structure_status = "not_analyzed" # Calculate overall health rating if health_score >= 9.0: overall_health = "excellent" elif health_score >= 7.0: overall_health = "good" elif health_score >= 5.0: overall_health = "fair" elif health_score >= 3.0: overall_health = "poor" else: overall_health = "critical" # Recovery assessment is_recoverable = health_score >= 2.0 recovery_confidence = min(health_score / 10.0, 1.0) if is_recoverable else 0.0 expected_success_rate = recovery_confidence * 100 # Vintage characteristics estimated_age = self._estimate_file_age(creation_time, format_info) creation_software = self._identify_creation_software(format_info) authenticity_score = self._calculate_authenticity_score( creation_time, format_info, health_score ) # Processing recommendations recommendations = self._generate_health_recommendations( overall_health, format_info, issues ) # Preservation priority preservation_priority = self._assess_preservation_priority( authenticity_score, health_score, format_info ) return HealthAnalysis( overall_health=overall_health, health_score=health_score, header_status=header_status, structure_integrity=structure_status, corruption_level=(10.0 - health_score) / 10.0, is_recoverable=is_recoverable, recovery_confidence=recovery_confidence, recommended_recovery_methods=self._get_recovery_methods(format_info, health_score), expected_success_rate=expected_success_rate, estimated_age=estimated_age, creation_software=creation_software, format_evolution=self._analyze_format_evolution(format_info), authenticity_score=authenticity_score, processing_recommendations=recommendations, preservation_priority=preservation_priority ) except Exception as e: logger.error("Health analysis failed", error=str(e)) return HealthAnalysis( overall_health="unknown", health_score=0.0, header_status="unknown", structure_integrity="unknown", corruption_level=1.0, is_recoverable=False, recovery_confidence=0.0, recommended_recovery_methods=[], expected_success_rate=0.0, estimated_age="unknown", creation_software="unknown", format_evolution="unknown", authenticity_score=0.0, processing_recommendations=["Health analysis failed - manual inspection required"], preservation_priority="unknown" ) async def _analyze_header_integrity(self, header: bytes, format_info: FormatInfo) -> str: """Analyze file header integrity.""" if not header: return "critical" # Format-specific header validation if format_info.format_family == "dbase": # dBASE files should start with version byte if len(header) > 0 and header[0] in [0x03, 0x04, 0x05, 0x30]: return "excellent" else: return "poor" elif format_info.format_family == "wordperfect": # WordPerfect files have specific magic signatures if header.startswith(b'\xFF\x57\x50'): return "excellent" else: return "damaged" # Generic analysis for other formats null_ratio = header.count(0) / len(header) if header else 1.0 if null_ratio > 0.8: return "critical" elif null_ratio > 0.5: return "poor" else: return "good" async def _analyze_structure_integrity(self, file_path: str, format_info: FormatInfo) -> str: """Analyze file structure integrity.""" try: # Get format-specific processor for deeper analysis processor = self._get_processor(format_info.format_family) if processor and hasattr(processor, 'analyze_structure'): return await processor.analyze_structure(file_path) # Generic structure analysis file_size = os.path.getsize(file_path) if file_size < 100: return "corrupted" with open(file_path, 'rb') as f: # Sample multiple points in file samples = [] for i in range(0, min(file_size, 10000), 1000): f.seek(i) sample = f.read(100) if sample: samples.append(sample) # Analyze samples for corruption patterns total_null_bytes = sum(sample.count(0) for sample in samples) total_bytes = sum(len(sample) for sample in samples) if total_bytes == 0: return "corrupted" null_ratio = total_null_bytes / total_bytes if null_ratio > 0.9: return "corrupted" elif null_ratio > 0.7: return "damaged" else: return "intact" except Exception: return "unknown" def _estimate_file_age(self, creation_time: datetime, format_info: FormatInfo) -> str: """Estimate file age based on creation time and format.""" current_year = datetime.now().year creation_year = creation_time.year age_years = current_year - creation_year if age_years > 40: return "1980s or earlier" elif age_years > 30: return "1990s" elif age_years > 20: return "2000s" elif age_years > 10: return "2010s" else: return "Recent (may not be authentic vintage)" def _identify_creation_software(self, format_info: FormatInfo) -> str: """Identify likely creation software based on format.""" software_map = { "dbase": "dBASE III/IV/5 or FoxPro", "wordperfect": "WordPerfect 4.2-6.1", "lotus123": "Lotus 1-2-3 Release 2-4", "appleworks": "AppleWorks/ClarisWorks", "hypercard": "HyperCard 1.x-2.x" } return software_map.get(format_info.format_family, "Unknown vintage software") def _calculate_authenticity_score( self, creation_time: datetime, format_info: FormatInfo, health_score: float ) -> float: """Calculate vintage authenticity score.""" base_score = format_info.vintage_score if hasattr(format_info, 'vintage_score') else 5.0 # Age factor age_years = datetime.now().year - creation_time.year if age_years > 30: age_bonus = 2.0 elif age_years > 20: age_bonus = 1.5 elif age_years > 10: age_bonus = 1.0 else: age_bonus = 0.0 # Health factor (damaged files are often more authentic) if health_score < 7.0: health_bonus = 0.5 # Slight bonus for imperfect condition else: health_bonus = 0.0 return min(base_score + age_bonus + health_bonus, 10.0) def _analyze_format_evolution(self, format_info: FormatInfo) -> str: """Analyze format evolution stage.""" evolution_map = { "dbase": "Mature (stable format across versions)", "wordperfect": "Evolving (frequent format changes)", "lotus123": "Stable (consistent binary structure)", "appleworks": "Integrated (multi-format suite)", "hypercard": "Revolutionary (unique multimedia format)" } return evolution_map.get(format_info.format_family, "Unknown evolution pattern") def _generate_health_recommendations( self, overall_health: str, format_info: FormatInfo, issues: List[str] ) -> List[str]: """Generate processing recommendations based on health analysis.""" recommendations = [] if overall_health == "excellent": recommendations.append("File is in excellent condition - use primary processing methods") elif overall_health == "good": recommendations.append("File is in good condition - standard processing should work") elif overall_health == "fair": recommendations.extend([ "File has minor issues - enable fallback processing", "Consider backup before processing" ]) elif overall_health == "poor": recommendations.extend([ "File has significant issues - use recovery methods", "Enable corruption recovery processing", "Backup original before any processing attempts" ]) else: # critical recommendations.extend([ "File is severely damaged - recovery unlikely", "Try specialized recovery tools", "Consider professional data recovery services" ]) # Format-specific recommendations format_recommendations = { "dbase": ["Check for associated memo files (.dbt)", "Verify record structure"], "wordperfect": ["Preserve formatting codes", "Check for password protection"], "lotus123": ["Verify worksheet structure", "Check for formula corruption"], "appleworks": ["Check for resource fork data", "Verify integrated document type"], "hypercard": ["Check stack structure", "Verify card navigation"] } recommendations.extend(format_recommendations.get(format_info.format_family, [])) return recommendations def _assess_preservation_priority( self, authenticity_score: float, health_score: float, format_info: FormatInfo ) -> str: """Assess preservation priority for digital heritage.""" # High authenticity + good health = high priority if authenticity_score >= 8.0 and health_score >= 7.0: return "high" # High authenticity + poor health = critical (urgent preservation needed) elif authenticity_score >= 8.0 and health_score < 5.0: return "critical" # Medium authenticity = medium priority elif authenticity_score >= 6.0: return "medium" else: return "low" def _get_recovery_methods(self, format_info: FormatInfo, health_score: float) -> List[str]: """Get recommended recovery methods based on format and health.""" methods = [] if health_score >= 7.0: methods.append("standard_processing") elif health_score >= 5.0: methods.extend(["fallback_processing", "partial_recovery"]) elif health_score >= 3.0: methods.extend(["corruption_recovery", "binary_analysis", "string_extraction"]) else: methods.extend(["emergency_recovery", "manual_analysis", "specialized_tools"]) # Format-specific recovery methods format_methods = { "dbase": ["record_reconstruction", "header_repair"], "wordperfect": ["formatting_code_recovery", "text_extraction"], "lotus123": ["cell_data_recovery", "formula_reconstruction"], "appleworks": ["resource_fork_recovery", "data_fork_extraction"], "hypercard": ["stack_repair", "card_recovery"] } methods.extend(format_methods.get(format_info.format_family, [])) return methods