mcp-legacy-files/src/mcp_legacy_files/core/processing.py

"""
Core processing engine for legacy document formats.

Orchestrates multi-library fallback chains, AI enhancement,
and provides bulletproof processing for vintage documents.
"""

import asyncio
import os
import tempfile
import time
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from dataclasses import dataclass

# Optional imports
try:
    import structlog
    logger = structlog.get_logger(__name__)
except ImportError:
    import logging
    logger = logging.getLogger(__name__)

from .detection import FormatInfo

# Import processors dynamically to avoid circular imports
try:
    from ..processors.dbase import DBaseProcessor
    from ..processors.wordperfect import WordPerfectProcessor
    from ..processors.lotus123 import Lotus123Processor
    from ..processors.appleworks import AppleWorksProcessor
    from ..processors.hypercard import HyperCardProcessor
    from ..processors.autocad import AutoCADProcessor
    from ..processors.pagemaker import PageMakerProcessor
    from ..processors.generic_cadd import GenericCADDProcessor
except ImportError as e:
    logger.warning(f"Processor import failed: {e}")
    # Create stub processors for missing ones
    DBaseProcessor = lambda: None
    WordPerfectProcessor = lambda: None
    Lotus123Processor = lambda: None
    AppleWorksProcessor = lambda: None
    HyperCardProcessor = lambda: None
    AutoCADProcessor = lambda: None
    PageMakerProcessor = lambda: None
    GenericCADDProcessor = lambda: None

try:
    from ..ai.enhancement import AIEnhancementPipeline
except ImportError:
    class AIEnhancementPipeline:
        def __init__(self): pass
        async def enhance_extraction(self, *args): return None

try:
    from ..utils.recovery import CorruptionRecoverySystem
except ImportError:
    class CorruptionRecoverySystem:
        def __init__(self): pass
        async def attempt_recovery(self, *args): return None

@dataclass
class ProcessingResult:
    """Comprehensive result from legacy document processing."""
    success: bool
    text_content: Optional[str] = None
    structured_content: Optional[Dict[str, Any]] = None
    method_used: str = "unknown"
    processing_time: float = 0.0
    fallback_attempts: int = 0
    success_rate: float = 0.0

    # Metadata
    creation_date: Optional[str] = None
    last_modified: Optional[str] = None
    format_specific_metadata: Dict[str, Any] = None

    # AI Analysis
    ai_analysis: Optional[Dict[str, Any]] = None

    # Error handling
    error_message: Optional[str] = None
    recovery_suggestions: List[str] = None

    def __post_init__(self):
        if self.format_specific_metadata is None:
            self.format_specific_metadata = {}
        if self.recovery_suggestions is None:
            self.recovery_suggestions = []


@dataclass
class HealthAnalysis:
    """Comprehensive health analysis of vintage files."""
    overall_health: str  # "excellent", "good", "fair", "poor", "critical"
    health_score: float  # 0.0 - 10.0
    header_status: str
    structure_integrity: str
    corruption_level: float

    # Recovery assessment
    is_recoverable: bool
    recovery_confidence: float
    recommended_recovery_methods: List[str]
    expected_success_rate: float

    # Vintage characteristics
    estimated_age: Optional[str]
    creation_software: Optional[str]
    format_evolution: str
    authenticity_score: float

    # Recommendations
    processing_recommendations: List[str]
    preservation_priority: str  # "critical", "high", "medium", "low"

    def __post_init__(self):
        if self.recommended_recovery_methods is None:
            self.recommended_recovery_methods = []
        if self.processing_recommendations is None:
            self.processing_recommendations = []


class ProcessingError(Exception):
    """Custom exception for processing errors."""
    pass


class ProcessingEngine:
    """
    Core processing engine that orchestrates legacy document processing
    through specialized processors with multi-library fallback chains.
    """

    def __init__(self):
        self.processors = self._initialize_processors()
        self.ai_pipeline = AIEnhancementPipeline()
        self.recovery_system = CorruptionRecoverySystem()

    def _initialize_processors(self) -> Dict[str, Any]:
        """Initialize all format-specific processors."""
        return {
            "dbase": DBaseProcessor(),
            "wordperfect": WordPerfectProcessor(),
            "lotus123": Lotus123Processor(),
            "appleworks": AppleWorksProcessor(),
            "hypercard": HyperCardProcessor(),
            "autocad": AutoCADProcessor(),
            "pagemaker": PageMakerProcessor(),
            "generic_cadd": GenericCADDProcessor(),
            # Additional processors will be added as implemented
        }

    async def process_document(
        self,
        file_path: str,
        format_info: FormatInfo,
        preserve_formatting: bool = True,
        method: str = "auto",
        enable_ai_enhancement: bool = True
    ) -> ProcessingResult:
        """
        Process legacy document with comprehensive error handling and fallbacks.

        Args:
            file_path: Path to the legacy document
            format_info: Detected format information
            preserve_formatting: Whether to preserve document structure
            method: Processing method ("auto", "primary", "fallback", or specific)
            enable_ai_enhancement: Whether to apply AI enhancement

        Returns:
            ProcessingResult: Comprehensive processing results
        """
        start_time = time.time()
        fallback_attempts = 0

        try:
            logger.info("Starting document processing",
                       format=format_info.format_name,
                       method=method)

            # Get appropriate processor
            processor = self._get_processor(format_info.format_family)
            if not processor:
                return ProcessingResult(
                    success=False,
                    error_message=f"No processor available for format: {format_info.format_family}",
                    processing_time=time.time() - start_time
                )

            # Attempt processing with fallback chain
            result = None
            processing_methods = self._get_processing_methods(processor, method)

            for attempt, process_method in enumerate(processing_methods):
                try:
                    logger.debug("Attempting processing method",
                               method=process_method,
                               attempt=attempt + 1)

                    result = await processor.process(
                        file_path=file_path,
                        method=process_method,
                        preserve_formatting=preserve_formatting
                    )

                    if result and result.success:
                        break

                    fallback_attempts += 1

                except Exception as e:
                    logger.warning("Processing method failed",
                                 method=process_method,
                                 error=str(e))
                    fallback_attempts += 1
                    continue

            # If all methods failed, try corruption recovery
            if not result or not result.success:
                logger.info("Attempting corruption recovery", file_path=file_path)
                result = await self._attempt_recovery(file_path, format_info)

            # Apply AI enhancement if enabled and processing succeeded
            if result and result.success and enable_ai_enhancement:
                try:
                    ai_analysis = await self.ai_pipeline.enhance_extraction(
                        result, format_info
                    )
                    result.ai_analysis = ai_analysis
                except Exception as e:
                    logger.warning("AI enhancement failed", error=str(e))

            # Calculate final metrics
            processing_time = time.time() - start_time
            success_rate = 1.0 if result.success else 0.0

            result.processing_time = processing_time
            result.fallback_attempts = fallback_attempts
            result.success_rate = success_rate

            logger.info("Document processing completed",
                       success=result.success,
                       processing_time=processing_time,
                       fallback_attempts=fallback_attempts)

            return result

        except Exception as e:
            processing_time = time.time() - start_time
            logger.error("Document processing failed", error=str(e))

            return ProcessingResult(
                success=False,
                error_message=f"Processing failed: {str(e)}",
                processing_time=processing_time,
                fallback_attempts=fallback_attempts,
                recovery_suggestions=[
                    "Check file integrity and format",
                    "Try using method='fallback'",
                    "Verify file is not corrupted",
                    "Contact support if issue persists"
                ]
            )

    def _get_processor(self, format_family: str):
        """Get appropriate processor for format family."""
        return self.processors.get(format_family)

    def _get_processing_methods(self, processor, method: str) -> List[str]:
        """Get ordered list of processing methods to try."""
        if method == "auto":
            return processor.get_processing_chain()
        elif method == "primary":
            return processor.get_processing_chain()[:1]
        elif method == "fallback":
            return processor.get_processing_chain()[1:]
        else:
            # Specific method requested
            return [method] + processor.get_processing_chain()

    async def _attempt_recovery(self, file_path: str, format_info: FormatInfo) -> ProcessingResult:
        """Attempt to recover data from corrupted vintage files."""
        try:
            logger.info("Attempting corruption recovery", file_path=file_path)

            recovery_result = await self.recovery_system.attempt_recovery(
                file_path, format_info
            )

            if recovery_result.success:
                return ProcessingResult(
                    success=True,
                    text_content=recovery_result.recovered_text,
                    method_used="corruption_recovery",
                    format_specific_metadata={"recovery_method": recovery_result.method_used}
                )
            else:
                return ProcessingResult(
                    success=False,
                    error_message="Recovery failed - file may be too damaged",
                    recovery_suggestions=[
                        "File appears to be severely corrupted",
                        "Try using specialized recovery software",
                        "Check if backup copies exist",
                        "Consider manual text extraction"
                    ]
                )

        except Exception as e:
            logger.error("Recovery attempt failed", error=str(e))
            return ProcessingResult(
                success=False,
                error_message=f"Recovery failed: {str(e)}"
            )

    async def analyze_file_health(
        self,
        file_path: str,
        format_info: FormatInfo,
        deep_analysis: bool = True
    ) -> HealthAnalysis:
        """
        Perform comprehensive health analysis of vintage document files.

        Args:
            file_path: Path to the file to analyze
            format_info: Detected format information
            deep_analysis: Whether to perform deep structural analysis

        Returns:
            HealthAnalysis: Comprehensive health assessment
        """
        try:
            logger.info("Starting health analysis", file_path=file_path, deep=deep_analysis)

            # Basic file analysis
            file_size = os.path.getsize(file_path)
            file_stat = os.stat(file_path)
            creation_time = datetime.fromtimestamp(file_stat.st_ctime)

            # Initialize health metrics
            health_score = 10.0
            issues = []

            # Check file accessibility
            if file_size == 0:
                health_score -= 8.0
                issues.append("File is empty")

            # Read file header for analysis
            try:
                with open(file_path, 'rb') as f:
                    header = f.read(min(1024, file_size))

                # Header integrity check
                header_status = await self._analyze_header_integrity(header, format_info)
                if header_status != "excellent":
                    health_score -= 2.0

            except Exception as e:
                health_score -= 5.0
                issues.append(f"Cannot read file header: {str(e)}")
                header_status = "critical"

            # Structure integrity analysis
            if deep_analysis:
                structure_status = await self._analyze_structure_integrity(file_path, format_info)
                if structure_status == "corrupted":
                    health_score -= 4.0
                elif structure_status == "damaged":
                    health_score -= 2.0
            else:
                structure_status = "not_analyzed"

            # Calculate overall health rating
            if health_score >= 9.0:
                overall_health = "excellent"
            elif health_score >= 7.0:
                overall_health = "good"
            elif health_score >= 5.0:
                overall_health = "fair"
            elif health_score >= 3.0:
                overall_health = "poor"
            else:
                overall_health = "critical"

            # Recovery assessment
            is_recoverable = health_score >= 2.0
            recovery_confidence = min(health_score / 10.0, 1.0) if is_recoverable else 0.0
            expected_success_rate = recovery_confidence * 100

            # Vintage characteristics
            estimated_age = self._estimate_file_age(creation_time, format_info)
            creation_software = self._identify_creation_software(format_info)
            authenticity_score = self._calculate_authenticity_score(
                creation_time, format_info, health_score
            )

            # Processing recommendations
            recommendations = self._generate_health_recommendations(
                overall_health, format_info, issues
            )

            # Preservation priority
            preservation_priority = self._assess_preservation_priority(
                authenticity_score, health_score, format_info
            )

            return HealthAnalysis(
                overall_health=overall_health,
                health_score=health_score,
                header_status=header_status,
                structure_integrity=structure_status,
                corruption_level=(10.0 - health_score) / 10.0,

                is_recoverable=is_recoverable,
                recovery_confidence=recovery_confidence,
                recommended_recovery_methods=self._get_recovery_methods(format_info, health_score),
                expected_success_rate=expected_success_rate,

                estimated_age=estimated_age,
                creation_software=creation_software,
                format_evolution=self._analyze_format_evolution(format_info),
                authenticity_score=authenticity_score,

                processing_recommendations=recommendations,
                preservation_priority=preservation_priority
            )

        except Exception as e:
            logger.error("Health analysis failed", error=str(e))
            return HealthAnalysis(
                overall_health="unknown",
                health_score=0.0,
                header_status="unknown",
                structure_integrity="unknown",
                corruption_level=1.0,
                is_recoverable=False,
                recovery_confidence=0.0,
                recommended_recovery_methods=[],
                expected_success_rate=0.0,
                estimated_age="unknown",
                creation_software="unknown",
                format_evolution="unknown",
                authenticity_score=0.0,
                processing_recommendations=["Health analysis failed - manual inspection required"],
                preservation_priority="unknown"
            )

    async def _analyze_header_integrity(self, header: bytes, format_info: FormatInfo) -> str:
        """Analyze file header integrity."""
        if not header:
            return "critical"

        # Format-specific header validation
        if format_info.format_family == "dbase":
            # dBASE files should start with version byte
            if len(header) > 0 and header[0] in [0x03, 0x04, 0x05, 0x30]:
                return "excellent"
            else:
                return "poor"

        elif format_info.format_family == "wordperfect":
            # WordPerfect files have specific magic signatures
            if header.startswith(b'\xFF\x57\x50'):
                return "excellent"
            else:
                return "damaged"

        # Generic analysis for other formats
        null_ratio = header.count(0) / len(header) if header else 1.0
        if null_ratio > 0.8:
            return "critical"
        elif null_ratio > 0.5:
            return "poor"
        else:
            return "good"

    async def _analyze_structure_integrity(self, file_path: str, format_info: FormatInfo) -> str:
        """Analyze file structure integrity."""
        try:
            # Get format-specific processor for deeper analysis
            processor = self._get_processor(format_info.format_family)
            if processor and hasattr(processor, 'analyze_structure'):
                return await processor.analyze_structure(file_path)

            # Generic structure analysis
            file_size = os.path.getsize(file_path)
            if file_size < 100:
                return "corrupted"

            with open(file_path, 'rb') as f:
                # Sample multiple points in file
                samples = []
                for i in range(0, min(file_size, 10000), 1000):
                    f.seek(i)
                    sample = f.read(100)
                    if sample:
                        samples.append(sample)

                # Analyze samples for corruption patterns
                total_null_bytes = sum(sample.count(0) for sample in samples)
                total_bytes = sum(len(sample) for sample in samples)

                if total_bytes == 0:
                    return "corrupted"

                null_ratio = total_null_bytes / total_bytes
                if null_ratio > 0.9:
                    return "corrupted"
                elif null_ratio > 0.7:
                    return "damaged"
                else:
                    return "intact"

        except Exception:
            return "unknown"

    def _estimate_file_age(self, creation_time: datetime, format_info: FormatInfo) -> str:
        """Estimate file age based on creation time and format."""
        current_year = datetime.now().year
        creation_year = creation_time.year
        age_years = current_year - creation_year

        if age_years > 40:
            return "1980s or earlier"
        elif age_years > 30:
            return "1990s"
        elif age_years > 20:
            return "2000s"
        elif age_years > 10:
            return "2010s"
        else:
            return "Recent (may not be authentic vintage)"

    def _identify_creation_software(self, format_info: FormatInfo) -> str:
        """Identify likely creation software based on format."""
        software_map = {
            "dbase": "dBASE III/IV/5 or FoxPro",
            "wordperfect": "WordPerfect 4.2-6.1",
            "lotus123": "Lotus 1-2-3 Release 2-4",
            "appleworks": "AppleWorks/ClarisWorks",
            "hypercard": "HyperCard 1.x-2.x"
        }
        return software_map.get(format_info.format_family, "Unknown vintage software")

    def _calculate_authenticity_score(
        self, creation_time: datetime, format_info: FormatInfo, health_score: float
    ) -> float:
        """Calculate vintage authenticity score."""
        base_score = format_info.vintage_score if hasattr(format_info, 'vintage_score') else 5.0

        # Age factor
        age_years = datetime.now().year - creation_time.year
        if age_years > 30:
            age_bonus = 2.0
        elif age_years > 20:
            age_bonus = 1.5
        elif age_years > 10:
            age_bonus = 1.0
        else:
            age_bonus = 0.0

        # Health factor (damaged files are often more authentic)
        if health_score < 7.0:
            health_bonus = 0.5  # Slight bonus for imperfect condition
        else:
            health_bonus = 0.0

        return min(base_score + age_bonus + health_bonus, 10.0)

    def _analyze_format_evolution(self, format_info: FormatInfo) -> str:
        """Analyze format evolution stage."""
        evolution_map = {
            "dbase": "Mature (stable format across versions)",
            "wordperfect": "Evolving (frequent format changes)",
            "lotus123": "Stable (consistent binary structure)",
            "appleworks": "Integrated (multi-format suite)",
            "hypercard": "Revolutionary (unique multimedia format)"
        }
        return evolution_map.get(format_info.format_family, "Unknown evolution pattern")

    def _generate_health_recommendations(
        self, overall_health: str, format_info: FormatInfo, issues: List[str]
    ) -> List[str]:
        """Generate processing recommendations based on health analysis."""
        recommendations = []

        if overall_health == "excellent":
            recommendations.append("File is in excellent condition - use primary processing methods")
        elif overall_health == "good":
            recommendations.append("File is in good condition - standard processing should work")
        elif overall_health == "fair":
            recommendations.extend([
                "File has minor issues - enable fallback processing",
                "Consider backup before processing"
            ])
        elif overall_health == "poor":
            recommendations.extend([
                "File has significant issues - use recovery methods",
                "Enable corruption recovery processing",
                "Backup original before any processing attempts"
            ])
        else:  # critical
            recommendations.extend([
                "File is severely damaged - recovery unlikely",
                "Try specialized recovery tools",
                "Consider professional data recovery services"
            ])

        # Format-specific recommendations
        format_recommendations = {
            "dbase": ["Check for associated memo files (.dbt)", "Verify record structure"],
            "wordperfect": ["Preserve formatting codes", "Check for password protection"],
            "lotus123": ["Verify worksheet structure", "Check for formula corruption"],
            "appleworks": ["Check for resource fork data", "Verify integrated document type"],
            "hypercard": ["Check stack structure", "Verify card navigation"]
        }

        recommendations.extend(format_recommendations.get(format_info.format_family, []))

        return recommendations

    def _assess_preservation_priority(
        self, authenticity_score: float, health_score: float, format_info: FormatInfo
    ) -> str:
        """Assess preservation priority for digital heritage."""
        # High authenticity + good health = high priority
        if authenticity_score >= 8.0 and health_score >= 7.0:
            return "high"
        # High authenticity + poor health = critical (urgent preservation needed)
        elif authenticity_score >= 8.0 and health_score < 5.0:
            return "critical"
        # Medium authenticity = medium priority
        elif authenticity_score >= 6.0:
            return "medium"
        else:
            return "low"

    def _get_recovery_methods(self, format_info: FormatInfo, health_score: float) -> List[str]:
        """Get recommended recovery methods based on format and health."""
        methods = []

        if health_score >= 7.0:
            methods.append("standard_processing")
        elif health_score >= 5.0:
            methods.extend(["fallback_processing", "partial_recovery"])
        elif health_score >= 3.0:
            methods.extend(["corruption_recovery", "binary_analysis", "string_extraction"])
        else:
            methods.extend(["emergency_recovery", "manual_analysis", "specialized_tools"])

        # Format-specific recovery methods
        format_methods = {
            "dbase": ["record_reconstruction", "header_repair"],
            "wordperfect": ["formatting_code_recovery", "text_extraction"],
            "lotus123": ["cell_data_recovery", "formula_reconstruction"],
            "appleworks": ["resource_fork_recovery", "data_fork_extraction"],
            "hypercard": ["stack_repair", "card_recovery"]
        }

        methods.extend(format_methods.get(format_info.format_family, []))

        return methods