mcp-legacy-files/src/mcp_legacy_files/api.py

"""
Production-ready REST API for MCP Legacy Files.

Provides HTTP endpoints for vintage document processing alongside the MCP server.
Designed for enterprise integration and web service consumption.
"""

import asyncio
import os
import tempfile
import time
from datetime import datetime
from typing import Dict, List, Optional, Union
from pathlib import Path

from fastapi import FastAPI, HTTPException, UploadFile, File, BackgroundTasks, Depends
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.gzip import GZipMiddleware
from pydantic import BaseModel, Field
import uvicorn

# Optional imports
try:
    import structlog
    logger = structlog.get_logger(__name__)
except ImportError:
    import logging
    logger = logging.getLogger(__name__)

try:
    from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST
    METRICS_AVAILABLE = True

    # Metrics
    REQUESTS_TOTAL = Counter('mcp_legacy_files_requests_total', 'Total requests', ['method', 'endpoint'])
    PROCESSING_TIME = Histogram('mcp_legacy_files_processing_seconds', 'Processing time')
    PROCESSING_SUCCESS = Counter('mcp_legacy_files_processing_success_total', 'Successful processing', ['format'])
    PROCESSING_ERRORS = Counter('mcp_legacy_files_processing_errors_total', 'Processing errors', ['format', 'error_type'])

except ImportError:
    METRICS_AVAILABLE = False

# Import our processors
from .processors.dbase import DBaseProcessor
from .processors.wordperfect import WordPerfectProcessor
from .processors.lotus123 import Lotus123Processor
from .processors.appleworks import AppleWorksProcessor
from .processors.hypercard import HyperCardProcessor
from .processors.autocad import AutoCADProcessor
from .processors.pagemaker import PageMakerProcessor
from .processors.generic_cadd import GenericCADDProcessor
from .core.detection import LegacyFormatDetector

# API Models
class ProcessingOptions(BaseModel):
    """Configuration options for document processing."""
    preserve_formatting: bool = Field(True, description="Preserve original document formatting")
    extract_metadata: bool = Field(True, description="Extract format-specific metadata")
    ai_enhancement: bool = Field(False, description="Apply AI-powered content analysis")
    method: str = Field("auto", description="Processing method (auto, primary, fallback)")
    timeout: int = Field(300, description="Processing timeout in seconds", ge=1, le=3600)

class ProcessingResult(BaseModel):
    """Result from document processing operation."""
    success: bool = Field(description="Whether processing succeeded")
    document_id: str = Field(description="Unique identifier for this processing operation")
    format_detected: str = Field(description="Detected vintage document format")
    confidence: float = Field(description="Detection confidence score (0-1)")
    method_used: str = Field(description="Processing method that succeeded")
    text_content: Optional[str] = Field(None, description="Extracted text content")
    structured_data: Optional[Dict] = Field(None, description="Structured data (for databases/spreadsheets)")
    metadata: Dict = Field(description="Format-specific metadata and processing information")
    processing_time: float = Field(description="Processing time in seconds")
    error_message: Optional[str] = Field(None, description="Error message if processing failed")
    warnings: List[str] = Field(default_factory=list, description="Processing warnings")

class BatchProcessingRequest(BaseModel):
    """Request for batch processing multiple documents."""
    options: ProcessingOptions = Field(default_factory=ProcessingOptions)
    webhook_url: Optional[str] = Field(None, description="Webhook URL for completion notification")
    batch_name: Optional[str] = Field(None, description="Name for this batch operation")

class BatchProcessingResponse(BaseModel):
    """Response for batch processing request."""
    batch_id: str = Field(description="Unique identifier for this batch")
    total_files: int = Field(description="Total number of files in batch")
    status: str = Field(description="Batch processing status")
    created_at: datetime = Field(description="Batch creation timestamp")
    estimated_completion: Optional[datetime] = Field(None, description="Estimated completion time")

class SupportedFormat(BaseModel):
    """Information about a supported vintage format."""
    format_name: str = Field(description="Human-readable format name")
    format_family: str = Field(description="Format family (dbase, wordperfect, etc.)")
    extensions: List[str] = Field(description="Supported file extensions")
    description: str = Field(description="Format description and historical context")
    confidence_level: str = Field(description="Processing confidence level")
    processing_methods: List[str] = Field(description="Available processing methods")
    typical_use_cases: List[str] = Field(description="Common use cases for this format")

class SystemHealth(BaseModel):
    """System health and status information."""
    status: str = Field(description="Overall system status")
    version: str = Field(description="MCP Legacy Files version")
    uptime_seconds: float = Field(description="System uptime in seconds")
    processors_available: Dict[str, bool] = Field(description="Processor availability status")
    system_resources: Dict[str, Union[str, float]] = Field(description="System resource usage")
    cache_stats: Optional[Dict] = Field(None, description="Cache performance statistics")

# Initialize FastAPI app
app = FastAPI(
    title="MCP Legacy Files API",
    description="Production-ready REST API for vintage document processing. Process documents from the 1980s-1990s business computing era.",
    version="1.0.0",
    docs_url="/docs",
    redoc_url="/redoc"
)

# Middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Configure appropriately for production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

app.add_middleware(GZipMiddleware, minimum_size=1000)

# Global state
startup_time = time.time()
processors = {}
detector = None

@app.on_event("startup")
async def startup_event():
    """Initialize processors and system components."""
    global processors, detector

    logger.info("Starting MCP Legacy Files API server")

    try:
        # Initialize format detector
        detector = LegacyFormatDetector()

        # Initialize processors
        processors = {
            "dbase": DBaseProcessor(),
            "wordperfect": WordPerfectProcessor(),
            "lotus123": Lotus123Processor(),
            "appleworks": AppleWorksProcessor(),
            "hypercard": HyperCardProcessor(),
            "autocad": AutoCADProcessor(),
            "pagemaker": PageMakerProcessor(),
            "generic_cadd": GenericCADDProcessor()
        }

        logger.info("All processors initialized successfully",
                   processor_count=len(processors))

    except Exception as e:
        logger.error("Failed to initialize processors", error=str(e))
        raise

@app.on_event("shutdown")
async def shutdown_event():
    """Cleanup on server shutdown."""
    logger.info("Shutting down MCP Legacy Files API server")

# Health check endpoint
@app.get("/health", response_model=SystemHealth, tags=["System"])
async def health_check():
    """System health check and status information."""
    if METRICS_AVAILABLE:
        REQUESTS_TOTAL.labels(method="GET", endpoint="/health").inc()

    uptime = time.time() - startup_time

    # Check processor availability
    processor_status = {}
    for name, processor in processors.items():
        try:
            # Quick availability check
            processor_status[name] = hasattr(processor, 'process') and callable(processor.process)
        except:
            processor_status[name] = False

    # Basic resource info
    try:
        import psutil
        system_resources = {
            "cpu_percent": psutil.cpu_percent(interval=1),
            "memory_percent": psutil.virtual_memory().percent,
            "disk_usage_percent": psutil.disk_usage('/').percent
        }
    except ImportError:
        system_resources = {"note": "psutil not available for resource monitoring"}

    return SystemHealth(
        status="healthy" if all(processor_status.values()) else "degraded",
        version="1.0.0",
        uptime_seconds=uptime,
        processors_available=processor_status,
        system_resources=system_resources
    )

# Metrics endpoint (if Prometheus available)
if METRICS_AVAILABLE:
    @app.get("/metrics", tags=["System"])
    async def metrics():
        """Prometheus metrics endpoint."""
        return generate_latest()

# Format information endpoints
@app.get("/formats", response_model=List[SupportedFormat], tags=["Formats"])
async def get_supported_formats():
    """List all supported vintage document formats."""
    if METRICS_AVAILABLE:
        REQUESTS_TOTAL.labels(method="GET", endpoint="/formats").inc()

    formats = [
        SupportedFormat(
            format_name="dBASE Database",
            format_family="dbase",
            extensions=[".dbf", ".db", ".dbt"],
            description="dBASE III/IV business databases from 1980s PC era",
            confidence_level="High (99%)",
            processing_methods=["dbfread", "simpledbf", "pandas", "custom_parser"],
            typical_use_cases=["Customer databases", "Inventory systems", "Business records"]
        ),
        SupportedFormat(
            format_name="WordPerfect Document",
            format_family="wordperfect",
            extensions=[".wpd", ".wp", ".wp5", ".wp6"],
            description="WordPerfect 4.2-6.0 business documents and letters",
            confidence_level="High (95%)",
            processing_methods=["wpd2text", "wpd2html", "wpd2raw", "strings_extract"],
            typical_use_cases=["Business correspondence", "Legal documents", "Reports"]
        ),
        SupportedFormat(
            format_name="Lotus 1-2-3 Spreadsheet",
            format_family="lotus123",
            extensions=[".wk1", ".wk3", ".wk4", ".wks"],
            description="Lotus 1-2-3 financial spreadsheets and business models",
            confidence_level="High (90%)",
            processing_methods=["gnumeric_ssconvert", "libreoffice", "strings_extract"],
            typical_use_cases=["Financial models", "Budget forecasts", "Business analytics"]
        ),
        SupportedFormat(
            format_name="AppleWorks/ClarisWorks",
            format_family="appleworks",
            extensions=[".cwk", ".appleworks", ".cws"],
            description="Mac integrated productivity documents and presentations",
            confidence_level="High (95%)",
            processing_methods=["libreoffice", "textutil", "strings_extract"],
            typical_use_cases=["Presentations", "Project databases", "Mac business documents"]
        ),
        SupportedFormat(
            format_name="HyperCard Stack",
            format_family="hypercard",
            extensions=[".hc", ".stack"],
            description="Interactive multimedia stacks with HyperTalk scripting",
            confidence_level="High (90%)",
            processing_methods=["hypercard_parser", "strings_extract"],
            typical_use_cases=["Training systems", "Interactive presentations", "Educational content"]
        ),
        SupportedFormat(
            format_name="AutoCAD Drawing",
            format_family="autocad",
            extensions=[".dwg", ".dxf", ".dwt"],
            description="Technical drawings and CAD files from AutoCAD R10-R14",
            confidence_level="High (90%)",
            processing_methods=["teigha_converter", "librecad_extract", "dxf_conversion", "binary_analysis"],
            typical_use_cases=["Technical drawings", "Architectural plans", "Engineering schematics"]
        ),
        SupportedFormat(
            format_name="PageMaker Publication",
            format_family="pagemaker",
            extensions=[".pm1", ".pm2", ".pm3", ".pm4", ".pm5", ".pm6", ".pmd", ".pt4", ".pt5", ".pt6"],
            description="Desktop publishing documents from the DTP revolution (1985-1995)",
            confidence_level="High (90%)",
            processing_methods=["adobe_sdk_extract", "scribus_import", "text_extraction", "binary_analysis"],
            typical_use_cases=["Newsletters", "Brochures", "Annual reports", "Marketing materials"]
        ),
        SupportedFormat(
            format_name="Generic CADD Drawing",
            format_family="generic_cadd",
            extensions=[".vcl", ".vrd", ".fc", ".fcd", ".drx", ".dfx", ".cdl", ".prt", ".dc2", ".tcw", ".td2"],
            description="Vintage CAD formats from the CAD revolution era (VersaCAD, FastCAD, Drafix, CadKey, etc.)",
            confidence_level="High (90%)",
            processing_methods=["cad_conversion", "format_parser", "geometry_analysis", "binary_analysis"],
            typical_use_cases=["Technical drawings", "Architectural plans", "Engineering schematics", "Circuit layouts"]
        )
    ]

    return formats

@app.get("/formats/{format_family}", response_model=SupportedFormat, tags=["Formats"])
async def get_format_info(format_family: str):
    """Get detailed information about a specific format family."""
    if METRICS_AVAILABLE:
        REQUESTS_TOTAL.labels(method="GET", endpoint="/formats/{format_family}").inc()

    formats = await get_supported_formats()
    for format_info in formats:
        if format_info.format_family == format_family:
            return format_info

    raise HTTPException(status_code=404, detail=f"Format family '{format_family}' not supported")

# Document processing endpoints
@app.post("/process", response_model=ProcessingResult, tags=["Processing"])
async def process_document(
    file: UploadFile = File(...),
    options: ProcessingOptions = Depends()
):
    """Process a single vintage document."""
    if METRICS_AVAILABLE:
        REQUESTS_TOTAL.labels(method="POST", endpoint="/process").inc()
        start_time = time.time()

    document_id = f"doc_{int(time.time() * 1000000)}"

    try:
        # Save uploaded file temporarily
        with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as tmp_file:
            content = await file.read()
            tmp_file.write(content)
            tmp_file_path = tmp_file.name

        # Detect format
        format_info = await detector.detect_format(tmp_file_path)
        if not format_info:
            raise HTTPException(status_code=400, detail="Unable to detect vintage document format")

        # Get appropriate processor
        processor = processors.get(format_info.format_family)
        if not processor:
            raise HTTPException(status_code=400, detail=f"No processor available for format: {format_info.format_family}")

        # Process document
        result = await processor.process(
            tmp_file_path,
            method=options.method,
            preserve_formatting=options.preserve_formatting
        )

        if not result:
            raise HTTPException(status_code=500, detail="Processing failed - no result returned")

        # Build response
        processing_result = ProcessingResult(
            success=result.success,
            document_id=document_id,
            format_detected=format_info.format_family,
            confidence=format_info.confidence,
            method_used=result.method_used,
            text_content=result.text_content,
            structured_data=result.structured_content,
            metadata={
                "filename": file.filename,
                "file_size": len(content),
                "format_info": {
                    "format_family": format_info.format_family,
                    "format_name": format_info.format_name,
                    "confidence": format_info.confidence
                },
                "processing_metadata": result.format_specific_metadata or {}
            },
            processing_time=result.processing_time or 0,
            error_message=result.error_message,
            warnings=result.recovery_suggestions or []
        )

        # Update metrics
        if METRICS_AVAILABLE:
            processing_duration = time.time() - start_time
            PROCESSING_TIME.observe(processing_duration)

            if result.success:
                PROCESSING_SUCCESS.labels(format=format_info.format_family).inc()
            else:
                PROCESSING_ERRORS.labels(format=format_info.format_family, error_type="processing_failed").inc()

        return processing_result

    except HTTPException:
        raise
    except Exception as e:
        logger.error("Document processing failed", error=str(e), document_id=document_id)

        if METRICS_AVAILABLE:
            PROCESSING_ERRORS.labels(format="unknown", error_type="system_error").inc()

        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")

    finally:
        # Clean up temporary file
        try:
            if 'tmp_file_path' in locals():
                os.unlink(tmp_file_path)
        except:
            pass

@app.post("/process/batch", response_model=BatchProcessingResponse, tags=["Processing"])
async def process_batch(
    background_tasks: BackgroundTasks,
    files: List[UploadFile] = File(...),
    request: BatchProcessingRequest = Depends()
):
    """Process multiple documents in batch mode."""
    if METRICS_AVAILABLE:
        REQUESTS_TOTAL.labels(method="POST", endpoint="/process/batch").inc()

    batch_id = f"batch_{int(time.time() * 1000000)}"

    # For now, return basic batch info - full implementation would use background processing
    batch_response = BatchProcessingResponse(
        batch_id=batch_id,
        total_files=len(files),
        status="queued",
        created_at=datetime.now()
    )

    # Add background task for processing (simplified implementation)
    background_tasks.add_task(process_batch_background, batch_id, files, request)

    return batch_response

async def process_batch_background(batch_id: str, files: List[UploadFile], request: BatchProcessingRequest):
    """Background task for batch processing."""
    logger.info("Starting batch processing", batch_id=batch_id, file_count=len(files))

    # Implementation would process files and send webhook notification when complete
    # This is a simplified version for the demo

    await asyncio.sleep(1)  # Simulate processing
    logger.info("Batch processing completed", batch_id=batch_id)

if __name__ == "__main__":
    uvicorn.run(
        "mcp_legacy_files.api:app",
        host="0.0.0.0",
        port=8000,
        log_level="info",
        access_log=True
    )