""" Production-ready REST API for MCP Legacy Files. Provides HTTP endpoints for vintage document processing alongside the MCP server. Designed for enterprise integration and web service consumption. """ import asyncio import os import tempfile import time from datetime import datetime from typing import Dict, List, Optional, Union from pathlib import Path from fastapi import FastAPI, HTTPException, UploadFile, File, BackgroundTasks, Depends from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.gzip import GZipMiddleware from pydantic import BaseModel, Field import uvicorn # Optional imports try: import structlog logger = structlog.get_logger(__name__) except ImportError: import logging logger = logging.getLogger(__name__) try: from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST METRICS_AVAILABLE = True # Metrics REQUESTS_TOTAL = Counter('mcp_legacy_files_requests_total', 'Total requests', ['method', 'endpoint']) PROCESSING_TIME = Histogram('mcp_legacy_files_processing_seconds', 'Processing time') PROCESSING_SUCCESS = Counter('mcp_legacy_files_processing_success_total', 'Successful processing', ['format']) PROCESSING_ERRORS = Counter('mcp_legacy_files_processing_errors_total', 'Processing errors', ['format', 'error_type']) except ImportError: METRICS_AVAILABLE = False # Import our processors from .processors.dbase import DBaseProcessor from .processors.wordperfect import WordPerfectProcessor from .processors.lotus123 import Lotus123Processor from .processors.appleworks import AppleWorksProcessor from .processors.hypercard import HyperCardProcessor from .processors.autocad import AutoCADProcessor from .processors.pagemaker import PageMakerProcessor from .processors.generic_cadd import GenericCADDProcessor from .core.detection import LegacyFormatDetector # API Models class ProcessingOptions(BaseModel): """Configuration options for document processing.""" preserve_formatting: bool = Field(True, description="Preserve original document formatting") extract_metadata: bool = Field(True, description="Extract format-specific metadata") ai_enhancement: bool = Field(False, description="Apply AI-powered content analysis") method: str = Field("auto", description="Processing method (auto, primary, fallback)") timeout: int = Field(300, description="Processing timeout in seconds", ge=1, le=3600) class ProcessingResult(BaseModel): """Result from document processing operation.""" success: bool = Field(description="Whether processing succeeded") document_id: str = Field(description="Unique identifier for this processing operation") format_detected: str = Field(description="Detected vintage document format") confidence: float = Field(description="Detection confidence score (0-1)") method_used: str = Field(description="Processing method that succeeded") text_content: Optional[str] = Field(None, description="Extracted text content") structured_data: Optional[Dict] = Field(None, description="Structured data (for databases/spreadsheets)") metadata: Dict = Field(description="Format-specific metadata and processing information") processing_time: float = Field(description="Processing time in seconds") error_message: Optional[str] = Field(None, description="Error message if processing failed") warnings: List[str] = Field(default_factory=list, description="Processing warnings") class BatchProcessingRequest(BaseModel): """Request for batch processing multiple documents.""" options: ProcessingOptions = Field(default_factory=ProcessingOptions) webhook_url: Optional[str] = Field(None, description="Webhook URL for completion notification") batch_name: Optional[str] = Field(None, description="Name for this batch operation") class BatchProcessingResponse(BaseModel): """Response for batch processing request.""" batch_id: str = Field(description="Unique identifier for this batch") total_files: int = Field(description="Total number of files in batch") status: str = Field(description="Batch processing status") created_at: datetime = Field(description="Batch creation timestamp") estimated_completion: Optional[datetime] = Field(None, description="Estimated completion time") class SupportedFormat(BaseModel): """Information about a supported vintage format.""" format_name: str = Field(description="Human-readable format name") format_family: str = Field(description="Format family (dbase, wordperfect, etc.)") extensions: List[str] = Field(description="Supported file extensions") description: str = Field(description="Format description and historical context") confidence_level: str = Field(description="Processing confidence level") processing_methods: List[str] = Field(description="Available processing methods") typical_use_cases: List[str] = Field(description="Common use cases for this format") class SystemHealth(BaseModel): """System health and status information.""" status: str = Field(description="Overall system status") version: str = Field(description="MCP Legacy Files version") uptime_seconds: float = Field(description="System uptime in seconds") processors_available: Dict[str, bool] = Field(description="Processor availability status") system_resources: Dict[str, Union[str, float]] = Field(description="System resource usage") cache_stats: Optional[Dict] = Field(None, description="Cache performance statistics") # Initialize FastAPI app app = FastAPI( title="MCP Legacy Files API", description="Production-ready REST API for vintage document processing. Process documents from the 1980s-1990s business computing era.", version="1.0.0", docs_url="/docs", redoc_url="/redoc" ) # Middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], # Configure appropriately for production allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) app.add_middleware(GZipMiddleware, minimum_size=1000) # Global state startup_time = time.time() processors = {} detector = None @app.on_event("startup") async def startup_event(): """Initialize processors and system components.""" global processors, detector logger.info("Starting MCP Legacy Files API server") try: # Initialize format detector detector = LegacyFormatDetector() # Initialize processors processors = { "dbase": DBaseProcessor(), "wordperfect": WordPerfectProcessor(), "lotus123": Lotus123Processor(), "appleworks": AppleWorksProcessor(), "hypercard": HyperCardProcessor(), "autocad": AutoCADProcessor(), "pagemaker": PageMakerProcessor(), "generic_cadd": GenericCADDProcessor() } logger.info("All processors initialized successfully", processor_count=len(processors)) except Exception as e: logger.error("Failed to initialize processors", error=str(e)) raise @app.on_event("shutdown") async def shutdown_event(): """Cleanup on server shutdown.""" logger.info("Shutting down MCP Legacy Files API server") # Health check endpoint @app.get("/health", response_model=SystemHealth, tags=["System"]) async def health_check(): """System health check and status information.""" if METRICS_AVAILABLE: REQUESTS_TOTAL.labels(method="GET", endpoint="/health").inc() uptime = time.time() - startup_time # Check processor availability processor_status = {} for name, processor in processors.items(): try: # Quick availability check processor_status[name] = hasattr(processor, 'process') and callable(processor.process) except: processor_status[name] = False # Basic resource info try: import psutil system_resources = { "cpu_percent": psutil.cpu_percent(interval=1), "memory_percent": psutil.virtual_memory().percent, "disk_usage_percent": psutil.disk_usage('/').percent } except ImportError: system_resources = {"note": "psutil not available for resource monitoring"} return SystemHealth( status="healthy" if all(processor_status.values()) else "degraded", version="1.0.0", uptime_seconds=uptime, processors_available=processor_status, system_resources=system_resources ) # Metrics endpoint (if Prometheus available) if METRICS_AVAILABLE: @app.get("/metrics", tags=["System"]) async def metrics(): """Prometheus metrics endpoint.""" return generate_latest() # Format information endpoints @app.get("/formats", response_model=List[SupportedFormat], tags=["Formats"]) async def get_supported_formats(): """List all supported vintage document formats.""" if METRICS_AVAILABLE: REQUESTS_TOTAL.labels(method="GET", endpoint="/formats").inc() formats = [ SupportedFormat( format_name="dBASE Database", format_family="dbase", extensions=[".dbf", ".db", ".dbt"], description="dBASE III/IV business databases from 1980s PC era", confidence_level="High (99%)", processing_methods=["dbfread", "simpledbf", "pandas", "custom_parser"], typical_use_cases=["Customer databases", "Inventory systems", "Business records"] ), SupportedFormat( format_name="WordPerfect Document", format_family="wordperfect", extensions=[".wpd", ".wp", ".wp5", ".wp6"], description="WordPerfect 4.2-6.0 business documents and letters", confidence_level="High (95%)", processing_methods=["wpd2text", "wpd2html", "wpd2raw", "strings_extract"], typical_use_cases=["Business correspondence", "Legal documents", "Reports"] ), SupportedFormat( format_name="Lotus 1-2-3 Spreadsheet", format_family="lotus123", extensions=[".wk1", ".wk3", ".wk4", ".wks"], description="Lotus 1-2-3 financial spreadsheets and business models", confidence_level="High (90%)", processing_methods=["gnumeric_ssconvert", "libreoffice", "strings_extract"], typical_use_cases=["Financial models", "Budget forecasts", "Business analytics"] ), SupportedFormat( format_name="AppleWorks/ClarisWorks", format_family="appleworks", extensions=[".cwk", ".appleworks", ".cws"], description="Mac integrated productivity documents and presentations", confidence_level="High (95%)", processing_methods=["libreoffice", "textutil", "strings_extract"], typical_use_cases=["Presentations", "Project databases", "Mac business documents"] ), SupportedFormat( format_name="HyperCard Stack", format_family="hypercard", extensions=[".hc", ".stack"], description="Interactive multimedia stacks with HyperTalk scripting", confidence_level="High (90%)", processing_methods=["hypercard_parser", "strings_extract"], typical_use_cases=["Training systems", "Interactive presentations", "Educational content"] ), SupportedFormat( format_name="AutoCAD Drawing", format_family="autocad", extensions=[".dwg", ".dxf", ".dwt"], description="Technical drawings and CAD files from AutoCAD R10-R14", confidence_level="High (90%)", processing_methods=["teigha_converter", "librecad_extract", "dxf_conversion", "binary_analysis"], typical_use_cases=["Technical drawings", "Architectural plans", "Engineering schematics"] ), SupportedFormat( format_name="PageMaker Publication", format_family="pagemaker", extensions=[".pm1", ".pm2", ".pm3", ".pm4", ".pm5", ".pm6", ".pmd", ".pt4", ".pt5", ".pt6"], description="Desktop publishing documents from the DTP revolution (1985-1995)", confidence_level="High (90%)", processing_methods=["adobe_sdk_extract", "scribus_import", "text_extraction", "binary_analysis"], typical_use_cases=["Newsletters", "Brochures", "Annual reports", "Marketing materials"] ), SupportedFormat( format_name="Generic CADD Drawing", format_family="generic_cadd", extensions=[".vcl", ".vrd", ".fc", ".fcd", ".drx", ".dfx", ".cdl", ".prt", ".dc2", ".tcw", ".td2"], description="Vintage CAD formats from the CAD revolution era (VersaCAD, FastCAD, Drafix, CadKey, etc.)", confidence_level="High (90%)", processing_methods=["cad_conversion", "format_parser", "geometry_analysis", "binary_analysis"], typical_use_cases=["Technical drawings", "Architectural plans", "Engineering schematics", "Circuit layouts"] ) ] return formats @app.get("/formats/{format_family}", response_model=SupportedFormat, tags=["Formats"]) async def get_format_info(format_family: str): """Get detailed information about a specific format family.""" if METRICS_AVAILABLE: REQUESTS_TOTAL.labels(method="GET", endpoint="/formats/{format_family}").inc() formats = await get_supported_formats() for format_info in formats: if format_info.format_family == format_family: return format_info raise HTTPException(status_code=404, detail=f"Format family '{format_family}' not supported") # Document processing endpoints @app.post("/process", response_model=ProcessingResult, tags=["Processing"]) async def process_document( file: UploadFile = File(...), options: ProcessingOptions = Depends() ): """Process a single vintage document.""" if METRICS_AVAILABLE: REQUESTS_TOTAL.labels(method="POST", endpoint="/process").inc() start_time = time.time() document_id = f"doc_{int(time.time() * 1000000)}" try: # Save uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as tmp_file: content = await file.read() tmp_file.write(content) tmp_file_path = tmp_file.name # Detect format format_info = await detector.detect_format(tmp_file_path) if not format_info: raise HTTPException(status_code=400, detail="Unable to detect vintage document format") # Get appropriate processor processor = processors.get(format_info.format_family) if not processor: raise HTTPException(status_code=400, detail=f"No processor available for format: {format_info.format_family}") # Process document result = await processor.process( tmp_file_path, method=options.method, preserve_formatting=options.preserve_formatting ) if not result: raise HTTPException(status_code=500, detail="Processing failed - no result returned") # Build response processing_result = ProcessingResult( success=result.success, document_id=document_id, format_detected=format_info.format_family, confidence=format_info.confidence, method_used=result.method_used, text_content=result.text_content, structured_data=result.structured_content, metadata={ "filename": file.filename, "file_size": len(content), "format_info": { "format_family": format_info.format_family, "format_name": format_info.format_name, "confidence": format_info.confidence }, "processing_metadata": result.format_specific_metadata or {} }, processing_time=result.processing_time or 0, error_message=result.error_message, warnings=result.recovery_suggestions or [] ) # Update metrics if METRICS_AVAILABLE: processing_duration = time.time() - start_time PROCESSING_TIME.observe(processing_duration) if result.success: PROCESSING_SUCCESS.labels(format=format_info.format_family).inc() else: PROCESSING_ERRORS.labels(format=format_info.format_family, error_type="processing_failed").inc() return processing_result except HTTPException: raise except Exception as e: logger.error("Document processing failed", error=str(e), document_id=document_id) if METRICS_AVAILABLE: PROCESSING_ERRORS.labels(format="unknown", error_type="system_error").inc() raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}") finally: # Clean up temporary file try: if 'tmp_file_path' in locals(): os.unlink(tmp_file_path) except: pass @app.post("/process/batch", response_model=BatchProcessingResponse, tags=["Processing"]) async def process_batch( background_tasks: BackgroundTasks, files: List[UploadFile] = File(...), request: BatchProcessingRequest = Depends() ): """Process multiple documents in batch mode.""" if METRICS_AVAILABLE: REQUESTS_TOTAL.labels(method="POST", endpoint="/process/batch").inc() batch_id = f"batch_{int(time.time() * 1000000)}" # For now, return basic batch info - full implementation would use background processing batch_response = BatchProcessingResponse( batch_id=batch_id, total_files=len(files), status="queued", created_at=datetime.now() ) # Add background task for processing (simplified implementation) background_tasks.add_task(process_batch_background, batch_id, files, request) return batch_response async def process_batch_background(batch_id: str, files: List[UploadFile], request: BatchProcessingRequest): """Background task for batch processing.""" logger.info("Starting batch processing", batch_id=batch_id, file_count=len(files)) # Implementation would process files and send webhook notification when complete # This is a simplified version for the demo await asyncio.sleep(1) # Simulate processing logger.info("Batch processing completed", batch_id=batch_id) if __name__ == "__main__": uvicorn.run( "mcp_legacy_files.api:app", host="0.0.0.0", port=8000, log_level="info", access_log=True )