✅ WordPerfect Production Support: - Comprehensive WordPerfect processor with 5-layer fallback chain - Support for WP 4.2, 5.0-5.1, 6.0+ (.wpd, .wp, .wp5, .wp6) - libwpd integration (wpd2text, wpd2html, wpd2raw) - Binary strings extraction and emergency parsing - Password detection and encoding intelligence - Document structure analysis and integrity checking 🏗️ Infrastructure Enhancements: - Created comprehensive CLAUDE.md development guide - Updated implementation status documentation - Added WordPerfect processor test suite - Enhanced format detection with WP magic signatures - Production-ready with graceful dependency handling 📊 Project Status: - 2/4 core processors complete (dBASE + WordPerfect) - 25+ legacy format detection engine operational - Phase 2 complete: Ready for Lotus 1-2-3 implementation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
25 KiB
25 KiB
🏗️ MCP Legacy Files - Technical Architecture
🎯 Core Architecture Principles
🧠 Intelligence-First Design
- Smart Format Detection - Multi-layer analysis beyond file extensions
- Adaptive Processing - Learn from failures to improve extraction
- Content-Aware Recovery - Reconstruct data from partial corruption
- AI Enhancement Pipeline - Transform raw extracts into structured intelligence
⚡ Performance-Optimized
- Async-First Processing - Non-blocking I/O for high throughput
- Intelligent Caching - Smart memoization of expensive operations
- Parallel Processing - Multi-document batch processing
- Resource Management - Memory-efficient handling of large archives
📊 System Overview
graph TD
A[Legacy Document Input] --> B{Format Detection Engine}
B --> C[Binary Analysis]
B --> D[Extension Mapping]
B --> E[Magic Byte Detection]
C --> F[Processing Chain Selection]
D --> F
E --> F
F --> G{Primary Extraction}
G -->|Success| H[AI Enhancement Pipeline]
G -->|Failure| I[Fallback Chain]
I --> J[Secondary Method]
J -->|Success| H
J -->|Failure| K[Tertiary Method]
K -->|Success| H
K -->|Failure| L[Emergency Binary Analysis]
L --> H
H --> M[Structured Output]
M --> N[Claude Desktop/MCP Client]
🔧 Core Components
1. Format Detection Engine
# src/mcp_legacy_files/detection/format_detector.py
class LegacyFormatDetector:
"""
Multi-layer format detection system with 99.9% accuracy
"""
def __init__(self):
self.magic_signatures = load_magic_database()
self.extension_mappings = load_extension_database()
self.heuristic_analyzers = load_content_analyzers()
async def detect_format(self, file_path: str) -> FormatInfo:
"""
Comprehensive format detection pipeline
"""
# Layer 1: Magic byte analysis (highest confidence)
magic_result = await self.analyze_magic_bytes(file_path)
# Layer 2: Extension analysis with version detection
extension_result = await self.analyze_extension(file_path)
# Layer 3: Content structure heuristics
structure_result = await self.analyze_structure(file_path)
# Layer 4: ML-based format classification
ml_result = await self.ml_classify_format(file_path)
# Confidence-weighted decision
return self.weighted_format_decision(
magic_result, extension_result,
structure_result, ml_result
)
# Format signature database
LEGACY_SIGNATURES = {
# WordPerfect signatures across versions
"wordperfect": {
"wp6": b"\xFF\x57\x50\x43", # WP 6.0+
"wp5": b"\xFF\x57\x50\x44", # WP 5.0-5.1
"wp4": b"\xFF\x57\x50\x42", # WP 4.2
},
# Lotus 1-2-3 signatures
"lotus123": {
"wk1": b"\x00\x00\x02\x00\x06\x04\x06\x00",
"wk3": b"\x00\x00\x1A\x00\x02\x04\x04\x00",
"wks": b"\xFF\x00\x02\x00\x04\x04\x05\x00",
},
# dBASE family signatures
"dbase": {
"dbf3": b"\x03", # dBASE III
"dbf4": b"\x04", # dBASE IV
"dbf5": b"\x05", # dBASE 5
"foxpro": b"\x30", # FoxPro
},
# Apple formats
"appleworks": {
"cwk": b"BOBO\x00\x00", # AppleWorks/ClarisWorks
"appleworks": b"AWDB", # AppleWorks Database
}
}
2. Processing Chain Manager
# src/mcp_legacy_files/processing/chain_manager.py
class ProcessingChainManager:
"""
Manages fallback chains for robust extraction
"""
def __init__(self):
self.chains = self.build_processing_chains()
self.success_rates = load_success_statistics()
def get_processing_chain(self, format_info: FormatInfo) -> List[ProcessingMethod]:
"""
Return optimized processing chain based on format and success rates
"""
base_chain = self.chains[format_info.format_family]
# Reorder based on success rates for this specific format variant
if format_info.variant in self.success_rates:
stats = self.success_rates[format_info.variant]
base_chain.sort(key=lambda method: stats.get(method.name, 0), reverse=True)
return base_chain
# Processing chain definitions
PROCESSING_CHAINS = {
"wordperfect": [
ProcessingMethod("libwpd", priority=1, confidence=0.95),
ProcessingMethod("wpd_python", priority=2, confidence=0.80),
ProcessingMethod("strings_extract", priority=3, confidence=0.60),
ProcessingMethod("binary_analysis", priority=4, confidence=0.30),
],
"lotus123": [
ProcessingMethod("pylotus123", priority=1, confidence=0.90),
ProcessingMethod("gnumeric_ssconvert", priority=2, confidence=0.85),
ProcessingMethod("custom_wk1_parser", priority=3, confidence=0.70),
ProcessingMethod("binary_cell_extract", priority=4, confidence=0.40),
],
"dbase": [
ProcessingMethod("dbfread", priority=1, confidence=0.98),
ProcessingMethod("simpledbf", priority=2, confidence=0.95),
ProcessingMethod("pandas_dbf", priority=3, confidence=0.90),
ProcessingMethod("xbase_parser", priority=4, confidence=0.75),
],
"appleworks": [
ProcessingMethod("libcwk", priority=1, confidence=0.85),
ProcessingMethod("resource_fork_parser", priority=2, confidence=0.70),
ProcessingMethod("mac_textutil", priority=3, confidence=0.60),
ProcessingMethod("binary_strings", priority=4, confidence=0.40),
]
}
3. AI Enhancement Pipeline
# src/mcp_legacy_files/enhancement/ai_pipeline.py
class AIEnhancementPipeline:
"""
Transform raw legacy extracts into AI-ready structured data
"""
def __init__(self):
self.content_classifier = load_content_classifier()
self.structure_analyzer = load_structure_analyzer()
self.quality_assessor = load_quality_assessor()
async def enhance_extraction(self, raw_extract: RawExtract) -> EnhancedDocument:
"""
Multi-stage AI enhancement of legacy document extracts
"""
# Stage 1: Content Classification
classification = await self.classify_content(raw_extract)
# Stage 2: Structure Recovery
structure = await self.recover_structure(raw_extract, classification)
# Stage 3: Data Quality Assessment
quality = await self.assess_quality(raw_extract, structure)
# Stage 4: Content Enhancement
enhanced_content = await self.enhance_content(
raw_extract, structure, quality
)
# Stage 5: Metadata Enrichment
metadata = await self.enrich_metadata(
raw_extract, classification, quality
)
return EnhancedDocument(
original=raw_extract,
classification=classification,
structure=structure,
quality=quality,
enhanced_content=enhanced_content,
metadata=metadata
)
# AI models for content processing
AI_MODELS = {
"content_classifier": {
"model": "distilbert-base-uncased-finetuned-legacy-docs",
"labels": ["business_letter", "financial_report", "database_record",
"research_paper", "technical_manual", "presentation"]
},
"structure_analyzer": {
"model": "layoutlm-base-uncased",
"tasks": ["paragraph_detection", "table_recovery", "heading_hierarchy"]
},
"quality_assessor": {
"model": "roberta-base-finetuned-corruption-detection",
"metrics": ["extraction_completeness", "text_coherence", "formatting_integrity"]
}
}
📚 Format-Specific Processing Modules
🖥️ PC/DOS Legacy Processors
WordPerfect Processor
# src/mcp_legacy_files/processors/wordperfect.py
class WordPerfectProcessor:
"""
Comprehensive WordPerfect document processing
"""
async def process_wpd(self, file_path: str, version: str) -> ProcessingResult:
"""
Process WordPerfect documents with version-specific handling
"""
if version.startswith("wp6"):
return await self._process_wp6_plus(file_path)
elif version.startswith("wp5"):
return await self._process_wp5(file_path)
elif version.startswith("wp4"):
return await self._process_wp4(file_path)
else:
return await self._process_generic(file_path)
async def _process_wp6_plus(self, file_path: str) -> ProcessingResult:
"""WP 6.0+ processing with full formatting support"""
try:
# Primary: libwpd via Python bindings
return await self._libwpd_extract(file_path)
except Exception:
# Fallback: Custom WP parser
return await self._custom_wp_parser(file_path)
Lotus 1-2-3 Processor
# src/mcp_legacy_files/processors/lotus123.py
class Lotus123Processor:
"""
Lotus 1-2-3 spreadsheet processing with formula support
"""
async def process_lotus(self, file_path: str, format_type: str) -> ProcessingResult:
"""
Process Lotus files with format-specific optimizations
"""
# Load Lotus-specific cell format definitions
cell_formats = self.load_lotus_formats(format_type)
if format_type == "wk1":
return await self._process_wk1(file_path, cell_formats)
elif format_type == "wk3":
return await self._process_wk3(file_path, cell_formats)
elif format_type == "wks":
return await self._process_wks(file_path, cell_formats)
async def _process_wk1(self, file_path: str, formats: dict) -> ProcessingResult:
"""WK1 format processing with formula reconstruction"""
# Parse binary WK1 structure
workbook = await self.parse_wk1_binary(file_path)
# Reconstruct formulas from binary representation
formulas = await self.reconstruct_formulas(workbook.formula_cells)
# Extract cell data with formatting
cell_data = await self.extract_formatted_cells(workbook, formats)
return ProcessingResult(
text_content=self.render_as_text(cell_data),
structured_data=cell_data,
formulas=formulas,
metadata=workbook.metadata
)
🍎 Apple/Mac Legacy Processors
AppleWorks Processor
# src/mcp_legacy_files/processors/appleworks.py
class AppleWorksProcessor:
"""
AppleWorks/ClarisWorks document processing with resource fork support
"""
async def process_appleworks(self, file_path: str) -> ProcessingResult:
"""
Process AppleWorks documents with Mac-specific handling
"""
# Check for HFS+ resource fork
resource_fork = await self.extract_resource_fork(file_path)
if resource_fork:
# Process with full Mac metadata
return await self._process_with_resources(file_path, resource_fork)
else:
# Process data fork only (cross-platform file)
return await self._process_data_fork(file_path)
async def extract_resource_fork(self, file_path: str) -> Optional[ResourceFork]:
"""Extract Mac resource fork if present"""
# Check for AppleDouble format (._ prefix)
appledouble_path = f"{os.path.dirname(file_path)}/._({os.path.basename(file_path)})"
if os.path.exists(appledouble_path):
return await self.parse_appledouble(appledouble_path)
# Check for resource fork in extended attributes (macOS)
if hasattr(os, 'getxattr'):
try:
return await self.parse_xattr_resource(file_path)
except OSError:
pass
return None
HyperCard Processor
# src/mcp_legacy_files/processors/hypercard.py
class HyperCardProcessor:
"""
HyperCard stack processing with HyperTalk script extraction
"""
async def process_hypercard(self, file_path: str) -> ProcessingResult:
"""
Process HyperCard stacks with multimedia content extraction
"""
# Parse HyperCard stack structure
stack = await self.parse_hypercard_stack(file_path)
# Extract cards and backgrounds
cards = await self.extract_cards(stack)
backgrounds = await self.extract_backgrounds(stack)
# Extract HyperTalk scripts
scripts = await self.extract_hypertalk_scripts(stack)
# Extract multimedia elements
sounds = await self.extract_sounds(stack)
graphics = await self.extract_graphics(stack)
return ProcessingResult(
text_content=self.render_stack_as_text(cards, scripts),
structured_data={
"cards": cards,
"backgrounds": backgrounds,
"scripts": scripts,
"sounds": sounds,
"graphics": graphics
},
multimedia={"sounds": sounds, "graphics": graphics},
metadata=stack.metadata
)
🔄 Caching & Performance Layer
Smart Caching System
# src/mcp_legacy_files/caching/smart_cache.py
class SmartCache:
"""
Intelligent caching for expensive legacy processing operations
"""
def __init__(self):
self.memory_cache = {}
self.disk_cache = diskcache.Cache('/tmp/mcp_legacy_cache')
self.cache_stats = CacheStatistics()
async def get_or_process(self, file_path: str, processor_func: callable) -> any:
"""
Intelligent cache retrieval with invalidation logic
"""
# Generate cache key from file content hash + processor version
cache_key = await self.generate_cache_key(file_path, processor_func)
# Check memory cache first (fastest)
if cache_key in self.memory_cache:
self.cache_stats.record_hit('memory')
return self.memory_cache[cache_key]
# Check disk cache
if cache_key in self.disk_cache:
result = self.disk_cache[cache_key]
# Promote to memory cache
self.memory_cache[cache_key] = result
self.cache_stats.record_hit('disk')
return result
# Cache miss - process and store
result = await processor_func(file_path)
# Store in both caches with appropriate TTL
await self.store_result(cache_key, result, file_path)
self.cache_stats.record_miss()
return result
Batch Processing Engine
# src/mcp_legacy_files/batch/batch_processor.py
class BatchProcessor:
"""
High-performance batch processing for enterprise archives
"""
def __init__(self, max_concurrent=10):
self.max_concurrent = max_concurrent
self.semaphore = asyncio.Semaphore(max_concurrent)
self.progress_tracker = ProgressTracker()
async def process_archive(self, archive_path: str) -> BatchResult:
"""
Process entire archive of legacy documents
"""
# Discover all processable files
file_list = await self.discover_legacy_files(archive_path)
# Group by format for optimized processing
grouped_files = self.group_by_format(file_list)
# Process each format group with specialized handlers
results = []
for format_type, files in grouped_files.items():
format_results = await self.process_format_batch(format_type, files)
results.extend(format_results)
return BatchResult(
total_files=len(file_list),
processed_files=len(results),
success_rate=len([r for r in results if r.success]) / len(results),
results=results,
processing_time=time.time() - start_time
)
async def process_format_batch(self, format_type: str, files: List[str]) -> List[ProcessingResult]:
"""
Process batch of files with same format using optimized pipeline
"""
# Create format-specific processor
processor = ProcessorFactory.create(format_type)
# Process files concurrently with rate limiting
async def process_single(file_path):
async with self.semaphore:
return await processor.process(file_path)
tasks = [process_single(file_path) for file_path in files]
results = await asyncio.gather(*tasks, return_exceptions=True)
return [r for r in results if not isinstance(r, Exception)]
🛡️ Error Recovery & Resilience
Corruption Recovery System
# src/mcp_legacy_files/recovery/corruption_recovery.py
class CorruptionRecoverySystem:
"""
Advanced system for recovering data from corrupted legacy files
"""
async def attempt_recovery(self, file_path: str, error_info: ErrorInfo) -> RecoveryResult:
"""
Multi-stage corruption recovery pipeline
"""
# Stage 1: Partial read recovery
partial_result = await self.partial_read_recovery(file_path)
if partial_result.success_rate > 0.7:
return partial_result
# Stage 2: Header reconstruction
header_result = await self.reconstruct_header(file_path, error_info.format)
if header_result.success:
return await self.reprocess_with_fixed_header(file_path, header_result.fixed_header)
# Stage 3: Content extraction via binary analysis
binary_result = await self.binary_content_extraction(file_path)
if binary_result.content_found:
return await self.enhance_binary_extraction(binary_result)
# Stage 4: ML-based content reconstruction
ml_result = await self.ml_content_reconstruction(file_path, error_info)
return ml_result
class AdvancedErrorHandling:
"""
Comprehensive error handling with learning capabilities
"""
def __init__(self):
self.error_patterns = load_error_patterns()
self.recovery_strategies = load_recovery_strategies()
async def handle_processing_error(self, error: Exception, context: ProcessingContext) -> ErrorRecovery:
"""
Intelligent error handling with pattern matching
"""
# Classify error type
error_type = self.classify_error(error, context)
# Look up known recovery strategies
strategies = self.recovery_strategies.get(error_type, [])
# Attempt recovery strategies in order of success probability
for strategy in strategies:
try:
recovery_result = await strategy.attempt_recovery(context)
if recovery_result.success:
# Learn from successful recovery
self.update_success_pattern(error_type, strategy)
return recovery_result
except Exception:
continue
# All strategies failed - record for future learning
self.record_unrecoverable_error(error, context)
return ErrorRecovery(success=False, error=error, context=context)
📊 Monitoring & Analytics
Processing Analytics
# src/mcp_legacy_files/analytics/processing_analytics.py
class ProcessingAnalytics:
"""
Comprehensive analytics for legacy document processing
"""
def __init__(self):
self.metrics_collector = MetricsCollector()
self.performance_tracker = PerformanceTracker()
self.quality_analyzer = QualityAnalyzer()
async def track_processing(self, file_path: str, format_info: FormatInfo,
processing_chain: List[str], result: ProcessingResult):
"""
Track comprehensive processing metrics
"""
# Performance metrics
await self.performance_tracker.record({
'file_size': os.path.getsize(file_path),
'format': format_info.format_family,
'version': format_info.version,
'processing_time': result.processing_time,
'successful_method': result.successful_method,
'fallback_attempts': len(processing_chain) - 1
})
# Quality metrics
await self.quality_analyzer.analyze({
'extraction_completeness': result.completeness_score,
'text_coherence': result.coherence_score,
'structure_preservation': result.structure_score,
'error_rate': result.error_count / result.total_elements
})
# Success patterns
await self.metrics_collector.record_success_pattern({
'format': format_info.format_family,
'file_characteristics': await self.analyze_file_characteristics(file_path),
'successful_processing_chain': result.processing_chain_used,
'success_factors': result.success_factors
})
# Real-time dashboard data
ANALYTICS_DASHBOARD = {
"processing_stats": {
"total_documents_processed": 0,
"success_rate_by_format": {},
"average_processing_time": {},
"most_reliable_processors": {}
},
"quality_metrics": {
"average_completeness": 0.0,
"text_coherence_score": 0.0,
"structure_preservation": 0.0
},
"error_analysis": {
"common_failure_patterns": [],
"recovery_success_rates": {},
"unprocessable_formats": []
}
}
🔧 Configuration & Extensibility
Plugin Architecture
# src/mcp_legacy_files/plugins/plugin_manager.py
class PluginManager:
"""
Extensible plugin system for custom format processors
"""
def __init__(self):
self.registered_processors = {}
self.format_handlers = {}
self.enhancement_plugins = {}
def register_processor(self, format_family: str, processor_class: type):
"""Register custom processor for specific format family"""
self.registered_processors[format_family] = processor_class
def register_format_handler(self, extension: str, handler_func: callable):
"""Register handler for specific file extension"""
self.format_handlers[extension] = handler_func
def register_enhancement_plugin(self, plugin_name: str, plugin_class: type):
"""Register AI enhancement plugin"""
self.enhancement_plugins[plugin_name] = plugin_class
# Example custom processor registration
@register_processor("custom_database")
class CustomDatabaseProcessor(BaseProcessor):
"""Example custom processor for proprietary database format"""
async def can_process(self, file_path: str) -> bool:
return file_path.endswith('.customdb')
async def process(self, file_path: str) -> ProcessingResult:
# Custom processing logic here
pass
🎯 Performance Specifications
Target Performance Metrics
Metric | Target | Measurement |
---|---|---|
Processing Speed | < 5 seconds/document | Average across all formats |
Memory Usage | < 512MB peak | Per document processing |
Batch Throughput | 1000+ docs/hour | Enterprise archive processing |
Cache Hit Rate | > 80% | Repeat processing scenarios |
Success Rate | > 95% | Non-corrupted files |
Recovery Rate | > 60% | Corrupted/damaged files |
Scalability Architecture
# Horizontal scaling support
SCALING_CONFIG = {
"processing_nodes": {
"min_nodes": 1,
"max_nodes": 100,
"auto_scale_threshold": 0.8, # CPU utilization
"scale_up_delay": 60, # seconds
"scale_down_delay": 300 # seconds
},
"load_balancing": {
"strategy": "least_connections",
"health_check_interval": 30,
"unhealthy_threshold": 3
},
"resource_limits": {
"max_file_size": "1GB",
"max_concurrent_processes": 50,
"memory_limit_per_process": "512MB"
}
}
This technical architecture provides the foundation for building the most comprehensive legacy document processing system ever created, capable of handling the full spectrum of vintage computing formats with modern AI-enhanced intelligence.
Next: Implementation begins with core format detection and the highest-value dBASE processor 🚀