"""Resource store for embedded Office document content. Provides caching and retrieval of extracted resources (images, charts, media, embeds) and structural content (chapters, pages, sheets) with stable document IDs. Resource URI Schemes: Binary content: image://{doc_id}/{id} - Embedded images chart://{doc_id}/{id} - Charts (as PNG or data) media://{doc_id}/{id} - Audio/video files embed://{doc_id}/{id} - OLE embedded objects Text/structural content: chapter://{doc_id}/{num} - Word chapter as markdown section://{doc_id}/{id} - Document section page://{doc_id}/{num} - Page content sheet://{doc_id}/{name} - Excel sheet as markdown/CSV slide://{doc_id}/{num} - PowerPoint slide content """ import hashlib from dataclasses import dataclass, field from typing import Dict, List, Optional, Union from pathlib import Path # Resource type categories BINARY_TYPES = {"image", "chart", "media", "embed"} TEXT_TYPES = {"chapter", "section", "page", "sheet", "slide"} ALL_RESOURCE_TYPES = BINARY_TYPES | TEXT_TYPES @dataclass class EmbeddedResource: """Represents an embedded resource from an Office document. Can hold either binary data (images, media) or text content (chapters, sheets). """ resource_id: str resource_type: str # image, chart, media, embed, chapter, section, page, sheet, slide mime_type: str data: Union[bytes, str] # bytes for binary, str for text content name: Optional[str] = None # Original filename or title metadata: Dict = field(default_factory=dict) # Dimensions, word count, etc. @property def uri(self) -> str: """Generate the MCP resource URI for this resource.""" return f"{self.resource_type}://{self.metadata.get('doc_id', 'unknown')}/{self.resource_id}" @property def is_binary(self) -> bool: """Check if this resource contains binary data.""" return self.resource_type in BINARY_TYPES @property def is_text(self) -> bool: """Check if this resource contains text data.""" return self.resource_type in TEXT_TYPES @property def size(self) -> int: """Get size in bytes.""" if isinstance(self.data, bytes): return len(self.data) return len(self.data.encode('utf-8')) class ResourceStore: """Manages extracted resources from Office documents. Resources are cached in memory and accessible via MCP resource URIs. Document IDs are generated from content hashes for stability. """ def __init__(self): # Structure: {doc_id: {resource_type: [EmbeddedResource, ...]}} self._documents: Dict[str, Dict[str, List[EmbeddedResource]]] = {} # Track doc_id to file path mapping self._doc_paths: Dict[str, str] = {} @staticmethod def get_doc_id(file_path: str) -> str: """Generate stable document ID from file content hash. Uses first 12 characters of SHA256 hash - enough uniqueness for practical purposes while keeping URIs readable. """ path = Path(file_path) if not path.exists(): # Fallback to path hash if file doesn't exist return hashlib.sha256(str(path).encode()).hexdigest()[:12] with open(path, 'rb') as f: content_hash = hashlib.sha256(f.read()).hexdigest() return content_hash[:12] def store(self, doc_id: str, resource: EmbeddedResource, file_path: Optional[str] = None): """Store an extracted resource. Args: doc_id: Document identifier (from get_doc_id) resource: The embedded resource to store file_path: Optional original file path for reference """ if doc_id not in self._documents: self._documents[doc_id] = {} rtype = resource.resource_type if rtype not in self._documents[doc_id]: self._documents[doc_id][rtype] = [] # Add doc_id to metadata for URI generation resource.metadata["doc_id"] = doc_id self._documents[doc_id][rtype].append(resource) if file_path: self._doc_paths[doc_id] = file_path def get(self, doc_id: str, resource_type: str, resource_id: str) -> Optional[EmbeddedResource]: """Retrieve a specific resource. Args: doc_id: Document identifier resource_type: Type of resource (image, chart, media, embed) resource_id: Resource identifier (index or name) Returns: EmbeddedResource if found, None otherwise """ if doc_id not in self._documents: return None resources = self._documents[doc_id].get(resource_type, []) # Try by index first (most common) if resource_id.isdigit(): idx = int(resource_id) if 0 <= idx < len(resources): return resources[idx] # Try by resource_id match for r in resources: if r.resource_id == resource_id: return r # Try by name match for r in resources: if r.name and r.name == resource_id: return r return None def list_resources(self, doc_id: str, resource_type: Optional[str] = None) -> Dict[str, List[dict]]: """List all resources for a document. Args: doc_id: Document identifier resource_type: Optional filter by type Returns: Dict mapping resource types to lists of resource info """ if doc_id not in self._documents: return {} result = {} for rtype, resources in self._documents[doc_id].items(): if resource_type and rtype != resource_type: continue result[rtype] = [ { "id": r.resource_id, "name": r.name, "mime_type": r.mime_type, "uri": f"{rtype}://{doc_id}/{r.resource_id}", "size_bytes": len(r.data), **{k: v for k, v in r.metadata.items() if k != "doc_id"} } for r in resources ] return result def get_doc_info(self, doc_id: str) -> Optional[dict]: """Get information about a cached document.""" if doc_id not in self._documents: return None resource_counts = { rtype: len(resources) for rtype, resources in self._documents[doc_id].items() } return { "doc_id": doc_id, "file_path": self._doc_paths.get(doc_id), "resource_counts": resource_counts, "total_resources": sum(resource_counts.values()) } def clear_document(self, doc_id: str): """Remove all cached resources for a document.""" if doc_id in self._documents: del self._documents[doc_id] if doc_id in self._doc_paths: del self._doc_paths[doc_id] def clear_all(self): """Clear all cached resources.""" self._documents.clear() self._doc_paths.clear() @property def cached_documents(self) -> List[str]: """List all cached document IDs.""" return list(self._documents.keys()) def get_stats(self) -> dict: """Get cache statistics.""" total_resources = 0 total_bytes = 0 type_counts = {} for doc_id, types in self._documents.items(): for rtype, resources in types.items(): count = len(resources) total_resources += count type_counts[rtype] = type_counts.get(rtype, 0) + count total_bytes += sum(len(r.data) for r in resources) return { "documents_cached": len(self._documents), "total_resources": total_resources, "total_bytes": total_bytes, "by_type": type_counts } # Global singleton instance resource_store = ResourceStore()