mcwaddams/src/mcp_office_tools/resources.py

"""Resource store for embedded Office document content.

Provides caching and retrieval of extracted resources (images, charts, media, embeds)
and structural content (chapters, pages, sheets) with stable document IDs.

Resource URI Schemes:
    Binary content:
        image://{doc_id}/{id}     - Embedded images
        chart://{doc_id}/{id}     - Charts (as PNG or data)
        media://{doc_id}/{id}     - Audio/video files
        embed://{doc_id}/{id}     - OLE embedded objects

    Text/structural content:
        chapter://{doc_id}/{num}  - Word chapter as markdown
        section://{doc_id}/{id}   - Document section
        page://{doc_id}/{num}     - Page content
        sheet://{doc_id}/{name}   - Excel sheet as markdown/CSV
        slide://{doc_id}/{num}    - PowerPoint slide content
"""

import hashlib
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Union
from pathlib import Path


# Resource type categories
BINARY_TYPES = {"image", "chart", "media", "embed"}
TEXT_TYPES = {"chapter", "section", "page", "sheet", "slide"}
ALL_RESOURCE_TYPES = BINARY_TYPES | TEXT_TYPES


@dataclass
class EmbeddedResource:
    """Represents an embedded resource from an Office document.

    Can hold either binary data (images, media) or text content (chapters, sheets).
    """
    resource_id: str
    resource_type: str  # image, chart, media, embed, chapter, section, page, sheet, slide
    mime_type: str
    data: Union[bytes, str]  # bytes for binary, str for text content
    name: Optional[str] = None  # Original filename or title
    metadata: Dict = field(default_factory=dict)  # Dimensions, word count, etc.

    @property
    def uri(self) -> str:
        """Generate the MCP resource URI for this resource."""
        return f"{self.resource_type}://{self.metadata.get('doc_id', 'unknown')}/{self.resource_id}"

    @property
    def is_binary(self) -> bool:
        """Check if this resource contains binary data."""
        return self.resource_type in BINARY_TYPES

    @property
    def is_text(self) -> bool:
        """Check if this resource contains text data."""
        return self.resource_type in TEXT_TYPES

    @property
    def size(self) -> int:
        """Get size in bytes."""
        if isinstance(self.data, bytes):
            return len(self.data)
        return len(self.data.encode('utf-8'))


class ResourceStore:
    """Manages extracted resources from Office documents.

    Resources are cached in memory and accessible via MCP resource URIs.
    Document IDs are generated from content hashes for stability.
    """

    def __init__(self):
        # Structure: {doc_id: {resource_type: [EmbeddedResource, ...]}}
        self._documents: Dict[str, Dict[str, List[EmbeddedResource]]] = {}
        # Track doc_id to file path mapping
        self._doc_paths: Dict[str, str] = {}

    @staticmethod
    def get_doc_id(file_path: str) -> str:
        """Generate stable document ID from file content hash.

        Uses first 12 characters of SHA256 hash - enough uniqueness
        for practical purposes while keeping URIs readable.
        """
        path = Path(file_path)
        if not path.exists():
            # Fallback to path hash if file doesn't exist
            return hashlib.sha256(str(path).encode()).hexdigest()[:12]

        with open(path, 'rb') as f:
            content_hash = hashlib.sha256(f.read()).hexdigest()
        return content_hash[:12]

    def store(self, doc_id: str, resource: EmbeddedResource, file_path: Optional[str] = None):
        """Store an extracted resource.

        Args:
            doc_id: Document identifier (from get_doc_id)
            resource: The embedded resource to store
            file_path: Optional original file path for reference
        """
        if doc_id not in self._documents:
            self._documents[doc_id] = {}

        rtype = resource.resource_type
        if rtype not in self._documents[doc_id]:
            self._documents[doc_id][rtype] = []

        # Add doc_id to metadata for URI generation
        resource.metadata["doc_id"] = doc_id

        self._documents[doc_id][rtype].append(resource)

        if file_path:
            self._doc_paths[doc_id] = file_path

    def get(self, doc_id: str, resource_type: str, resource_id: str) -> Optional[EmbeddedResource]:
        """Retrieve a specific resource.

        Args:
            doc_id: Document identifier
            resource_type: Type of resource (image, chart, media, embed)
            resource_id: Resource identifier (index or name)

        Returns:
            EmbeddedResource if found, None otherwise
        """
        if doc_id not in self._documents:
            return None

        resources = self._documents[doc_id].get(resource_type, [])

        # Try by index first (most common)
        if resource_id.isdigit():
            idx = int(resource_id)
            if 0 <= idx < len(resources):
                return resources[idx]

        # Try by resource_id match
        for r in resources:
            if r.resource_id == resource_id:
                return r

        # Try by name match
        for r in resources:
            if r.name and r.name == resource_id:
                return r

        return None

    def list_resources(self, doc_id: str, resource_type: Optional[str] = None) -> Dict[str, List[dict]]:
        """List all resources for a document.

        Args:
            doc_id: Document identifier
            resource_type: Optional filter by type

        Returns:
            Dict mapping resource types to lists of resource info
        """
        if doc_id not in self._documents:
            return {}

        result = {}
        for rtype, resources in self._documents[doc_id].items():
            if resource_type and rtype != resource_type:
                continue

            result[rtype] = [
                {
                    "id": r.resource_id,
                    "name": r.name,
                    "mime_type": r.mime_type,
                    "uri": f"{rtype}://{doc_id}/{r.resource_id}",
                    "size_bytes": len(r.data),
                    **{k: v for k, v in r.metadata.items() if k != "doc_id"}
                }
                for r in resources
            ]

        return result

    def get_doc_info(self, doc_id: str) -> Optional[dict]:
        """Get information about a cached document."""
        if doc_id not in self._documents:
            return None

        resource_counts = {
            rtype: len(resources)
            for rtype, resources in self._documents[doc_id].items()
        }

        return {
            "doc_id": doc_id,
            "file_path": self._doc_paths.get(doc_id),
            "resource_counts": resource_counts,
            "total_resources": sum(resource_counts.values())
        }

    def clear_document(self, doc_id: str):
        """Remove all cached resources for a document."""
        if doc_id in self._documents:
            del self._documents[doc_id]
        if doc_id in self._doc_paths:
            del self._doc_paths[doc_id]

    def clear_all(self):
        """Clear all cached resources."""
        self._documents.clear()
        self._doc_paths.clear()

    @property
    def cached_documents(self) -> List[str]:
        """List all cached document IDs."""
        return list(self._documents.keys())

    def get_stats(self) -> dict:
        """Get cache statistics."""
        total_resources = 0
        total_bytes = 0
        type_counts = {}

        for doc_id, types in self._documents.items():
            for rtype, resources in types.items():
                count = len(resources)
                total_resources += count
                type_counts[rtype] = type_counts.get(rtype, 0) + count
                total_bytes += sum(len(r.data) for r in resources)

        return {
            "documents_cached": len(self._documents),
            "total_resources": total_resources,
            "total_bytes": total_bytes,
            "by_type": type_counts
        }


# Global singleton instance
resource_store = ResourceStore()