Ryan Malloy d569034fa3
Some checks are pending
Test Dashboard / test-and-dashboard (push) Waiting to run
Add MCP resource system for embedded document content
Implements URI-based access to document content with:

- ResourceStore for caching extracted images, chapters, sheets, slides
- Content-based document IDs (SHA256 hash) for stable URIs across sessions
- 11 resource templates with flexible URI patterns:
  - Binary: image://, chart://, media://, embed://
  - Text: chapter://, section://, sheet://, slide://
  - Ranges: chapters://doc/1-5, slides://doc/1,3,5
  - Hierarchical: paragraph://doc/3/5

- Format suffixes for output control:
  - chapter://doc/3.md (default markdown)
  - chapter://doc/3.txt (plain text)
  - chapter://doc/3.html (basic HTML)

- index_document tool scans and populates resources:
  - Word: chapters as markdown, embedded images
  - Excel: sheets as markdown tables
  - PowerPoint: slides as markdown

Tool responses return URIs instead of blobs - clients fetch only what they need.
2026-01-11 09:04:29 -07:00

244 lines
7.9 KiB
Python

"""Resource store for embedded Office document content.
Provides caching and retrieval of extracted resources (images, charts, media, embeds)
and structural content (chapters, pages, sheets) with stable document IDs.
Resource URI Schemes:
Binary content:
image://{doc_id}/{id} - Embedded images
chart://{doc_id}/{id} - Charts (as PNG or data)
media://{doc_id}/{id} - Audio/video files
embed://{doc_id}/{id} - OLE embedded objects
Text/structural content:
chapter://{doc_id}/{num} - Word chapter as markdown
section://{doc_id}/{id} - Document section
page://{doc_id}/{num} - Page content
sheet://{doc_id}/{name} - Excel sheet as markdown/CSV
slide://{doc_id}/{num} - PowerPoint slide content
"""
import hashlib
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Union
from pathlib import Path
# Resource type categories
BINARY_TYPES = {"image", "chart", "media", "embed"}
TEXT_TYPES = {"chapter", "section", "page", "sheet", "slide"}
ALL_RESOURCE_TYPES = BINARY_TYPES | TEXT_TYPES
@dataclass
class EmbeddedResource:
"""Represents an embedded resource from an Office document.
Can hold either binary data (images, media) or text content (chapters, sheets).
"""
resource_id: str
resource_type: str # image, chart, media, embed, chapter, section, page, sheet, slide
mime_type: str
data: Union[bytes, str] # bytes for binary, str for text content
name: Optional[str] = None # Original filename or title
metadata: Dict = field(default_factory=dict) # Dimensions, word count, etc.
@property
def uri(self) -> str:
"""Generate the MCP resource URI for this resource."""
return f"{self.resource_type}://{self.metadata.get('doc_id', 'unknown')}/{self.resource_id}"
@property
def is_binary(self) -> bool:
"""Check if this resource contains binary data."""
return self.resource_type in BINARY_TYPES
@property
def is_text(self) -> bool:
"""Check if this resource contains text data."""
return self.resource_type in TEXT_TYPES
@property
def size(self) -> int:
"""Get size in bytes."""
if isinstance(self.data, bytes):
return len(self.data)
return len(self.data.encode('utf-8'))
class ResourceStore:
"""Manages extracted resources from Office documents.
Resources are cached in memory and accessible via MCP resource URIs.
Document IDs are generated from content hashes for stability.
"""
def __init__(self):
# Structure: {doc_id: {resource_type: [EmbeddedResource, ...]}}
self._documents: Dict[str, Dict[str, List[EmbeddedResource]]] = {}
# Track doc_id to file path mapping
self._doc_paths: Dict[str, str] = {}
@staticmethod
def get_doc_id(file_path: str) -> str:
"""Generate stable document ID from file content hash.
Uses first 12 characters of SHA256 hash - enough uniqueness
for practical purposes while keeping URIs readable.
"""
path = Path(file_path)
if not path.exists():
# Fallback to path hash if file doesn't exist
return hashlib.sha256(str(path).encode()).hexdigest()[:12]
with open(path, 'rb') as f:
content_hash = hashlib.sha256(f.read()).hexdigest()
return content_hash[:12]
def store(self, doc_id: str, resource: EmbeddedResource, file_path: Optional[str] = None):
"""Store an extracted resource.
Args:
doc_id: Document identifier (from get_doc_id)
resource: The embedded resource to store
file_path: Optional original file path for reference
"""
if doc_id not in self._documents:
self._documents[doc_id] = {}
rtype = resource.resource_type
if rtype not in self._documents[doc_id]:
self._documents[doc_id][rtype] = []
# Add doc_id to metadata for URI generation
resource.metadata["doc_id"] = doc_id
self._documents[doc_id][rtype].append(resource)
if file_path:
self._doc_paths[doc_id] = file_path
def get(self, doc_id: str, resource_type: str, resource_id: str) -> Optional[EmbeddedResource]:
"""Retrieve a specific resource.
Args:
doc_id: Document identifier
resource_type: Type of resource (image, chart, media, embed)
resource_id: Resource identifier (index or name)
Returns:
EmbeddedResource if found, None otherwise
"""
if doc_id not in self._documents:
return None
resources = self._documents[doc_id].get(resource_type, [])
# Try by index first (most common)
if resource_id.isdigit():
idx = int(resource_id)
if 0 <= idx < len(resources):
return resources[idx]
# Try by resource_id match
for r in resources:
if r.resource_id == resource_id:
return r
# Try by name match
for r in resources:
if r.name and r.name == resource_id:
return r
return None
def list_resources(self, doc_id: str, resource_type: Optional[str] = None) -> Dict[str, List[dict]]:
"""List all resources for a document.
Args:
doc_id: Document identifier
resource_type: Optional filter by type
Returns:
Dict mapping resource types to lists of resource info
"""
if doc_id not in self._documents:
return {}
result = {}
for rtype, resources in self._documents[doc_id].items():
if resource_type and rtype != resource_type:
continue
result[rtype] = [
{
"id": r.resource_id,
"name": r.name,
"mime_type": r.mime_type,
"uri": f"{rtype}://{doc_id}/{r.resource_id}",
"size_bytes": len(r.data),
**{k: v for k, v in r.metadata.items() if k != "doc_id"}
}
for r in resources
]
return result
def get_doc_info(self, doc_id: str) -> Optional[dict]:
"""Get information about a cached document."""
if doc_id not in self._documents:
return None
resource_counts = {
rtype: len(resources)
for rtype, resources in self._documents[doc_id].items()
}
return {
"doc_id": doc_id,
"file_path": self._doc_paths.get(doc_id),
"resource_counts": resource_counts,
"total_resources": sum(resource_counts.values())
}
def clear_document(self, doc_id: str):
"""Remove all cached resources for a document."""
if doc_id in self._documents:
del self._documents[doc_id]
if doc_id in self._doc_paths:
del self._doc_paths[doc_id]
def clear_all(self):
"""Clear all cached resources."""
self._documents.clear()
self._doc_paths.clear()
@property
def cached_documents(self) -> List[str]:
"""List all cached document IDs."""
return list(self._documents.keys())
def get_stats(self) -> dict:
"""Get cache statistics."""
total_resources = 0
total_bytes = 0
type_counts = {}
for doc_id, types in self._documents.items():
for rtype, resources in types.items():
count = len(resources)
total_resources += count
type_counts[rtype] = type_counts.get(rtype, 0) + count
total_bytes += sum(len(r.data) for r in resources)
return {
"documents_cached": len(self._documents),
"total_resources": total_resources,
"total_bytes": total_bytes,
"by_type": type_counts
}
# Global singleton instance
resource_store = ResourceStore()