Some checks are pending
Test Dashboard / test-and-dashboard (push) Waiting to run
Implements URI-based access to document content with: - ResourceStore for caching extracted images, chapters, sheets, slides - Content-based document IDs (SHA256 hash) for stable URIs across sessions - 11 resource templates with flexible URI patterns: - Binary: image://, chart://, media://, embed:// - Text: chapter://, section://, sheet://, slide:// - Ranges: chapters://doc/1-5, slides://doc/1,3,5 - Hierarchical: paragraph://doc/3/5 - Format suffixes for output control: - chapter://doc/3.md (default markdown) - chapter://doc/3.txt (plain text) - chapter://doc/3.html (basic HTML) - index_document tool scans and populates resources: - Word: chapters as markdown, embedded images - Excel: sheets as markdown tables - PowerPoint: slides as markdown Tool responses return URIs instead of blobs - clients fetch only what they need.
244 lines
7.9 KiB
Python
244 lines
7.9 KiB
Python
"""Resource store for embedded Office document content.
|
|
|
|
Provides caching and retrieval of extracted resources (images, charts, media, embeds)
|
|
and structural content (chapters, pages, sheets) with stable document IDs.
|
|
|
|
Resource URI Schemes:
|
|
Binary content:
|
|
image://{doc_id}/{id} - Embedded images
|
|
chart://{doc_id}/{id} - Charts (as PNG or data)
|
|
media://{doc_id}/{id} - Audio/video files
|
|
embed://{doc_id}/{id} - OLE embedded objects
|
|
|
|
Text/structural content:
|
|
chapter://{doc_id}/{num} - Word chapter as markdown
|
|
section://{doc_id}/{id} - Document section
|
|
page://{doc_id}/{num} - Page content
|
|
sheet://{doc_id}/{name} - Excel sheet as markdown/CSV
|
|
slide://{doc_id}/{num} - PowerPoint slide content
|
|
"""
|
|
|
|
import hashlib
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, List, Optional, Union
|
|
from pathlib import Path
|
|
|
|
|
|
# Resource type categories
|
|
BINARY_TYPES = {"image", "chart", "media", "embed"}
|
|
TEXT_TYPES = {"chapter", "section", "page", "sheet", "slide"}
|
|
ALL_RESOURCE_TYPES = BINARY_TYPES | TEXT_TYPES
|
|
|
|
|
|
@dataclass
|
|
class EmbeddedResource:
|
|
"""Represents an embedded resource from an Office document.
|
|
|
|
Can hold either binary data (images, media) or text content (chapters, sheets).
|
|
"""
|
|
resource_id: str
|
|
resource_type: str # image, chart, media, embed, chapter, section, page, sheet, slide
|
|
mime_type: str
|
|
data: Union[bytes, str] # bytes for binary, str for text content
|
|
name: Optional[str] = None # Original filename or title
|
|
metadata: Dict = field(default_factory=dict) # Dimensions, word count, etc.
|
|
|
|
@property
|
|
def uri(self) -> str:
|
|
"""Generate the MCP resource URI for this resource."""
|
|
return f"{self.resource_type}://{self.metadata.get('doc_id', 'unknown')}/{self.resource_id}"
|
|
|
|
@property
|
|
def is_binary(self) -> bool:
|
|
"""Check if this resource contains binary data."""
|
|
return self.resource_type in BINARY_TYPES
|
|
|
|
@property
|
|
def is_text(self) -> bool:
|
|
"""Check if this resource contains text data."""
|
|
return self.resource_type in TEXT_TYPES
|
|
|
|
@property
|
|
def size(self) -> int:
|
|
"""Get size in bytes."""
|
|
if isinstance(self.data, bytes):
|
|
return len(self.data)
|
|
return len(self.data.encode('utf-8'))
|
|
|
|
|
|
class ResourceStore:
|
|
"""Manages extracted resources from Office documents.
|
|
|
|
Resources are cached in memory and accessible via MCP resource URIs.
|
|
Document IDs are generated from content hashes for stability.
|
|
"""
|
|
|
|
def __init__(self):
|
|
# Structure: {doc_id: {resource_type: [EmbeddedResource, ...]}}
|
|
self._documents: Dict[str, Dict[str, List[EmbeddedResource]]] = {}
|
|
# Track doc_id to file path mapping
|
|
self._doc_paths: Dict[str, str] = {}
|
|
|
|
@staticmethod
|
|
def get_doc_id(file_path: str) -> str:
|
|
"""Generate stable document ID from file content hash.
|
|
|
|
Uses first 12 characters of SHA256 hash - enough uniqueness
|
|
for practical purposes while keeping URIs readable.
|
|
"""
|
|
path = Path(file_path)
|
|
if not path.exists():
|
|
# Fallback to path hash if file doesn't exist
|
|
return hashlib.sha256(str(path).encode()).hexdigest()[:12]
|
|
|
|
with open(path, 'rb') as f:
|
|
content_hash = hashlib.sha256(f.read()).hexdigest()
|
|
return content_hash[:12]
|
|
|
|
def store(self, doc_id: str, resource: EmbeddedResource, file_path: Optional[str] = None):
|
|
"""Store an extracted resource.
|
|
|
|
Args:
|
|
doc_id: Document identifier (from get_doc_id)
|
|
resource: The embedded resource to store
|
|
file_path: Optional original file path for reference
|
|
"""
|
|
if doc_id not in self._documents:
|
|
self._documents[doc_id] = {}
|
|
|
|
rtype = resource.resource_type
|
|
if rtype not in self._documents[doc_id]:
|
|
self._documents[doc_id][rtype] = []
|
|
|
|
# Add doc_id to metadata for URI generation
|
|
resource.metadata["doc_id"] = doc_id
|
|
|
|
self._documents[doc_id][rtype].append(resource)
|
|
|
|
if file_path:
|
|
self._doc_paths[doc_id] = file_path
|
|
|
|
def get(self, doc_id: str, resource_type: str, resource_id: str) -> Optional[EmbeddedResource]:
|
|
"""Retrieve a specific resource.
|
|
|
|
Args:
|
|
doc_id: Document identifier
|
|
resource_type: Type of resource (image, chart, media, embed)
|
|
resource_id: Resource identifier (index or name)
|
|
|
|
Returns:
|
|
EmbeddedResource if found, None otherwise
|
|
"""
|
|
if doc_id not in self._documents:
|
|
return None
|
|
|
|
resources = self._documents[doc_id].get(resource_type, [])
|
|
|
|
# Try by index first (most common)
|
|
if resource_id.isdigit():
|
|
idx = int(resource_id)
|
|
if 0 <= idx < len(resources):
|
|
return resources[idx]
|
|
|
|
# Try by resource_id match
|
|
for r in resources:
|
|
if r.resource_id == resource_id:
|
|
return r
|
|
|
|
# Try by name match
|
|
for r in resources:
|
|
if r.name and r.name == resource_id:
|
|
return r
|
|
|
|
return None
|
|
|
|
def list_resources(self, doc_id: str, resource_type: Optional[str] = None) -> Dict[str, List[dict]]:
|
|
"""List all resources for a document.
|
|
|
|
Args:
|
|
doc_id: Document identifier
|
|
resource_type: Optional filter by type
|
|
|
|
Returns:
|
|
Dict mapping resource types to lists of resource info
|
|
"""
|
|
if doc_id not in self._documents:
|
|
return {}
|
|
|
|
result = {}
|
|
for rtype, resources in self._documents[doc_id].items():
|
|
if resource_type and rtype != resource_type:
|
|
continue
|
|
|
|
result[rtype] = [
|
|
{
|
|
"id": r.resource_id,
|
|
"name": r.name,
|
|
"mime_type": r.mime_type,
|
|
"uri": f"{rtype}://{doc_id}/{r.resource_id}",
|
|
"size_bytes": len(r.data),
|
|
**{k: v for k, v in r.metadata.items() if k != "doc_id"}
|
|
}
|
|
for r in resources
|
|
]
|
|
|
|
return result
|
|
|
|
def get_doc_info(self, doc_id: str) -> Optional[dict]:
|
|
"""Get information about a cached document."""
|
|
if doc_id not in self._documents:
|
|
return None
|
|
|
|
resource_counts = {
|
|
rtype: len(resources)
|
|
for rtype, resources in self._documents[doc_id].items()
|
|
}
|
|
|
|
return {
|
|
"doc_id": doc_id,
|
|
"file_path": self._doc_paths.get(doc_id),
|
|
"resource_counts": resource_counts,
|
|
"total_resources": sum(resource_counts.values())
|
|
}
|
|
|
|
def clear_document(self, doc_id: str):
|
|
"""Remove all cached resources for a document."""
|
|
if doc_id in self._documents:
|
|
del self._documents[doc_id]
|
|
if doc_id in self._doc_paths:
|
|
del self._doc_paths[doc_id]
|
|
|
|
def clear_all(self):
|
|
"""Clear all cached resources."""
|
|
self._documents.clear()
|
|
self._doc_paths.clear()
|
|
|
|
@property
|
|
def cached_documents(self) -> List[str]:
|
|
"""List all cached document IDs."""
|
|
return list(self._documents.keys())
|
|
|
|
def get_stats(self) -> dict:
|
|
"""Get cache statistics."""
|
|
total_resources = 0
|
|
total_bytes = 0
|
|
type_counts = {}
|
|
|
|
for doc_id, types in self._documents.items():
|
|
for rtype, resources in types.items():
|
|
count = len(resources)
|
|
total_resources += count
|
|
type_counts[rtype] = type_counts.get(rtype, 0) + count
|
|
total_bytes += sum(len(r.data) for r in resources)
|
|
|
|
return {
|
|
"documents_cached": len(self._documents),
|
|
"total_resources": total_resources,
|
|
"total_bytes": total_bytes,
|
|
"by_type": type_counts
|
|
}
|
|
|
|
|
|
# Global singleton instance
|
|
resource_store = ResourceStore()
|