From 6fb76d87601253d70f6e8829b3535d6a02f13345 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Sun, 11 Jan 2026 10:23:47 -0700 Subject: [PATCH] Add MCP resources documentation and fix section format suffix - Document MCP resource system in README with URI patterns, format suffixes, range syntax, and section detection strategies - Add index_document to Universal Tools table - Update architecture section to include resources.py - Fix section:// resource to support .md/.txt/.html format suffixes (matching chapter:// behavior) --- README.md | 98 +++++++++++++++++++++++++++++++++- src/mcp_office_tools/server.py | 9 ++-- 2 files changed, 101 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 6a7f512..fb3ce60 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ claude mcp add office-tools "uvx mcp-office-tools" | `detect_office_format` | Identify format, version, encryption status | | `analyze_document_health` | Check integrity, corruption, password protection | | `get_supported_formats` | List all supported file extensions | +| `index_document` | Scan document and create resource URIs for on-demand fetching | ### Word Tools @@ -124,6 +125,62 @@ Here's what works and what's "good enough" โ€” legacy formats from Office 97-200 --- +## ๐Ÿ”— MCP Resources + +Instead of returning entire documents in tool responses, you can index a document once and fetch content on-demand via URI-based resources. This keeps context windows manageable when working with large files. + +### How It Works + +1. **Index the document** โ€” `index_document` scans the file and returns URIs +2. **Fetch what you need** โ€” Request specific chapters, sheets, slides, or images by URI +3. **Format on demand** โ€” Append `.txt` or `.html` to get different output formats + +### Resource URI Patterns + +| URI Pattern | Description | Example | +|-------------|-------------|---------| +| `chapter://{doc_id}/{n}` | Single chapter/section | `chapter://abc123/3` | +| `chapters://{doc_id}/{range}` | Multiple chapters | `chapters://abc123/1-5` | +| `section://{doc_id}/{n}` | Section by heading style | `section://abc123/2` | +| `paragraph://{doc_id}/{ch}/{p}` | Specific paragraph | `paragraph://abc123/3/7` | +| `sheet://{doc_id}/{name}` | Excel sheet as markdown table | `sheet://abc123/Revenue` | +| `slide://{doc_id}/{n}` | PowerPoint slide | `slide://abc123/5` | +| `slides://{doc_id}/{range}` | Multiple slides | `slides://abc123/1,3,5` | +| `image://{doc_id}/{n}` | Embedded image | `image://abc123/0` | + +### Format Suffixes + +Append a format suffix to convert on the fly: + +| Suffix | Output | +|--------|--------| +| `.md` (default) | Markdown | +| `.txt` | Plain text (no formatting) | +| `.html` | Basic HTML | + +Examples: +- `chapter://abc123/3` โ†’ Markdown (default) +- `chapter://abc123/3.txt` โ†’ Plain text +- `chapter://abc123/3.html` โ†’ HTML + +### Range Syntax + +Fetch multiple items at once: +- `1-5` โ†’ Items 1 through 5 +- `1,3,5` โ†’ Specific items +- `1-3,7,9-10` โ†’ Mixed ranges + +### Section Detection + +The indexer detects document structure automatically: + +1. **Heading 1 styles** (primary) โ€” Business docs, manuals, technical documents +2. **"Chapter X" text patterns** (fallback) โ€” Books, manuscripts, narratives + +Use `text_patterns_only=True` to skip heading style detection for documents with messy formatting. + +--- + ## ๐ŸŽฏ MCP Prompts Pre-built workflows that chain multiple tools together. Use these as starting points: @@ -283,6 +340,42 @@ result = await extract_text("https://example.com/report.docx") # Cache expires after 1 hour by default ``` +### Index Document for On-Demand Resource Fetching + +```python +# Index the document - returns URIs for all content +result = await index_document("novel.docx") + +# Returns: +# { +# "doc_id": "56036b0f171a", +# "resources": { +# "chapter": [ +# {"id": "1", "title": "Chapter 1: The Beginning", "uri": "chapter://56036b0f171a/1"}, +# {"id": "2", "title": "Chapter 2: Rising Action", "uri": "chapter://56036b0f171a/2"}, +# ... +# ], +# "image": [ +# {"id": "0", "uri": "image://56036b0f171a/0"}, +# ... +# ] +# } +# } + +# Now fetch specific content via MCP resources: +# - chapter://56036b0f171a/1 โ†’ Chapter 1 as markdown +# - chapter://56036b0f171a/1.txt โ†’ Chapter 1 as plain text +# - chapters://56036b0f171a/1-3 โ†’ Chapters 1-3 combined +# - image://56036b0f171a/0 โ†’ First embedded image + +# Works with Excel and PowerPoint too: +await index_document("data.xlsx") +# โ†’ sheet://abc123/Revenue, sheet://abc123/Expenses, ... + +await index_document("presentation.pptx") +# โ†’ slide://def456/1, slide://def456/2, ... +``` + --- ## ๐Ÿงช Testing @@ -311,9 +404,10 @@ The mixin pattern keeps things modular โ€” universal tools work on everything, f ``` mcp-office-tools/ โ”œโ”€โ”€ src/mcp_office_tools/ -โ”‚ โ”œโ”€โ”€ server.py # FastMCP server entry point +โ”‚ โ”œโ”€โ”€ server.py # FastMCP server + resource templates +โ”‚ โ”œโ”€โ”€ resources.py # Resource store for on-demand content โ”‚ โ”œโ”€โ”€ mixins/ -โ”‚ โ”‚ โ”œโ”€โ”€ universal.py # Format-agnostic tools +โ”‚ โ”‚ โ”œโ”€โ”€ universal.py # Format-agnostic tools (incl. index_document) โ”‚ โ”‚ โ”œโ”€โ”€ word.py # Word-specific tools โ”‚ โ”‚ โ”œโ”€โ”€ excel.py # Excel-specific tools โ”‚ โ”‚ โ””โ”€โ”€ powerpoint.py # PowerPoint tools (WIP) diff --git a/src/mcp_office_tools/server.py b/src/mcp_office_tools/server.py index 7b554dc..4586623 100644 --- a/src/mcp_office_tools/server.py +++ b/src/mcp_office_tools/server.py @@ -196,14 +196,15 @@ def get_chapter_resource(doc_id: str, resource_id: str) -> str: "section://{doc_id}/{resource_id}", mime_type="text/markdown", name="document_section", - description="Section from a document as Markdown" + description="Section from a document. Supports format suffixes: section://doc/2.md, section://doc/2.txt, section://doc/2.html" ) def get_section_resource(doc_id: str, resource_id: str) -> str: - """Retrieve a section as markdown.""" - resource = resource_store.get(doc_id, "section", resource_id) + """Retrieve a section with optional format conversion.""" + section_id, fmt = _parse_format_suffix(resource_id) + resource = resource_store.get(doc_id, "section", section_id) if resource is None: raise ValueError(f"Section not found: section://{doc_id}/{resource_id}") - return resource.data + return _convert_markdown_to_format(resource.data, fmt) @app.resource(