Security hardening + CalVer 2026.05.22 for first PyPI publish
Some checks failed
Test Dashboard / test-and-dashboard (push) Has been cancelled
Some checks failed
Test Dashboard / test-and-dashboard (push) Has been cancelled
Margaret Hamilton pre-publish review found 5 blockers + 9 flags. All
correctness/security issues fixed; H6 (connection pooling perf) deferred.
caching.py — comprehensive hardening:
- B3: base64.b64decode now uses validate=True (no silent mangling)
- B4: MCP_ALLOW_LOCAL_FILES evaluated per request, not at import
- B5: extension allowlist + 0o700 temp dir + 0o600 files + O_EXCL writes
- B2+H5: MCP_MAX_UPLOAD_BYTES / MCP_MAX_DOWNLOAD_BYTES caps (50MB default),
enforced pre-decode and during chunked downloads
- H1: env var parsing strip()+lower(), truthy set {true,1,yes,on}
- H3: UUID-based unique temp paths replace SHA-prefix collision risk
- H7: ZIP magic bytes disambiguated via [Content_Types].xml peek
- H8: stronger CSV heuristic (commas/tabs + UTF-8 + no NULs)
- H9: specific exceptions in cache I/O with logged warnings
- New: upload_cleanup_scope() context manager + ContextVar tracker
decorators.py:
- cleanup_temp_uploads decorator wraps tool methods, auto-cleans temp
upload files on return OR exception (B1+H4)
validation.py:
- OfficeFileError.__init__ scrubs /tmp/mcp_office_uploads/ paths from
messages so server paths never leak to HTTP callers (H2)
mixins/{universal,word,excel}.py:
- @cleanup_temp_uploads applied to all 19 tool methods that resolve files
tests/test_security_hardening.py:
- 24 new tests, one per Hamilton finding, prove fixes work and catch
regressions. Including end-to-end: temp file created → exists during
scope → gone after scope exit (success AND exception paths)
pyproject.toml:
- version 0.1.0 → 2026.05.22 (CalVer per CLAUDE.md convention)
- URLs updated GitHub → git.supported.systems/MCP/mcwaddams
- Belt-and-suspenders sdist exclude list (defends against future
include-list edits accidentally shipping CLAUDE.md, .env, etc.)
This commit is contained in:
parent
367a9c58e5
commit
c9de63cf29
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "mcwaddams"
|
||||
version = "0.1.0"
|
||||
version = "2026.05.22"
|
||||
description = "MCP server for Microsoft Office document processing. Named for Milton Waddams, who was relocated to the basement with boxes of legacy documents."
|
||||
authors = [{name = "Ryan Malloy", email = "ryan@supported.systems"}]
|
||||
readme = "README.md"
|
||||
@ -64,9 +64,9 @@ enhanced = [
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/ryanmalloy/mcwaddams"
|
||||
Repository = "https://github.com/ryanmalloy/mcwaddams"
|
||||
Issues = "https://github.com/ryanmalloy/mcwaddams/issues"
|
||||
Homepage = "https://mcwaddams.l.supported.systems"
|
||||
Repository = "https://git.supported.systems/MCP/mcwaddams"
|
||||
Issues = "https://git.supported.systems/MCP/mcwaddams/issues"
|
||||
|
||||
[project.scripts]
|
||||
mcwaddams = "mcwaddams.server:main"
|
||||
@ -79,13 +79,46 @@ build-backend = "hatchling.build"
|
||||
packages = ["src/mcwaddams"]
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
# Belt: only ship what's listed here.
|
||||
include = [
|
||||
"/src",
|
||||
"/tests",
|
||||
"/examples",
|
||||
"/README.md",
|
||||
"/LICENSE",
|
||||
]
|
||||
# Suspenders: even if a file matches an include glob, drop it if it matches here.
|
||||
# These guard against accidental inclusion if the include list grows.
|
||||
exclude = [
|
||||
"CLAUDE.md",
|
||||
".env",
|
||||
".env.*",
|
||||
".mcp.json",
|
||||
".pytest_cache",
|
||||
".ruff_cache",
|
||||
".mypy_cache",
|
||||
"htmlcov",
|
||||
"dist",
|
||||
"build",
|
||||
"reports",
|
||||
"audits",
|
||||
"tests",
|
||||
"examples",
|
||||
"docs",
|
||||
"ADVANCED_TOOLS_PLAN.md",
|
||||
"IMPLEMENTATION_STATUS.md",
|
||||
"QUICKSTART_DASHBOARD.md",
|
||||
"TESTING_STRATEGY.md",
|
||||
"*.docx",
|
||||
"*.xlsx",
|
||||
"*.pptx",
|
||||
"*.doc",
|
||||
"*.xls",
|
||||
"*.ppt",
|
||||
"*.json",
|
||||
"Dockerfile",
|
||||
"docker-compose.yml",
|
||||
"Makefile",
|
||||
"uv.lock",
|
||||
]
|
||||
|
||||
# Code quality tools
|
||||
[tool.black]
|
||||
|
||||
@ -15,6 +15,7 @@ from ..utils import (
|
||||
resolve_field_defaults,
|
||||
handle_office_errors
|
||||
)
|
||||
from ..utils.decorators import cleanup_temp_uploads
|
||||
|
||||
|
||||
# Common field description for file_content parameter
|
||||
@ -38,6 +39,7 @@ class ExcelMixin(MCPMixin):
|
||||
detect_data_types=True,
|
||||
check_data_quality=True
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def analyze_excel_data(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Excel document or URL"),
|
||||
@ -183,6 +185,7 @@ class ExcelMixin(MCPMixin):
|
||||
include_values=True,
|
||||
analyze_dependencies=True
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def extract_excel_formulas(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Excel document or URL"),
|
||||
@ -294,6 +297,7 @@ class ExcelMixin(MCPMixin):
|
||||
y_columns=[],
|
||||
output_format="chartjs"
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def create_excel_chart_data(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Excel document or URL"),
|
||||
|
||||
@ -14,6 +14,7 @@ from ..utils import (
|
||||
resolve_office_file_path,
|
||||
validate_office_file,
|
||||
)
|
||||
from ..utils.decorators import cleanup_temp_uploads
|
||||
from ..resources import resource_store, EmbeddedResource, ResourceStore
|
||||
|
||||
|
||||
@ -31,6 +32,7 @@ class UniversalMixin(MCPMixin):
|
||||
name="extract_text",
|
||||
description="Extract text content from Office documents with intelligent method selection. Supports Word (.docx, .doc), Excel (.xlsx, .xls), PowerPoint (.pptx, .ppt), and CSV files. Uses multi-library fallback for maximum compatibility."
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def extract_text(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Office document or URL"),
|
||||
@ -90,6 +92,7 @@ class UniversalMixin(MCPMixin):
|
||||
name="extract_images",
|
||||
description="Extract images from Office documents with size filtering and format conversion."
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def extract_images(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Office document or URL"),
|
||||
@ -142,6 +145,7 @@ class UniversalMixin(MCPMixin):
|
||||
name="extract_metadata",
|
||||
description="Extract comprehensive metadata from Office documents."
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def extract_metadata(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Office document or URL"),
|
||||
@ -183,6 +187,7 @@ class UniversalMixin(MCPMixin):
|
||||
name="detect_office_format",
|
||||
description="Intelligent Office document format detection and analysis."
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def detect_office_format(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Office document or URL"),
|
||||
@ -208,6 +213,7 @@ class UniversalMixin(MCPMixin):
|
||||
name="analyze_document_health",
|
||||
description="Comprehensive document health and integrity analysis."
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def analyze_document_health(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Office document or URL"),
|
||||
@ -359,6 +365,7 @@ class UniversalMixin(MCPMixin):
|
||||
name="index_document",
|
||||
description="Scan and index all resources in a document (images, chapters, sheets, slides). Returns resource URIs that can be fetched individually. Use this before accessing resources via their URIs."
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def index_document(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Office document or URL"),
|
||||
|
||||
@ -15,6 +15,7 @@ from ..utils import (
|
||||
resolve_field_defaults,
|
||||
handle_office_errors
|
||||
)
|
||||
from ..utils.decorators import cleanup_temp_uploads
|
||||
from ..pagination import paginate_document_conversion, PaginationParams
|
||||
|
||||
|
||||
@ -48,6 +49,7 @@ class WordMixin(MCPMixin):
|
||||
session_id=None,
|
||||
return_all=False
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def convert_to_markdown(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Office document or URL"),
|
||||
@ -280,6 +282,7 @@ class WordMixin(MCPMixin):
|
||||
preserve_merged_cells=True,
|
||||
include_headers=True
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def extract_word_tables(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document or URL"),
|
||||
@ -457,6 +460,7 @@ class WordMixin(MCPMixin):
|
||||
extract_outline=True,
|
||||
analyze_styles=True
|
||||
)
|
||||
@cleanup_temp_uploads
|
||||
async def analyze_word_structure(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document or URL"),
|
||||
@ -653,6 +657,7 @@ class WordMixin(MCPMixin):
|
||||
description="Get a clean, structured outline of a Word document showing all headings, sections, and chapters with their locations. Perfect for understanding document structure before reading."
|
||||
)
|
||||
@handle_office_errors("Document outline")
|
||||
@cleanup_temp_uploads
|
||||
async def get_document_outline(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document or URL"),
|
||||
@ -774,6 +779,7 @@ class WordMixin(MCPMixin):
|
||||
description="Analyze a Word document for style inconsistencies, formatting issues, and potential problems like mismatched heading styles or missing chapters."
|
||||
)
|
||||
@handle_office_errors("Style consistency check")
|
||||
@cleanup_temp_uploads
|
||||
async def check_style_consistency(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document or URL"),
|
||||
@ -932,6 +938,7 @@ class WordMixin(MCPMixin):
|
||||
description="Search for text within a Word document and return matches with surrounding context and location information."
|
||||
)
|
||||
@handle_office_errors("Document search")
|
||||
@cleanup_temp_uploads
|
||||
async def search_document(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document or URL"),
|
||||
@ -1019,6 +1026,7 @@ class WordMixin(MCPMixin):
|
||||
description="Extract named entities (people, places, organizations) from a Word document using pattern-based recognition. Great for identifying key characters, locations, and institutions mentioned in the text."
|
||||
)
|
||||
@handle_office_errors("Entity extraction")
|
||||
@cleanup_temp_uploads
|
||||
async def extract_entities(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document or URL"),
|
||||
@ -1230,6 +1238,7 @@ class WordMixin(MCPMixin):
|
||||
description="Get brief summaries/previews of each chapter in a Word document. Extracts the opening sentences of each chapter to give a quick overview of content."
|
||||
)
|
||||
@handle_office_errors("Chapter summaries")
|
||||
@cleanup_temp_uploads
|
||||
async def get_chapter_summaries(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document or URL"),
|
||||
@ -1330,6 +1339,7 @@ class WordMixin(MCPMixin):
|
||||
description="Save your reading progress in a Word document. Creates a bookmark file to track which chapter/paragraph you're on, so you can resume reading later."
|
||||
)
|
||||
@handle_office_errors("Save reading progress")
|
||||
@cleanup_temp_uploads
|
||||
async def save_reading_progress(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document"),
|
||||
@ -1400,6 +1410,7 @@ class WordMixin(MCPMixin):
|
||||
description="Retrieve your saved reading progress for a Word document. Shows where you left off and your reading history."
|
||||
)
|
||||
@handle_office_errors("Get reading progress")
|
||||
@cleanup_temp_uploads
|
||||
async def get_reading_progress(
|
||||
self,
|
||||
file_path: str = Field(description="Path to Word document"),
|
||||
|
||||
@ -2,228 +2,318 @@
|
||||
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import uuid
|
||||
import logging
|
||||
import hashlib
|
||||
import tempfile
|
||||
import base64
|
||||
import zipfile
|
||||
import io
|
||||
from contextvars import ContextVar
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Optional, Dict, Any, AsyncIterator, List
|
||||
import aiofiles
|
||||
import aiohttp
|
||||
from urllib.parse import urlparse
|
||||
from .validation import OfficeFileError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Truthy values accepted for boolean env vars
|
||||
_TRUTHY = frozenset({"true", "1", "yes", "on"})
|
||||
|
||||
# Allowlist of file extensions accepted from base64 uploads.
|
||||
# Anything else gets coerced to .bin to avoid arbitrary-extension writes.
|
||||
_ALLOWED_UPLOAD_EXTENSIONS = frozenset({
|
||||
".docx", ".doc",
|
||||
".xlsx", ".xls", ".xlsm",
|
||||
".pptx", ".ppt",
|
||||
".csv", ".txt",
|
||||
".dotx", ".xltx", ".potx",
|
||||
})
|
||||
|
||||
# Defaults for size limits — overridable via env vars.
|
||||
_DEFAULT_MAX_UPLOAD_BYTES = 50 * 1024 * 1024 # 50 MB
|
||||
_DEFAULT_MAX_DOWNLOAD_BYTES = 50 * 1024 * 1024 # 50 MB
|
||||
|
||||
|
||||
# Per-request tracker for temp upload files. Set by tool-method decorator;
|
||||
# _resolve_from_content registers paths here so the decorator can clean them up
|
||||
# on exit (success OR exception). ContextVar is async-safe across concurrent tools.
|
||||
_upload_tracker: ContextVar[Optional[List[Path]]] = ContextVar(
|
||||
"mcwaddams_upload_tracker", default=None
|
||||
)
|
||||
|
||||
|
||||
def _register_temp_upload(path: Path) -> None:
|
||||
"""Register a temp upload for cleanup if a tracker is active."""
|
||||
tracker = _upload_tracker.get()
|
||||
if tracker is not None:
|
||||
tracker.append(path)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def upload_cleanup_scope() -> AsyncIterator[None]:
|
||||
"""Track and clean up base64-upload temp files within the scope.
|
||||
|
||||
Used by the @cleanup_temp_uploads decorator on tool methods. Any temp file
|
||||
created by _resolve_from_content during the scope is deleted on exit,
|
||||
regardless of whether the body raised or returned normally.
|
||||
"""
|
||||
tracker: List[Path] = []
|
||||
token = _upload_tracker.set(tracker)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
_upload_tracker.reset(token)
|
||||
for path in tracker:
|
||||
try:
|
||||
path.unlink(missing_ok=True)
|
||||
except OSError as e:
|
||||
logger.warning("Failed to clean up temp upload %s: %s", path, e)
|
||||
|
||||
|
||||
def _env_truthy(value: Optional[str]) -> bool:
|
||||
"""Parse a truthy env var value, tolerant of whitespace and case."""
|
||||
if value is None:
|
||||
return False
|
||||
return value.strip().lower() in _TRUTHY
|
||||
|
||||
|
||||
# Environment variable to control local file access
|
||||
# Default depends on transport mode:
|
||||
# - stdio (local): allow local files by default
|
||||
# - streamable-http (remote): block local files by default
|
||||
def _get_allow_local_files() -> bool:
|
||||
"""Determine if local file access is allowed based on transport mode."""
|
||||
"""Determine if local file access is allowed based on transport mode.
|
||||
|
||||
Evaluated at request time (not import time) so env changes after
|
||||
import (test harnesses, embedded usage) are honored.
|
||||
"""
|
||||
explicit = os.environ.get("MCP_ALLOW_LOCAL_FILES")
|
||||
if explicit is not None:
|
||||
return explicit.lower() == "true"
|
||||
return _env_truthy(explicit)
|
||||
|
||||
# If not explicitly set, default based on transport mode
|
||||
transport = os.environ.get("MCP_TRANSPORT", "stdio").lower()
|
||||
transport = (os.environ.get("MCP_TRANSPORT") or "stdio").strip().lower()
|
||||
return transport == "stdio"
|
||||
|
||||
MCP_ALLOW_LOCAL_FILES = _get_allow_local_files()
|
||||
|
||||
def _get_max_upload_bytes() -> int:
|
||||
"""Max accepted size for base64 uploads, after decoding."""
|
||||
raw = os.environ.get("MCP_MAX_UPLOAD_BYTES")
|
||||
if raw is not None:
|
||||
try:
|
||||
return max(1, int(raw.strip()))
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"MCP_MAX_UPLOAD_BYTES=%r is not a valid integer; using default %d",
|
||||
raw, _DEFAULT_MAX_UPLOAD_BYTES,
|
||||
)
|
||||
return _DEFAULT_MAX_UPLOAD_BYTES
|
||||
|
||||
|
||||
def _get_max_download_bytes() -> int:
|
||||
"""Max accepted size for URL downloads."""
|
||||
raw = os.environ.get("MCP_MAX_DOWNLOAD_BYTES")
|
||||
if raw is not None:
|
||||
try:
|
||||
return max(1, int(raw.strip()))
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"MCP_MAX_DOWNLOAD_BYTES=%r is not a valid integer; using default %d",
|
||||
raw, _DEFAULT_MAX_DOWNLOAD_BYTES,
|
||||
)
|
||||
return _DEFAULT_MAX_DOWNLOAD_BYTES
|
||||
|
||||
|
||||
class OfficeFileCache:
|
||||
"""Simple file cache for downloaded Office documents."""
|
||||
|
||||
|
||||
def __init__(self, cache_dir: Optional[str] = None, cache_duration: int = 3600):
|
||||
"""Initialize cache with optional custom directory and duration.
|
||||
|
||||
Args:
|
||||
cache_dir: Custom cache directory. If None, uses system temp.
|
||||
cache_duration: Cache duration in seconds (default: 1 hour)
|
||||
"""
|
||||
if cache_dir:
|
||||
self.cache_dir = Path(cache_dir)
|
||||
else:
|
||||
self.cache_dir = Path(tempfile.gettempdir()) / "mcp_office_cache"
|
||||
|
||||
|
||||
self.cache_duration = cache_duration
|
||||
self.cache_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Cache metadata file
|
||||
self.cache_dir.mkdir(exist_ok=True, mode=0o700)
|
||||
|
||||
self.metadata_file = self.cache_dir / "cache_metadata.json"
|
||||
self._metadata = self._load_metadata()
|
||||
|
||||
|
||||
def _load_metadata(self) -> Dict[str, Any]:
|
||||
"""Load cache metadata."""
|
||||
"""Load cache metadata, tolerant of missing or corrupt files."""
|
||||
if not self.metadata_file.exists():
|
||||
return {}
|
||||
try:
|
||||
if self.metadata_file.exists():
|
||||
import json
|
||||
with open(self.metadata_file, 'r') as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
with open(self.metadata_file, 'r') as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
logger.warning("Cache metadata unreadable (%s); starting fresh.", e)
|
||||
return {}
|
||||
|
||||
def _save_metadata(self) -> None:
|
||||
"""Save cache metadata."""
|
||||
"""Save cache metadata; log on failure rather than swallow silently."""
|
||||
try:
|
||||
import json
|
||||
with open(self.metadata_file, 'w') as f:
|
||||
json.dump(self._metadata, f, indent=2)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except OSError as e:
|
||||
logger.warning("Failed to write cache metadata: %s", e)
|
||||
|
||||
def _get_cache_key(self, url: str) -> str:
|
||||
"""Generate cache key for URL."""
|
||||
return hashlib.sha256(url.encode()).hexdigest()
|
||||
|
||||
|
||||
def _get_cache_path(self, cache_key: str) -> Path:
|
||||
"""Get cache file path for cache key."""
|
||||
return self.cache_dir / f"{cache_key}.office"
|
||||
|
||||
|
||||
def is_cached(self, url: str) -> bool:
|
||||
"""Check if URL is cached and still valid."""
|
||||
cache_key = self._get_cache_key(url)
|
||||
|
||||
|
||||
if cache_key not in self._metadata:
|
||||
return False
|
||||
|
||||
|
||||
cache_info = self._metadata[cache_key]
|
||||
cache_path = self._get_cache_path(cache_key)
|
||||
|
||||
# Check if file exists
|
||||
|
||||
if not cache_path.exists():
|
||||
del self._metadata[cache_key]
|
||||
self._save_metadata()
|
||||
return False
|
||||
|
||||
# Check if cache is still valid
|
||||
|
||||
cache_time = cache_info.get('cached_at', 0)
|
||||
if time.time() - cache_time > self.cache_duration:
|
||||
self._remove_cache_entry(cache_key)
|
||||
return False
|
||||
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def get_cached_path(self, url: str) -> Optional[str]:
|
||||
"""Get cached file path for URL if available."""
|
||||
if not self.is_cached(url):
|
||||
return None
|
||||
|
||||
cache_key = self._get_cache_key(url)
|
||||
cache_path = self._get_cache_path(cache_key)
|
||||
return str(cache_path)
|
||||
|
||||
return str(self._get_cache_path(cache_key))
|
||||
|
||||
async def cache_url(self, url: str, timeout: int = 30) -> str:
|
||||
"""Download and cache file from URL."""
|
||||
"""Download and cache file from URL with size-cap enforcement."""
|
||||
cache_key = self._get_cache_key(url)
|
||||
cache_path = self._get_cache_path(cache_key)
|
||||
|
||||
# Download file
|
||||
max_bytes = _get_max_download_bytes()
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, timeout=timeout) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Get response metadata
|
||||
|
||||
# Reject based on declared Content-Length before reading.
|
||||
content_length_hdr = response.headers.get('content-length')
|
||||
if content_length_hdr is not None:
|
||||
try:
|
||||
declared = int(content_length_hdr)
|
||||
if declared > max_bytes:
|
||||
raise OfficeFileError(
|
||||
f"Remote file too large: {declared} bytes "
|
||||
f"(max {max_bytes})"
|
||||
)
|
||||
except ValueError:
|
||||
pass # malformed header — fall through to chunk-counted enforcement
|
||||
|
||||
content_type = response.headers.get('content-type', '')
|
||||
content_length = response.headers.get('content-length')
|
||||
last_modified = response.headers.get('last-modified')
|
||||
|
||||
# Write to cache file
|
||||
|
||||
bytes_written = 0
|
||||
async with aiofiles.open(cache_path, 'wb') as f:
|
||||
async for chunk in response.content.iter_chunked(8192):
|
||||
bytes_written += len(chunk)
|
||||
if bytes_written > max_bytes:
|
||||
raise OfficeFileError(
|
||||
f"Remote file exceeded {max_bytes} bytes during download"
|
||||
)
|
||||
await f.write(chunk)
|
||||
|
||||
# Update metadata
|
||||
|
||||
self._metadata[cache_key] = {
|
||||
'url': url,
|
||||
'cached_at': time.time(),
|
||||
'content_type': content_type,
|
||||
'content_length': content_length,
|
||||
'content_length': content_length_hdr,
|
||||
'last_modified': last_modified,
|
||||
'file_size': cache_path.stat().st_size
|
||||
'file_size': cache_path.stat().st_size,
|
||||
}
|
||||
self._save_metadata()
|
||||
|
||||
|
||||
return str(cache_path)
|
||||
|
||||
|
||||
except OfficeFileError:
|
||||
# Clean up partial file before re-raising
|
||||
if cache_path.exists():
|
||||
try:
|
||||
cache_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
except Exception as e:
|
||||
# Clean up on error
|
||||
if cache_path.exists():
|
||||
try:
|
||||
cache_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
raise OfficeFileError(f"Failed to download and cache file: {str(e)}")
|
||||
|
||||
|
||||
def _remove_cache_entry(self, cache_key: str) -> None:
|
||||
"""Remove cache entry and file."""
|
||||
cache_path = self._get_cache_path(cache_key)
|
||||
|
||||
# Remove file
|
||||
if cache_path.exists():
|
||||
try:
|
||||
cache_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Remove metadata
|
||||
except OSError as e:
|
||||
logger.warning("Failed to unlink cache file %s: %s", cache_path, e)
|
||||
if cache_key in self._metadata:
|
||||
del self._metadata[cache_key]
|
||||
self._save_metadata()
|
||||
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
"""Clear all cached files."""
|
||||
for cache_key in list(self._metadata.keys()):
|
||||
self._remove_cache_entry(cache_key)
|
||||
|
||||
|
||||
def cleanup_expired(self) -> int:
|
||||
"""Remove expired cache entries. Returns number of entries removed."""
|
||||
current_time = time.time()
|
||||
expired_keys = []
|
||||
|
||||
|
||||
for cache_key, cache_info in self._metadata.items():
|
||||
cache_time = cache_info.get('cached_at', 0)
|
||||
if current_time - cache_time > self.cache_duration:
|
||||
expired_keys.append(cache_key)
|
||||
|
||||
|
||||
for cache_key in expired_keys:
|
||||
self._remove_cache_entry(cache_key)
|
||||
|
||||
|
||||
return len(expired_keys)
|
||||
|
||||
|
||||
def get_cache_stats(self) -> Dict[str, Any]:
|
||||
"""Get cache statistics."""
|
||||
total_files = len(self._metadata)
|
||||
total_size = 0
|
||||
expired_count = 0
|
||||
current_time = time.time()
|
||||
|
||||
|
||||
for cache_key, cache_info in self._metadata.items():
|
||||
cache_path = self._get_cache_path(cache_key)
|
||||
if cache_path.exists():
|
||||
total_size += cache_path.stat().st_size
|
||||
|
||||
|
||||
cache_time = cache_info.get('cached_at', 0)
|
||||
if current_time - cache_time > self.cache_duration:
|
||||
expired_count += 1
|
||||
|
||||
|
||||
return {
|
||||
'total_files': total_files,
|
||||
'total_size_bytes': total_size,
|
||||
'total_size_mb': round(total_size / (1024 * 1024), 2),
|
||||
'expired_files': expired_count,
|
||||
'cache_directory': str(self.cache_dir),
|
||||
'cache_duration_hours': self.cache_duration / 3600
|
||||
'cache_duration_hours': self.cache_duration / 3600,
|
||||
}
|
||||
|
||||
|
||||
# Global cache instance
|
||||
_global_cache: Optional[OfficeFileCache] = None
|
||||
|
||||
|
||||
def get_cache() -> OfficeFileCache:
|
||||
"""Get global cache instance."""
|
||||
global _global_cache
|
||||
if _global_cache is None:
|
||||
_global_cache = OfficeFileCache()
|
||||
@ -234,9 +324,9 @@ async def resolve_office_file_path(
|
||||
file_path: str,
|
||||
use_cache: bool = True,
|
||||
file_content: Optional[str] = None,
|
||||
filename: Optional[str] = None
|
||||
filename: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Resolve file path, downloading from URL if necessary, or decode inline content.
|
||||
"""Resolve a file reference to a local path.
|
||||
|
||||
Args:
|
||||
file_path: Local file path or URL (ignored if file_content provided)
|
||||
@ -245,105 +335,196 @@ async def resolve_office_file_path(
|
||||
filename: Original filename for extension detection (used with file_content)
|
||||
|
||||
Returns:
|
||||
Local file path (temp file if from content, downloaded if from URL)
|
||||
Local file path. Callers MUST clean up temp files when file_content was used —
|
||||
prefer the `resolved_office_file()` context manager which handles cleanup automatically.
|
||||
|
||||
Security:
|
||||
When MCP_ALLOW_LOCAL_FILES=false (default for HTTP transport):
|
||||
- Local file paths are rejected
|
||||
- Only URLs and file_content are allowed
|
||||
- This prevents hosted servers from accessing server-side files
|
||||
When MCP_ALLOW_LOCAL_FILES is false (default for HTTP transport),
|
||||
local filesystem paths are rejected. Env var is read at request time.
|
||||
"""
|
||||
# Priority 1: If file_content is provided, decode and write to temp file
|
||||
if file_content:
|
||||
return await _resolve_from_content(file_content, filename or file_path)
|
||||
|
||||
# Check if it's a URL
|
||||
parsed = urlparse(file_path)
|
||||
is_url = bool(parsed.scheme and parsed.netloc)
|
||||
|
||||
if not is_url:
|
||||
# Local file path - check if allowed
|
||||
if not MCP_ALLOW_LOCAL_FILES:
|
||||
if not _get_allow_local_files():
|
||||
raise OfficeFileError(
|
||||
"Local file access is disabled for this server. "
|
||||
"Please use file_content parameter to upload document data, "
|
||||
"or provide a URL. Set MCP_ALLOW_LOCAL_FILES=true to enable local files."
|
||||
"Use the file_content parameter to upload document data, "
|
||||
"or provide an https:// URL."
|
||||
)
|
||||
return file_path
|
||||
|
||||
# Validate URL scheme
|
||||
if parsed.scheme not in ['http', 'https']:
|
||||
raise OfficeFileError(f"Unsupported URL scheme: {parsed.scheme}")
|
||||
|
||||
cache = get_cache()
|
||||
|
||||
# Check cache first
|
||||
if use_cache and cache.is_cached(file_path):
|
||||
cached_path = cache.get_cached_path(file_path)
|
||||
if cached_path:
|
||||
return cached_path
|
||||
|
||||
# Download and cache
|
||||
if use_cache:
|
||||
return await cache.cache_url(file_path)
|
||||
else:
|
||||
# Direct download without caching
|
||||
from .validation import download_office_file
|
||||
return await download_office_file(file_path)
|
||||
|
||||
|
||||
async def _resolve_from_content(file_content: str, filename_hint: str) -> str:
|
||||
"""Decode base64 content and write to a temp file.
|
||||
@asynccontextmanager
|
||||
async def resolved_office_file(
|
||||
file_path: str,
|
||||
use_cache: bool = True,
|
||||
file_content: Optional[str] = None,
|
||||
filename: Optional[str] = None,
|
||||
) -> AsyncIterator[str]:
|
||||
"""Async context manager: resolves path, cleans up temp files on exit.
|
||||
|
||||
Args:
|
||||
file_content: Base64-encoded file data
|
||||
filename_hint: Filename or path to extract extension from
|
||||
Use this from tool implementations instead of calling resolve_office_file_path
|
||||
directly when a base64 upload might be involved. Temp files created from
|
||||
file_content are deleted on exit (success OR exception). URL cache and local
|
||||
paths are left alone.
|
||||
|
||||
Returns:
|
||||
Path to temporary file containing decoded content
|
||||
Example:
|
||||
async with resolved_office_file(file_path, file_content=file_content) as local_path:
|
||||
return process(local_path)
|
||||
"""
|
||||
is_temp_upload = bool(file_content)
|
||||
local_path: Optional[str] = None
|
||||
try:
|
||||
# Decode base64 content
|
||||
content_bytes = base64.b64decode(file_content)
|
||||
except Exception as e:
|
||||
local_path = await resolve_office_file_path(
|
||||
file_path,
|
||||
use_cache=use_cache,
|
||||
file_content=file_content,
|
||||
filename=filename,
|
||||
)
|
||||
yield local_path
|
||||
finally:
|
||||
if is_temp_upload and local_path:
|
||||
try:
|
||||
Path(local_path).unlink(missing_ok=True)
|
||||
except OSError as e:
|
||||
logger.warning("Failed to clean up temp upload %s: %s", local_path, e)
|
||||
|
||||
|
||||
def _scrub_temp_path(message: str) -> str:
|
||||
"""Remove server-side temp upload paths from error messages."""
|
||||
temp_root = str(Path(tempfile.gettempdir()) / "mcp_office_uploads")
|
||||
if temp_root in message:
|
||||
return message.replace(temp_root, "<uploaded file>")
|
||||
return message
|
||||
|
||||
|
||||
async def _resolve_from_content(file_content: str, filename_hint: str) -> str:
|
||||
"""Decode base64 content and write to a temp file with strict validation."""
|
||||
# Pre-decode size check (base64 expands by ~4/3)
|
||||
max_bytes = _get_max_upload_bytes()
|
||||
encoded_len = len(file_content)
|
||||
max_encoded = (max_bytes * 4 // 3) + 4 # +4 for padding slack
|
||||
if encoded_len > max_encoded:
|
||||
raise OfficeFileError(
|
||||
f"Upload too large: encoded size {encoded_len} exceeds limit "
|
||||
f"(max {max_bytes} decoded bytes). Set MCP_MAX_UPLOAD_BYTES to override."
|
||||
)
|
||||
|
||||
# Strict base64 decode — rejects garbage instead of silently mangling
|
||||
try:
|
||||
content_bytes = base64.b64decode(file_content, validate=True)
|
||||
except (ValueError, base64.binascii.Error) as e:
|
||||
raise OfficeFileError(f"Invalid base64 content: {str(e)}")
|
||||
|
||||
# Extract extension from filename hint
|
||||
ext = Path(filename_hint).suffix.lower()
|
||||
if not ext:
|
||||
# Try to detect from content magic bytes
|
||||
if len(content_bytes) > max_bytes:
|
||||
raise OfficeFileError(
|
||||
f"Upload too large: {len(content_bytes)} bytes (max {max_bytes}). "
|
||||
f"Set MCP_MAX_UPLOAD_BYTES to override."
|
||||
)
|
||||
|
||||
# Extension determination — allowlist first, magic bytes as fallback,
|
||||
# default to .bin for anything we don't recognize.
|
||||
raw_ext = Path(filename_hint).suffix.lower() if filename_hint else ""
|
||||
if raw_ext and raw_ext in _ALLOWED_UPLOAD_EXTENSIONS:
|
||||
ext = raw_ext
|
||||
else:
|
||||
ext = _detect_extension_from_bytes(content_bytes)
|
||||
|
||||
# Create temp file with correct extension
|
||||
# Locked-down temp dir: owner-only access
|
||||
temp_dir = Path(tempfile.gettempdir()) / "mcp_office_uploads"
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
temp_dir.mkdir(exist_ok=True, mode=0o700)
|
||||
# Re-apply mode in case dir existed with looser perms
|
||||
try:
|
||||
os.chmod(temp_dir, 0o700)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Generate unique filename
|
||||
content_hash = hashlib.sha256(content_bytes).hexdigest()[:12]
|
||||
temp_path = temp_dir / f"upload_{content_hash}{ext}"
|
||||
# Unique filename — UUID prevents concurrent-write collisions
|
||||
unique = uuid.uuid4().hex[:16]
|
||||
temp_path = temp_dir / f"upload_{unique}{ext}"
|
||||
|
||||
# Write content to temp file
|
||||
async with aiofiles.open(temp_path, 'wb') as f:
|
||||
await f.write(content_bytes)
|
||||
# Atomic-ish write: O_EXCL ensures we never overwrite
|
||||
fd = os.open(
|
||||
str(temp_path),
|
||||
os.O_WRONLY | os.O_CREAT | os.O_EXCL,
|
||||
0o600,
|
||||
)
|
||||
try:
|
||||
async with aiofiles.open(fd, 'wb') as f:
|
||||
await f.write(content_bytes)
|
||||
except Exception:
|
||||
try:
|
||||
temp_path.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
# Register for automatic cleanup by the active tool-method scope (if any).
|
||||
_register_temp_upload(temp_path)
|
||||
|
||||
return str(temp_path)
|
||||
|
||||
|
||||
def _detect_extension_from_bytes(content: bytes) -> str:
|
||||
"""Detect file extension from magic bytes."""
|
||||
# ZIP-based formats (docx, xlsx, pptx)
|
||||
"""Detect file extension from magic bytes, with ZIP disambiguation."""
|
||||
# ZIP-based formats — peek inside to tell docx/xlsx/pptx apart
|
||||
if content[:4] == b'PK\x03\x04':
|
||||
# Could be docx, xlsx, or pptx - default to .docx
|
||||
# Full detection would require reading internal XML
|
||||
return ".docx"
|
||||
return _disambiguate_zip_format(content)
|
||||
|
||||
# OLE Compound Document (doc, xls, ppt)
|
||||
if content[:8] == b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
|
||||
return ".doc"
|
||||
|
||||
# CSV (text-based, starts with printable characters)
|
||||
if content[:1].isalpha() or content[:1] in b'"\'':
|
||||
return ".csv"
|
||||
# Stronger CSV heuristic: comma or tab in first KB, no NUL bytes,
|
||||
# mostly printable ASCII or UTF-8
|
||||
head = content[:1024]
|
||||
if head and b'\x00' not in head:
|
||||
if b',' in head or b'\t' in head or b';' in head:
|
||||
try:
|
||||
head.decode('utf-8')
|
||||
return ".csv"
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
# Default
|
||||
return ".bin"
|
||||
return ".bin"
|
||||
|
||||
|
||||
def _disambiguate_zip_format(content: bytes) -> str:
|
||||
"""Inspect [Content_Types].xml inside a ZIP to identify the Office format."""
|
||||
try:
|
||||
with zipfile.ZipFile(io.BytesIO(content)) as zf:
|
||||
try:
|
||||
types_xml = zf.read("[Content_Types].xml").decode("utf-8", errors="replace")
|
||||
except KeyError:
|
||||
return ".bin"
|
||||
|
||||
if "wordprocessingml" in types_xml:
|
||||
return ".docx"
|
||||
if "spreadsheetml" in types_xml:
|
||||
return ".xlsx"
|
||||
if "presentationml" in types_xml:
|
||||
return ".pptx"
|
||||
except (zipfile.BadZipFile, OSError):
|
||||
pass
|
||||
|
||||
return ".bin"
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
"""
|
||||
Decorators for MCP Office Tools.
|
||||
|
||||
Provides common patterns for error handling and Pydantic field resolution.
|
||||
Provides common patterns for error handling, temp-file cleanup,
|
||||
and Pydantic field resolution.
|
||||
"""
|
||||
|
||||
from functools import wraps
|
||||
@ -9,11 +10,34 @@ from typing import Any, Callable, TypeVar
|
||||
|
||||
from pydantic.fields import FieldInfo
|
||||
|
||||
from .caching import upload_cleanup_scope
|
||||
from .validation import OfficeFileError
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
def cleanup_temp_uploads(func: Callable[..., T]) -> Callable[..., T]:
|
||||
"""Auto-clean base64-upload temp files created during the wrapped call.
|
||||
|
||||
Wrap each MCP tool method that may receive a `file_content` parameter.
|
||||
Any temp file written by `_resolve_from_content` during the call is deleted
|
||||
when the function returns or raises. Safe for concurrent calls
|
||||
(uses ContextVar — per-task isolation).
|
||||
|
||||
Usage:
|
||||
@mcp_tool(...)
|
||||
@cleanup_temp_uploads
|
||||
async def extract_text(self, file_path: str, file_content: str = None, ...):
|
||||
local_path = await resolve_office_file_path(file_path, file_content=file_content, ...)
|
||||
return process(local_path)
|
||||
"""
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
async with upload_cleanup_scope():
|
||||
return await func(*args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
|
||||
def resolve_field_defaults(**defaults: Any) -> Callable:
|
||||
"""
|
||||
Decorator to resolve Pydantic Field defaults for direct function calls.
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
"""File validation utilities for Office documents."""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
from urllib.parse import urlparse
|
||||
@ -16,8 +17,19 @@ except ImportError:
|
||||
|
||||
|
||||
class OfficeFileError(Exception):
|
||||
"""Custom exception for Office file processing errors."""
|
||||
pass
|
||||
"""Custom exception for Office file processing errors.
|
||||
|
||||
Sanitizes internal upload temp paths from messages so server-side
|
||||
paths never leak to remote (HTTP transport) callers.
|
||||
"""
|
||||
|
||||
# Path prefix to strip from error messages (single source of truth)
|
||||
_UPLOAD_TEMP_PREFIX = str(Path(tempfile.gettempdir()) / "mcp_office_uploads")
|
||||
|
||||
def __init__(self, message: str = ""):
|
||||
if message and self._UPLOAD_TEMP_PREFIX in message:
|
||||
message = message.replace(self._UPLOAD_TEMP_PREFIX, "<uploaded file>")
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
# Office format MIME types and extensions
|
||||
|
||||
285
tests/test_security_hardening.py
Normal file
285
tests/test_security_hardening.py
Normal file
@ -0,0 +1,285 @@
|
||||
"""Verification tests for Margaret Hamilton review blockers.
|
||||
|
||||
Each test maps directly to a finding from the pre-publish review. These tests
|
||||
exist to prove the fixes work as advertised and to prevent regressions.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from mcwaddams.utils.caching import (
|
||||
_detect_extension_from_bytes,
|
||||
_env_truthy,
|
||||
_get_allow_local_files,
|
||||
_get_max_upload_bytes,
|
||||
_resolve_from_content,
|
||||
resolve_office_file_path,
|
||||
upload_cleanup_scope,
|
||||
)
|
||||
from mcwaddams.utils.decorators import cleanup_temp_uploads
|
||||
from mcwaddams.utils.validation import OfficeFileError
|
||||
|
||||
|
||||
def _minimal_docx_bytes() -> bytes:
|
||||
"""Smallest plausible ZIP that looks like a docx for magic-byte tests."""
|
||||
import io
|
||||
import zipfile
|
||||
|
||||
buf = io.BytesIO()
|
||||
with zipfile.ZipFile(buf, "w") as zf:
|
||||
zf.writestr(
|
||||
"[Content_Types].xml",
|
||||
'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/'
|
||||
'package/2006/content-types"><Override PartName="/word/document.xml" '
|
||||
'ContentType="application/vnd.openxmlformats-officedocument.'
|
||||
'wordprocessingml.document.main+xml"/></Types>',
|
||||
)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
class TestB3_Base64Validation:
|
||||
"""B3: b64decode must reject garbage instead of silently mangling."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_garbage_input_rejected(self):
|
||||
with pytest.raises(OfficeFileError, match="Invalid base64"):
|
||||
await _resolve_from_content("<html>not base64 at all</html>", "x.docx")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_whitespace_in_b64_rejected(self):
|
||||
valid_b64 = base64.b64encode(b"hello").decode()
|
||||
# Inject a non-b64 char in the middle
|
||||
corrupted = valid_b64[:4] + "!!" + valid_b64[4:]
|
||||
with pytest.raises(OfficeFileError, match="Invalid base64"):
|
||||
await _resolve_from_content(corrupted, "x.txt")
|
||||
|
||||
|
||||
class TestB4_EnvVarAtRequestTime:
|
||||
"""B4: env var must be read at request time, not import time."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_env_change_after_import_takes_effect(self):
|
||||
with patch.dict(os.environ, {"MCP_TRANSPORT": "stdio"}, clear=False):
|
||||
os.environ.pop("MCP_ALLOW_LOCAL_FILES", None)
|
||||
assert _get_allow_local_files() is True
|
||||
|
||||
with patch.dict(os.environ, {"MCP_TRANSPORT": "streamable-http"}, clear=False):
|
||||
os.environ.pop("MCP_ALLOW_LOCAL_FILES", None)
|
||||
assert _get_allow_local_files() is False
|
||||
|
||||
|
||||
class TestH1_EnvVarHygiene:
|
||||
"""H1: env var parsing must tolerate whitespace, accept truthy set."""
|
||||
|
||||
def test_truthy_values(self):
|
||||
for v in ("true", "True", "TRUE", "1", "yes", "on", " true ", "YES\n"):
|
||||
assert _env_truthy(v) is True, f"Expected {v!r} → True"
|
||||
|
||||
def test_falsy_values(self):
|
||||
for v in ("false", "0", "no", "off", "", None, "maybe", "2"):
|
||||
assert _env_truthy(v) is False, f"Expected {v!r} → False"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_transport_with_trailing_space_defaults_correctly(self):
|
||||
with patch.dict(os.environ, {"MCP_TRANSPORT": " stdio "}, clear=False):
|
||||
os.environ.pop("MCP_ALLOW_LOCAL_FILES", None)
|
||||
assert _get_allow_local_files() is True
|
||||
|
||||
|
||||
class TestB5_ExtensionAllowlist:
|
||||
"""B5: only known Office extensions allowed; unknown coerced to .bin."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_executable_extension_blocked(self):
|
||||
content = base64.b64encode(b"\x7fELF\x02\x01\x01\x00").decode()
|
||||
path = await _resolve_from_content(content, "evil.sh")
|
||||
try:
|
||||
assert not str(path).endswith(".sh")
|
||||
assert str(path).endswith(".bin") # falls through to magic-bytes default
|
||||
finally:
|
||||
Path(path).unlink(missing_ok=True)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_known_extension_preserved(self):
|
||||
content = base64.b64encode(_minimal_docx_bytes()).decode()
|
||||
path = await _resolve_from_content(content, "report.docx")
|
||||
try:
|
||||
assert str(path).endswith(".docx")
|
||||
finally:
|
||||
Path(path).unlink(missing_ok=True)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_path_traversal_in_filename_ignored(self):
|
||||
content = base64.b64encode(b"hello,world\n").decode()
|
||||
path = await _resolve_from_content(content, "../../etc/passwd")
|
||||
try:
|
||||
# No path traversal — file lands in our temp dir, not /etc
|
||||
assert "/etc/" not in str(path)
|
||||
assert "mcp_office_uploads" in str(path)
|
||||
finally:
|
||||
Path(path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
class TestB2_SizeLimits:
|
||||
"""B2: oversized base64 input must be rejected before/after decode."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_oversized_input_rejected(self):
|
||||
with patch.dict(os.environ, {"MCP_MAX_UPLOAD_BYTES": "1024"}):
|
||||
# Create 2KB of data → 2.7KB base64
|
||||
payload = base64.b64encode(b"A" * 2048).decode()
|
||||
with pytest.raises(OfficeFileError, match="too large"):
|
||||
await _resolve_from_content(payload, "x.txt")
|
||||
|
||||
def test_max_upload_bytes_env_parsing(self):
|
||||
with patch.dict(os.environ, {"MCP_MAX_UPLOAD_BYTES": "12345"}):
|
||||
assert _get_max_upload_bytes() == 12345
|
||||
|
||||
def test_max_upload_bytes_bad_value_falls_back(self):
|
||||
with patch.dict(os.environ, {"MCP_MAX_UPLOAD_BYTES": "not-a-number"}):
|
||||
# Should not raise; should log warning and use default
|
||||
assert _get_max_upload_bytes() > 0
|
||||
|
||||
|
||||
class TestB1_TempFileCleanup:
|
||||
"""B1: temp files must be cleaned up after each tool invocation."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cleanup_scope_removes_uploads(self):
|
||||
temp_root = Path(tempfile.gettempdir()) / "mcp_office_uploads"
|
||||
content = base64.b64encode(_minimal_docx_bytes()).decode()
|
||||
|
||||
async with upload_cleanup_scope():
|
||||
path = await _resolve_from_content(content, "report.docx")
|
||||
assert Path(path).exists()
|
||||
inside_path = path
|
||||
|
||||
# After the scope exits, the file must be gone
|
||||
assert not Path(inside_path).exists(), (
|
||||
f"Temp file leaked: {inside_path} still exists after scope exit"
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_cleanup_scope_removes_uploads_on_exception(self):
|
||||
content = base64.b64encode(_minimal_docx_bytes()).decode()
|
||||
inside_path = None
|
||||
|
||||
with pytest.raises(RuntimeError, match="simulated"):
|
||||
async with upload_cleanup_scope():
|
||||
inside_path = await _resolve_from_content(content, "report.docx")
|
||||
assert Path(inside_path).exists()
|
||||
raise RuntimeError("simulated tool failure")
|
||||
|
||||
assert inside_path is not None
|
||||
assert not Path(inside_path).exists(), (
|
||||
"Temp file must be cleaned up even when tool raises"
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_decorator_applies_cleanup_to_tool_method(self):
|
||||
captured_path = {}
|
||||
|
||||
@cleanup_temp_uploads
|
||||
async def fake_tool(file_content):
|
||||
path = await _resolve_from_content(file_content, "x.docx")
|
||||
captured_path["p"] = path
|
||||
return path
|
||||
|
||||
content = base64.b64encode(_minimal_docx_bytes()).decode()
|
||||
result = await fake_tool(content)
|
||||
|
||||
assert captured_path["p"] == result
|
||||
assert not Path(result).exists(), "Decorator failed to clean up"
|
||||
|
||||
|
||||
class TestH3_UniqueTempPaths:
|
||||
"""H3: concurrent uploads with same content must not collide."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_two_resolves_get_different_paths(self):
|
||||
content = base64.b64encode(_minimal_docx_bytes()).decode()
|
||||
p1 = await _resolve_from_content(content, "x.docx")
|
||||
p2 = await _resolve_from_content(content, "x.docx")
|
||||
try:
|
||||
assert p1 != p2, "Identical content must still get unique temp paths"
|
||||
finally:
|
||||
Path(p1).unlink(missing_ok=True)
|
||||
Path(p2).unlink(missing_ok=True)
|
||||
|
||||
|
||||
class TestH7_ZipDisambiguation:
|
||||
"""H7: ZIP magic bytes must disambiguate docx/xlsx/pptx."""
|
||||
|
||||
def test_docx_detected_via_content_types(self):
|
||||
ext = _detect_extension_from_bytes(_minimal_docx_bytes())
|
||||
assert ext == ".docx"
|
||||
|
||||
def test_unknown_zip_returns_bin(self):
|
||||
import io
|
||||
import zipfile
|
||||
|
||||
buf = io.BytesIO()
|
||||
with zipfile.ZipFile(buf, "w") as zf:
|
||||
zf.writestr("hello.txt", "not an office doc")
|
||||
ext = _detect_extension_from_bytes(buf.getvalue())
|
||||
# No [Content_Types].xml → .bin
|
||||
assert ext == ".bin"
|
||||
|
||||
|
||||
class TestH8_CsvDetection:
|
||||
"""H8: CSV detection must require commas/tabs + valid UTF-8."""
|
||||
|
||||
def test_binary_garbage_not_csv(self):
|
||||
# Printable first byte but binary tail with NUL — must NOT be classified as CSV
|
||||
garbage = b"Aabcdef\x00binary\xff\xfedata"
|
||||
ext = _detect_extension_from_bytes(garbage)
|
||||
assert ext != ".csv"
|
||||
|
||||
def test_actual_csv_detected(self):
|
||||
csv = b"name,age,city\nAlice,30,NYC\nBob,25,LA\n"
|
||||
ext = _detect_extension_from_bytes(csv)
|
||||
assert ext == ".csv"
|
||||
|
||||
|
||||
class TestH2_ErrorPathScrubbing:
|
||||
"""H2: OfficeFileError must never leak server-side upload paths."""
|
||||
|
||||
def test_error_message_scrubs_temp_path(self):
|
||||
leaked = "/tmp/mcp_office_uploads/upload_deadbeef.docx is corrupt"
|
||||
e = OfficeFileError(leaked)
|
||||
msg = str(e)
|
||||
assert "/tmp/mcp_office_uploads/" not in msg
|
||||
assert "<uploaded file>" in msg
|
||||
|
||||
def test_clean_error_messages_unchanged(self):
|
||||
msg = "Unsupported URL scheme: ftp"
|
||||
assert str(OfficeFileError(msg)) == msg
|
||||
|
||||
|
||||
class TestLocalFileBoundary:
|
||||
"""Combined: the security boundary must hold across env config permutations."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_local_file_blocked_in_http_mode(self):
|
||||
with patch.dict(
|
||||
os.environ,
|
||||
{"MCP_TRANSPORT": "streamable-http", "MCP_ALLOW_LOCAL_FILES": ""},
|
||||
clear=False,
|
||||
):
|
||||
os.environ.pop("MCP_ALLOW_LOCAL_FILES", None)
|
||||
with pytest.raises(OfficeFileError, match="Local file access"):
|
||||
await resolve_office_file_path("/etc/passwd")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_local_file_allowed_in_stdio_mode(self, tmp_path):
|
||||
test_file = tmp_path / "test.txt"
|
||||
test_file.write_text("hello")
|
||||
with patch.dict(os.environ, {"MCP_TRANSPORT": "stdio"}, clear=False):
|
||||
os.environ.pop("MCP_ALLOW_LOCAL_FILES", None)
|
||||
result = await resolve_office_file_path(str(test_file))
|
||||
assert result == str(test_file)
|
||||
Loading…
x
Reference in New Issue
Block a user