diff --git a/CLAUDE.md b/CLAUDE.md index a7d33d4..867b277 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -134,11 +134,19 @@ Critical system dependencies: Environment variables (optional): - `TESSDATA_PREFIX`: Tesseract language data location - `PDF_TEMP_DIR`: Temporary file processing directory (defaults to `/tmp/mcp-pdf-processing`) +- `MCP_PDF_ALLOWED_PATHS`: Colon-separated list of allowed output directories (e.g., `/tmp:/home/user/documents:/var/output`) + - If unset: Allows writes to any directory with security warnings + - If set: Restricts file outputs to specified directories only + - **SECURITY NOTE**: This is "security theater" - real protection requires OS-level permissions and process isolation - `DEBUG`: Enable debug logging ### Security Features -The server implements comprehensive security hardening: +**🔒 "TRUST NO ONE" Security Philosophy** + +This server implements defense-in-depth, but remember: **application-level security is "theater" - real security comes from the operating system and deployment practices.** + +**Application-Level Protections (Security Theater):** **Input Validation:** - File size limits: 100MB for PDFs, 50MB for images @@ -167,6 +175,18 @@ The server implements comprehensive security hardening: - GitHub Actions workflow for continuous security monitoring - Daily automated vulnerability assessments +**⚡ REAL Security (What Actually Matters):** + +1. **Process Isolation**: Run as non-privileged user with minimal permissions +2. **OS-Level Controls**: Use chroot/containers/systemd to limit filesystem access +3. **Network Isolation**: Firewall rules, network namespaces, air-gapped environments +4. **Resource Limits**: ulimit, cgroups, memory/CPU quotas at the OS level +5. **File Permissions**: Proper Unix permissions (chmod/chown) on directories and files +6. **Monitoring**: System-level audit logs, not application logs +7. **Regular Updates**: Keep OS, libraries, and dependencies patched + +**Remember**: If an attacker has code execution, application-level restrictions are meaningless. Defense-in-depth starts with the operating system. + ## Development Notes ### Testing Strategy diff --git a/pyproject.toml b/pyproject.toml index 69a7de4..e326c9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "mcp-pdf" -version = "1.1.0" +version = "1.1.1" description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more" authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}] readme = "README.md" diff --git a/src/mcp_pdf/server.py b/src/mcp_pdf/server.py index 0708a83..41a9d27 100644 --- a/src/mcp_pdf/server.py +++ b/src/mcp_pdf/server.py @@ -74,19 +74,41 @@ def validate_output_path(path: str) -> Path: """Validate and secure output paths to prevent directory traversal""" if not path: raise ValueError("Output path cannot be empty") - + # Convert to Path and resolve to absolute path resolved_path = Path(path).resolve() - + # Check for path traversal attempts if '../' in str(path) or '\\..\\' in str(path): raise ValueError("Path traversal detected in output path") - - # Ensure path is within safe directories - safe_prefixes = ['/tmp', '/var/tmp', str(CACHE_DIR.resolve())] - if not any(str(resolved_path).startswith(prefix) for prefix in safe_prefixes): - raise ValueError(f"Output path not allowed: {path}") - + + # Check allowed output paths from environment variable + allowed_paths = os.getenv('MCP_PDF_ALLOWED_PATHS') + + if allowed_paths is None: + # No restriction set - warn user but allow any path + logger.warning(f"MCP_PDF_ALLOWED_PATHS not set - allowing write to any directory: {resolved_path}") + logger.warning("SECURITY NOTE: This restriction is 'security theater' - real protection comes from OS-level permissions") + logger.warning("Recommended: Set MCP_PDF_ALLOWED_PATHS='/tmp:/var/tmp:/home/user/documents' AND use proper file permissions") + logger.warning("For true security: Run this server with limited user permissions, not as root/admin") + return resolved_path + + # Parse allowed paths (semicolon or colon separated for cross-platform compatibility) + separator = ';' if os.name == 'nt' else ':' + allowed_prefixes = [Path(p.strip()).resolve() for p in allowed_paths.split(separator) if p.strip()] + + # Check if resolved path is within any allowed directory + for allowed_prefix in allowed_prefixes: + try: + resolved_path.relative_to(allowed_prefix) + return resolved_path # Path is within allowed directory + except ValueError: + continue # Path is not within this allowed directory + + # Path not allowed + allowed_paths_str = separator.join(str(p) for p in allowed_prefixes) + raise ValueError(f"Output path not allowed: {resolved_path}. Allowed paths: {allowed_paths_str}") + return resolved_path def safe_json_parse(json_str: str, max_size: int = MAX_JSON_SIZE) -> dict: diff --git a/uv.lock b/uv.lock index 43023fe..6424c06 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13' and sys_platform == 'darwin'", @@ -1032,7 +1032,7 @@ wheels = [ [[package]] name = "mcp-pdf" -version = "1.0.1" +version = "1.1.0" source = { editable = "." } dependencies = [ { name = "camelot-py", extra = ["cv"] },