From 58d43851b9900e49b6a7ceb9e69a67905a8a09da Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Mon, 11 Aug 2025 02:25:53 -0600 Subject: [PATCH] Add HTTPS URL support and fix MCP parameter validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Features: - HTTPS URL support: Process PDFs directly from URLs with intelligent caching - Smart caching: 1-hour cache to avoid repeated downloads - Content validation: Verify downloads are actually PDF files - Security: Proper User-Agent headers, HTTPS preferred over HTTP - MCP parameter fixes: Handle pages parameter as string "[2,3]" format - Backward compatibility: Still supports local file paths and list parameters Technical changes: - Added download_pdf_from_url() with caching and validation - Updated validate_pdf_path() to handle URLs and local paths - Added parse_pages_parameter() for flexible parameter parsing - Updated all 8 tools to accept string pages parameters - Enhanced error handling for network and validation issues All tools now support: - Local paths: "/path/to/file.pdf" - HTTPS URLs: "https://example.com/document.pdf" - Flexible pages: "[2,3]", "1,2,3", or [1,2,3] ๐Ÿค– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- README.md | 23 ++++++ examples/url_examples.py | 104 +++++++++++++++++++++++++ src/mcp_pdf_tools/server.py | 147 +++++++++++++++++++++++++++++------- test_pages_parameter.py | 52 +++++++++++++ test_url_support.py | 71 +++++++++++++++++ 5 files changed, 368 insertions(+), 29 deletions(-) create mode 100644 examples/url_examples.py create mode 100644 test_pages_parameter.py create mode 100644 test_url_support.py diff --git a/README.md b/README.md index ab546ef..c81304f 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,31 @@ A comprehensive FastMCP server for PDF processing operations. This server provid - **Document Analysis**: Extract structure, metadata, and check if PDFs are scanned - **Image Extraction**: Extract images with size filtering - **Format Conversion**: Convert PDFs to clean Markdown format +- **URL Support**: Process PDFs directly from HTTPS URLs with intelligent caching - **Smart Detection**: Automatically detect the best method for each operation +## URL Support + +All tools support processing PDFs directly from HTTPS URLs: + +```bash +# Extract text from URL +mcp_pdf_tools extract_text "https://example.com/document.pdf" + +# Extract tables from URL +mcp_pdf_tools extract_tables "https://example.com/report.pdf" + +# Convert URL PDF to markdown +mcp_pdf_tools pdf_to_markdown "https://example.com/paper.pdf" +``` + +**Features:** +- **Intelligent caching**: Downloaded PDFs are cached for 1 hour to avoid repeated downloads +- **Content validation**: Verifies content is actually a PDF file (checks magic bytes and content-type) +- **Security**: HTTPS URLs recommended (HTTP URLs show security warnings) +- **Proper headers**: Sends appropriate User-Agent for better server compatibility +- **Error handling**: Clear error messages for network issues or invalid content + ## Installation ### Using uv (recommended) diff --git a/examples/url_examples.py b/examples/url_examples.py new file mode 100644 index 0000000..2e30336 --- /dev/null +++ b/examples/url_examples.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +""" +Examples of using MCP PDF Tools with URLs +""" + +import asyncio +import sys +import os + +# Add src to path for development +sys.path.insert(0, '../src') + +from mcp_pdf_tools.server import ( + extract_text, extract_metadata, pdf_to_markdown, + extract_tables, is_scanned_pdf +) + +async def example_text_extraction(): + """Example: Extract text from a PDF URL""" + print("๐Ÿ”— Extracting text from URL...") + + # Using a sample PDF from the web + url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" + + try: + result = await extract_text(url) + print(f"โœ… Text extraction successful!") + print(f" Method used: {result['method_used']}") + print(f" Pages: {result['metadata']['pages']}") + print(f" Extracted text length: {len(result['text'])} characters") + print(f" First 100 characters: {result['text'][:100]}...") + + except Exception as e: + print(f"โŒ Failed: {e}") + +async def example_metadata_extraction(): + """Example: Extract metadata from a PDF URL""" + print("\n๐Ÿ“‹ Extracting metadata from URL...") + + url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" + + try: + result = await extract_metadata(url) + print(f"โœ… Metadata extraction successful!") + print(f" File size: {result['file_info']['size_mb']:.2f} MB") + print(f" Pages: {result['statistics']['page_count']}") + print(f" Title: {result['metadata'].get('title', 'No title')}") + print(f" Creation date: {result['metadata'].get('creation_date', 'Unknown')}") + + except Exception as e: + print(f"โŒ Failed: {e}") + +async def example_scanned_detection(): + """Example: Check if PDF is scanned""" + print("\n๐Ÿ” Checking if PDF is scanned...") + + url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" + + try: + result = await is_scanned_pdf(url) + print(f"โœ… Scanned detection successful!") + print(f" Is scanned: {result['is_scanned']}") + print(f" Recommendation: {result['recommendation']}") + print(f" Pages checked: {result['sample_pages_checked']}") + + except Exception as e: + print(f"โŒ Failed: {e}") + +async def example_markdown_conversion(): + """Example: Convert PDF URL to markdown""" + print("\n๐Ÿ“ Converting PDF to markdown...") + + url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" + + try: + result = await pdf_to_markdown(url) + print(f"โœ… Markdown conversion successful!") + print(f" Pages converted: {result['pages_converted']}") + print(f" Markdown length: {len(result['markdown'])} characters") + print(f" First 200 characters:") + print(f" {result['markdown'][:200]}...") + + except Exception as e: + print(f"โŒ Failed: {e}") + +async def main(): + """Run all URL examples""" + print("๐ŸŒ MCP PDF Tools - URL Examples") + print("=" * 50) + + await example_text_extraction() + await example_metadata_extraction() + await example_scanned_detection() + await example_markdown_conversion() + + print("\nโœจ URL examples completed!") + print("\n๐Ÿ’ก Tips:") + print(" โ€ข URLs are cached for 1 hour to avoid repeated downloads") + print(" โ€ข Use HTTPS URLs for security") + print(" โ€ข The server validates content is actually a PDF file") + print(" โ€ข All tools support the same URL format") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/src/mcp_pdf_tools/server.py b/src/mcp_pdf_tools/server.py index afdd59d..8c4e3c5 100644 --- a/src/mcp_pdf_tools/server.py +++ b/src/mcp_pdf_tools/server.py @@ -6,10 +6,14 @@ import os import asyncio import tempfile import base64 +import hashlib +import time from pathlib import Path from typing import Dict, Any, List, Optional, Union from contextlib import asynccontextmanager +from urllib.parse import urlparse import logging +import ast from fastmcp import FastMCP from pydantic import BaseModel, Field @@ -55,8 +59,85 @@ class OCRConfig(BaseModel): dpi: int = Field(default=300, description="DPI for image conversion") # Utility functions +# URL download cache directory +CACHE_DIR = Path(os.environ.get("PDF_TEMP_DIR", "/tmp/mcp-pdf-processing")) +CACHE_DIR.mkdir(exist_ok=True, parents=True) + +def parse_pages_parameter(pages: Union[str, List[int], None]) -> Optional[List[int]]: + """Parse pages parameter that might come as string or list""" + if pages is None: + return None + + if isinstance(pages, list): + return [int(p) for p in pages] + + if isinstance(pages, str): + try: + # Handle string representations like "[1, 2, 3]" or "1,2,3" + if pages.strip().startswith('[') and pages.strip().endswith(']'): + return ast.literal_eval(pages.strip()) + elif ',' in pages: + return [int(p.strip()) for p in pages.split(',')] + else: + return [int(pages.strip())] + except (ValueError, SyntaxError) as e: + raise ValueError(f"Invalid pages format: {pages}. Use format like [1,2,3] or 1,2,3") + + return None + +async def download_pdf_from_url(url: str) -> Path: + """Download PDF from URL with caching""" + try: + # Create cache filename based on URL hash + url_hash = hashlib.sha256(url.encode()).hexdigest()[:16] + cache_file = CACHE_DIR / f"cached_{url_hash}.pdf" + + # Check if cached file exists and is recent (1 hour) + if cache_file.exists(): + file_age = time.time() - cache_file.stat().st_mtime + if file_age < 3600: # 1 hour cache + logger.info(f"Using cached PDF: {cache_file}") + return cache_file + + logger.info(f"Downloading PDF from: {url}") + + headers = { + "User-Agent": "MCP-PDF-Tools/1.0 (PDF processing server; +https://github.com/fastmcp/mcp-pdf-tools)" + } + + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: + response = await client.get(url, headers=headers) + response.raise_for_status() + + # Check content type + content_type = response.headers.get("content-type", "").lower() + if "pdf" not in content_type and "application/pdf" not in content_type: + # Check if content looks like PDF by magic bytes + content_start = response.content[:10] + if not content_start.startswith(b"%PDF"): + raise ValueError(f"URL does not contain a PDF file. Content-Type: {content_type}") + + # Save to cache + cache_file.write_bytes(response.content) + logger.info(f"Downloaded and cached PDF: {cache_file} ({len(response.content)} bytes)") + return cache_file + + except httpx.HTTPError as e: + raise ValueError(f"Failed to download PDF from URL {url}: {str(e)}") + except Exception as e: + raise ValueError(f"Error downloading PDF: {str(e)}") + async def validate_pdf_path(pdf_path: str) -> Path: - """Validate that the path exists and is a PDF file""" + """Validate path (local or URL) and return local Path to PDF file""" + # Check if it's a URL + parsed = urlparse(pdf_path) + + if parsed.scheme in ('http', 'https'): + if parsed.scheme == 'http': + logger.warning(f"Using insecure HTTP URL: {pdf_path}") + return await download_pdf_from_url(pdf_path) + + # Handle local path path = Path(pdf_path) if not path.exists(): raise ValueError(f"File not found: {pdf_path}") @@ -126,20 +207,23 @@ async def extract_with_pypdf(pdf_path: Path, pages: Optional[List[int]] = None, return "\n\n".join(text_parts) # Main text extraction tool -@mcp.tool(name="extract_text", description="Extract text from PDF with intelligent method selection") +@mcp.tool( + name="extract_text", + description="Extract text from PDF with intelligent method selection" +) async def extract_text( pdf_path: str, - method: str = "auto", - pages: Optional[List[int]] = None, + method: str = "auto", + pages: Optional[str] = None, # Accept as string for MCP compatibility preserve_layout: bool = False ) -> Dict[str, Any]: """ Extract text from PDF using various methods Args: - pdf_path: Path to the PDF file + pdf_path: Path to PDF file or HTTPS URL method: Extraction method (auto, pymupdf, pdfplumber, pypdf) - pages: List of page numbers to extract (0-indexed), None for all pages + pages: Page numbers to extract as string like "1,2,3" or "[1,2,3]", None for all pages (0-indexed) preserve_layout: Whether to preserve the original text layout Returns: @@ -150,6 +234,7 @@ async def extract_text( try: path = await validate_pdf_path(pdf_path) + parsed_pages = parse_pages_parameter(pages) # Auto-select method based on PDF characteristics if method == "auto": @@ -163,11 +248,11 @@ async def extract_text( # Extract text using selected method if method == "pymupdf": - text = await extract_with_pymupdf(path, pages, preserve_layout) + text = await extract_with_pymupdf(path, parsed_pages, preserve_layout) elif method == "pdfplumber": - text = await extract_with_pdfplumber(path, pages, preserve_layout) + text = await extract_with_pdfplumber(path, parsed_pages, preserve_layout) elif method == "pypdf": - text = await extract_with_pypdf(path, pages, preserve_layout) + text = await extract_with_pypdf(path, parsed_pages, preserve_layout) else: raise ValueError(f"Unknown extraction method: {method}") @@ -248,7 +333,7 @@ async def extract_tables_pdfplumber(pdf_path: Path, pages: Optional[List[int]] = @mcp.tool(name="extract_tables", description="Extract tables from PDF with automatic method selection") async def extract_tables( pdf_path: str, - pages: Optional[List[int]] = None, + pages: Optional[str] = None, # Accept as string for MCP compatibility method: str = "auto", output_format: str = "json" ) -> Dict[str, Any]: @@ -256,7 +341,7 @@ async def extract_tables( Extract tables from PDF using various methods Args: - pdf_path: Path to the PDF file + pdf_path: Path to PDF file or HTTPS URL pages: List of page numbers to extract tables from (0-indexed) method: Extraction method (auto, camelot, tabula, pdfplumber) output_format: Output format (json, csv, markdown) @@ -269,6 +354,7 @@ async def extract_tables( try: path = await validate_pdf_path(pdf_path) + parsed_pages = parse_pages_parameter(pages) all_tables = [] methods_tried = [] @@ -278,11 +364,11 @@ async def extract_tables( methods_tried.append(try_method) if try_method == "camelot": - tables = await extract_tables_camelot(path, pages) + tables = await extract_tables_camelot(path, parsed_pages) elif try_method == "pdfplumber": - tables = await extract_tables_pdfplumber(path, pages) + tables = await extract_tables_pdfplumber(path, parsed_pages) elif try_method == "tabula": - tables = await extract_tables_tabula(path, pages) + tables = await extract_tables_tabula(path, parsed_pages) if tables: method = try_method @@ -292,11 +378,11 @@ async def extract_tables( # Use specific method methods_tried.append(method) if method == "camelot": - all_tables = await extract_tables_camelot(path, pages) + all_tables = await extract_tables_camelot(path, parsed_pages) elif method == "pdfplumber": - all_tables = await extract_tables_pdfplumber(path, pages) + all_tables = await extract_tables_pdfplumber(path, parsed_pages) elif method == "tabula": - all_tables = await extract_tables_tabula(path, pages) + all_tables = await extract_tables_tabula(path, parsed_pages) else: raise ValueError(f"Unknown table extraction method: {method}") @@ -345,13 +431,13 @@ async def ocr_pdf( languages: List[str] = ["eng"], preprocess: bool = True, dpi: int = 300, - pages: Optional[List[int]] = None + pages: Optional[str] = None # Accept as string for MCP compatibility ) -> Dict[str, Any]: """ Perform OCR on a scanned PDF Args: - pdf_path: Path to the PDF file + pdf_path: Path to PDF file or HTTPS URL languages: List of language codes for OCR (e.g., ["eng", "fra"]) preprocess: Whether to preprocess images for better OCR dpi: DPI for PDF to image conversion @@ -365,12 +451,13 @@ async def ocr_pdf( try: path = await validate_pdf_path(pdf_path) + parsed_pages = parse_pages_parameter(pages) # Convert PDF pages to images with tempfile.TemporaryDirectory() as temp_dir: - if pages: + if parsed_pages: images = [] - for page_num in pages: + for page_num in parsed_pages: page_images = convert_from_path( str(path), dpi=dpi, @@ -461,7 +548,7 @@ async def get_document_structure(pdf_path: str) -> Dict[str, Any]: Extract document structure including headers, sections, and metadata Args: - pdf_path: Path to the PDF file + pdf_path: Path to PDF file or HTTPS URL Returns: Dictionary containing document structure information @@ -532,13 +619,13 @@ async def pdf_to_markdown( pdf_path: str, include_images: bool = True, include_metadata: bool = True, - pages: Optional[List[int]] = None + pages: Optional[str] = None # Accept as string for MCP compatibility ) -> Dict[str, Any]: """ Convert PDF to markdown format Args: - pdf_path: Path to the PDF file + pdf_path: Path to PDF file or HTTPS URL include_images: Whether to extract and include images include_metadata: Whether to include document metadata pages: Specific pages to convert (0-indexed) @@ -551,6 +638,7 @@ async def pdf_to_markdown( try: path = await validate_pdf_path(pdf_path) + parsed_pages = parse_pages_parameter(pages) doc = fitz.open(str(path)) markdown_parts = [] @@ -575,7 +663,7 @@ async def pdf_to_markdown( markdown_parts.append("\n---\n") # Process pages - page_range = pages if pages else range(len(doc)) + page_range = parsed_pages if parsed_pages else range(len(doc)) images_extracted = [] for page_num in page_range: @@ -638,7 +726,7 @@ async def pdf_to_markdown( @mcp.tool(name="extract_images", description="Extract images from PDF") async def extract_images( pdf_path: str, - pages: Optional[List[int]] = None, + pages: Optional[str] = None, # Accept as string for MCP compatibility min_width: int = 100, min_height: int = 100, output_format: str = "png" @@ -647,7 +735,7 @@ async def extract_images( Extract images from PDF Args: - pdf_path: Path to the PDF file + pdf_path: Path to PDF file or HTTPS URL pages: Specific pages to extract images from (0-indexed) min_width: Minimum image width to extract min_height: Minimum image height to extract @@ -658,10 +746,11 @@ async def extract_images( """ try: path = await validate_pdf_path(pdf_path) + parsed_pages = parse_pages_parameter(pages) doc = fitz.open(str(path)) images = [] - page_range = pages if pages else range(len(doc)) + page_range = parsed_pages if parsed_pages else range(len(doc)) for page_num in page_range: page = doc[page_num] @@ -714,7 +803,7 @@ async def extract_metadata(pdf_path: str) -> Dict[str, Any]: Extract comprehensive metadata from PDF Args: - pdf_path: Path to the PDF file + pdf_path: Path to PDF file or HTTPS URL Returns: Dictionary containing all available metadata diff --git a/test_pages_parameter.py b/test_pages_parameter.py new file mode 100644 index 0000000..570c89a --- /dev/null +++ b/test_pages_parameter.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" +Test the updated pages parameter parsing +""" + +import asyncio +import sys +import os + +# Add src to path +sys.path.insert(0, 'src') + +from mcp_pdf_tools.server import parse_pages_parameter + +def test_page_parsing(): + """Test page parameter parsing""" + print("Testing page parameter parsing...") + + # Test different input formats + test_cases = [ + (None, None), + ("1,2,3", [1, 2, 3]), + ("[2, 3]", [2, 3]), # This is the problematic case from the user + ("5", [5]), + ([0, 1, 2], [0, 1, 2]), + ("0,1,2", [0, 1, 2]), + ("[0,1,2]", [0, 1, 2]) + ] + + all_passed = True + + for input_val, expected in test_cases: + try: + result = parse_pages_parameter(input_val) + if result == expected: + print(f"โœ… '{input_val}' -> {result}") + else: + print(f"โŒ '{input_val}' -> {result}, expected {expected}") + all_passed = False + except Exception as e: + print(f"โŒ '{input_val}' -> Error: {e}") + all_passed = False + + return all_passed + +if __name__ == "__main__": + success = test_page_parsing() + if success: + print("\n๐ŸŽ‰ All page parameter parsing tests passed!") + else: + print("\n๐Ÿšจ Some tests failed!") + sys.exit(0 if success else 1) \ No newline at end of file diff --git a/test_url_support.py b/test_url_support.py new file mode 100644 index 0000000..8ef2918 --- /dev/null +++ b/test_url_support.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +""" +Test URL support for MCP PDF Tools +""" + +import asyncio +import sys +import os + +# Add src to path +sys.path.insert(0, 'src') + +from mcp_pdf_tools.server import validate_pdf_path, download_pdf_from_url + +async def test_url_validation(): + """Test URL validation and download""" + print("Testing URL validation and download...") + + # Test with a known PDF URL (using a publicly available sample) + test_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" + + try: + print(f"Testing URL: {test_url}") + path = await validate_pdf_path(test_url) + print(f"โœ… Successfully downloaded and validated PDF: {path}") + print(f" File size: {path.stat().st_size} bytes") + return True + + except Exception as e: + print(f"โŒ URL test failed: {e}") + return False + +async def test_local_path(): + """Test that local paths still work""" + print("\nTesting local path validation...") + + # Test with our existing test PDF + test_path = "/tmp/test_text.pdf" + + if not os.path.exists(test_path): + print(f"โš ๏ธ Test file {test_path} not found, skipping local test") + return True + + try: + path = await validate_pdf_path(test_path) + print(f"โœ… Local path validation works: {path}") + return True + + except Exception as e: + print(f"โŒ Local path test failed: {e}") + return False + +async def main(): + print("๐Ÿงช Testing MCP PDF Tools URL Support\n") + + url_success = await test_url_validation() + local_success = await test_local_path() + + print(f"\n๐Ÿ“Š Test Results:") + print(f" URL support: {'โœ… PASS' if url_success else 'โŒ FAIL'}") + print(f" Local paths: {'โœ… PASS' if local_success else 'โŒ FAIL'}") + + if url_success and local_success: + print("\n๐ŸŽ‰ All tests passed! URL support is working.") + return 0 + else: + print("\n๐Ÿšจ Some tests failed.") + return 1 + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) \ No newline at end of file