mcp-pdf-tools/examples/url_examples.py
Ryan Malloy 58d43851b9 Add HTTPS URL support and fix MCP parameter validation
Features:
- HTTPS URL support: Process PDFs directly from URLs with intelligent caching
- Smart caching: 1-hour cache to avoid repeated downloads
- Content validation: Verify downloads are actually PDF files
- Security: Proper User-Agent headers, HTTPS preferred over HTTP
- MCP parameter fixes: Handle pages parameter as string "[2,3]" format
- Backward compatibility: Still supports local file paths and list parameters

Technical changes:
- Added download_pdf_from_url() with caching and validation
- Updated validate_pdf_path() to handle URLs and local paths
- Added parse_pages_parameter() for flexible parameter parsing
- Updated all 8 tools to accept string pages parameters
- Enhanced error handling for network and validation issues

All tools now support:
- Local paths: "/path/to/file.pdf"
- HTTPS URLs: "https://example.com/document.pdf"
- Flexible pages: "[2,3]", "1,2,3", or [1,2,3]

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-11 02:25:53 -06:00

104 lines
3.5 KiB
Python

#!/usr/bin/env python3
"""
Examples of using MCP PDF Tools with URLs
"""
import asyncio
import sys
import os
# Add src to path for development
sys.path.insert(0, '../src')
from mcp_pdf_tools.server import (
extract_text, extract_metadata, pdf_to_markdown,
extract_tables, is_scanned_pdf
)
async def example_text_extraction():
"""Example: Extract text from a PDF URL"""
print("🔗 Extracting text from URL...")
# Using a sample PDF from the web
url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
try:
result = await extract_text(url)
print(f"✅ Text extraction successful!")
print(f" Method used: {result['method_used']}")
print(f" Pages: {result['metadata']['pages']}")
print(f" Extracted text length: {len(result['text'])} characters")
print(f" First 100 characters: {result['text'][:100]}...")
except Exception as e:
print(f"❌ Failed: {e}")
async def example_metadata_extraction():
"""Example: Extract metadata from a PDF URL"""
print("\n📋 Extracting metadata from URL...")
url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
try:
result = await extract_metadata(url)
print(f"✅ Metadata extraction successful!")
print(f" File size: {result['file_info']['size_mb']:.2f} MB")
print(f" Pages: {result['statistics']['page_count']}")
print(f" Title: {result['metadata'].get('title', 'No title')}")
print(f" Creation date: {result['metadata'].get('creation_date', 'Unknown')}")
except Exception as e:
print(f"❌ Failed: {e}")
async def example_scanned_detection():
"""Example: Check if PDF is scanned"""
print("\n🔍 Checking if PDF is scanned...")
url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
try:
result = await is_scanned_pdf(url)
print(f"✅ Scanned detection successful!")
print(f" Is scanned: {result['is_scanned']}")
print(f" Recommendation: {result['recommendation']}")
print(f" Pages checked: {result['sample_pages_checked']}")
except Exception as e:
print(f"❌ Failed: {e}")
async def example_markdown_conversion():
"""Example: Convert PDF URL to markdown"""
print("\n📝 Converting PDF to markdown...")
url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
try:
result = await pdf_to_markdown(url)
print(f"✅ Markdown conversion successful!")
print(f" Pages converted: {result['pages_converted']}")
print(f" Markdown length: {len(result['markdown'])} characters")
print(f" First 200 characters:")
print(f" {result['markdown'][:200]}...")
except Exception as e:
print(f"❌ Failed: {e}")
async def main():
"""Run all URL examples"""
print("🌐 MCP PDF Tools - URL Examples")
print("=" * 50)
await example_text_extraction()
await example_metadata_extraction()
await example_scanned_detection()
await example_markdown_conversion()
print("\n✨ URL examples completed!")
print("\n💡 Tips:")
print(" • URLs are cached for 1 hour to avoid repeated downloads")
print(" • Use HTTPS URLs for security")
print(" • The server validates content is actually a PDF file")
print(" • All tools support the same URL format")
if __name__ == "__main__":
asyncio.run(main())