From 856dd4199658df0d5ced95525d40e0987f858bf1 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Tue, 23 Sep 2025 20:41:16 -0600 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Add=20comprehensive=20link=20extrac?= =?UTF-8?q?tion=20tool=20(24th=20PDF=20tool)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New Features: - extract_links: Extract all PDF hyperlinks with advanced filtering - Page-specific filtering (e.g., "1,3,5" or "1-5,8,10-12") - Link type categorization: external URLs, internal pages, emails, documents - Coordinate tracking for precise link positioning - FastMCP integration with proper tool registration - Version banner display following CLAUDE.md guidelines Technical Improvements: - Enhanced startup banner with package version display - Updated documentation to reflect 24 specialized tools - Proper FastMCP @mcp.tool() decorator usage - Comprehensive error handling and security validation Documentation Updates: - README.md: Updated tool count and installation guides - CLAUDE.md: Added link extraction to implemented features - LOCAL_DEVELOPMENT.md: Enhanced with scoped installation commands Version: 1.1.0 (minor version bump for new feature) --- .mcp.json | 10 +-- CLAUDE.md | 9 ++- LOCAL_DEVELOPMENT.md | 27 +++++-- README.md | 27 +++++-- pyproject.toml | 2 +- src/mcp_pdf/server.py | 169 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 219 insertions(+), 25 deletions(-) diff --git a/.mcp.json b/.mcp.json index 80bbce9..7001130 100644 --- a/.mcp.json +++ b/.mcp.json @@ -1,11 +1,3 @@ { - "mcpServers": { - "pdf-tools": { - "command": "uv", - "args": ["run", "mcp-pdf-tools"], - "env": { - "PDF_TEMP_DIR": "/tmp/mcp-pdf-processing" - } - } - } + "mcpServers": {} } \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index 6ed42bf..a7d33d4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -93,9 +93,10 @@ uv publish 4. **Document Analysis**: `is_scanned_pdf`, `get_document_structure`, `extract_metadata` 5. **Format Conversion**: `pdf_to_markdown` - Clean markdown with MCP resource URIs for images 6. **Image Processing**: `extract_images` - Extract images with custom output paths and clean summary output -7. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management -8. **Document Assembly**: `merge_pdfs`, `split_pdf_by_pages`, `reorder_pdf_pages` - PDF manipulation and organization -9. **Annotations & Markup**: `add_sticky_notes`, `add_highlights`, `add_stamps`, `add_video_notes`, `extract_all_annotations` - Collaboration and multimedia review tools +7. **Link Extraction**: `extract_links` - Extract all hyperlinks with page filtering and type categorization +8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management +9. **Document Assembly**: `merge_pdfs`, `split_pdf_by_pages`, `reorder_pdf_pages` - PDF manipulation and organization +10. **Annotations & Markup**: `add_sticky_notes`, `add_highlights`, `add_stamps`, `add_video_notes`, `extract_all_annotations` - Collaboration and multimedia review tools ### MCP Client-Friendly Design @@ -314,7 +315,7 @@ Based on comprehensive PDF usage patterns, here are potential high-impact featur - `detect_pdf_quality_issues` - Scan for structural problems ### 📄 Priority 5: Advanced Content Extraction -- `extract_pdf_links` - All URLs and internal links +- ✅ `extract_links` - All URLs and internal links (IMPLEMENTED) - `extract_pdf_fonts` - Font usage analysis - `extract_pdf_colors` - Color palette extraction - `extract_pdf_layers` - CAD/design layer information diff --git a/LOCAL_DEVELOPMENT.md b/LOCAL_DEVELOPMENT.md index ca1640f..82cc005 100644 --- a/LOCAL_DEVELOPMENT.md +++ b/LOCAL_DEVELOPMENT.md @@ -25,19 +25,34 @@ uv sync --dev uv run python -c "from mcp_pdf.server import create_server; print('✅ MCP PDF loads successfully')" ``` -### 2. Test with Claude Code (Local Development) +### 2. Add MCP Server to Claude Desktop -Use the `-t local` flag to point Claude Code to your local development copy: +#### For Production Use (PyPI Installation) + +Install the published version from PyPI: ```bash -# Start Claude Code with local MCP PDF server -claude-code -t local /path/to/mcp-pdf +# For personal use across all projects +claude mcp add -s local pdf-tools uvx mcp-pdf + +# For project-specific use (isolated to current directory) +claude mcp add -s project pdf-tools uvx mcp-pdf ``` -Or if you're already in the mcp-pdf directory: +#### For Local Development (Source Installation) + +When developing MCP PDF itself, use the local source: ```bash -claude-code -t local . +# For development from local source +claude mcp add -s project pdf-tools-dev uv -- --directory /path/to/mcp-pdf-tools run mcp-pdf +``` + +Or if you're in the mcp-pdf directory: + +```bash +# Development server from current directory +claude mcp add -s project pdf-tools-dev uv -- --directory . run mcp-pdf ``` ### 3. Alternative: Manual Server Testing diff --git a/README.md b/README.md index 8085d50..e9c1716 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ **🚀 The Ultimate PDF Processing Intelligence Platform for AI** -*Transform any PDF into structured, actionable intelligence with 23 specialized tools* +*Transform any PDF into structured, actionable intelligence with 24 specialized tools* [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg?style=flat-square)](https://www.python.org/downloads/) [![FastMCP](https://img.shields.io/badge/FastMCP-2.0+-green.svg?style=flat-square)](https://github.com/jlowin/fastmcp) @@ -31,7 +31,7 @@ ### 🏆 **Why MCP PDF Leads** -- **🚀 23 Specialized Tools** for every PDF scenario +- **🚀 24 Specialized Tools** for every PDF scenario - **🧠 AI-Powered Intelligence** beyond basic extraction - **🔄 Multi-Library Fallbacks** for 99.9% reliability - **⚡ 10x Faster** than traditional solutions @@ -76,14 +76,31 @@ uv run mcp-pdf
🔧 Claude Desktop Integration (click to expand) +### **📦 Production Installation (PyPI)** + +```bash +# For personal use across all projects +claude mcp add -s local pdf-tools uvx mcp-pdf + +# For project-specific use (isolated) +claude mcp add -s project pdf-tools uvx mcp-pdf +``` + +### **🛠️ Development Installation (Source)** + +```bash +# For local development from source +claude mcp add -s project pdf-tools-dev uv -- --directory /path/to/mcp-pdf run mcp-pdf +``` + +### **⚙️ Manual Configuration** Add to your `claude_desktop_config.json`: ```json { "mcpServers": { "pdf-tools": { - "command": "uv", - "args": ["run", "mcp-pdf"], - "cwd": "/path/to/mcp-pdf" + "command": "uvx", + "args": ["mcp-pdf"] } } } diff --git a/pyproject.toml b/pyproject.toml index 0adaeb6..69a7de4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "mcp-pdf" -version = "1.0.1" +version = "1.1.0" description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more" authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}] readme = "README.md" diff --git a/src/mcp_pdf/server.py b/src/mcp_pdf/server.py index 2eb3496..0708a83 100644 --- a/src/mcp_pdf/server.py +++ b/src/mcp_pdf/server.py @@ -6295,12 +6295,181 @@ def create_server(): """Create and return the MCP server instance""" return mcp +@mcp.tool( + name="extract_links", + description="Extract all links from PDF with comprehensive filtering and analysis options" +) +async def extract_links( + pdf_path: str, + pages: Optional[str] = None, + include_internal: bool = True, + include_external: bool = True, + include_email: bool = True +) -> dict: + """ + Extract all links from a PDF document with page filtering options. + + Args: + pdf_path: Path to PDF file or HTTPS URL + pages: Page numbers (e.g., "1,3,5" or "1-5,8,10-12"). If None, processes all pages + include_internal: Include internal document links (default: True) + include_external: Include external URL links (default: True) + include_email: Include email links (default: True) + + Returns: + Dictionary containing extracted links organized by type and page + """ + start_time = time.time() + + try: + # Validate PDF path and security + path = await validate_pdf_path(pdf_path) + + # Parse pages parameter + pages_to_extract = [] + doc = fitz.open(path) + total_pages = doc.page_count + + if pages: + try: + pages_to_extract = parse_page_ranges(pages, total_pages) + except ValueError as e: + raise ValueError(f"Invalid page specification: {e}") + else: + pages_to_extract = list(range(total_pages)) + + # Extract links from specified pages + all_links = [] + pages_with_links = [] + + for page_num in pages_to_extract: + page = doc[page_num] + page_links = page.get_links() + + if page_links: + pages_with_links.append(page_num + 1) # 1-based for user + + for link in page_links: + link_info = { + "page": page_num + 1, # 1-based page numbering + "type": "unknown", + "destination": None, + "coordinates": { + "x0": round(link["from"].x0, 2), + "y0": round(link["from"].y0, 2), + "x1": round(link["from"].x1, 2), + "y1": round(link["from"].y1, 2) + } + } + + # Determine link type and destination + if link["kind"] == fitz.LINK_URI: + # External URL + if include_external: + link_info["type"] = "external_url" + link_info["destination"] = link["uri"] + all_links.append(link_info) + elif link["kind"] == fitz.LINK_GOTO: + # Internal link to another page + if include_internal: + link_info["type"] = "internal_page" + link_info["destination"] = f"Page {link['page'] + 1}" + all_links.append(link_info) + elif link["kind"] == fitz.LINK_GOTOR: + # Link to external document + if include_external: + link_info["type"] = "external_document" + link_info["destination"] = link.get("file", "unknown") + all_links.append(link_info) + elif link["kind"] == fitz.LINK_LAUNCH: + # Launch application/file + if include_external: + link_info["type"] = "launch" + link_info["destination"] = link.get("file", "unknown") + all_links.append(link_info) + elif link["kind"] == fitz.LINK_NAMED: + # Named action (like print, quit, etc.) + if include_internal: + link_info["type"] = "named_action" + link_info["destination"] = link.get("name", "unknown") + all_links.append(link_info) + + # Organize links by type + links_by_type = { + "external_url": [link for link in all_links if link["type"] == "external_url"], + "internal_page": [link for link in all_links if link["type"] == "internal_page"], + "external_document": [link for link in all_links if link["type"] == "external_document"], + "launch": [link for link in all_links if link["type"] == "launch"], + "named_action": [link for link in all_links if link["type"] == "named_action"], + "email": [] # PyMuPDF doesn't distinguish email separately, they come as external_url + } + + # Extract email links from external URLs + if include_email: + for link in links_by_type["external_url"]: + if link["destination"] and link["destination"].startswith("mailto:"): + email_link = link.copy() + email_link["type"] = "email" + email_link["destination"] = link["destination"].replace("mailto:", "") + links_by_type["email"].append(email_link) + + # Remove email links from external_url list + links_by_type["external_url"] = [ + link for link in links_by_type["external_url"] + if not (link["destination"] and link["destination"].startswith("mailto:")) + ] + + doc.close() + + extraction_time = round(time.time() - start_time, 2) + + return { + "file_info": { + "path": str(path), + "total_pages": total_pages, + "pages_searched": pages_to_extract if pages else list(range(total_pages)) + }, + "extraction_summary": { + "total_links_found": len(all_links), + "pages_with_links": pages_with_links, + "pages_searched_count": len(pages_to_extract), + "link_types_found": [link_type for link_type, links in links_by_type.items() if links] + }, + "links_by_type": links_by_type, + "all_links": all_links, + "extraction_settings": { + "include_internal": include_internal, + "include_external": include_external, + "include_email": include_email, + "pages_filter": pages or "all" + }, + "extraction_time": extraction_time + } + + except Exception as e: + error_msg = sanitize_error_message(str(e)) + logger.error(f"Link extraction failed for {pdf_path}: {error_msg}") + return { + "error": f"Link extraction failed: {error_msg}", + "extraction_time": round(time.time() - start_time, 2) + } + + def main(): """Run the MCP server - entry point for CLI""" asyncio.run(run_server()) async def run_server(): """Run the MCP server""" + try: + from importlib.metadata import version + package_version = version("mcp-pdf") + except: + package_version = "1.0.1" + + # Log version to stderr so it appears even with MCP protocol on stdout + import sys + print(f"🎬 MCP PDF Tools v{package_version}", file=sys.stderr) await mcp.run_stdio_async() if __name__ == "__main__":