diff --git a/.mcp.json b/.mcp.json
index 80bbce9..7001130 100644
--- a/.mcp.json
+++ b/.mcp.json
@@ -1,11 +1,3 @@
{
- "mcpServers": {
- "pdf-tools": {
- "command": "uv",
- "args": ["run", "mcp-pdf-tools"],
- "env": {
- "PDF_TEMP_DIR": "/tmp/mcp-pdf-processing"
- }
- }
- }
+ "mcpServers": {}
}
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
index 6ed42bf..a7d33d4 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -93,9 +93,10 @@ uv publish
4. **Document Analysis**: `is_scanned_pdf`, `get_document_structure`, `extract_metadata`
5. **Format Conversion**: `pdf_to_markdown` - Clean markdown with MCP resource URIs for images
6. **Image Processing**: `extract_images` - Extract images with custom output paths and clean summary output
-7. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management
-8. **Document Assembly**: `merge_pdfs`, `split_pdf_by_pages`, `reorder_pdf_pages` - PDF manipulation and organization
-9. **Annotations & Markup**: `add_sticky_notes`, `add_highlights`, `add_stamps`, `add_video_notes`, `extract_all_annotations` - Collaboration and multimedia review tools
+7. **Link Extraction**: `extract_links` - Extract all hyperlinks with page filtering and type categorization
+8. **PDF Forms**: `extract_form_data`, `create_form_pdf`, `fill_form_pdf`, `add_form_fields` - Complete form lifecycle management
+9. **Document Assembly**: `merge_pdfs`, `split_pdf_by_pages`, `reorder_pdf_pages` - PDF manipulation and organization
+10. **Annotations & Markup**: `add_sticky_notes`, `add_highlights`, `add_stamps`, `add_video_notes`, `extract_all_annotations` - Collaboration and multimedia review tools
### MCP Client-Friendly Design
@@ -314,7 +315,7 @@ Based on comprehensive PDF usage patterns, here are potential high-impact featur
- `detect_pdf_quality_issues` - Scan for structural problems
### 📄 Priority 5: Advanced Content Extraction
-- `extract_pdf_links` - All URLs and internal links
+- ✅ `extract_links` - All URLs and internal links (IMPLEMENTED)
- `extract_pdf_fonts` - Font usage analysis
- `extract_pdf_colors` - Color palette extraction
- `extract_pdf_layers` - CAD/design layer information
diff --git a/LOCAL_DEVELOPMENT.md b/LOCAL_DEVELOPMENT.md
index ca1640f..82cc005 100644
--- a/LOCAL_DEVELOPMENT.md
+++ b/LOCAL_DEVELOPMENT.md
@@ -25,19 +25,34 @@ uv sync --dev
uv run python -c "from mcp_pdf.server import create_server; print('✅ MCP PDF loads successfully')"
```
-### 2. Test with Claude Code (Local Development)
+### 2. Add MCP Server to Claude Desktop
-Use the `-t local` flag to point Claude Code to your local development copy:
+#### For Production Use (PyPI Installation)
+
+Install the published version from PyPI:
```bash
-# Start Claude Code with local MCP PDF server
-claude-code -t local /path/to/mcp-pdf
+# For personal use across all projects
+claude mcp add -s local pdf-tools uvx mcp-pdf
+
+# For project-specific use (isolated to current directory)
+claude mcp add -s project pdf-tools uvx mcp-pdf
```
-Or if you're already in the mcp-pdf directory:
+#### For Local Development (Source Installation)
+
+When developing MCP PDF itself, use the local source:
```bash
-claude-code -t local .
+# For development from local source
+claude mcp add -s project pdf-tools-dev uv -- --directory /path/to/mcp-pdf-tools run mcp-pdf
+```
+
+Or if you're in the mcp-pdf directory:
+
+```bash
+# Development server from current directory
+claude mcp add -s project pdf-tools-dev uv -- --directory . run mcp-pdf
```
### 3. Alternative: Manual Server Testing
diff --git a/README.md b/README.md
index 8085d50..e9c1716 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
**🚀 The Ultimate PDF Processing Intelligence Platform for AI**
-*Transform any PDF into structured, actionable intelligence with 23 specialized tools*
+*Transform any PDF into structured, actionable intelligence with 24 specialized tools*
[](https://www.python.org/downloads/)
[](https://github.com/jlowin/fastmcp)
@@ -31,7 +31,7 @@
### 🏆 **Why MCP PDF Leads**
-- **🚀 23 Specialized Tools** for every PDF scenario
+- **🚀 24 Specialized Tools** for every PDF scenario
- **🧠 AI-Powered Intelligence** beyond basic extraction
- **🔄 Multi-Library Fallbacks** for 99.9% reliability
- **⚡ 10x Faster** than traditional solutions
@@ -76,14 +76,31 @@ uv run mcp-pdf
🔧 Claude Desktop Integration (click to expand)
+### **📦 Production Installation (PyPI)**
+
+```bash
+# For personal use across all projects
+claude mcp add -s local pdf-tools uvx mcp-pdf
+
+# For project-specific use (isolated)
+claude mcp add -s project pdf-tools uvx mcp-pdf
+```
+
+### **🛠️ Development Installation (Source)**
+
+```bash
+# For local development from source
+claude mcp add -s project pdf-tools-dev uv -- --directory /path/to/mcp-pdf run mcp-pdf
+```
+
+### **⚙️ Manual Configuration**
Add to your `claude_desktop_config.json`:
```json
{
"mcpServers": {
"pdf-tools": {
- "command": "uv",
- "args": ["run", "mcp-pdf"],
- "cwd": "/path/to/mcp-pdf"
+ "command": "uvx",
+ "args": ["mcp-pdf"]
}
}
}
diff --git a/pyproject.toml b/pyproject.toml
index 0adaeb6..69a7de4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "mcp-pdf"
-version = "1.0.1"
+version = "1.1.0"
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
readme = "README.md"
diff --git a/src/mcp_pdf/server.py b/src/mcp_pdf/server.py
index 2eb3496..0708a83 100644
--- a/src/mcp_pdf/server.py
+++ b/src/mcp_pdf/server.py
@@ -6295,12 +6295,181 @@ def create_server():
"""Create and return the MCP server instance"""
return mcp
+@mcp.tool(
+ name="extract_links",
+ description="Extract all links from PDF with comprehensive filtering and analysis options"
+)
+async def extract_links(
+ pdf_path: str,
+ pages: Optional[str] = None,
+ include_internal: bool = True,
+ include_external: bool = True,
+ include_email: bool = True
+) -> dict:
+ """
+ Extract all links from a PDF document with page filtering options.
+
+ Args:
+ pdf_path: Path to PDF file or HTTPS URL
+ pages: Page numbers (e.g., "1,3,5" or "1-5,8,10-12"). If None, processes all pages
+ include_internal: Include internal document links (default: True)
+ include_external: Include external URL links (default: True)
+ include_email: Include email links (default: True)
+
+ Returns:
+ Dictionary containing extracted links organized by type and page
+ """
+ start_time = time.time()
+
+ try:
+ # Validate PDF path and security
+ path = await validate_pdf_path(pdf_path)
+
+ # Parse pages parameter
+ pages_to_extract = []
+ doc = fitz.open(path)
+ total_pages = doc.page_count
+
+ if pages:
+ try:
+ pages_to_extract = parse_page_ranges(pages, total_pages)
+ except ValueError as e:
+ raise ValueError(f"Invalid page specification: {e}")
+ else:
+ pages_to_extract = list(range(total_pages))
+
+ # Extract links from specified pages
+ all_links = []
+ pages_with_links = []
+
+ for page_num in pages_to_extract:
+ page = doc[page_num]
+ page_links = page.get_links()
+
+ if page_links:
+ pages_with_links.append(page_num + 1) # 1-based for user
+
+ for link in page_links:
+ link_info = {
+ "page": page_num + 1, # 1-based page numbering
+ "type": "unknown",
+ "destination": None,
+ "coordinates": {
+ "x0": round(link["from"].x0, 2),
+ "y0": round(link["from"].y0, 2),
+ "x1": round(link["from"].x1, 2),
+ "y1": round(link["from"].y1, 2)
+ }
+ }
+
+ # Determine link type and destination
+ if link["kind"] == fitz.LINK_URI:
+ # External URL
+ if include_external:
+ link_info["type"] = "external_url"
+ link_info["destination"] = link["uri"]
+ all_links.append(link_info)
+ elif link["kind"] == fitz.LINK_GOTO:
+ # Internal link to another page
+ if include_internal:
+ link_info["type"] = "internal_page"
+ link_info["destination"] = f"Page {link['page'] + 1}"
+ all_links.append(link_info)
+ elif link["kind"] == fitz.LINK_GOTOR:
+ # Link to external document
+ if include_external:
+ link_info["type"] = "external_document"
+ link_info["destination"] = link.get("file", "unknown")
+ all_links.append(link_info)
+ elif link["kind"] == fitz.LINK_LAUNCH:
+ # Launch application/file
+ if include_external:
+ link_info["type"] = "launch"
+ link_info["destination"] = link.get("file", "unknown")
+ all_links.append(link_info)
+ elif link["kind"] == fitz.LINK_NAMED:
+ # Named action (like print, quit, etc.)
+ if include_internal:
+ link_info["type"] = "named_action"
+ link_info["destination"] = link.get("name", "unknown")
+ all_links.append(link_info)
+
+ # Organize links by type
+ links_by_type = {
+ "external_url": [link for link in all_links if link["type"] == "external_url"],
+ "internal_page": [link for link in all_links if link["type"] == "internal_page"],
+ "external_document": [link for link in all_links if link["type"] == "external_document"],
+ "launch": [link for link in all_links if link["type"] == "launch"],
+ "named_action": [link for link in all_links if link["type"] == "named_action"],
+ "email": [] # PyMuPDF doesn't distinguish email separately, they come as external_url
+ }
+
+ # Extract email links from external URLs
+ if include_email:
+ for link in links_by_type["external_url"]:
+ if link["destination"] and link["destination"].startswith("mailto:"):
+ email_link = link.copy()
+ email_link["type"] = "email"
+ email_link["destination"] = link["destination"].replace("mailto:", "")
+ links_by_type["email"].append(email_link)
+
+ # Remove email links from external_url list
+ links_by_type["external_url"] = [
+ link for link in links_by_type["external_url"]
+ if not (link["destination"] and link["destination"].startswith("mailto:"))
+ ]
+
+ doc.close()
+
+ extraction_time = round(time.time() - start_time, 2)
+
+ return {
+ "file_info": {
+ "path": str(path),
+ "total_pages": total_pages,
+ "pages_searched": pages_to_extract if pages else list(range(total_pages))
+ },
+ "extraction_summary": {
+ "total_links_found": len(all_links),
+ "pages_with_links": pages_with_links,
+ "pages_searched_count": len(pages_to_extract),
+ "link_types_found": [link_type for link_type, links in links_by_type.items() if links]
+ },
+ "links_by_type": links_by_type,
+ "all_links": all_links,
+ "extraction_settings": {
+ "include_internal": include_internal,
+ "include_external": include_external,
+ "include_email": include_email,
+ "pages_filter": pages or "all"
+ },
+ "extraction_time": extraction_time
+ }
+
+ except Exception as e:
+ error_msg = sanitize_error_message(str(e))
+ logger.error(f"Link extraction failed for {pdf_path}: {error_msg}")
+ return {
+ "error": f"Link extraction failed: {error_msg}",
+ "extraction_time": round(time.time() - start_time, 2)
+ }
+
+
def main():
"""Run the MCP server - entry point for CLI"""
asyncio.run(run_server())
async def run_server():
"""Run the MCP server"""
+ try:
+ from importlib.metadata import version
+ package_version = version("mcp-pdf")
+ except:
+ package_version = "1.0.1"
+
+ # Log version to stderr so it appears even with MCP protocol on stdout
+ import sys
+ print(f"🎬 MCP PDF Tools v{package_version}", file=sys.stderr)
await mcp.run_stdio_async()
if __name__ == "__main__":
|