Major architectural improvements and bug fixes in the v2.0.x series:
## v2.0.5 - Page Range Parsing (Current Release)
- Fix page range parsing bug affecting 6 mixins (e.g., "93-95" or "11-30")
- Create shared parse_pages_parameter() utility function
- Support mixed formats: "1,3-5,7,10-15"
- Update: pdf_utilities, content_analysis, image_processing, misc_tools, table_extraction, text_extraction
## v2.0.4 - Chunk Hint Fix
- Fix next_chunk_hint to show correct page ranges
- Dynamic calculation based on actual pages being extracted
- Example: "30-50" now correctly shows "40-49" for next chunk
## v2.0.3 - Initial Range Support
- Add page range support to text extraction ("11-30")
- Fix _parse_pages_parameter to handle ranges with Python's range()
- Convert 1-based user input to 0-based internal indexing
## v2.0.2 - Lazy Import Fix
- Fix ModuleNotFoundError for reportlab on startup
- Implement lazy imports for optional dependencies
- Graceful degradation with helpful error messages
## v2.0.1 - Dependency Restructuring
- Move reportlab to optional [forms] extra
- Document installation: uvx --with mcp-pdf[forms] mcp-pdf
## v2.0.0 - Official FastMCP Pattern Migration
- Migrate to official fastmcp.contrib.mcp_mixin pattern
- Create 12 specialized mixins with 42 tools total
- Architecture: mixins_official/ using MCPMixin base class
- Backwards compatibility: server_legacy.py preserved
Technical Improvements:
- Centralized utility functions (DRY principle)
- Consistent behavior across all PDF tools
- Better error messages with actionable instructions
- Library-specific adapters for table extraction
Files Changed:
- New: src/mcp_pdf/mixins_official/utils.py (shared utilities)
- Updated: 6 mixins with improved page parsing
- Version: pyproject.toml, server.py → 2.0.5
PyPI: https://pypi.org/project/mcp-pdf/2.0.5/
584 lines
21 KiB
Python
584 lines
21 KiB
Python
"""
|
|
PDF Utilities Mixin - Additional PDF processing tools
|
|
Uses official fastmcp.contrib.mcp_mixin pattern
|
|
"""
|
|
|
|
import asyncio
|
|
import time
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional, List
|
|
import logging
|
|
|
|
# PDF processing libraries
|
|
import fitz # PyMuPDF
|
|
from PIL import Image
|
|
import io
|
|
|
|
# Official FastMCP mixin
|
|
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
|
|
|
|
from ..security import validate_pdf_path, validate_output_path, sanitize_error_message
|
|
from .utils import parse_pages_parameter
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PDFUtilitiesMixin(MCPMixin):
|
|
"""
|
|
Handles additional PDF utility operations including comparison, optimization, and repair.
|
|
Uses the official FastMCP mixin pattern.
|
|
"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
|
|
@mcp_tool(
|
|
name="compare_pdfs",
|
|
description="Compare two PDFs for differences in text, structure, and metadata"
|
|
)
|
|
async def compare_pdfs(
|
|
self,
|
|
pdf_path1: str,
|
|
pdf_path2: str,
|
|
comparison_type: str = "all"
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Compare two PDF files for differences.
|
|
|
|
Args:
|
|
pdf_path1: Path to first PDF file
|
|
pdf_path2: Path to second PDF file
|
|
comparison_type: Type of comparison ("text", "structure", "metadata", "all")
|
|
|
|
Returns:
|
|
Dictionary containing comparison results
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Validate both PDF paths
|
|
path1 = await validate_pdf_path(pdf_path1)
|
|
path2 = await validate_pdf_path(pdf_path2)
|
|
|
|
doc1 = fitz.open(str(path1))
|
|
doc2 = fitz.open(str(path2))
|
|
|
|
comparison_results = {}
|
|
|
|
# Basic document info comparison
|
|
basic_comparison = {
|
|
"pages": {"doc1": len(doc1), "doc2": len(doc2), "equal": len(doc1) == len(doc2)},
|
|
"file_sizes": {
|
|
"doc1_bytes": path1.stat().st_size,
|
|
"doc2_bytes": path2.stat().st_size,
|
|
"size_diff_bytes": abs(path1.stat().st_size - path2.stat().st_size)
|
|
}
|
|
}
|
|
|
|
# Text comparison
|
|
if comparison_type in ["text", "all"]:
|
|
text1 = ""
|
|
text2 = ""
|
|
|
|
# Extract text from both documents
|
|
max_pages = min(len(doc1), len(doc2), 10) # Limit for performance
|
|
for page_num in range(max_pages):
|
|
if page_num < len(doc1):
|
|
text1 += doc1[page_num].get_text() + "\n"
|
|
if page_num < len(doc2):
|
|
text2 += doc2[page_num].get_text() + "\n"
|
|
|
|
# Simple text comparison
|
|
text_equal = text1.strip() == text2.strip()
|
|
text_similarity = self._calculate_text_similarity(text1, text2)
|
|
|
|
comparison_results["text_comparison"] = {
|
|
"texts_equal": text_equal,
|
|
"similarity_score": text_similarity,
|
|
"text1_chars": len(text1),
|
|
"text2_chars": len(text2),
|
|
"char_difference": abs(len(text1) - len(text2))
|
|
}
|
|
|
|
# Metadata comparison
|
|
if comparison_type in ["metadata", "all"]:
|
|
meta1 = doc1.metadata
|
|
meta2 = doc2.metadata
|
|
|
|
metadata_differences = {}
|
|
all_keys = set(meta1.keys()) | set(meta2.keys())
|
|
|
|
for key in all_keys:
|
|
val1 = meta1.get(key, "")
|
|
val2 = meta2.get(key, "")
|
|
if val1 != val2:
|
|
metadata_differences[key] = {"doc1": val1, "doc2": val2}
|
|
|
|
comparison_results["metadata_comparison"] = {
|
|
"metadata_equal": len(metadata_differences) == 0,
|
|
"differences": metadata_differences,
|
|
"total_differences": len(metadata_differences)
|
|
}
|
|
|
|
# Structure comparison
|
|
if comparison_type in ["structure", "all"]:
|
|
toc1 = doc1.get_toc()
|
|
toc2 = doc2.get_toc()
|
|
|
|
structure_equal = toc1 == toc2
|
|
|
|
comparison_results["structure_comparison"] = {
|
|
"bookmarks_equal": structure_equal,
|
|
"toc1_count": len(toc1),
|
|
"toc2_count": len(toc2),
|
|
"bookmark_difference": abs(len(toc1) - len(toc2))
|
|
}
|
|
|
|
doc1.close()
|
|
doc2.close()
|
|
|
|
# Overall similarity assessment
|
|
similarities = []
|
|
if "text_comparison" in comparison_results:
|
|
similarities.append(comparison_results["text_comparison"]["similarity_score"])
|
|
if "metadata_comparison" in comparison_results:
|
|
similarities.append(1.0 if comparison_results["metadata_comparison"]["metadata_equal"] else 0.0)
|
|
if "structure_comparison" in comparison_results:
|
|
similarities.append(1.0 if comparison_results["structure_comparison"]["bookmarks_equal"] else 0.0)
|
|
|
|
overall_similarity = sum(similarities) / len(similarities) if similarities else 0.0
|
|
|
|
return {
|
|
"success": True,
|
|
"comparison_summary": {
|
|
"overall_similarity": round(overall_similarity, 2),
|
|
"comparison_type": comparison_type,
|
|
"documents_identical": overall_similarity == 1.0
|
|
},
|
|
"basic_comparison": basic_comparison,
|
|
**comparison_results,
|
|
"file_info": {
|
|
"file1": str(path1),
|
|
"file2": str(path2)
|
|
},
|
|
"comparison_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = sanitize_error_message(str(e))
|
|
logger.error(f"PDF comparison failed: {error_msg}")
|
|
return {
|
|
"success": False,
|
|
"error": error_msg,
|
|
"comparison_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
@mcp_tool(
|
|
name="optimize_pdf",
|
|
description="Optimize PDF file size and performance"
|
|
)
|
|
async def optimize_pdf(
|
|
self,
|
|
pdf_path: str,
|
|
optimization_level: str = "balanced",
|
|
preserve_quality: bool = True
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Optimize PDF file for smaller size and better performance.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file to optimize
|
|
optimization_level: Level of optimization ("light", "balanced", "aggressive")
|
|
preserve_quality: Whether to preserve visual quality
|
|
|
|
Returns:
|
|
Dictionary containing optimization results
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
path = await validate_pdf_path(pdf_path)
|
|
|
|
# Generate optimized filename
|
|
optimized_path = path.parent / f"{path.stem}_optimized.pdf"
|
|
|
|
doc = fitz.open(str(path))
|
|
original_size = path.stat().st_size
|
|
|
|
# Apply optimization based on level
|
|
if optimization_level == "light":
|
|
# Light optimization: remove unused objects
|
|
doc.save(str(optimized_path), garbage=3, deflate=True)
|
|
elif optimization_level == "balanced":
|
|
# Balanced optimization: compression + cleanup
|
|
doc.save(str(optimized_path), garbage=3, deflate=True, clean=True)
|
|
elif optimization_level == "aggressive":
|
|
# Aggressive optimization: maximum compression
|
|
doc.save(str(optimized_path), garbage=4, deflate=True, clean=True, ascii=False)
|
|
|
|
doc.close()
|
|
|
|
# Check if optimization was successful
|
|
if optimized_path.exists():
|
|
optimized_size = optimized_path.stat().st_size
|
|
size_reduction = original_size - optimized_size
|
|
reduction_percent = (size_reduction / original_size) * 100 if original_size > 0 else 0
|
|
|
|
return {
|
|
"success": True,
|
|
"optimization_summary": {
|
|
"original_size_bytes": original_size,
|
|
"optimized_size_bytes": optimized_size,
|
|
"size_reduction_bytes": size_reduction,
|
|
"reduction_percent": round(reduction_percent, 1),
|
|
"optimization_level": optimization_level
|
|
},
|
|
"output_info": {
|
|
"optimized_path": str(optimized_path),
|
|
"original_path": str(path)
|
|
},
|
|
"optimization_time": round(time.time() - start_time, 2)
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"error": "Optimization failed - output file not created",
|
|
"optimization_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = sanitize_error_message(str(e))
|
|
logger.error(f"PDF optimization failed: {error_msg}")
|
|
return {
|
|
"success": False,
|
|
"error": error_msg,
|
|
"optimization_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
@mcp_tool(
|
|
name="repair_pdf",
|
|
description="Attempt to repair corrupted or damaged PDF files"
|
|
)
|
|
async def repair_pdf(self, pdf_path: str) -> Dict[str, Any]:
|
|
"""
|
|
Attempt to repair a corrupted or damaged PDF file.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file to repair
|
|
|
|
Returns:
|
|
Dictionary containing repair results
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
path = await validate_pdf_path(pdf_path)
|
|
|
|
# Generate repaired filename
|
|
repaired_path = path.parent / f"{path.stem}_repaired.pdf"
|
|
|
|
# Attempt to open and repair the PDF
|
|
try:
|
|
doc = fitz.open(str(path))
|
|
|
|
# Check if document can be read
|
|
total_pages = len(doc)
|
|
readable_pages = 0
|
|
corrupted_pages = []
|
|
|
|
for page_num in range(total_pages):
|
|
try:
|
|
page = doc[page_num]
|
|
# Try to get text to verify page integrity
|
|
page.get_text()
|
|
readable_pages += 1
|
|
except Exception as e:
|
|
corrupted_pages.append(page_num + 1)
|
|
|
|
# If document is readable, save a clean copy
|
|
if readable_pages > 0:
|
|
# Save with repair options
|
|
doc.save(str(repaired_path), garbage=4, deflate=True, clean=True)
|
|
|
|
repair_success = True
|
|
repair_notes = f"Successfully repaired: {readable_pages}/{total_pages} pages recovered"
|
|
else:
|
|
repair_success = False
|
|
repair_notes = "Document appears to be severely corrupted - no readable pages found"
|
|
|
|
doc.close()
|
|
|
|
except Exception as open_error:
|
|
# Document can't be opened normally, try recovery
|
|
repair_success = False
|
|
repair_notes = f"Cannot open document: {str(open_error)[:100]}"
|
|
|
|
# Check repair results
|
|
if repair_success and repaired_path.exists():
|
|
repaired_size = repaired_path.stat().st_size
|
|
original_size = path.stat().st_size
|
|
|
|
return {
|
|
"success": True,
|
|
"repair_summary": {
|
|
"repair_successful": True,
|
|
"original_pages": total_pages,
|
|
"recovered_pages": readable_pages,
|
|
"corrupted_pages": len(corrupted_pages),
|
|
"recovery_rate_percent": round((readable_pages / total_pages) * 100, 1) if total_pages > 0 else 0
|
|
},
|
|
"file_info": {
|
|
"original_path": str(path),
|
|
"repaired_path": str(repaired_path),
|
|
"original_size_bytes": original_size,
|
|
"repaired_size_bytes": repaired_size
|
|
},
|
|
"repair_notes": repair_notes,
|
|
"corrupted_page_numbers": corrupted_pages,
|
|
"repair_time": round(time.time() - start_time, 2)
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"repair_summary": {
|
|
"repair_successful": False,
|
|
"error_details": repair_notes
|
|
},
|
|
"file_info": {
|
|
"original_path": str(path)
|
|
},
|
|
"repair_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = sanitize_error_message(str(e))
|
|
logger.error(f"PDF repair failed: {error_msg}")
|
|
return {
|
|
"success": False,
|
|
"error": error_msg,
|
|
"repair_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
@mcp_tool(
|
|
name="rotate_pages",
|
|
description="Rotate specific pages by 90, 180, or 270 degrees"
|
|
)
|
|
async def rotate_pages(
|
|
self,
|
|
pdf_path: str,
|
|
rotation: int = 90,
|
|
pages: Optional[str] = None,
|
|
output_filename: str = "rotated_document.pdf"
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Rotate specific pages in a PDF document.
|
|
|
|
Args:
|
|
pdf_path: Path to input PDF file
|
|
rotation: Rotation angle (90, 180, 270 degrees)
|
|
pages: Page numbers to rotate (comma-separated, 1-based), None for all
|
|
output_filename: Name for the output file
|
|
|
|
Returns:
|
|
Dictionary containing rotation results
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Validate inputs
|
|
if rotation not in [90, 180, 270]:
|
|
return {
|
|
"success": False,
|
|
"error": "Rotation must be 90, 180, or 270 degrees",
|
|
"rotation_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
path = await validate_pdf_path(pdf_path)
|
|
output_path = path.parent / output_filename
|
|
|
|
doc = fitz.open(str(path))
|
|
total_pages = len(doc)
|
|
|
|
# Parse pages parameter
|
|
parsed_pages = parse_pages_parameter(pages)
|
|
if pages and parsed_pages is None:
|
|
doc.close()
|
|
return {
|
|
"success": False,
|
|
"error": "Invalid page numbers specified",
|
|
"rotation_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
page_numbers = parsed_pages if parsed_pages else list(range(total_pages))
|
|
page_numbers = [p for p in page_numbers if 0 <= p < total_pages]
|
|
|
|
# Rotate specified pages
|
|
pages_rotated = 0
|
|
for page_num in page_numbers:
|
|
try:
|
|
page = doc[page_num]
|
|
page.set_rotation(rotation)
|
|
pages_rotated += 1
|
|
except Exception as e:
|
|
logger.warning(f"Failed to rotate page {page_num + 1}: {e}")
|
|
|
|
# Save rotated document
|
|
doc.save(str(output_path))
|
|
output_size = output_path.stat().st_size
|
|
doc.close()
|
|
|
|
return {
|
|
"success": True,
|
|
"rotation_summary": {
|
|
"rotation_degrees": rotation,
|
|
"total_pages": total_pages,
|
|
"pages_requested": len(page_numbers),
|
|
"pages_rotated": pages_rotated,
|
|
"pages_failed": len(page_numbers) - pages_rotated
|
|
},
|
|
"output_info": {
|
|
"output_path": str(output_path),
|
|
"output_size_bytes": output_size
|
|
},
|
|
"rotated_pages": [p + 1 for p in page_numbers],
|
|
"rotation_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = sanitize_error_message(str(e))
|
|
logger.error(f"Page rotation failed: {error_msg}")
|
|
return {
|
|
"success": False,
|
|
"error": error_msg,
|
|
"rotation_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
@mcp_tool(
|
|
name="convert_to_images",
|
|
description="Convert PDF pages to image files"
|
|
)
|
|
async def convert_to_images(
|
|
self,
|
|
pdf_path: str,
|
|
pages: Optional[str] = None,
|
|
dpi: int = 300,
|
|
format: str = "png",
|
|
output_prefix: str = "page"
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Convert PDF pages to image files.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file
|
|
pages: Page numbers to convert (comma-separated, 1-based), None for all
|
|
dpi: DPI for image rendering
|
|
format: Output image format ("png", "jpg", "jpeg")
|
|
output_prefix: Prefix for output image files
|
|
|
|
Returns:
|
|
Dictionary containing conversion results
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
path = await validate_pdf_path(pdf_path)
|
|
doc = fitz.open(str(path))
|
|
total_pages = len(doc)
|
|
|
|
# Parse pages parameter
|
|
parsed_pages = parse_pages_parameter(pages)
|
|
if pages and parsed_pages is None:
|
|
doc.close()
|
|
return {
|
|
"success": False,
|
|
"error": "Invalid page numbers specified",
|
|
"conversion_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
page_numbers = parsed_pages if parsed_pages else list(range(total_pages))
|
|
page_numbers = [p for p in page_numbers if 0 <= p < total_pages]
|
|
|
|
# Convert pages to images
|
|
converted_images = []
|
|
pages_converted = 0
|
|
|
|
for page_num in page_numbers:
|
|
try:
|
|
page = doc[page_num]
|
|
|
|
# Create image from page
|
|
mat = fitz.Matrix(dpi/72, dpi/72)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
|
|
# Generate filename
|
|
image_filename = f"{output_prefix}_{page_num + 1:03d}.{format}"
|
|
image_path = path.parent / image_filename
|
|
|
|
# Save image
|
|
if format.lower() in ["jpg", "jpeg"]:
|
|
pix.save(str(image_path), "JPEG")
|
|
else:
|
|
pix.save(str(image_path), "PNG")
|
|
|
|
image_size = image_path.stat().st_size
|
|
|
|
converted_images.append({
|
|
"page": page_num + 1,
|
|
"filename": image_filename,
|
|
"path": str(image_path),
|
|
"size_bytes": image_size,
|
|
"dimensions": f"{pix.width}x{pix.height}"
|
|
})
|
|
|
|
pages_converted += 1
|
|
pix = None
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to convert page {page_num + 1}: {e}")
|
|
|
|
doc.close()
|
|
|
|
total_size = sum(img["size_bytes"] for img in converted_images)
|
|
|
|
return {
|
|
"success": True,
|
|
"conversion_summary": {
|
|
"pages_requested": len(page_numbers),
|
|
"pages_converted": pages_converted,
|
|
"pages_failed": len(page_numbers) - pages_converted,
|
|
"output_format": format,
|
|
"dpi": dpi,
|
|
"total_output_size_bytes": total_size
|
|
},
|
|
"converted_images": converted_images,
|
|
"file_info": {
|
|
"input_path": str(path),
|
|
"total_pages": total_pages
|
|
},
|
|
"conversion_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = sanitize_error_message(str(e))
|
|
logger.error(f"PDF to images conversion failed: {error_msg}")
|
|
return {
|
|
"success": False,
|
|
"error": error_msg,
|
|
"conversion_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
# Helper methods
|
|
def _calculate_text_similarity(self, text1: str, text2: str) -> float:
|
|
"""Calculate similarity between two texts (simplified)"""
|
|
if not text1 and not text2:
|
|
return 1.0
|
|
if not text1 or not text2:
|
|
return 0.0
|
|
|
|
# Simple character-based similarity
|
|
common_chars = sum(1 for c1, c2 in zip(text1, text2) if c1 == c2)
|
|
max_length = max(len(text1), len(text2))
|
|
|
|
return common_chars / max_length if max_length > 0 else 1.0 |