Remove incorrect 'await' keywords from validate_output_path() calls across all mixins. validate_output_path() is a synchronous function, not async. Fixed in 15 locations across 6 mixins: - advanced_forms.py (4 calls) - annotations.py (3 calls) - document_assembly.py (2 calls) - form_management.py (2 calls) - image_processing.py (1 call) - misc_tools.py (4 calls) Error: 'object PosixPath can't be used in 'await' expression' Root cause: Incorrectly awaiting synchronous Path validation function Fix: Removed await keyword from all validate_output_path() calls PyPI: https://pypi.org/project/mcp-pdf/2.0.6/
417 lines
15 KiB
Python
417 lines
15 KiB
Python
"""
|
|
Document Assembly Mixin - PDF merging, splitting, and page manipulation
|
|
Uses official fastmcp.contrib.mcp_mixin pattern
|
|
"""
|
|
|
|
import asyncio
|
|
import time
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional, List
|
|
import logging
|
|
|
|
# PDF processing libraries
|
|
import fitz # PyMuPDF
|
|
|
|
# Official FastMCP mixin
|
|
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
|
|
|
|
from ..security import validate_pdf_path, validate_output_path, sanitize_error_message
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DocumentAssemblyMixin(MCPMixin):
|
|
"""
|
|
Handles PDF document assembly operations including merging, splitting, and reordering.
|
|
Uses the official FastMCP mixin pattern.
|
|
"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
|
|
@mcp_tool(
|
|
name="merge_pdfs",
|
|
description="Merge multiple PDFs into one document"
|
|
)
|
|
async def merge_pdfs(
|
|
self,
|
|
pdf_paths: str,
|
|
output_path: str
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Merge multiple PDF files into a single document.
|
|
|
|
Args:
|
|
pdf_paths: JSON string containing list of PDF file paths
|
|
output_path: Path where merged PDF will be saved
|
|
|
|
Returns:
|
|
Dictionary containing merge results
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Parse input paths
|
|
try:
|
|
paths_list = json.loads(pdf_paths)
|
|
except json.JSONDecodeError as e:
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid JSON in pdf_paths: {e}",
|
|
"merge_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
if not isinstance(paths_list, list) or len(paths_list) < 2:
|
|
return {
|
|
"success": False,
|
|
"error": "At least 2 PDF paths required for merging",
|
|
"merge_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
# Validate output path
|
|
output_pdf_path = validate_output_path(output_path)
|
|
|
|
# Validate and open all input PDFs
|
|
input_docs = []
|
|
file_info = []
|
|
|
|
for i, pdf_path in enumerate(paths_list):
|
|
try:
|
|
validated_path = await validate_pdf_path(pdf_path)
|
|
doc = fitz.open(str(validated_path))
|
|
input_docs.append(doc)
|
|
|
|
file_info.append({
|
|
"index": i + 1,
|
|
"path": str(validated_path),
|
|
"pages": len(doc),
|
|
"size_bytes": validated_path.stat().st_size
|
|
})
|
|
except Exception as e:
|
|
# Close any already opened docs
|
|
for opened_doc in input_docs:
|
|
opened_doc.close()
|
|
return {
|
|
"success": False,
|
|
"error": f"Failed to open PDF {i + 1}: {sanitize_error_message(str(e))}",
|
|
"merge_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
# Create merged document
|
|
merged_doc = fitz.open()
|
|
total_pages_merged = 0
|
|
|
|
for i, doc in enumerate(input_docs):
|
|
try:
|
|
merged_doc.insert_pdf(doc)
|
|
total_pages_merged += len(doc)
|
|
logger.info(f"Merged document {i + 1}: {len(doc)} pages")
|
|
except Exception as e:
|
|
logger.error(f"Failed to merge document {i + 1}: {e}")
|
|
|
|
# Save merged document
|
|
merged_doc.save(str(output_pdf_path))
|
|
output_size = output_pdf_path.stat().st_size
|
|
|
|
# Close all documents
|
|
merged_doc.close()
|
|
for doc in input_docs:
|
|
doc.close()
|
|
|
|
return {
|
|
"success": True,
|
|
"merge_summary": {
|
|
"input_files": len(paths_list),
|
|
"total_pages_merged": total_pages_merged,
|
|
"output_size_bytes": output_size,
|
|
"output_size_mb": round(output_size / (1024 * 1024), 2)
|
|
},
|
|
"input_files": file_info,
|
|
"output_info": {
|
|
"output_path": str(output_pdf_path),
|
|
"total_pages": total_pages_merged
|
|
},
|
|
"merge_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = sanitize_error_message(str(e))
|
|
logger.error(f"PDF merge failed: {error_msg}")
|
|
return {
|
|
"success": False,
|
|
"error": error_msg,
|
|
"merge_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
@mcp_tool(
|
|
name="split_pdf",
|
|
description="Split PDF into separate documents"
|
|
)
|
|
async def split_pdf(
|
|
self,
|
|
pdf_path: str,
|
|
split_method: str = "pages"
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Split PDF document into separate files.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file to split
|
|
split_method: Method to use ("pages", "bookmarks", "ranges")
|
|
|
|
Returns:
|
|
Dictionary containing split results
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Validate input path
|
|
input_pdf_path = await validate_pdf_path(pdf_path)
|
|
doc = fitz.open(str(input_pdf_path))
|
|
total_pages = len(doc)
|
|
|
|
if total_pages <= 1:
|
|
doc.close()
|
|
return {
|
|
"success": False,
|
|
"error": "PDF must have more than 1 page to split",
|
|
"split_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
split_files = []
|
|
base_path = input_pdf_path.parent
|
|
base_name = input_pdf_path.stem
|
|
|
|
if split_method == "pages":
|
|
# Split into individual pages
|
|
for page_num in range(total_pages):
|
|
output_path = base_path / f"{base_name}_page_{page_num + 1}.pdf"
|
|
|
|
page_doc = fitz.open()
|
|
page_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
|
|
page_doc.save(str(output_path))
|
|
page_doc.close()
|
|
|
|
split_files.append({
|
|
"file_path": str(output_path),
|
|
"pages": 1,
|
|
"page_range": f"{page_num + 1}",
|
|
"size_bytes": output_path.stat().st_size
|
|
})
|
|
|
|
elif split_method == "bookmarks":
|
|
# Split by bookmarks/table of contents
|
|
toc = doc.get_toc()
|
|
|
|
if not toc:
|
|
doc.close()
|
|
return {
|
|
"success": False,
|
|
"error": "No bookmarks found in PDF for bookmark-based splitting",
|
|
"split_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
# Create splits based on top-level bookmarks
|
|
top_level_bookmarks = [item for item in toc if item[0] == 1] # Level 1 bookmarks
|
|
|
|
for i, bookmark in enumerate(top_level_bookmarks):
|
|
start_page = bookmark[2] - 1 # Convert to 0-based
|
|
|
|
# Determine end page
|
|
if i + 1 < len(top_level_bookmarks):
|
|
end_page = top_level_bookmarks[i + 1][2] - 2 # Convert to 0-based, inclusive
|
|
else:
|
|
end_page = total_pages - 1
|
|
|
|
if start_page <= end_page:
|
|
# Clean bookmark title for filename
|
|
clean_title = "".join(c for c in bookmark[1] if c.isalnum() or c in (' ', '-', '_')).strip()
|
|
clean_title = clean_title[:50] # Limit length
|
|
|
|
output_path = base_path / f"{base_name}_{clean_title}.pdf"
|
|
|
|
split_doc = fitz.open()
|
|
split_doc.insert_pdf(doc, from_page=start_page, to_page=end_page)
|
|
split_doc.save(str(output_path))
|
|
split_doc.close()
|
|
|
|
split_files.append({
|
|
"file_path": str(output_path),
|
|
"pages": end_page - start_page + 1,
|
|
"page_range": f"{start_page + 1}-{end_page + 1}",
|
|
"bookmark_title": bookmark[1],
|
|
"size_bytes": output_path.stat().st_size
|
|
})
|
|
|
|
elif split_method == "ranges":
|
|
# Split into chunks of 10 pages each
|
|
chunk_size = 10
|
|
chunks = (total_pages + chunk_size - 1) // chunk_size
|
|
|
|
for chunk in range(chunks):
|
|
start_page = chunk * chunk_size
|
|
end_page = min(start_page + chunk_size - 1, total_pages - 1)
|
|
|
|
output_path = base_path / f"{base_name}_pages_{start_page + 1}-{end_page + 1}.pdf"
|
|
|
|
chunk_doc = fitz.open()
|
|
chunk_doc.insert_pdf(doc, from_page=start_page, to_page=end_page)
|
|
chunk_doc.save(str(output_path))
|
|
chunk_doc.close()
|
|
|
|
split_files.append({
|
|
"file_path": str(output_path),
|
|
"pages": end_page - start_page + 1,
|
|
"page_range": f"{start_page + 1}-{end_page + 1}",
|
|
"size_bytes": output_path.stat().st_size
|
|
})
|
|
|
|
doc.close()
|
|
|
|
total_output_size = sum(f["size_bytes"] for f in split_files)
|
|
|
|
return {
|
|
"success": True,
|
|
"split_summary": {
|
|
"split_method": split_method,
|
|
"input_pages": total_pages,
|
|
"output_files": len(split_files),
|
|
"total_output_size_bytes": total_output_size,
|
|
"total_output_size_mb": round(total_output_size / (1024 * 1024), 2)
|
|
},
|
|
"split_files": split_files,
|
|
"input_info": {
|
|
"input_path": str(input_pdf_path),
|
|
"total_pages": total_pages
|
|
},
|
|
"split_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = sanitize_error_message(str(e))
|
|
logger.error(f"PDF split failed: {error_msg}")
|
|
return {
|
|
"success": False,
|
|
"error": error_msg,
|
|
"split_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
@mcp_tool(
|
|
name="reorder_pdf_pages",
|
|
description="Reorder pages in PDF document"
|
|
)
|
|
async def reorder_pdf_pages(
|
|
self,
|
|
pdf_path: str,
|
|
page_order: str,
|
|
output_path: str
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Reorder pages in a PDF document according to specified order.
|
|
|
|
Args:
|
|
pdf_path: Path to input PDF file
|
|
page_order: JSON string with new page order (1-based page numbers)
|
|
output_path: Path where reordered PDF will be saved
|
|
|
|
Returns:
|
|
Dictionary containing reorder results
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Validate paths
|
|
input_pdf_path = await validate_pdf_path(pdf_path)
|
|
output_pdf_path = validate_output_path(output_path)
|
|
|
|
# Parse page order
|
|
try:
|
|
order_list = json.loads(page_order)
|
|
except json.JSONDecodeError as e:
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid JSON in page_order: {e}",
|
|
"reorder_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
if not isinstance(order_list, list):
|
|
return {
|
|
"success": False,
|
|
"error": "page_order must be a list of page numbers",
|
|
"reorder_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
# Open input document
|
|
input_doc = fitz.open(str(input_pdf_path))
|
|
total_pages = len(input_doc)
|
|
|
|
# Validate page numbers (convert to 0-based)
|
|
valid_pages = []
|
|
invalid_pages = []
|
|
|
|
for page_num in order_list:
|
|
try:
|
|
page_index = int(page_num) - 1 # Convert to 0-based
|
|
if 0 <= page_index < total_pages:
|
|
valid_pages.append(page_index)
|
|
else:
|
|
invalid_pages.append(page_num)
|
|
except (ValueError, TypeError):
|
|
invalid_pages.append(page_num)
|
|
|
|
if invalid_pages:
|
|
input_doc.close()
|
|
return {
|
|
"success": False,
|
|
"error": f"Invalid page numbers: {invalid_pages}. Pages must be between 1 and {total_pages}",
|
|
"reorder_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
# Create reordered document
|
|
output_doc = fitz.open()
|
|
|
|
for page_index in valid_pages:
|
|
try:
|
|
output_doc.insert_pdf(input_doc, from_page=page_index, to_page=page_index)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to copy page {page_index + 1}: {e}")
|
|
|
|
# Save reordered document
|
|
output_doc.save(str(output_pdf_path))
|
|
output_size = output_pdf_path.stat().st_size
|
|
|
|
input_doc.close()
|
|
output_doc.close()
|
|
|
|
return {
|
|
"success": True,
|
|
"reorder_summary": {
|
|
"input_pages": total_pages,
|
|
"output_pages": len(valid_pages),
|
|
"pages_reordered": len(valid_pages),
|
|
"output_size_bytes": output_size,
|
|
"output_size_mb": round(output_size / (1024 * 1024), 2)
|
|
},
|
|
"page_mapping": {
|
|
"original_order": list(range(1, total_pages + 1)),
|
|
"new_order": [p + 1 for p in valid_pages],
|
|
"pages_duplicated": len(valid_pages) - len(set(valid_pages)),
|
|
"pages_omitted": total_pages - len(set(valid_pages))
|
|
},
|
|
"output_info": {
|
|
"output_path": str(output_pdf_path),
|
|
"total_pages": len(valid_pages)
|
|
},
|
|
"reorder_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = sanitize_error_message(str(e))
|
|
logger.error(f"PDF page reorder failed: {error_msg}")
|
|
return {
|
|
"success": False,
|
|
"error": error_msg,
|
|
"reorder_time": round(time.time() - start_time, 2)
|
|
} |