""" Document Assembly Mixin - PDF merging, splitting, and page manipulation Uses official fastmcp.contrib.mcp_mixin pattern """ import asyncio import time import json from pathlib import Path from typing import Dict, Any, Optional, List import logging # PDF processing libraries import fitz # PyMuPDF # Official FastMCP mixin from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool from ..security import validate_pdf_path, validate_output_path, sanitize_error_message logger = logging.getLogger(__name__) class DocumentAssemblyMixin(MCPMixin): """ Handles PDF document assembly operations including merging, splitting, and reordering. Uses the official FastMCP mixin pattern. """ def __init__(self): super().__init__() self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="merge_pdfs", description="Merge multiple PDFs into one document" ) async def merge_pdfs( self, pdf_paths: str, output_path: str ) -> Dict[str, Any]: """ Merge multiple PDF files into a single document. Args: pdf_paths: JSON string containing list of PDF file paths output_path: Path where merged PDF will be saved Returns: Dictionary containing merge results """ start_time = time.time() try: # Parse input paths try: paths_list = json.loads(pdf_paths) except json.JSONDecodeError as e: return { "success": False, "error": f"Invalid JSON in pdf_paths: {e}", "merge_time": round(time.time() - start_time, 2) } if not isinstance(paths_list, list) or len(paths_list) < 2: return { "success": False, "error": "At least 2 PDF paths required for merging", "merge_time": round(time.time() - start_time, 2) } # Validate output path output_pdf_path = validate_output_path(output_path) # Validate and open all input PDFs input_docs = [] file_info = [] for i, pdf_path in enumerate(paths_list): try: validated_path = await validate_pdf_path(pdf_path) doc = fitz.open(str(validated_path)) input_docs.append(doc) file_info.append({ "index": i + 1, "path": str(validated_path), "pages": len(doc), "size_bytes": validated_path.stat().st_size }) except Exception as e: # Close any already opened docs for opened_doc in input_docs: opened_doc.close() return { "success": False, "error": f"Failed to open PDF {i + 1}: {sanitize_error_message(str(e))}", "merge_time": round(time.time() - start_time, 2) } # Create merged document merged_doc = fitz.open() total_pages_merged = 0 for i, doc in enumerate(input_docs): try: merged_doc.insert_pdf(doc) total_pages_merged += len(doc) logger.info(f"Merged document {i + 1}: {len(doc)} pages") except Exception as e: logger.error(f"Failed to merge document {i + 1}: {e}") # Save merged document merged_doc.save(str(output_pdf_path)) output_size = output_pdf_path.stat().st_size # Close all documents merged_doc.close() for doc in input_docs: doc.close() return { "success": True, "merge_summary": { "input_files": len(paths_list), "total_pages_merged": total_pages_merged, "output_size_bytes": output_size, "output_size_mb": round(output_size / (1024 * 1024), 2) }, "input_files": file_info, "output_info": { "output_path": str(output_pdf_path), "total_pages": total_pages_merged }, "merge_time": round(time.time() - start_time, 2) } except Exception as e: error_msg = sanitize_error_message(str(e)) logger.error(f"PDF merge failed: {error_msg}") return { "success": False, "error": error_msg, "merge_time": round(time.time() - start_time, 2) } @mcp_tool( name="split_pdf", description="Split PDF into separate documents" ) async def split_pdf( self, pdf_path: str, split_method: str = "pages" ) -> Dict[str, Any]: """ Split PDF document into separate files. Args: pdf_path: Path to PDF file to split split_method: Method to use ("pages", "bookmarks", "ranges") Returns: Dictionary containing split results """ start_time = time.time() try: # Validate input path input_pdf_path = await validate_pdf_path(pdf_path) doc = fitz.open(str(input_pdf_path)) total_pages = len(doc) if total_pages <= 1: doc.close() return { "success": False, "error": "PDF must have more than 1 page to split", "split_time": round(time.time() - start_time, 2) } split_files = [] base_path = input_pdf_path.parent base_name = input_pdf_path.stem if split_method == "pages": # Split into individual pages for page_num in range(total_pages): output_path = base_path / f"{base_name}_page_{page_num + 1}.pdf" page_doc = fitz.open() page_doc.insert_pdf(doc, from_page=page_num, to_page=page_num) page_doc.save(str(output_path)) page_doc.close() split_files.append({ "file_path": str(output_path), "pages": 1, "page_range": f"{page_num + 1}", "size_bytes": output_path.stat().st_size }) elif split_method == "bookmarks": # Split by bookmarks/table of contents toc = doc.get_toc() if not toc: doc.close() return { "success": False, "error": "No bookmarks found in PDF for bookmark-based splitting", "split_time": round(time.time() - start_time, 2) } # Create splits based on top-level bookmarks top_level_bookmarks = [item for item in toc if item[0] == 1] # Level 1 bookmarks for i, bookmark in enumerate(top_level_bookmarks): start_page = bookmark[2] - 1 # Convert to 0-based # Determine end page if i + 1 < len(top_level_bookmarks): end_page = top_level_bookmarks[i + 1][2] - 2 # Convert to 0-based, inclusive else: end_page = total_pages - 1 if start_page <= end_page: # Clean bookmark title for filename clean_title = "".join(c for c in bookmark[1] if c.isalnum() or c in (' ', '-', '_')).strip() clean_title = clean_title[:50] # Limit length output_path = base_path / f"{base_name}_{clean_title}.pdf" split_doc = fitz.open() split_doc.insert_pdf(doc, from_page=start_page, to_page=end_page) split_doc.save(str(output_path)) split_doc.close() split_files.append({ "file_path": str(output_path), "pages": end_page - start_page + 1, "page_range": f"{start_page + 1}-{end_page + 1}", "bookmark_title": bookmark[1], "size_bytes": output_path.stat().st_size }) elif split_method == "ranges": # Split into chunks of 10 pages each chunk_size = 10 chunks = (total_pages + chunk_size - 1) // chunk_size for chunk in range(chunks): start_page = chunk * chunk_size end_page = min(start_page + chunk_size - 1, total_pages - 1) output_path = base_path / f"{base_name}_pages_{start_page + 1}-{end_page + 1}.pdf" chunk_doc = fitz.open() chunk_doc.insert_pdf(doc, from_page=start_page, to_page=end_page) chunk_doc.save(str(output_path)) chunk_doc.close() split_files.append({ "file_path": str(output_path), "pages": end_page - start_page + 1, "page_range": f"{start_page + 1}-{end_page + 1}", "size_bytes": output_path.stat().st_size }) doc.close() total_output_size = sum(f["size_bytes"] for f in split_files) return { "success": True, "split_summary": { "split_method": split_method, "input_pages": total_pages, "output_files": len(split_files), "total_output_size_bytes": total_output_size, "total_output_size_mb": round(total_output_size / (1024 * 1024), 2) }, "split_files": split_files, "input_info": { "input_path": str(input_pdf_path), "total_pages": total_pages }, "split_time": round(time.time() - start_time, 2) } except Exception as e: error_msg = sanitize_error_message(str(e)) logger.error(f"PDF split failed: {error_msg}") return { "success": False, "error": error_msg, "split_time": round(time.time() - start_time, 2) } @mcp_tool( name="reorder_pdf_pages", description="Reorder pages in PDF document" ) async def reorder_pdf_pages( self, pdf_path: str, page_order: str, output_path: str ) -> Dict[str, Any]: """ Reorder pages in a PDF document according to specified order. Args: pdf_path: Path to input PDF file page_order: JSON string with new page order (1-based page numbers) output_path: Path where reordered PDF will be saved Returns: Dictionary containing reorder results """ start_time = time.time() try: # Validate paths input_pdf_path = await validate_pdf_path(pdf_path) output_pdf_path = validate_output_path(output_path) # Parse page order try: order_list = json.loads(page_order) except json.JSONDecodeError as e: return { "success": False, "error": f"Invalid JSON in page_order: {e}", "reorder_time": round(time.time() - start_time, 2) } if not isinstance(order_list, list): return { "success": False, "error": "page_order must be a list of page numbers", "reorder_time": round(time.time() - start_time, 2) } # Open input document input_doc = fitz.open(str(input_pdf_path)) total_pages = len(input_doc) # Validate page numbers (convert to 0-based) valid_pages = [] invalid_pages = [] for page_num in order_list: try: page_index = int(page_num) - 1 # Convert to 0-based if 0 <= page_index < total_pages: valid_pages.append(page_index) else: invalid_pages.append(page_num) except (ValueError, TypeError): invalid_pages.append(page_num) if invalid_pages: input_doc.close() return { "success": False, "error": f"Invalid page numbers: {invalid_pages}. Pages must be between 1 and {total_pages}", "reorder_time": round(time.time() - start_time, 2) } # Create reordered document output_doc = fitz.open() for page_index in valid_pages: try: output_doc.insert_pdf(input_doc, from_page=page_index, to_page=page_index) except Exception as e: logger.warning(f"Failed to copy page {page_index + 1}: {e}") # Save reordered document output_doc.save(str(output_pdf_path)) output_size = output_pdf_path.stat().st_size input_doc.close() output_doc.close() return { "success": True, "reorder_summary": { "input_pages": total_pages, "output_pages": len(valid_pages), "pages_reordered": len(valid_pages), "output_size_bytes": output_size, "output_size_mb": round(output_size / (1024 * 1024), 2) }, "page_mapping": { "original_order": list(range(1, total_pages + 1)), "new_order": [p + 1 for p in valid_pages], "pages_duplicated": len(valid_pages) - len(set(valid_pages)), "pages_omitted": total_pages - len(set(valid_pages)) }, "output_info": { "output_path": str(output_pdf_path), "total_pages": len(valid_pages) }, "reorder_time": round(time.time() - start_time, 2) } except Exception as e: error_msg = sanitize_error_message(str(e)) logger.error(f"PDF page reorder failed: {error_msg}") return { "success": False, "error": error_msg, "reorder_time": round(time.time() - start_time, 2) }