""" Text Extraction Mixin - PDF text extraction and OCR capabilities """ import os import tempfile import time from pathlib import Path from typing import Dict, Any, List, Optional import logging # PDF processing libraries import fitz # PyMuPDF import pdfplumber import pypdf import pytesseract from pdf2image import convert_from_path from .base import MCPMixin, mcp_tool from ..security import validate_pdf_path, parse_pages_parameter, sanitize_error_message logger = logging.getLogger(__name__) class TextExtractionMixin(MCPMixin): """ Handles all PDF text extraction and OCR operations. Tools provided: - extract_text: Intelligent text extraction with method selection - ocr_pdf: OCR processing for scanned documents - is_scanned_pdf: Detect if PDF is scanned/image-based """ def get_mixin_name(self) -> str: return "TextExtraction" def get_required_permissions(self) -> List[str]: return ["read_files", "ocr_processing"] def _setup(self): """Initialize text extraction specific configuration""" self.max_chunk_pages = int(os.getenv("PDF_CHUNK_PAGES", "10")) self.max_tokens_per_chunk = int(os.getenv("PDF_MAX_TOKENS_CHUNK", "20000")) @mcp_tool( name="extract_text", description="Extract text from PDF with intelligent method selection and automatic chunking for large files" ) async def extract_text( self, pdf_path: str, method: str = "auto", pages: Optional[str] = None, preserve_layout: bool = False, max_tokens: int = 20000, chunk_pages: int = 10 ) -> Dict[str, Any]: """ Extract text from PDF with intelligent method selection and automatic chunking. Args: pdf_path: Path to PDF file or URL method: Extraction method ("auto", "pymupdf", "pdfplumber", "pypdf") pages: Page specification (e.g., "1-5,10,15-20" or "all") preserve_layout: Whether to preserve text layout and formatting max_tokens: Maximum tokens to prevent MCP overflow (default 20000) chunk_pages: Number of pages per chunk for large PDFs Returns: Dictionary with extracted text, metadata, and processing info """ start_time = time.time() try: # Validate inputs using centralized security functions path = await validate_pdf_path(pdf_path) parsed_pages = parse_pages_parameter(pages) # Auto-select method based on PDF characteristics if method == "auto": is_scanned = self._detect_scanned_pdf(str(path)) if is_scanned: return { "success": False, "error": "Scanned PDF detected. Please use the OCR tool for this file.", "is_scanned": True, "processing_time": round(time.time() - start_time, 2) } method = "pymupdf" # Default to PyMuPDF for text-based PDFs # Get PDF metadata and size analysis doc = fitz.open(str(path)) total_pages = len(doc) file_size_bytes = path.stat().st_size if path.is_file() else 0 file_size_mb = file_size_bytes / (1024 * 1024) if file_size_bytes > 0 else 0 # Sample content for analysis sample_pages = min(3, total_pages) sample_text = "" for page_num in range(sample_pages): page = doc[page_num] sample_text += page.get_text() avg_chars_per_page = len(sample_text) / sample_pages if sample_pages > 0 else 0 estimated_total_chars = avg_chars_per_page * total_pages estimated_tokens_by_density = int(estimated_total_chars / 4) metadata = { "pages": total_pages, "title": doc.metadata.get("title", ""), "author": doc.metadata.get("author", ""), "file_size_mb": round(file_size_mb, 2), "avg_chars_per_page": int(avg_chars_per_page), "estimated_total_chars": int(estimated_total_chars), "estimated_tokens_by_density": estimated_tokens_by_density } doc.close() # Enforce MCP hard limit effective_max_tokens = min(max_tokens, 24000) # Determine pages to extract if parsed_pages: pages_to_extract = parsed_pages else: pages_to_extract = list(range(total_pages)) # Extract text using selected method if method == "pymupdf": text = self._extract_with_pymupdf(path, pages_to_extract, preserve_layout) elif method == "pdfplumber": text = self._extract_with_pdfplumber(path, pages_to_extract, preserve_layout) elif method == "pypdf": text = self._extract_with_pypdf(path, pages_to_extract, preserve_layout) else: raise ValueError(f"Unknown extraction method: {method}") # Estimate token count estimated_tokens = len(text) // 4 # Handle large responses with intelligent chunking if estimated_tokens > effective_max_tokens: chars_per_chunk = effective_max_tokens * 4 if len(pages_to_extract) > chunk_pages: # Multiple page chunks chunk_page_ranges = [] for i in range(0, len(pages_to_extract), chunk_pages): chunk_pages_list = pages_to_extract[i:i + chunk_pages] chunk_page_ranges.append(chunk_pages_list) # Extract first chunk if method == "pymupdf": chunk_text = self._extract_with_pymupdf(path, chunk_page_ranges[0], preserve_layout) elif method == "pdfplumber": chunk_text = self._extract_with_pdfplumber(path, chunk_page_ranges[0], preserve_layout) elif method == "pypdf": chunk_text = self._extract_with_pypdf(path, chunk_page_ranges[0], preserve_layout) return { "success": True, "text": chunk_text, "method_used": method, "metadata": metadata, "pages_extracted": chunk_page_ranges[0], "processing_time": round(time.time() - start_time, 2), "chunking_info": { "is_chunked": True, "current_chunk": 1, "total_chunks": len(chunk_page_ranges), "chunk_page_ranges": chunk_page_ranges, "reason": "Large PDF automatically chunked to prevent token overflow", "next_chunk_command": f"Use pages parameter: \"{','.join(map(str, chunk_page_ranges[1]))}\" for chunk 2" if len(chunk_page_ranges) > 1 else None } } else: # Single chunk but too much text - truncate truncated_text = text[:chars_per_chunk] last_sentence = truncated_text.rfind('. ') if last_sentence > chars_per_chunk * 0.8: truncated_text = truncated_text[:last_sentence + 1] return { "success": True, "text": truncated_text, "method_used": method, "metadata": metadata, "pages_extracted": pages_to_extract, "processing_time": round(time.time() - start_time, 2), "chunking_info": { "is_truncated": True, "original_estimated_tokens": estimated_tokens, "returned_estimated_tokens": len(truncated_text) // 4, "truncation_percentage": round((len(truncated_text) / len(text)) * 100, 1) } } # Normal response return { "success": True, "text": text, "method_used": method, "metadata": metadata, "pages_extracted": pages_to_extract, "character_count": len(text), "word_count": len(text.split()), "processing_time": round(time.time() - start_time, 2) } except Exception as e: error_msg = sanitize_error_message(str(e)) logger.error(f"Text extraction failed: {error_msg}") return { "success": False, "error": error_msg, "method_attempted": method, "processing_time": round(time.time() - start_time, 2) } @mcp_tool( name="ocr_pdf", description="Perform OCR on scanned PDFs with preprocessing options" ) async def ocr_pdf( self, pdf_path: str, languages: List[str] = ["eng"], preprocess: bool = True, dpi: int = 300, pages: Optional[str] = None ) -> Dict[str, Any]: """ Perform OCR on scanned PDF documents. Args: pdf_path: Path to PDF file or URL languages: List of language codes for OCR (e.g., ["eng", "fra"]) preprocess: Whether to preprocess images for better OCR dpi: DPI for PDF to image conversion pages: Specific pages to OCR Returns: Dictionary containing OCR text and metadata """ start_time = time.time() try: # Validate inputs using centralized security functions path = await validate_pdf_path(pdf_path) parsed_pages = parse_pages_parameter(pages) # Convert PDF pages to images with tempfile.TemporaryDirectory() as temp_dir: if parsed_pages: images = [] for page_num in parsed_pages: page_images = convert_from_path( str(path), dpi=dpi, first_page=page_num+1, last_page=page_num+1, output_folder=temp_dir ) images.extend(page_images) else: images = convert_from_path(str(path), dpi=dpi, output_folder=temp_dir) # Perform OCR on each page ocr_texts = [] for i, image in enumerate(images): # Preprocess image if requested if preprocess: # Convert to grayscale for better OCR image = image.convert('L') # Join languages for tesseract lang_string = '+'.join(languages) # Perform OCR try: text = pytesseract.image_to_string(image, lang=lang_string) ocr_texts.append(text) except Exception as e: logger.warning(f"OCR failed for page {i+1}: {e}") ocr_texts.append("") full_text = "\n\n".join(ocr_texts) return { "success": True, "text": full_text, "pages_processed": len(images), "languages": languages, "dpi": dpi, "preprocessed": preprocess, "character_count": len(full_text), "processing_time": round(time.time() - start_time, 2) } except Exception as e: error_msg = sanitize_error_message(str(e)) logger.error(f"OCR processing failed: {error_msg}") return { "success": False, "error": error_msg, "processing_time": round(time.time() - start_time, 2) } @mcp_tool( name="is_scanned_pdf", description="Detect if a PDF is scanned/image-based rather than text-based" ) async def is_scanned_pdf(self, pdf_path: str) -> Dict[str, Any]: """ Analyze PDF to determine if it's scanned/image-based. Args: pdf_path: Path to PDF file or URL Returns: Dictionary with scan detection results and recommendations """ try: # Validate inputs using centralized security functions path = await validate_pdf_path(pdf_path) is_scanned = self._detect_scanned_pdf(str(path)) doc_info = self._get_document_info(path) return { "success": True, "is_scanned": is_scanned, "confidence": "high" if is_scanned else "medium", "recommendation": "Use OCR extraction" if is_scanned else "Use text extraction", "page_count": doc_info.get("page_count", 0), "file_size": doc_info.get("file_size", 0) } except Exception as e: error_msg = sanitize_error_message(str(e)) return { "success": False, "error": error_msg } # Private helper methods (all synchronous for proper async pattern) def _detect_scanned_pdf(self, pdf_path: str) -> bool: """Detect if a PDF is scanned (image-based)""" try: with pdfplumber.open(pdf_path) as pdf: # Check first few pages for text pages_to_check = min(3, len(pdf.pages)) for i in range(pages_to_check): text = pdf.pages[i].extract_text() if text and len(text.strip()) > 50: return False return True except Exception: return True def _extract_with_pymupdf(self, pdf_path: Path, pages: Optional[List[int]] = None, preserve_layout: bool = False) -> str: """Extract text using PyMuPDF""" doc = fitz.open(str(pdf_path)) text_parts = [] try: page_range = pages if pages else range(len(doc)) for page_num in page_range: page = doc[page_num] if preserve_layout: text_parts.append(page.get_text("text")) else: text_parts.append(page.get_text()) finally: doc.close() return "\n\n".join(text_parts) def _extract_with_pdfplumber(self, pdf_path: Path, pages: Optional[List[int]] = None, preserve_layout: bool = False) -> str: """Extract text using pdfplumber""" text_parts = [] with pdfplumber.open(str(pdf_path)) as pdf: page_range = pages if pages else range(len(pdf.pages)) for page_num in page_range: page = pdf.pages[page_num] text = page.extract_text(layout=preserve_layout) if text: text_parts.append(text) return "\n\n".join(text_parts) def _extract_with_pypdf(self, pdf_path: Path, pages: Optional[List[int]] = None, preserve_layout: bool = False) -> str: """Extract text using pypdf""" reader = pypdf.PdfReader(str(pdf_path)) text_parts = [] page_range = pages if pages else range(len(reader.pages)) for page_num in page_range: page = reader.pages[page_num] text = page.extract_text() if text: text_parts.append(text) return "\n\n".join(text_parts) def _get_document_info(self, pdf_path: Path) -> Dict[str, Any]: """Get basic document information""" try: doc = fitz.open(str(pdf_path)) info = { "page_count": len(doc), "file_size": pdf_path.stat().st_size } doc.close() return info except Exception: return {"page_count": 0, "file_size": 0}