mcp-pdf-tools/src/mcp_pdf/mixins_official/table_extraction.py

"""
Table Extraction Mixin - PDF table extraction with intelligent method selection
Uses official fastmcp.contrib.mcp_mixin pattern
"""

import asyncio
import time
import tempfile
from pathlib import Path
from typing import Dict, Any, Optional, List
import logging
import json

# Table extraction libraries
import pandas as pd
import camelot
import tabula
import pdfplumber

# Official FastMCP mixin
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool

from ..security import validate_pdf_path, sanitize_error_message

logger = logging.getLogger(__name__)


class TableExtractionMixin(MCPMixin):
    """
    Handles PDF table extraction operations with intelligent method selection.
    Uses the official FastMCP mixin pattern.
    """

    def __init__(self):
        super().__init__()
        self.max_file_size = 100 * 1024 * 1024  # 100MB

    @mcp_tool(
        name="extract_tables",
        description="Extract tables from PDF with automatic method selection and intelligent fallbacks"
    )
    async def extract_tables(
        self,
        pdf_path: str,
        pages: Optional[str] = None,
        method: str = "auto",
        table_format: str = "json",
        max_rows_per_table: Optional[int] = None,
        summary_only: bool = False
    ) -> Dict[str, Any]:
        """
        Extract tables from PDF using intelligent method selection.

        Args:
            pdf_path: Path to PDF file or HTTPS URL
            pages: Page numbers to extract (comma-separated, 1-based), None for all
            method: Extraction method ("auto", "camelot", "pdfplumber", "tabula")
            table_format: Output format ("json", "csv", "html")
            max_rows_per_table: Maximum rows to return per table (prevents token overflow)
            summary_only: Return only table metadata without data (useful for large tables)

        Returns:
            Dictionary containing extracted tables and metadata
        """
        start_time = time.time()

        try:
            # Validate and prepare inputs
            path = await validate_pdf_path(pdf_path)
            parsed_pages = self._parse_pages_parameter(pages)

            if method == "auto":
                # Try methods in order of reliability
                methods_to_try = ["camelot", "pdfplumber", "tabula"]
            else:
                methods_to_try = [method]

            extraction_results = []
            method_used = None
            total_tables = 0

            for extraction_method in methods_to_try:
                try:
                    logger.info(f"Attempting table extraction with {extraction_method}")

                    if extraction_method == "camelot":
                        result = await self._extract_with_camelot(path, parsed_pages, table_format, max_rows_per_table, summary_only)
                    elif extraction_method == "pdfplumber":
                        result = await self._extract_with_pdfplumber(path, parsed_pages, table_format, max_rows_per_table, summary_only)
                    elif extraction_method == "tabula":
                        result = await self._extract_with_tabula(path, parsed_pages, table_format, max_rows_per_table, summary_only)
                    else:
                        continue

                    if result.get("tables") and len(result["tables"]) > 0:
                        extraction_results = result["tables"]
                        total_tables = len(extraction_results)
                        method_used = extraction_method
                        logger.info(f"Successfully extracted {total_tables} tables with {extraction_method}")
                        break

                except Exception as e:
                    logger.warning(f"Table extraction failed with {extraction_method}: {e}")
                    continue

            if not extraction_results:
                return {
                    "success": False,
                    "error": "No tables found or all extraction methods failed",
                    "methods_tried": methods_to_try,
                    "extraction_time": round(time.time() - start_time, 2)
                }

            return {
                "success": True,
                "tables_found": total_tables,
                "tables": extraction_results,
                "method_used": method_used,
                "file_info": {
                    "path": str(path),
                    "pages_processed": pages or "all"
                },
                "extraction_time": round(time.time() - start_time, 2)
            }

        except Exception as e:
            error_msg = sanitize_error_message(str(e))
            logger.error(f"Table extraction failed: {error_msg}")
            return {
                "success": False,
                "error": error_msg,
                "extraction_time": round(time.time() - start_time, 2)
            }

    # Helper methods (synchronous)
    def _process_table_data(self, df, table_format: str, max_rows: Optional[int], summary_only: bool) -> Any:
        """Process table data with row limiting and summary options"""
        if summary_only:
            # Return None for data when in summary mode
            return None

        # Apply row limit if specified
        if max_rows and len(df) > max_rows:
            df_limited = df.head(max_rows)
        else:
            df_limited = df

        # Convert to requested format
        if table_format == "json":
            return df_limited.to_dict('records')
        elif table_format == "csv":
            return df_limited.to_csv(index=False)
        elif table_format == "html":
            return df_limited.to_html(index=False)
        else:
            return df_limited.to_dict('records')

    def _parse_pages_parameter(self, pages: Optional[str]) -> Optional[str]:
        """Parse pages parameter for different extraction methods

        Converts user input (supporting ranges like "11-30") into library format
        """
        if not pages:
            return None

        try:
            # Use shared parser from utils to handle ranges
            from .utils import parse_pages_parameter
            parsed = parse_pages_parameter(pages)

            if parsed is None:
                return None

            # Convert 0-based indices back to 1-based for library format
            page_list = [p + 1 for p in parsed]
            return ','.join(map(str, page_list))
        except (ValueError, ImportError):
            return None

    async def _extract_with_camelot(self, path: Path, pages: Optional[str], table_format: str,
                                     max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
        """Extract tables using Camelot (best for complex tables)"""
        import camelot

        pages_param = pages if pages else "all"

        # Run camelot in thread to avoid blocking
        def extract_camelot():
            return camelot.read_pdf(str(path), pages=pages_param, flavor='lattice')

        tables = await asyncio.get_event_loop().run_in_executor(None, extract_camelot)

        extracted_tables = []
        for i, table in enumerate(tables):
            # Process table data with limits
            table_data = self._process_table_data(table.df, table_format, max_rows, summary_only)

            table_info = {
                "table_index": i + 1,
                "page": table.page,
                "accuracy": round(table.accuracy, 2) if hasattr(table, 'accuracy') else None,
                "total_rows": len(table.df),
                "columns": len(table.df.columns),
            }

            # Only include data if not summary_only
            if not summary_only:
                table_info["data"] = table_data
                if max_rows and len(table.df) > max_rows:
                    table_info["rows_returned"] = max_rows
                    table_info["rows_truncated"] = len(table.df) - max_rows
                else:
                    table_info["rows_returned"] = len(table.df)

            extracted_tables.append(table_info)

        return {"tables": extracted_tables}

    async def _extract_with_pdfplumber(self, path: Path, pages: Optional[str], table_format: str,
                                        max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
        """Extract tables using pdfplumber (good for simple tables)"""
        import pdfplumber

        def extract_pdfplumber():
            extracted_tables = []
            with pdfplumber.open(str(path)) as pdf:
                pages_to_process = self._get_page_range(pdf, pages)

                for page_num in pages_to_process:
                    if page_num < len(pdf.pages):
                        page = pdf.pages[page_num]
                        tables = page.extract_tables()

                        for i, table in enumerate(tables):
                            if table and len(table) > 0:
                                # Convert to DataFrame for consistent formatting
                                df = pd.DataFrame(table[1:], columns=table[0])

                                # Process table data with limits
                                table_data = self._process_table_data(df, table_format, max_rows, summary_only)

                                table_info = {
                                    "table_index": len(extracted_tables) + 1,
                                    "page": page_num + 1,
                                    "total_rows": len(df),
                                    "columns": len(df.columns),
                                }

                                # Only include data if not summary_only
                                if not summary_only:
                                    table_info["data"] = table_data
                                    if max_rows and len(df) > max_rows:
                                        table_info["rows_returned"] = max_rows
                                        table_info["rows_truncated"] = len(df) - max_rows
                                    else:
                                        table_info["rows_returned"] = len(df)

                                extracted_tables.append(table_info)

            return {"tables": extracted_tables}

        return await asyncio.get_event_loop().run_in_executor(None, extract_pdfplumber)

    async def _extract_with_tabula(self, path: Path, pages: Optional[str], table_format: str,
                                    max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
        """Extract tables using Tabula (Java-based, good for complex layouts)"""
        import tabula

        def extract_tabula():
            pages_param = pages if pages else "all"

            # Read tables with tabula
            tables = tabula.read_pdf(str(path), pages=pages_param, multiple_tables=True)

            extracted_tables = []
            for i, df in enumerate(tables):
                if not df.empty:
                    # Process table data with limits
                    table_data = self._process_table_data(df, table_format, max_rows, summary_only)

                    table_info = {
                        "table_index": i + 1,
                        "page": None,  # Tabula doesn't provide page info easily
                        "total_rows": len(df),
                        "columns": len(df.columns),
                    }

                    # Only include data if not summary_only
                    if not summary_only:
                        table_info["data"] = table_data
                        if max_rows and len(df) > max_rows:
                            table_info["rows_returned"] = max_rows
                            table_info["rows_truncated"] = len(df) - max_rows
                        else:
                            table_info["rows_returned"] = len(df)

                    extracted_tables.append(table_info)

            return {"tables": extracted_tables}

        return await asyncio.get_event_loop().run_in_executor(None, extract_tabula)

    def _get_page_range(self, pdf, pages: Optional[str]) -> List[int]:
        """Convert pages parameter to list of 0-based page indices"""
        if not pages:
            return list(range(len(pdf.pages)))

        try:
            if ',' in pages:
                return [int(p.strip()) - 1 for p in pages.split(',')]
            else:
                return [int(pages.strip()) - 1]
        except ValueError:
            return list(range(len(pdf.pages)))