PROBLEM: Table extraction from large PDFs was exceeding MCP's 25,000 token limit, causing "response too large" errors. A 5-page PDF with large tables generated 59,005 tokens, more than double the allowed limit. SOLUTION: Added flexible table data limiting with two new parameters: - max_rows_per_table: Limit rows returned per table (prevents overflow) - summary_only: Return only metadata without table data IMPLEMENTATION: 1. Added new parameters to extract_tables() method signature 2. Created _process_table_data() helper for consistent limiting logic 3. Updated all 3 extraction methods (Camelot, pdfplumber, Tabula) 4. Enhanced table metadata with truncation tracking: - total_rows: Full row count from PDF - rows_returned: Actual rows in response (after limiting) - rows_truncated: Number of rows omitted (if limited) USAGE EXAMPLES: # Summary mode - metadata only (smallest response) extract_tables(pdf_path, pages="1-5", summary_only=True) # Limited data - first 100 rows per table extract_tables(pdf_path, pages="1-5", max_rows_per_table=100) # Full data (default behavior, may overflow on large tables) extract_tables(pdf_path, pages="1-5") BENEFITS: - Prevents MCP token overflow errors - Maintains backward compatibility (new params are optional) - Clear guidance through metadata (shows when truncation occurred) - Flexible - users choose between summary/limited/full modes FILES MODIFIED: - src/mcp_pdf/mixins_official/table_extraction.py (all changes) - src/mcp_pdf/server.py (version bump to 2.0.7) - pyproject.toml (version bump to 2.0.7) VERSION: 2.0.7 PUBLISHED: https://pypi.org/project/mcp-pdf/2.0.7/
314 lines
12 KiB
Python
314 lines
12 KiB
Python
"""
|
|
Table Extraction Mixin - PDF table extraction with intelligent method selection
|
|
Uses official fastmcp.contrib.mcp_mixin pattern
|
|
"""
|
|
|
|
import asyncio
|
|
import time
|
|
import tempfile
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional, List
|
|
import logging
|
|
import json
|
|
|
|
# Table extraction libraries
|
|
import pandas as pd
|
|
import camelot
|
|
import tabula
|
|
import pdfplumber
|
|
|
|
# Official FastMCP mixin
|
|
from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool
|
|
|
|
from ..security import validate_pdf_path, sanitize_error_message
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TableExtractionMixin(MCPMixin):
|
|
"""
|
|
Handles PDF table extraction operations with intelligent method selection.
|
|
Uses the official FastMCP mixin pattern.
|
|
"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.max_file_size = 100 * 1024 * 1024 # 100MB
|
|
|
|
@mcp_tool(
|
|
name="extract_tables",
|
|
description="Extract tables from PDF with automatic method selection and intelligent fallbacks"
|
|
)
|
|
async def extract_tables(
|
|
self,
|
|
pdf_path: str,
|
|
pages: Optional[str] = None,
|
|
method: str = "auto",
|
|
table_format: str = "json",
|
|
max_rows_per_table: Optional[int] = None,
|
|
summary_only: bool = False
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Extract tables from PDF using intelligent method selection.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file or HTTPS URL
|
|
pages: Page numbers to extract (comma-separated, 1-based), None for all
|
|
method: Extraction method ("auto", "camelot", "pdfplumber", "tabula")
|
|
table_format: Output format ("json", "csv", "html")
|
|
max_rows_per_table: Maximum rows to return per table (prevents token overflow)
|
|
summary_only: Return only table metadata without data (useful for large tables)
|
|
|
|
Returns:
|
|
Dictionary containing extracted tables and metadata
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Validate and prepare inputs
|
|
path = await validate_pdf_path(pdf_path)
|
|
parsed_pages = self._parse_pages_parameter(pages)
|
|
|
|
if method == "auto":
|
|
# Try methods in order of reliability
|
|
methods_to_try = ["camelot", "pdfplumber", "tabula"]
|
|
else:
|
|
methods_to_try = [method]
|
|
|
|
extraction_results = []
|
|
method_used = None
|
|
total_tables = 0
|
|
|
|
for extraction_method in methods_to_try:
|
|
try:
|
|
logger.info(f"Attempting table extraction with {extraction_method}")
|
|
|
|
if extraction_method == "camelot":
|
|
result = await self._extract_with_camelot(path, parsed_pages, table_format, max_rows_per_table, summary_only)
|
|
elif extraction_method == "pdfplumber":
|
|
result = await self._extract_with_pdfplumber(path, parsed_pages, table_format, max_rows_per_table, summary_only)
|
|
elif extraction_method == "tabula":
|
|
result = await self._extract_with_tabula(path, parsed_pages, table_format, max_rows_per_table, summary_only)
|
|
else:
|
|
continue
|
|
|
|
if result.get("tables") and len(result["tables"]) > 0:
|
|
extraction_results = result["tables"]
|
|
total_tables = len(extraction_results)
|
|
method_used = extraction_method
|
|
logger.info(f"Successfully extracted {total_tables} tables with {extraction_method}")
|
|
break
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Table extraction failed with {extraction_method}: {e}")
|
|
continue
|
|
|
|
if not extraction_results:
|
|
return {
|
|
"success": False,
|
|
"error": "No tables found or all extraction methods failed",
|
|
"methods_tried": methods_to_try,
|
|
"extraction_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
return {
|
|
"success": True,
|
|
"tables_found": total_tables,
|
|
"tables": extraction_results,
|
|
"method_used": method_used,
|
|
"file_info": {
|
|
"path": str(path),
|
|
"pages_processed": pages or "all"
|
|
},
|
|
"extraction_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
except Exception as e:
|
|
error_msg = sanitize_error_message(str(e))
|
|
logger.error(f"Table extraction failed: {error_msg}")
|
|
return {
|
|
"success": False,
|
|
"error": error_msg,
|
|
"extraction_time": round(time.time() - start_time, 2)
|
|
}
|
|
|
|
# Helper methods (synchronous)
|
|
def _process_table_data(self, df, table_format: str, max_rows: Optional[int], summary_only: bool) -> Any:
|
|
"""Process table data with row limiting and summary options"""
|
|
if summary_only:
|
|
# Return None for data when in summary mode
|
|
return None
|
|
|
|
# Apply row limit if specified
|
|
if max_rows and len(df) > max_rows:
|
|
df_limited = df.head(max_rows)
|
|
else:
|
|
df_limited = df
|
|
|
|
# Convert to requested format
|
|
if table_format == "json":
|
|
return df_limited.to_dict('records')
|
|
elif table_format == "csv":
|
|
return df_limited.to_csv(index=False)
|
|
elif table_format == "html":
|
|
return df_limited.to_html(index=False)
|
|
else:
|
|
return df_limited.to_dict('records')
|
|
|
|
def _parse_pages_parameter(self, pages: Optional[str]) -> Optional[str]:
|
|
"""Parse pages parameter for different extraction methods
|
|
|
|
Converts user input (supporting ranges like "11-30") into library format
|
|
"""
|
|
if not pages:
|
|
return None
|
|
|
|
try:
|
|
# Use shared parser from utils to handle ranges
|
|
from .utils import parse_pages_parameter
|
|
parsed = parse_pages_parameter(pages)
|
|
|
|
if parsed is None:
|
|
return None
|
|
|
|
# Convert 0-based indices back to 1-based for library format
|
|
page_list = [p + 1 for p in parsed]
|
|
return ','.join(map(str, page_list))
|
|
except (ValueError, ImportError):
|
|
return None
|
|
|
|
async def _extract_with_camelot(self, path: Path, pages: Optional[str], table_format: str,
|
|
max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
|
|
"""Extract tables using Camelot (best for complex tables)"""
|
|
import camelot
|
|
|
|
pages_param = pages if pages else "all"
|
|
|
|
# Run camelot in thread to avoid blocking
|
|
def extract_camelot():
|
|
return camelot.read_pdf(str(path), pages=pages_param, flavor='lattice')
|
|
|
|
tables = await asyncio.get_event_loop().run_in_executor(None, extract_camelot)
|
|
|
|
extracted_tables = []
|
|
for i, table in enumerate(tables):
|
|
# Process table data with limits
|
|
table_data = self._process_table_data(table.df, table_format, max_rows, summary_only)
|
|
|
|
table_info = {
|
|
"table_index": i + 1,
|
|
"page": table.page,
|
|
"accuracy": round(table.accuracy, 2) if hasattr(table, 'accuracy') else None,
|
|
"total_rows": len(table.df),
|
|
"columns": len(table.df.columns),
|
|
}
|
|
|
|
# Only include data if not summary_only
|
|
if not summary_only:
|
|
table_info["data"] = table_data
|
|
if max_rows and len(table.df) > max_rows:
|
|
table_info["rows_returned"] = max_rows
|
|
table_info["rows_truncated"] = len(table.df) - max_rows
|
|
else:
|
|
table_info["rows_returned"] = len(table.df)
|
|
|
|
extracted_tables.append(table_info)
|
|
|
|
return {"tables": extracted_tables}
|
|
|
|
async def _extract_with_pdfplumber(self, path: Path, pages: Optional[str], table_format: str,
|
|
max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
|
|
"""Extract tables using pdfplumber (good for simple tables)"""
|
|
import pdfplumber
|
|
|
|
def extract_pdfplumber():
|
|
extracted_tables = []
|
|
with pdfplumber.open(str(path)) as pdf:
|
|
pages_to_process = self._get_page_range(pdf, pages)
|
|
|
|
for page_num in pages_to_process:
|
|
if page_num < len(pdf.pages):
|
|
page = pdf.pages[page_num]
|
|
tables = page.extract_tables()
|
|
|
|
for i, table in enumerate(tables):
|
|
if table and len(table) > 0:
|
|
# Convert to DataFrame for consistent formatting
|
|
df = pd.DataFrame(table[1:], columns=table[0])
|
|
|
|
# Process table data with limits
|
|
table_data = self._process_table_data(df, table_format, max_rows, summary_only)
|
|
|
|
table_info = {
|
|
"table_index": len(extracted_tables) + 1,
|
|
"page": page_num + 1,
|
|
"total_rows": len(df),
|
|
"columns": len(df.columns),
|
|
}
|
|
|
|
# Only include data if not summary_only
|
|
if not summary_only:
|
|
table_info["data"] = table_data
|
|
if max_rows and len(df) > max_rows:
|
|
table_info["rows_returned"] = max_rows
|
|
table_info["rows_truncated"] = len(df) - max_rows
|
|
else:
|
|
table_info["rows_returned"] = len(df)
|
|
|
|
extracted_tables.append(table_info)
|
|
|
|
return {"tables": extracted_tables}
|
|
|
|
return await asyncio.get_event_loop().run_in_executor(None, extract_pdfplumber)
|
|
|
|
async def _extract_with_tabula(self, path: Path, pages: Optional[str], table_format: str,
|
|
max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
|
|
"""Extract tables using Tabula (Java-based, good for complex layouts)"""
|
|
import tabula
|
|
|
|
def extract_tabula():
|
|
pages_param = pages if pages else "all"
|
|
|
|
# Read tables with tabula
|
|
tables = tabula.read_pdf(str(path), pages=pages_param, multiple_tables=True)
|
|
|
|
extracted_tables = []
|
|
for i, df in enumerate(tables):
|
|
if not df.empty:
|
|
# Process table data with limits
|
|
table_data = self._process_table_data(df, table_format, max_rows, summary_only)
|
|
|
|
table_info = {
|
|
"table_index": i + 1,
|
|
"page": None, # Tabula doesn't provide page info easily
|
|
"total_rows": len(df),
|
|
"columns": len(df.columns),
|
|
}
|
|
|
|
# Only include data if not summary_only
|
|
if not summary_only:
|
|
table_info["data"] = table_data
|
|
if max_rows and len(df) > max_rows:
|
|
table_info["rows_returned"] = max_rows
|
|
table_info["rows_truncated"] = len(df) - max_rows
|
|
else:
|
|
table_info["rows_returned"] = len(df)
|
|
|
|
extracted_tables.append(table_info)
|
|
|
|
return {"tables": extracted_tables}
|
|
|
|
return await asyncio.get_event_loop().run_in_executor(None, extract_tabula)
|
|
|
|
def _get_page_range(self, pdf, pages: Optional[str]) -> List[int]:
|
|
"""Convert pages parameter to list of 0-based page indices"""
|
|
if not pages:
|
|
return list(range(len(pdf.pages)))
|
|
|
|
try:
|
|
if ',' in pages:
|
|
return [int(p.strip()) - 1 for p in pages.split(',')]
|
|
else:
|
|
return [int(pages.strip()) - 1]
|
|
except ValueError:
|
|
return list(range(len(pdf.pages))) |