🔧 v2.0.7: Fix table extraction token overflow with smart limiting

PROBLEM:
Table extraction from large PDFs was exceeding MCP's 25,000 token limit,
causing "response too large" errors. A 5-page PDF with large tables
generated 59,005 tokens, more than double the allowed limit.

SOLUTION:
Added flexible table data limiting with two new parameters:
- max_rows_per_table: Limit rows returned per table (prevents overflow)
- summary_only: Return only metadata without table data

IMPLEMENTATION:
1. Added new parameters to extract_tables() method signature
2. Created _process_table_data() helper for consistent limiting logic
3. Updated all 3 extraction methods (Camelot, pdfplumber, Tabula)
4. Enhanced table metadata with truncation tracking:
   - total_rows: Full row count from PDF
   - rows_returned: Actual rows in response (after limiting)
   - rows_truncated: Number of rows omitted (if limited)

USAGE EXAMPLES:
# Summary mode - metadata only (smallest response)
extract_tables(pdf_path, pages="1-5", summary_only=True)

# Limited data - first 100 rows per table
extract_tables(pdf_path, pages="1-5", max_rows_per_table=100)

# Full data (default behavior, may overflow on large tables)
extract_tables(pdf_path, pages="1-5")

BENEFITS:
- Prevents MCP token overflow errors
- Maintains backward compatibility (new params are optional)
- Clear guidance through metadata (shows when truncation occurred)
- Flexible - users choose between summary/limited/full modes

FILES MODIFIED:
- src/mcp_pdf/mixins_official/table_extraction.py (all changes)
- src/mcp_pdf/server.py (version bump to 2.0.7)
- pyproject.toml (version bump to 2.0.7)

VERSION: 2.0.7
PUBLISHED: https://pypi.org/project/mcp-pdf/2.0.7/
This commit is contained in:
Ryan Malloy 2025-11-03 18:26:34 -07:00
parent fa65fa6e0c
commit dfbf3d1870
4 changed files with 88 additions and 47 deletions

View File

@ -1,6 +1,6 @@
[project] [project]
name = "mcp-pdf" name = "mcp-pdf"
version = "2.0.6" version = "2.0.7"
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more" description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}] authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
readme = "README.md" readme = "README.md"

View File

@ -44,7 +44,9 @@ class TableExtractionMixin(MCPMixin):
pdf_path: str, pdf_path: str,
pages: Optional[str] = None, pages: Optional[str] = None,
method: str = "auto", method: str = "auto",
table_format: str = "json" table_format: str = "json",
max_rows_per_table: Optional[int] = None,
summary_only: bool = False
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
Extract tables from PDF using intelligent method selection. Extract tables from PDF using intelligent method selection.
@ -54,6 +56,8 @@ class TableExtractionMixin(MCPMixin):
pages: Page numbers to extract (comma-separated, 1-based), None for all pages: Page numbers to extract (comma-separated, 1-based), None for all
method: Extraction method ("auto", "camelot", "pdfplumber", "tabula") method: Extraction method ("auto", "camelot", "pdfplumber", "tabula")
table_format: Output format ("json", "csv", "html") table_format: Output format ("json", "csv", "html")
max_rows_per_table: Maximum rows to return per table (prevents token overflow)
summary_only: Return only table metadata without data (useful for large tables)
Returns: Returns:
Dictionary containing extracted tables and metadata Dictionary containing extracted tables and metadata
@ -80,11 +84,11 @@ class TableExtractionMixin(MCPMixin):
logger.info(f"Attempting table extraction with {extraction_method}") logger.info(f"Attempting table extraction with {extraction_method}")
if extraction_method == "camelot": if extraction_method == "camelot":
result = await self._extract_with_camelot(path, parsed_pages, table_format) result = await self._extract_with_camelot(path, parsed_pages, table_format, max_rows_per_table, summary_only)
elif extraction_method == "pdfplumber": elif extraction_method == "pdfplumber":
result = await self._extract_with_pdfplumber(path, parsed_pages, table_format) result = await self._extract_with_pdfplumber(path, parsed_pages, table_format, max_rows_per_table, summary_only)
elif extraction_method == "tabula": elif extraction_method == "tabula":
result = await self._extract_with_tabula(path, parsed_pages, table_format) result = await self._extract_with_tabula(path, parsed_pages, table_format, max_rows_per_table, summary_only)
else: else:
continue continue
@ -129,6 +133,28 @@ class TableExtractionMixin(MCPMixin):
} }
# Helper methods (synchronous) # Helper methods (synchronous)
def _process_table_data(self, df, table_format: str, max_rows: Optional[int], summary_only: bool) -> Any:
"""Process table data with row limiting and summary options"""
if summary_only:
# Return None for data when in summary mode
return None
# Apply row limit if specified
if max_rows and len(df) > max_rows:
df_limited = df.head(max_rows)
else:
df_limited = df
# Convert to requested format
if table_format == "json":
return df_limited.to_dict('records')
elif table_format == "csv":
return df_limited.to_csv(index=False)
elif table_format == "html":
return df_limited.to_html(index=False)
else:
return df_limited.to_dict('records')
def _parse_pages_parameter(self, pages: Optional[str]) -> Optional[str]: def _parse_pages_parameter(self, pages: Optional[str]) -> Optional[str]:
"""Parse pages parameter for different extraction methods """Parse pages parameter for different extraction methods
@ -151,7 +177,8 @@ class TableExtractionMixin(MCPMixin):
except (ValueError, ImportError): except (ValueError, ImportError):
return None return None
async def _extract_with_camelot(self, path: Path, pages: Optional[str], table_format: str) -> Dict[str, Any]: async def _extract_with_camelot(self, path: Path, pages: Optional[str], table_format: str,
max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
"""Extract tables using Camelot (best for complex tables)""" """Extract tables using Camelot (best for complex tables)"""
import camelot import camelot
@ -165,27 +192,32 @@ class TableExtractionMixin(MCPMixin):
extracted_tables = [] extracted_tables = []
for i, table in enumerate(tables): for i, table in enumerate(tables):
if table_format == "json": # Process table data with limits
table_data = table.df.to_dict('records') table_data = self._process_table_data(table.df, table_format, max_rows, summary_only)
elif table_format == "csv":
table_data = table.df.to_csv(index=False)
elif table_format == "html":
table_data = table.df.to_html(index=False)
else:
table_data = table.df.to_dict('records')
extracted_tables.append({ table_info = {
"table_index": i + 1, "table_index": i + 1,
"page": table.page, "page": table.page,
"accuracy": round(table.accuracy, 2) if hasattr(table, 'accuracy') else None, "accuracy": round(table.accuracy, 2) if hasattr(table, 'accuracy') else None,
"rows": len(table.df), "total_rows": len(table.df),
"columns": len(table.df.columns), "columns": len(table.df.columns),
"data": table_data }
})
# Only include data if not summary_only
if not summary_only:
table_info["data"] = table_data
if max_rows and len(table.df) > max_rows:
table_info["rows_returned"] = max_rows
table_info["rows_truncated"] = len(table.df) - max_rows
else:
table_info["rows_returned"] = len(table.df)
extracted_tables.append(table_info)
return {"tables": extracted_tables} return {"tables": extracted_tables}
async def _extract_with_pdfplumber(self, path: Path, pages: Optional[str], table_format: str) -> Dict[str, Any]: async def _extract_with_pdfplumber(self, path: Path, pages: Optional[str], table_format: str,
max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
"""Extract tables using pdfplumber (good for simple tables)""" """Extract tables using pdfplumber (good for simple tables)"""
import pdfplumber import pdfplumber
@ -204,28 +236,33 @@ class TableExtractionMixin(MCPMixin):
# Convert to DataFrame for consistent formatting # Convert to DataFrame for consistent formatting
df = pd.DataFrame(table[1:], columns=table[0]) df = pd.DataFrame(table[1:], columns=table[0])
if table_format == "json": # Process table data with limits
table_data = df.to_dict('records') table_data = self._process_table_data(df, table_format, max_rows, summary_only)
elif table_format == "csv":
table_data = df.to_csv(index=False)
elif table_format == "html":
table_data = df.to_html(index=False)
else:
table_data = df.to_dict('records')
extracted_tables.append({ table_info = {
"table_index": len(extracted_tables) + 1, "table_index": len(extracted_tables) + 1,
"page": page_num + 1, "page": page_num + 1,
"rows": len(df), "total_rows": len(df),
"columns": len(df.columns), "columns": len(df.columns),
"data": table_data }
})
# Only include data if not summary_only
if not summary_only:
table_info["data"] = table_data
if max_rows and len(df) > max_rows:
table_info["rows_returned"] = max_rows
table_info["rows_truncated"] = len(df) - max_rows
else:
table_info["rows_returned"] = len(df)
extracted_tables.append(table_info)
return {"tables": extracted_tables} return {"tables": extracted_tables}
return await asyncio.get_event_loop().run_in_executor(None, extract_pdfplumber) return await asyncio.get_event_loop().run_in_executor(None, extract_pdfplumber)
async def _extract_with_tabula(self, path: Path, pages: Optional[str], table_format: str) -> Dict[str, Any]: async def _extract_with_tabula(self, path: Path, pages: Optional[str], table_format: str,
max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
"""Extract tables using Tabula (Java-based, good for complex layouts)""" """Extract tables using Tabula (Java-based, good for complex layouts)"""
import tabula import tabula
@ -238,22 +275,26 @@ class TableExtractionMixin(MCPMixin):
extracted_tables = [] extracted_tables = []
for i, df in enumerate(tables): for i, df in enumerate(tables):
if not df.empty: if not df.empty:
if table_format == "json": # Process table data with limits
table_data = df.to_dict('records') table_data = self._process_table_data(df, table_format, max_rows, summary_only)
elif table_format == "csv":
table_data = df.to_csv(index=False)
elif table_format == "html":
table_data = df.to_html(index=False)
else:
table_data = df.to_dict('records')
extracted_tables.append({ table_info = {
"table_index": i + 1, "table_index": i + 1,
"page": None, # Tabula doesn't provide page info easily "page": None, # Tabula doesn't provide page info easily
"rows": len(df), "total_rows": len(df),
"columns": len(df.columns), "columns": len(df.columns),
"data": table_data }
})
# Only include data if not summary_only
if not summary_only:
table_info["data"] = table_data
if max_rows and len(df) > max_rows:
table_info["rows_returned"] = max_rows
table_info["rows_truncated"] = len(df) - max_rows
else:
table_info["rows_returned"] = len(df)
extracted_tables.append(table_info)
return {"tables": extracted_tables} return {"tables": extracted_tables}

View File

@ -105,7 +105,7 @@ class PDFServerOfficial:
"""Get detailed server information including mixins and configuration""" """Get detailed server information including mixins and configuration"""
return { return {
"server_name": "MCP PDF Tools (Official FastMCP Pattern)", "server_name": "MCP PDF Tools (Official FastMCP Pattern)",
"version": "2.0.6", "version": "2.0.7",
"architecture": "Official FastMCP Mixin Pattern", "architecture": "Official FastMCP Mixin Pattern",
"total_mixins": len(self.mixins), "total_mixins": len(self.mixins),
"mixins": [ "mixins": [
@ -160,7 +160,7 @@ def main():
from importlib.metadata import version from importlib.metadata import version
package_version = version("mcp-pdf") package_version = version("mcp-pdf")
except: except:
package_version = "2.0.6" package_version = "2.0.7"
logger.info(f"🎬 MCP PDF Tools Server v{package_version} (Official Pattern)") logger.info(f"🎬 MCP PDF Tools Server v{package_version} (Official Pattern)")

2
uv.lock generated
View File

@ -1032,7 +1032,7 @@ wheels = [
[[package]] [[package]]
name = "mcp-pdf" name = "mcp-pdf"
version = "2.0.6" version = "2.0.7"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "camelot-py", extra = ["cv"] }, { name = "camelot-py", extra = ["cv"] },