🔧 v2.0.7: Fix table extraction token overflow with smart limiting
PROBLEM: Table extraction from large PDFs was exceeding MCP's 25,000 token limit, causing "response too large" errors. A 5-page PDF with large tables generated 59,005 tokens, more than double the allowed limit. SOLUTION: Added flexible table data limiting with two new parameters: - max_rows_per_table: Limit rows returned per table (prevents overflow) - summary_only: Return only metadata without table data IMPLEMENTATION: 1. Added new parameters to extract_tables() method signature 2. Created _process_table_data() helper for consistent limiting logic 3. Updated all 3 extraction methods (Camelot, pdfplumber, Tabula) 4. Enhanced table metadata with truncation tracking: - total_rows: Full row count from PDF - rows_returned: Actual rows in response (after limiting) - rows_truncated: Number of rows omitted (if limited) USAGE EXAMPLES: # Summary mode - metadata only (smallest response) extract_tables(pdf_path, pages="1-5", summary_only=True) # Limited data - first 100 rows per table extract_tables(pdf_path, pages="1-5", max_rows_per_table=100) # Full data (default behavior, may overflow on large tables) extract_tables(pdf_path, pages="1-5") BENEFITS: - Prevents MCP token overflow errors - Maintains backward compatibility (new params are optional) - Clear guidance through metadata (shows when truncation occurred) - Flexible - users choose between summary/limited/full modes FILES MODIFIED: - src/mcp_pdf/mixins_official/table_extraction.py (all changes) - src/mcp_pdf/server.py (version bump to 2.0.7) - pyproject.toml (version bump to 2.0.7) VERSION: 2.0.7 PUBLISHED: https://pypi.org/project/mcp-pdf/2.0.7/
This commit is contained in:
parent
fa65fa6e0c
commit
dfbf3d1870
@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "mcp-pdf"
|
name = "mcp-pdf"
|
||||||
version = "2.0.6"
|
version = "2.0.7"
|
||||||
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
||||||
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|||||||
@ -44,7 +44,9 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
pdf_path: str,
|
pdf_path: str,
|
||||||
pages: Optional[str] = None,
|
pages: Optional[str] = None,
|
||||||
method: str = "auto",
|
method: str = "auto",
|
||||||
table_format: str = "json"
|
table_format: str = "json",
|
||||||
|
max_rows_per_table: Optional[int] = None,
|
||||||
|
summary_only: bool = False
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Extract tables from PDF using intelligent method selection.
|
Extract tables from PDF using intelligent method selection.
|
||||||
@ -54,6 +56,8 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
pages: Page numbers to extract (comma-separated, 1-based), None for all
|
pages: Page numbers to extract (comma-separated, 1-based), None for all
|
||||||
method: Extraction method ("auto", "camelot", "pdfplumber", "tabula")
|
method: Extraction method ("auto", "camelot", "pdfplumber", "tabula")
|
||||||
table_format: Output format ("json", "csv", "html")
|
table_format: Output format ("json", "csv", "html")
|
||||||
|
max_rows_per_table: Maximum rows to return per table (prevents token overflow)
|
||||||
|
summary_only: Return only table metadata without data (useful for large tables)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary containing extracted tables and metadata
|
Dictionary containing extracted tables and metadata
|
||||||
@ -80,11 +84,11 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
logger.info(f"Attempting table extraction with {extraction_method}")
|
logger.info(f"Attempting table extraction with {extraction_method}")
|
||||||
|
|
||||||
if extraction_method == "camelot":
|
if extraction_method == "camelot":
|
||||||
result = await self._extract_with_camelot(path, parsed_pages, table_format)
|
result = await self._extract_with_camelot(path, parsed_pages, table_format, max_rows_per_table, summary_only)
|
||||||
elif extraction_method == "pdfplumber":
|
elif extraction_method == "pdfplumber":
|
||||||
result = await self._extract_with_pdfplumber(path, parsed_pages, table_format)
|
result = await self._extract_with_pdfplumber(path, parsed_pages, table_format, max_rows_per_table, summary_only)
|
||||||
elif extraction_method == "tabula":
|
elif extraction_method == "tabula":
|
||||||
result = await self._extract_with_tabula(path, parsed_pages, table_format)
|
result = await self._extract_with_tabula(path, parsed_pages, table_format, max_rows_per_table, summary_only)
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -129,6 +133,28 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Helper methods (synchronous)
|
# Helper methods (synchronous)
|
||||||
|
def _process_table_data(self, df, table_format: str, max_rows: Optional[int], summary_only: bool) -> Any:
|
||||||
|
"""Process table data with row limiting and summary options"""
|
||||||
|
if summary_only:
|
||||||
|
# Return None for data when in summary mode
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Apply row limit if specified
|
||||||
|
if max_rows and len(df) > max_rows:
|
||||||
|
df_limited = df.head(max_rows)
|
||||||
|
else:
|
||||||
|
df_limited = df
|
||||||
|
|
||||||
|
# Convert to requested format
|
||||||
|
if table_format == "json":
|
||||||
|
return df_limited.to_dict('records')
|
||||||
|
elif table_format == "csv":
|
||||||
|
return df_limited.to_csv(index=False)
|
||||||
|
elif table_format == "html":
|
||||||
|
return df_limited.to_html(index=False)
|
||||||
|
else:
|
||||||
|
return df_limited.to_dict('records')
|
||||||
|
|
||||||
def _parse_pages_parameter(self, pages: Optional[str]) -> Optional[str]:
|
def _parse_pages_parameter(self, pages: Optional[str]) -> Optional[str]:
|
||||||
"""Parse pages parameter for different extraction methods
|
"""Parse pages parameter for different extraction methods
|
||||||
|
|
||||||
@ -151,7 +177,8 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
except (ValueError, ImportError):
|
except (ValueError, ImportError):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def _extract_with_camelot(self, path: Path, pages: Optional[str], table_format: str) -> Dict[str, Any]:
|
async def _extract_with_camelot(self, path: Path, pages: Optional[str], table_format: str,
|
||||||
|
max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
|
||||||
"""Extract tables using Camelot (best for complex tables)"""
|
"""Extract tables using Camelot (best for complex tables)"""
|
||||||
import camelot
|
import camelot
|
||||||
|
|
||||||
@ -165,27 +192,32 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
|
|
||||||
extracted_tables = []
|
extracted_tables = []
|
||||||
for i, table in enumerate(tables):
|
for i, table in enumerate(tables):
|
||||||
if table_format == "json":
|
# Process table data with limits
|
||||||
table_data = table.df.to_dict('records')
|
table_data = self._process_table_data(table.df, table_format, max_rows, summary_only)
|
||||||
elif table_format == "csv":
|
|
||||||
table_data = table.df.to_csv(index=False)
|
|
||||||
elif table_format == "html":
|
|
||||||
table_data = table.df.to_html(index=False)
|
|
||||||
else:
|
|
||||||
table_data = table.df.to_dict('records')
|
|
||||||
|
|
||||||
extracted_tables.append({
|
table_info = {
|
||||||
"table_index": i + 1,
|
"table_index": i + 1,
|
||||||
"page": table.page,
|
"page": table.page,
|
||||||
"accuracy": round(table.accuracy, 2) if hasattr(table, 'accuracy') else None,
|
"accuracy": round(table.accuracy, 2) if hasattr(table, 'accuracy') else None,
|
||||||
"rows": len(table.df),
|
"total_rows": len(table.df),
|
||||||
"columns": len(table.df.columns),
|
"columns": len(table.df.columns),
|
||||||
"data": table_data
|
}
|
||||||
})
|
|
||||||
|
# Only include data if not summary_only
|
||||||
|
if not summary_only:
|
||||||
|
table_info["data"] = table_data
|
||||||
|
if max_rows and len(table.df) > max_rows:
|
||||||
|
table_info["rows_returned"] = max_rows
|
||||||
|
table_info["rows_truncated"] = len(table.df) - max_rows
|
||||||
|
else:
|
||||||
|
table_info["rows_returned"] = len(table.df)
|
||||||
|
|
||||||
|
extracted_tables.append(table_info)
|
||||||
|
|
||||||
return {"tables": extracted_tables}
|
return {"tables": extracted_tables}
|
||||||
|
|
||||||
async def _extract_with_pdfplumber(self, path: Path, pages: Optional[str], table_format: str) -> Dict[str, Any]:
|
async def _extract_with_pdfplumber(self, path: Path, pages: Optional[str], table_format: str,
|
||||||
|
max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
|
||||||
"""Extract tables using pdfplumber (good for simple tables)"""
|
"""Extract tables using pdfplumber (good for simple tables)"""
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
|
||||||
@ -204,28 +236,33 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
# Convert to DataFrame for consistent formatting
|
# Convert to DataFrame for consistent formatting
|
||||||
df = pd.DataFrame(table[1:], columns=table[0])
|
df = pd.DataFrame(table[1:], columns=table[0])
|
||||||
|
|
||||||
if table_format == "json":
|
# Process table data with limits
|
||||||
table_data = df.to_dict('records')
|
table_data = self._process_table_data(df, table_format, max_rows, summary_only)
|
||||||
elif table_format == "csv":
|
|
||||||
table_data = df.to_csv(index=False)
|
|
||||||
elif table_format == "html":
|
|
||||||
table_data = df.to_html(index=False)
|
|
||||||
else:
|
|
||||||
table_data = df.to_dict('records')
|
|
||||||
|
|
||||||
extracted_tables.append({
|
table_info = {
|
||||||
"table_index": len(extracted_tables) + 1,
|
"table_index": len(extracted_tables) + 1,
|
||||||
"page": page_num + 1,
|
"page": page_num + 1,
|
||||||
"rows": len(df),
|
"total_rows": len(df),
|
||||||
"columns": len(df.columns),
|
"columns": len(df.columns),
|
||||||
"data": table_data
|
}
|
||||||
})
|
|
||||||
|
# Only include data if not summary_only
|
||||||
|
if not summary_only:
|
||||||
|
table_info["data"] = table_data
|
||||||
|
if max_rows and len(df) > max_rows:
|
||||||
|
table_info["rows_returned"] = max_rows
|
||||||
|
table_info["rows_truncated"] = len(df) - max_rows
|
||||||
|
else:
|
||||||
|
table_info["rows_returned"] = len(df)
|
||||||
|
|
||||||
|
extracted_tables.append(table_info)
|
||||||
|
|
||||||
return {"tables": extracted_tables}
|
return {"tables": extracted_tables}
|
||||||
|
|
||||||
return await asyncio.get_event_loop().run_in_executor(None, extract_pdfplumber)
|
return await asyncio.get_event_loop().run_in_executor(None, extract_pdfplumber)
|
||||||
|
|
||||||
async def _extract_with_tabula(self, path: Path, pages: Optional[str], table_format: str) -> Dict[str, Any]:
|
async def _extract_with_tabula(self, path: Path, pages: Optional[str], table_format: str,
|
||||||
|
max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]:
|
||||||
"""Extract tables using Tabula (Java-based, good for complex layouts)"""
|
"""Extract tables using Tabula (Java-based, good for complex layouts)"""
|
||||||
import tabula
|
import tabula
|
||||||
|
|
||||||
@ -238,22 +275,26 @@ class TableExtractionMixin(MCPMixin):
|
|||||||
extracted_tables = []
|
extracted_tables = []
|
||||||
for i, df in enumerate(tables):
|
for i, df in enumerate(tables):
|
||||||
if not df.empty:
|
if not df.empty:
|
||||||
if table_format == "json":
|
# Process table data with limits
|
||||||
table_data = df.to_dict('records')
|
table_data = self._process_table_data(df, table_format, max_rows, summary_only)
|
||||||
elif table_format == "csv":
|
|
||||||
table_data = df.to_csv(index=False)
|
|
||||||
elif table_format == "html":
|
|
||||||
table_data = df.to_html(index=False)
|
|
||||||
else:
|
|
||||||
table_data = df.to_dict('records')
|
|
||||||
|
|
||||||
extracted_tables.append({
|
table_info = {
|
||||||
"table_index": i + 1,
|
"table_index": i + 1,
|
||||||
"page": None, # Tabula doesn't provide page info easily
|
"page": None, # Tabula doesn't provide page info easily
|
||||||
"rows": len(df),
|
"total_rows": len(df),
|
||||||
"columns": len(df.columns),
|
"columns": len(df.columns),
|
||||||
"data": table_data
|
}
|
||||||
})
|
|
||||||
|
# Only include data if not summary_only
|
||||||
|
if not summary_only:
|
||||||
|
table_info["data"] = table_data
|
||||||
|
if max_rows and len(df) > max_rows:
|
||||||
|
table_info["rows_returned"] = max_rows
|
||||||
|
table_info["rows_truncated"] = len(df) - max_rows
|
||||||
|
else:
|
||||||
|
table_info["rows_returned"] = len(df)
|
||||||
|
|
||||||
|
extracted_tables.append(table_info)
|
||||||
|
|
||||||
return {"tables": extracted_tables}
|
return {"tables": extracted_tables}
|
||||||
|
|
||||||
|
|||||||
@ -105,7 +105,7 @@ class PDFServerOfficial:
|
|||||||
"""Get detailed server information including mixins and configuration"""
|
"""Get detailed server information including mixins and configuration"""
|
||||||
return {
|
return {
|
||||||
"server_name": "MCP PDF Tools (Official FastMCP Pattern)",
|
"server_name": "MCP PDF Tools (Official FastMCP Pattern)",
|
||||||
"version": "2.0.6",
|
"version": "2.0.7",
|
||||||
"architecture": "Official FastMCP Mixin Pattern",
|
"architecture": "Official FastMCP Mixin Pattern",
|
||||||
"total_mixins": len(self.mixins),
|
"total_mixins": len(self.mixins),
|
||||||
"mixins": [
|
"mixins": [
|
||||||
@ -160,7 +160,7 @@ def main():
|
|||||||
from importlib.metadata import version
|
from importlib.metadata import version
|
||||||
package_version = version("mcp-pdf")
|
package_version = version("mcp-pdf")
|
||||||
except:
|
except:
|
||||||
package_version = "2.0.6"
|
package_version = "2.0.7"
|
||||||
|
|
||||||
logger.info(f"🎬 MCP PDF Tools Server v{package_version} (Official Pattern)")
|
logger.info(f"🎬 MCP PDF Tools Server v{package_version} (Official Pattern)")
|
||||||
|
|
||||||
|
|||||||
2
uv.lock
generated
2
uv.lock
generated
@ -1032,7 +1032,7 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mcp-pdf"
|
name = "mcp-pdf"
|
||||||
version = "2.0.6"
|
version = "2.0.7"
|
||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "camelot-py", extra = ["cv"] },
|
{ name = "camelot-py", extra = ["cv"] },
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user