From dfbf3d187071e1f51c9dd593b4b04c2674dda716 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Mon, 3 Nov 2025 18:26:34 -0700 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A7=20v2.0.7:=20Fix=20table=20extracti?= =?UTF-8?q?on=20token=20overflow=20with=20smart=20limiting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PROBLEM: Table extraction from large PDFs was exceeding MCP's 25,000 token limit, causing "response too large" errors. A 5-page PDF with large tables generated 59,005 tokens, more than double the allowed limit. SOLUTION: Added flexible table data limiting with two new parameters: - max_rows_per_table: Limit rows returned per table (prevents overflow) - summary_only: Return only metadata without table data IMPLEMENTATION: 1. Added new parameters to extract_tables() method signature 2. Created _process_table_data() helper for consistent limiting logic 3. Updated all 3 extraction methods (Camelot, pdfplumber, Tabula) 4. Enhanced table metadata with truncation tracking: - total_rows: Full row count from PDF - rows_returned: Actual rows in response (after limiting) - rows_truncated: Number of rows omitted (if limited) USAGE EXAMPLES: # Summary mode - metadata only (smallest response) extract_tables(pdf_path, pages="1-5", summary_only=True) # Limited data - first 100 rows per table extract_tables(pdf_path, pages="1-5", max_rows_per_table=100) # Full data (default behavior, may overflow on large tables) extract_tables(pdf_path, pages="1-5") BENEFITS: - Prevents MCP token overflow errors - Maintains backward compatibility (new params are optional) - Clear guidance through metadata (shows when truncation occurred) - Flexible - users choose between summary/limited/full modes FILES MODIFIED: - src/mcp_pdf/mixins_official/table_extraction.py (all changes) - src/mcp_pdf/server.py (version bump to 2.0.7) - pyproject.toml (version bump to 2.0.7) VERSION: 2.0.7 PUBLISHED: https://pypi.org/project/mcp-pdf/2.0.7/ --- pyproject.toml | 2 +- .../mixins_official/table_extraction.py | 127 ++++++++++++------ src/mcp_pdf/server.py | 4 +- uv.lock | 2 +- 4 files changed, 88 insertions(+), 47 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f836731..1ef0146 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "mcp-pdf" -version = "2.0.6" +version = "2.0.7" description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more" authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}] readme = "README.md" diff --git a/src/mcp_pdf/mixins_official/table_extraction.py b/src/mcp_pdf/mixins_official/table_extraction.py index 20f21d5..fa1fc53 100644 --- a/src/mcp_pdf/mixins_official/table_extraction.py +++ b/src/mcp_pdf/mixins_official/table_extraction.py @@ -44,7 +44,9 @@ class TableExtractionMixin(MCPMixin): pdf_path: str, pages: Optional[str] = None, method: str = "auto", - table_format: str = "json" + table_format: str = "json", + max_rows_per_table: Optional[int] = None, + summary_only: bool = False ) -> Dict[str, Any]: """ Extract tables from PDF using intelligent method selection. @@ -54,6 +56,8 @@ class TableExtractionMixin(MCPMixin): pages: Page numbers to extract (comma-separated, 1-based), None for all method: Extraction method ("auto", "camelot", "pdfplumber", "tabula") table_format: Output format ("json", "csv", "html") + max_rows_per_table: Maximum rows to return per table (prevents token overflow) + summary_only: Return only table metadata without data (useful for large tables) Returns: Dictionary containing extracted tables and metadata @@ -80,11 +84,11 @@ class TableExtractionMixin(MCPMixin): logger.info(f"Attempting table extraction with {extraction_method}") if extraction_method == "camelot": - result = await self._extract_with_camelot(path, parsed_pages, table_format) + result = await self._extract_with_camelot(path, parsed_pages, table_format, max_rows_per_table, summary_only) elif extraction_method == "pdfplumber": - result = await self._extract_with_pdfplumber(path, parsed_pages, table_format) + result = await self._extract_with_pdfplumber(path, parsed_pages, table_format, max_rows_per_table, summary_only) elif extraction_method == "tabula": - result = await self._extract_with_tabula(path, parsed_pages, table_format) + result = await self._extract_with_tabula(path, parsed_pages, table_format, max_rows_per_table, summary_only) else: continue @@ -129,6 +133,28 @@ class TableExtractionMixin(MCPMixin): } # Helper methods (synchronous) + def _process_table_data(self, df, table_format: str, max_rows: Optional[int], summary_only: bool) -> Any: + """Process table data with row limiting and summary options""" + if summary_only: + # Return None for data when in summary mode + return None + + # Apply row limit if specified + if max_rows and len(df) > max_rows: + df_limited = df.head(max_rows) + else: + df_limited = df + + # Convert to requested format + if table_format == "json": + return df_limited.to_dict('records') + elif table_format == "csv": + return df_limited.to_csv(index=False) + elif table_format == "html": + return df_limited.to_html(index=False) + else: + return df_limited.to_dict('records') + def _parse_pages_parameter(self, pages: Optional[str]) -> Optional[str]: """Parse pages parameter for different extraction methods @@ -151,7 +177,8 @@ class TableExtractionMixin(MCPMixin): except (ValueError, ImportError): return None - async def _extract_with_camelot(self, path: Path, pages: Optional[str], table_format: str) -> Dict[str, Any]: + async def _extract_with_camelot(self, path: Path, pages: Optional[str], table_format: str, + max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]: """Extract tables using Camelot (best for complex tables)""" import camelot @@ -165,27 +192,32 @@ class TableExtractionMixin(MCPMixin): extracted_tables = [] for i, table in enumerate(tables): - if table_format == "json": - table_data = table.df.to_dict('records') - elif table_format == "csv": - table_data = table.df.to_csv(index=False) - elif table_format == "html": - table_data = table.df.to_html(index=False) - else: - table_data = table.df.to_dict('records') + # Process table data with limits + table_data = self._process_table_data(table.df, table_format, max_rows, summary_only) - extracted_tables.append({ + table_info = { "table_index": i + 1, "page": table.page, "accuracy": round(table.accuracy, 2) if hasattr(table, 'accuracy') else None, - "rows": len(table.df), + "total_rows": len(table.df), "columns": len(table.df.columns), - "data": table_data - }) + } + + # Only include data if not summary_only + if not summary_only: + table_info["data"] = table_data + if max_rows and len(table.df) > max_rows: + table_info["rows_returned"] = max_rows + table_info["rows_truncated"] = len(table.df) - max_rows + else: + table_info["rows_returned"] = len(table.df) + + extracted_tables.append(table_info) return {"tables": extracted_tables} - async def _extract_with_pdfplumber(self, path: Path, pages: Optional[str], table_format: str) -> Dict[str, Any]: + async def _extract_with_pdfplumber(self, path: Path, pages: Optional[str], table_format: str, + max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]: """Extract tables using pdfplumber (good for simple tables)""" import pdfplumber @@ -204,28 +236,33 @@ class TableExtractionMixin(MCPMixin): # Convert to DataFrame for consistent formatting df = pd.DataFrame(table[1:], columns=table[0]) - if table_format == "json": - table_data = df.to_dict('records') - elif table_format == "csv": - table_data = df.to_csv(index=False) - elif table_format == "html": - table_data = df.to_html(index=False) - else: - table_data = df.to_dict('records') + # Process table data with limits + table_data = self._process_table_data(df, table_format, max_rows, summary_only) - extracted_tables.append({ + table_info = { "table_index": len(extracted_tables) + 1, "page": page_num + 1, - "rows": len(df), + "total_rows": len(df), "columns": len(df.columns), - "data": table_data - }) + } + + # Only include data if not summary_only + if not summary_only: + table_info["data"] = table_data + if max_rows and len(df) > max_rows: + table_info["rows_returned"] = max_rows + table_info["rows_truncated"] = len(df) - max_rows + else: + table_info["rows_returned"] = len(df) + + extracted_tables.append(table_info) return {"tables": extracted_tables} return await asyncio.get_event_loop().run_in_executor(None, extract_pdfplumber) - async def _extract_with_tabula(self, path: Path, pages: Optional[str], table_format: str) -> Dict[str, Any]: + async def _extract_with_tabula(self, path: Path, pages: Optional[str], table_format: str, + max_rows: Optional[int], summary_only: bool) -> Dict[str, Any]: """Extract tables using Tabula (Java-based, good for complex layouts)""" import tabula @@ -238,22 +275,26 @@ class TableExtractionMixin(MCPMixin): extracted_tables = [] for i, df in enumerate(tables): if not df.empty: - if table_format == "json": - table_data = df.to_dict('records') - elif table_format == "csv": - table_data = df.to_csv(index=False) - elif table_format == "html": - table_data = df.to_html(index=False) - else: - table_data = df.to_dict('records') + # Process table data with limits + table_data = self._process_table_data(df, table_format, max_rows, summary_only) - extracted_tables.append({ + table_info = { "table_index": i + 1, "page": None, # Tabula doesn't provide page info easily - "rows": len(df), + "total_rows": len(df), "columns": len(df.columns), - "data": table_data - }) + } + + # Only include data if not summary_only + if not summary_only: + table_info["data"] = table_data + if max_rows and len(df) > max_rows: + table_info["rows_returned"] = max_rows + table_info["rows_truncated"] = len(df) - max_rows + else: + table_info["rows_returned"] = len(df) + + extracted_tables.append(table_info) return {"tables": extracted_tables} diff --git a/src/mcp_pdf/server.py b/src/mcp_pdf/server.py index 919e5c5..50b0ad4 100644 --- a/src/mcp_pdf/server.py +++ b/src/mcp_pdf/server.py @@ -105,7 +105,7 @@ class PDFServerOfficial: """Get detailed server information including mixins and configuration""" return { "server_name": "MCP PDF Tools (Official FastMCP Pattern)", - "version": "2.0.6", + "version": "2.0.7", "architecture": "Official FastMCP Mixin Pattern", "total_mixins": len(self.mixins), "mixins": [ @@ -160,7 +160,7 @@ def main(): from importlib.metadata import version package_version = version("mcp-pdf") except: - package_version = "2.0.6" + package_version = "2.0.7" logger.info(f"🎬 MCP PDF Tools Server v{package_version} (Official Pattern)") diff --git a/uv.lock b/uv.lock index 313455f..8936292 100644 --- a/uv.lock +++ b/uv.lock @@ -1032,7 +1032,7 @@ wheels = [ [[package]] name = "mcp-pdf" -version = "2.0.6" +version = "2.0.7" source = { editable = "." } dependencies = [ { name = "camelot-py", extra = ["cv"] },