mcp-pdf-tools/pyproject.toml
Ryan Malloy dfbf3d1870 🔧 v2.0.7: Fix table extraction token overflow with smart limiting
PROBLEM:
Table extraction from large PDFs was exceeding MCP's 25,000 token limit,
causing "response too large" errors. A 5-page PDF with large tables
generated 59,005 tokens, more than double the allowed limit.

SOLUTION:
Added flexible table data limiting with two new parameters:
- max_rows_per_table: Limit rows returned per table (prevents overflow)
- summary_only: Return only metadata without table data

IMPLEMENTATION:
1. Added new parameters to extract_tables() method signature
2. Created _process_table_data() helper for consistent limiting logic
3. Updated all 3 extraction methods (Camelot, pdfplumber, Tabula)
4. Enhanced table metadata with truncation tracking:
   - total_rows: Full row count from PDF
   - rows_returned: Actual rows in response (after limiting)
   - rows_truncated: Number of rows omitted (if limited)

USAGE EXAMPLES:
# Summary mode - metadata only (smallest response)
extract_tables(pdf_path, pages="1-5", summary_only=True)

# Limited data - first 100 rows per table
extract_tables(pdf_path, pages="1-5", max_rows_per_table=100)

# Full data (default behavior, may overflow on large tables)
extract_tables(pdf_path, pages="1-5")

BENEFITS:
- Prevents MCP token overflow errors
- Maintains backward compatibility (new params are optional)
- Clear guidance through metadata (shows when truncation occurred)
- Flexible - users choose between summary/limited/full modes

FILES MODIFIED:
- src/mcp_pdf/mixins_official/table_extraction.py (all changes)
- src/mcp_pdf/server.py (version bump to 2.0.7)
- pyproject.toml (version bump to 2.0.7)

VERSION: 2.0.7
PUBLISHED: https://pypi.org/project/mcp-pdf/2.0.7/
2025-11-03 18:26:34 -07:00

115 lines
2.7 KiB
TOML

[project]
name = "mcp-pdf"
version = "2.0.7"
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
readme = "README.md"
license = {text = "MIT"}
requires-python = ">=3.10"
keywords = [
"mcp",
"fastmcp",
"pdf",
"ocr",
"text-extraction",
"table-extraction",
"pdf-processing",
"api",
"integration"
]
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing :: General",
"Topic :: Office/Business",
]
dependencies = [
"fastmcp>=0.1.0",
"httpx>=0.25.0",
"pydantic>=2.0.0",
"python-dotenv>=1.0.0",
"PyMuPDF>=1.23.0",
"pdfplumber>=0.10.0",
"camelot-py[cv]>=0.11.0", # includes opencv-python
"tabula-py>=2.8.0",
"pytesseract>=0.3.10",
"pdf2image>=1.16.0",
"pypdf>=6.0.0",
"pandas>=2.0.0",
"Pillow>=10.0.0",
"markdown>=3.5.0",
]
[project.urls]
Homepage = "https://github.com/rsp2k/mcp-pdf"
Documentation = "https://github.com/rsp2k/mcp-pdf#readme"
Repository = "https://github.com/rsp2k/mcp-pdf.git"
Issues = "https://github.com/rsp2k/mcp-pdf/issues"
Changelog = "https://github.com/rsp2k/mcp-pdf/releases"
[project.scripts]
mcp-pdf = "mcp_pdf.server:main"
mcp-pdf-legacy = "mcp_pdf.server_legacy:main"
mcp-pdf-modular = "mcp_pdf.server_refactored:main"
[project.optional-dependencies]
# Form creation features (create_form_pdf, advanced form tools)
forms = [
"reportlab>=4.0.0",
]
# All optional features
all = [
"reportlab>=4.0.0",
]
# Development dependencies
dev = [
"pytest>=7.0.0",
"pytest-asyncio>=0.21.0",
"black>=23.0.0",
"ruff>=0.1.0",
"mypy>=1.0.0",
"build>=0.10.0",
"twine>=4.0.0",
"safety>=3.0.0",
"pip-audit>=2.0.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.pytest.ini_options]
asyncio_mode = "auto"
addopts = "-v --tb=short"
testpaths = ["tests"]
python_files = ["test_*.py", "*_test.py"]
[tool.hatchling.build.targets.sdist]
include = [
"/src",
"/tests",
"/examples",
"README.md",
"LICENSE",
"MANIFEST.in",
]
[dependency-groups]
dev = [
"pip-audit>=2.9.0",
"pytest>=8.4.1",
"pytest-asyncio>=1.1.0",
"pytest-cov>=6.2.1",
"reportlab>=4.4.3",
"safety>=3.2.11",
"twine>=6.1.0",
]