PROBLEM: Table extraction from large PDFs was exceeding MCP's 25,000 token limit, causing "response too large" errors. A 5-page PDF with large tables generated 59,005 tokens, more than double the allowed limit. SOLUTION: Added flexible table data limiting with two new parameters: - max_rows_per_table: Limit rows returned per table (prevents overflow) - summary_only: Return only metadata without table data IMPLEMENTATION: 1. Added new parameters to extract_tables() method signature 2. Created _process_table_data() helper for consistent limiting logic 3. Updated all 3 extraction methods (Camelot, pdfplumber, Tabula) 4. Enhanced table metadata with truncation tracking: - total_rows: Full row count from PDF - rows_returned: Actual rows in response (after limiting) - rows_truncated: Number of rows omitted (if limited) USAGE EXAMPLES: # Summary mode - metadata only (smallest response) extract_tables(pdf_path, pages="1-5", summary_only=True) # Limited data - first 100 rows per table extract_tables(pdf_path, pages="1-5", max_rows_per_table=100) # Full data (default behavior, may overflow on large tables) extract_tables(pdf_path, pages="1-5") BENEFITS: - Prevents MCP token overflow errors - Maintains backward compatibility (new params are optional) - Clear guidance through metadata (shows when truncation occurred) - Flexible - users choose between summary/limited/full modes FILES MODIFIED: - src/mcp_pdf/mixins_official/table_extraction.py (all changes) - src/mcp_pdf/server.py (version bump to 2.0.7) - pyproject.toml (version bump to 2.0.7) VERSION: 2.0.7 PUBLISHED: https://pypi.org/project/mcp-pdf/2.0.7/
115 lines
2.7 KiB
TOML
115 lines
2.7 KiB
TOML
[project]
|
|
name = "mcp-pdf"
|
|
version = "2.0.7"
|
|
description = "Secure FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, forms, annotations, and more"
|
|
authors = [{name = "Ryan Malloy", email = "ryan@malloys.us"}]
|
|
readme = "README.md"
|
|
license = {text = "MIT"}
|
|
requires-python = ">=3.10"
|
|
keywords = [
|
|
"mcp",
|
|
"fastmcp",
|
|
"pdf",
|
|
"ocr",
|
|
"text-extraction",
|
|
"table-extraction",
|
|
"pdf-processing",
|
|
"api",
|
|
"integration"
|
|
]
|
|
classifiers = [
|
|
"Development Status :: 4 - Beta",
|
|
"Intended Audience :: Developers",
|
|
"License :: OSI Approved :: MIT License",
|
|
"Programming Language :: Python :: 3",
|
|
"Programming Language :: Python :: 3.10",
|
|
"Programming Language :: Python :: 3.11",
|
|
"Programming Language :: Python :: 3.12",
|
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
"Topic :: Text Processing :: General",
|
|
"Topic :: Office/Business",
|
|
]
|
|
dependencies = [
|
|
"fastmcp>=0.1.0",
|
|
"httpx>=0.25.0",
|
|
"pydantic>=2.0.0",
|
|
"python-dotenv>=1.0.0",
|
|
"PyMuPDF>=1.23.0",
|
|
"pdfplumber>=0.10.0",
|
|
"camelot-py[cv]>=0.11.0", # includes opencv-python
|
|
"tabula-py>=2.8.0",
|
|
"pytesseract>=0.3.10",
|
|
"pdf2image>=1.16.0",
|
|
"pypdf>=6.0.0",
|
|
"pandas>=2.0.0",
|
|
"Pillow>=10.0.0",
|
|
"markdown>=3.5.0",
|
|
]
|
|
|
|
[project.urls]
|
|
Homepage = "https://github.com/rsp2k/mcp-pdf"
|
|
Documentation = "https://github.com/rsp2k/mcp-pdf#readme"
|
|
Repository = "https://github.com/rsp2k/mcp-pdf.git"
|
|
Issues = "https://github.com/rsp2k/mcp-pdf/issues"
|
|
Changelog = "https://github.com/rsp2k/mcp-pdf/releases"
|
|
|
|
[project.scripts]
|
|
mcp-pdf = "mcp_pdf.server:main"
|
|
mcp-pdf-legacy = "mcp_pdf.server_legacy:main"
|
|
mcp-pdf-modular = "mcp_pdf.server_refactored:main"
|
|
|
|
[project.optional-dependencies]
|
|
# Form creation features (create_form_pdf, advanced form tools)
|
|
forms = [
|
|
"reportlab>=4.0.0",
|
|
]
|
|
|
|
# All optional features
|
|
all = [
|
|
"reportlab>=4.0.0",
|
|
]
|
|
|
|
# Development dependencies
|
|
dev = [
|
|
"pytest>=7.0.0",
|
|
"pytest-asyncio>=0.21.0",
|
|
"black>=23.0.0",
|
|
"ruff>=0.1.0",
|
|
"mypy>=1.0.0",
|
|
"build>=0.10.0",
|
|
"twine>=4.0.0",
|
|
"safety>=3.0.0",
|
|
"pip-audit>=2.0.0",
|
|
]
|
|
|
|
[build-system]
|
|
requires = ["hatchling"]
|
|
build-backend = "hatchling.build"
|
|
|
|
[tool.pytest.ini_options]
|
|
asyncio_mode = "auto"
|
|
addopts = "-v --tb=short"
|
|
testpaths = ["tests"]
|
|
python_files = ["test_*.py", "*_test.py"]
|
|
|
|
[tool.hatchling.build.targets.sdist]
|
|
include = [
|
|
"/src",
|
|
"/tests",
|
|
"/examples",
|
|
"README.md",
|
|
"LICENSE",
|
|
"MANIFEST.in",
|
|
]
|
|
|
|
[dependency-groups]
|
|
dev = [
|
|
"pip-audit>=2.9.0",
|
|
"pytest>=8.4.1",
|
|
"pytest-asyncio>=1.1.0",
|
|
"pytest-cov>=6.2.1",
|
|
"reportlab>=4.4.3",
|
|
"safety>=3.2.11",
|
|
"twine>=6.1.0",
|
|
]
|