Compare commits
2 Commits
dfc6fe1149
...
478ab41b1f
Author | SHA1 | Date | |
---|---|---|---|
478ab41b1f | |||
c902e81e4d |
39
.env.example
Normal file
39
.env.example
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
# MCP PDF Tools Configuration
|
||||||
|
|
||||||
|
# Tesseract OCR configuration
|
||||||
|
# Path to Tesseract data directory (for language files)
|
||||||
|
# Ubuntu/Debian: /usr/share/tesseract-ocr/5/tessdata
|
||||||
|
# macOS (Homebrew): /usr/local/share/tessdata
|
||||||
|
# Windows: C:\Program Files\Tesseract-OCR\tessdata
|
||||||
|
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
|
||||||
|
|
||||||
|
# Temporary directory for PDF processing
|
||||||
|
# Used for intermediate files during conversion
|
||||||
|
PDF_TEMP_DIR=/tmp/pdf_processing
|
||||||
|
|
||||||
|
# Poppler utilities path (for pdf2image)
|
||||||
|
# Only needed if poppler-utils is not in PATH
|
||||||
|
# Ubuntu/Debian: Usually in PATH
|
||||||
|
# macOS: /usr/local/bin (if installed via Homebrew)
|
||||||
|
# Windows: C:\Program Files\poppler-0.68.0\bin
|
||||||
|
# POPPLER_PATH=/usr/local/bin
|
||||||
|
|
||||||
|
# Java home for Tabula (table extraction)
|
||||||
|
# Only needed if Java is not in PATH
|
||||||
|
# JAVA_HOME=/usr/lib/jvm/java-11-openjdk
|
||||||
|
|
||||||
|
# Debug mode
|
||||||
|
# Set to true for verbose logging
|
||||||
|
DEBUG=false
|
||||||
|
|
||||||
|
# Maximum file size in MB
|
||||||
|
# PDFs larger than this will be rejected
|
||||||
|
MAX_PDF_SIZE_MB=100
|
||||||
|
|
||||||
|
# Default DPI for PDF to image conversion
|
||||||
|
# Higher values = better quality but slower processing
|
||||||
|
DEFAULT_DPI=300
|
||||||
|
|
||||||
|
# Default OCR languages (comma-separated)
|
||||||
|
# Common codes: eng (English), fra (French), deu (German), spa (Spanish)
|
||||||
|
DEFAULT_OCR_LANGUAGES=eng
|
93
.gitignore
vendored
Normal file
93
.gitignore
vendored
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
pip-wheel-metadata/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# IDEs
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# PDF test files
|
||||||
|
*.pdf
|
||||||
|
test_pdfs/
|
||||||
|
sample_pdfs/
|
||||||
|
|
||||||
|
# Temporary files
|
||||||
|
tmp/
|
||||||
|
temp/
|
||||||
|
*.tmp
|
||||||
|
|
||||||
|
# OCR output
|
||||||
|
tesseract_output/
|
||||||
|
ocr_results/
|
||||||
|
|
||||||
|
# Log files
|
||||||
|
*.log
|
||||||
|
logs/
|
128
CLAUDE.md
Normal file
128
CLAUDE.md
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
# CLAUDE.md
|
||||||
|
|
||||||
|
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||||
|
|
||||||
|
## Project Overview
|
||||||
|
|
||||||
|
MCP PDF Tools is a FastMCP server that provides comprehensive PDF processing capabilities including text extraction, table extraction, OCR, image extraction, and format conversion. The server is built on the FastMCP framework and provides intelligent method selection with automatic fallbacks.
|
||||||
|
|
||||||
|
## Development Commands
|
||||||
|
|
||||||
|
### Environment Setup
|
||||||
|
```bash
|
||||||
|
# Install with development dependencies
|
||||||
|
uv sync --dev
|
||||||
|
|
||||||
|
# Install system dependencies (Ubuntu/Debian)
|
||||||
|
sudo apt-get install tesseract-ocr tesseract-ocr-eng poppler-utils ghostscript python3-tk default-jre-headless
|
||||||
|
```
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
```bash
|
||||||
|
# Run all tests
|
||||||
|
uv run pytest
|
||||||
|
|
||||||
|
# Run with coverage
|
||||||
|
uv run pytest --cov=mcp_pdf_tools
|
||||||
|
|
||||||
|
# Run specific test file
|
||||||
|
uv run pytest tests/test_server.py
|
||||||
|
|
||||||
|
# Run specific test
|
||||||
|
uv run pytest tests/test_server.py::TestTextExtraction::test_extract_text_success
|
||||||
|
```
|
||||||
|
|
||||||
|
### Code Quality
|
||||||
|
```bash
|
||||||
|
# Format code
|
||||||
|
uv run black src/ tests/ examples/
|
||||||
|
|
||||||
|
# Lint code
|
||||||
|
uv run ruff check src/ tests/ examples/
|
||||||
|
|
||||||
|
# Type checking
|
||||||
|
uv run mypy src/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running the Server
|
||||||
|
```bash
|
||||||
|
# Run MCP server directly
|
||||||
|
uv run mcp-pdf-tools
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
uv run python examples/verify_installation.py
|
||||||
|
|
||||||
|
# Test with sample PDF
|
||||||
|
uv run python examples/test_pdf_tools.py /path/to/test.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
### Building and Distribution
|
||||||
|
```bash
|
||||||
|
# Build package
|
||||||
|
uv build
|
||||||
|
|
||||||
|
# Upload to PyPI (requires credentials)
|
||||||
|
uv publish
|
||||||
|
```
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
### Core Components
|
||||||
|
|
||||||
|
- **`src/mcp_pdf_tools/server.py`**: Main server implementation with all PDF processing tools
|
||||||
|
- **FastMCP Framework**: Uses FastMCP for MCP protocol implementation
|
||||||
|
- **Multi-library approach**: Integrates PyMuPDF, pdfplumber, pypdf, Camelot, Tabula, and Tesseract
|
||||||
|
|
||||||
|
### Tool Categories
|
||||||
|
|
||||||
|
1. **Text Extraction**: `extract_text` - Intelligent method selection (PyMuPDF, pdfplumber, pypdf)
|
||||||
|
2. **Table Extraction**: `extract_tables` - Auto-fallback through Camelot → pdfplumber → Tabula
|
||||||
|
3. **OCR Processing**: `ocr_pdf` - Tesseract with preprocessing options
|
||||||
|
4. **Document Analysis**: `is_scanned_pdf`, `get_document_structure`, `extract_metadata`
|
||||||
|
5. **Format Conversion**: `pdf_to_markdown` - Clean markdown with optional images
|
||||||
|
6. **Image Processing**: `extract_images` - Size filtering and format conversion
|
||||||
|
|
||||||
|
### Intelligent Fallbacks
|
||||||
|
|
||||||
|
The server implements smart fallback mechanisms:
|
||||||
|
- Text extraction automatically detects scanned PDFs and suggests OCR
|
||||||
|
- Table extraction tries multiple methods until tables are found
|
||||||
|
- All operations include comprehensive error handling with helpful hints
|
||||||
|
|
||||||
|
### Dependencies Management
|
||||||
|
|
||||||
|
Critical system dependencies:
|
||||||
|
- **Tesseract OCR**: Required for `ocr_pdf` functionality
|
||||||
|
- **Java**: Required for Tabula table extraction
|
||||||
|
- **Ghostscript**: Required for Camelot table extraction
|
||||||
|
- **Poppler**: Required for PDF to image conversion
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
Environment variables (optional):
|
||||||
|
- `TESSDATA_PREFIX`: Tesseract language data location
|
||||||
|
- `PDF_TEMP_DIR`: Temporary file processing directory
|
||||||
|
- `DEBUG`: Enable debug logging
|
||||||
|
|
||||||
|
## Development Notes
|
||||||
|
|
||||||
|
### Testing Strategy
|
||||||
|
- Comprehensive unit tests with mocked PDF libraries
|
||||||
|
- Test fixtures for consistent PDF document simulation
|
||||||
|
- Error handling tests for all major failure modes
|
||||||
|
- Server initialization and tool registration validation
|
||||||
|
|
||||||
|
### Tool Implementation Pattern
|
||||||
|
All tools follow this pattern:
|
||||||
|
1. Validate PDF path using `validate_pdf_path()`
|
||||||
|
2. Try primary method with intelligent selection
|
||||||
|
3. Implement fallbacks where applicable
|
||||||
|
4. Return structured results with metadata
|
||||||
|
5. Include timing information and method used
|
||||||
|
6. Provide helpful error messages with troubleshooting hints
|
||||||
|
|
||||||
|
### Docker Support
|
||||||
|
The project includes Docker support with all system dependencies pre-installed, useful for consistent cross-platform development and deployment.
|
||||||
|
|
||||||
|
### MCP Integration
|
||||||
|
Tools are registered using FastMCP decorators and follow MCP protocol standards for tool descriptions and parameter validation.
|
53
Dockerfile
Normal file
53
Dockerfile
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
# MCP PDF Tools Docker Image
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
# PDF libraries
|
||||||
|
poppler-utils \
|
||||||
|
# OCR dependencies
|
||||||
|
tesseract-ocr \
|
||||||
|
tesseract-ocr-eng \
|
||||||
|
tesseract-ocr-fra \
|
||||||
|
tesseract-ocr-deu \
|
||||||
|
tesseract-ocr-spa \
|
||||||
|
# Image processing
|
||||||
|
libmagic1 \
|
||||||
|
# Java for Tabula
|
||||||
|
default-jre-headless \
|
||||||
|
# Build dependencies
|
||||||
|
gcc \
|
||||||
|
g++ \
|
||||||
|
python3-dev \
|
||||||
|
# Ghostscript for Camelot
|
||||||
|
ghostscript \
|
||||||
|
python3-tk \
|
||||||
|
# Clean up
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy project files
|
||||||
|
COPY pyproject.toml README.md LICENSE MANIFEST.in ./
|
||||||
|
COPY src/ ./src/
|
||||||
|
COPY tests/ ./tests/
|
||||||
|
COPY examples/ ./examples/
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
RUN pip install --no-cache-dir -e .
|
||||||
|
|
||||||
|
# Create directory for PDF processing
|
||||||
|
RUN mkdir -p /tmp/pdf_processing
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
|
||||||
|
ENV PDF_TEMP_DIR=/tmp/pdf_processing
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
# Expose the MCP server (stdio)
|
||||||
|
ENTRYPOINT ["python", "-m", "mcp_pdf_tools.server"]
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||||
|
CMD python examples/verify_installation.py || exit 1
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2024 RPM
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
6
MANIFEST.in
Normal file
6
MANIFEST.in
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
include README.md
|
||||||
|
include LICENSE
|
||||||
|
include .env.example
|
||||||
|
recursive-include src *.py
|
||||||
|
recursive-include tests *.py
|
||||||
|
recursive-include examples *.py
|
113
QUICKSTART.md
Normal file
113
QUICKSTART.md
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
# Quick Start Guide
|
||||||
|
|
||||||
|
## 1. Installation
|
||||||
|
|
||||||
|
### Option A: Using UV (Recommended for Development)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone the repository
|
||||||
|
git clone https://github.com/rpm/mcp-pdf-tools
|
||||||
|
cd mcp-pdf-tools
|
||||||
|
|
||||||
|
# Install with uv
|
||||||
|
uv sync
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
uv run python examples/verify_installation.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option B: Using Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone the repository
|
||||||
|
git clone https://github.com/rpm/mcp-pdf-tools
|
||||||
|
cd mcp-pdf-tools
|
||||||
|
|
||||||
|
# Build and run with Docker
|
||||||
|
docker-compose build
|
||||||
|
docker-compose run --rm mcp-pdf-tools python examples/verify_installation.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option C: From PyPI
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install mcp-pdf-tools
|
||||||
|
```
|
||||||
|
|
||||||
|
## 2. System Dependencies
|
||||||
|
|
||||||
|
### Ubuntu/Debian
|
||||||
|
```bash
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y \
|
||||||
|
tesseract-ocr \
|
||||||
|
tesseract-ocr-eng \
|
||||||
|
poppler-utils \
|
||||||
|
ghostscript \
|
||||||
|
python3-tk \
|
||||||
|
default-jre-headless
|
||||||
|
```
|
||||||
|
|
||||||
|
### macOS
|
||||||
|
```bash
|
||||||
|
brew install tesseract poppler ghostscript
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows
|
||||||
|
- Install Tesseract: https://github.com/UB-Mannheim/tesseract/wiki
|
||||||
|
- Install Poppler: http://blog.alivate.com.au/poppler-windows/
|
||||||
|
- Install Ghostscript: https://www.ghostscript.com/download/gsdnld.html
|
||||||
|
- Install Java: https://www.java.com/download/
|
||||||
|
|
||||||
|
## 3. Claude Desktop Configuration
|
||||||
|
|
||||||
|
Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"pdf-tools": {
|
||||||
|
"command": "uv",
|
||||||
|
"args": ["run", "mcp-pdf-tools"],
|
||||||
|
"cwd": "/home/rpm/claude/mcp-pdf-tools"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## 4. Test the Tools
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test with a sample PDF
|
||||||
|
uv run python examples/test_pdf_tools.py /path/to/your/document.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
## 5. Common Issues
|
||||||
|
|
||||||
|
### OCR not working
|
||||||
|
- Check Tesseract is installed: `tesseract --version`
|
||||||
|
- Install language packs: `sudo apt-get install tesseract-ocr-[lang]`
|
||||||
|
|
||||||
|
### Table extraction failing
|
||||||
|
- Check Java is installed: `java -version`
|
||||||
|
- For Camelot issues, ensure Ghostscript is installed
|
||||||
|
|
||||||
|
### Large PDF issues
|
||||||
|
- Process specific pages: `pages=[0, 1, 2]`
|
||||||
|
- Increase memory: `export JAVA_OPTS="-Xmx2g"`
|
||||||
|
|
||||||
|
## 6. Example Usage in Claude
|
||||||
|
|
||||||
|
Once configured, you can ask Claude:
|
||||||
|
|
||||||
|
- "Extract text from the PDF at /path/to/document.pdf"
|
||||||
|
- "Check if /path/to/scan.pdf is a scanned document"
|
||||||
|
- "Extract all tables from /path/to/report.pdf and format as markdown"
|
||||||
|
- "Convert /path/to/document.pdf to markdown format"
|
||||||
|
- "Extract images from the first 5 pages of /path/to/presentation.pdf"
|
||||||
|
|
||||||
|
## Need Help?
|
||||||
|
|
||||||
|
- Check the full README.md for detailed documentation
|
||||||
|
- Run tests: `uv run pytest`
|
||||||
|
- Enable debug mode: Set `DEBUG=true` in your .env file
|
319
README.md
319
README.md
@ -1,3 +1,318 @@
|
|||||||
# mcp-pdf-tools
|
# MCP PDF Tools
|
||||||
|
|
||||||
MCP PDF Tools - Comprehensive PDF processing server for the Model Context Protocol with intelligent method selection and automatic fallbacks
|
A comprehensive FastMCP server for PDF processing operations. This server provides powerful tools for extracting text, tables, images, and metadata from PDFs, performing OCR on scanned documents, and converting PDFs to various formats.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Text Extraction**: Multiple methods (PyMuPDF, pdfplumber, pypdf) with automatic selection
|
||||||
|
- **Table Extraction**: Support for both bordered and borderless tables using Camelot, Tabula, and pdfplumber
|
||||||
|
- **OCR**: Process scanned PDFs with Tesseract OCR, including preprocessing for better results
|
||||||
|
- **Document Analysis**: Extract structure, metadata, and check if PDFs are scanned
|
||||||
|
- **Image Extraction**: Extract images with size filtering
|
||||||
|
- **Format Conversion**: Convert PDFs to clean Markdown format
|
||||||
|
- **Smart Detection**: Automatically detect the best method for each operation
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### Using uv (recommended)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone the repository
|
||||||
|
git clone https://github.com/rpm/mcp-pdf-tools
|
||||||
|
cd mcp-pdf-tools
|
||||||
|
|
||||||
|
# Install with uv
|
||||||
|
uv sync
|
||||||
|
|
||||||
|
# Install Tesseract OCR (required for OCR functionality)
|
||||||
|
# On Ubuntu/Debian:
|
||||||
|
sudo apt-get install tesseract-ocr tesseract-ocr-eng
|
||||||
|
|
||||||
|
# On macOS:
|
||||||
|
brew install tesseract
|
||||||
|
|
||||||
|
# On Windows:
|
||||||
|
# Download installer from: https://github.com/UB-Mannheim/tesseract/wiki
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using pip
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install mcp-pdf-tools
|
||||||
|
|
||||||
|
# Install system dependencies for OCR
|
||||||
|
# Same as above for Tesseract
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Claude Desktop Integration
|
||||||
|
|
||||||
|
Add to your Claude configuration (`~/Library/Application Support/Claude/claude_desktop_config.json` on macOS):
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"pdf-tools": {
|
||||||
|
"command": "uv",
|
||||||
|
"args": ["run", "mcp-pdf-tools"],
|
||||||
|
"cwd": "/path/to/mcp-pdf-tools"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Or if installed via pip:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"pdf-tools": {
|
||||||
|
"command": "mcp-pdf-tools"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Claude Code Integration
|
||||||
|
|
||||||
|
For development with Claude Code, add the MCP server from your local development directory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
claude mcp add pdf-tools "uvx --from /path/to/mcp-pdf-tools mcp-pdf-tools"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
Create a `.env` file in your project directory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Optional: Tesseract configuration
|
||||||
|
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
|
||||||
|
|
||||||
|
# Optional: Temporary file directory
|
||||||
|
PDF_TEMP_DIR=/tmp/pdf_processing
|
||||||
|
|
||||||
|
# Optional: Enable debug logging
|
||||||
|
DEBUG=true
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage Examples
|
||||||
|
|
||||||
|
### Text Extraction
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Basic text extraction
|
||||||
|
result = await extract_text(
|
||||||
|
pdf_path="/path/to/document.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract specific pages with layout preservation
|
||||||
|
result = await extract_text(
|
||||||
|
pdf_path="/path/to/document.pdf",
|
||||||
|
pages=[0, 1, 2], # First 3 pages
|
||||||
|
preserve_layout=True,
|
||||||
|
method="pdfplumber" # Or "auto", "pymupdf", "pypdf"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Table Extraction
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Extract all tables
|
||||||
|
result = await extract_tables(
|
||||||
|
pdf_path="/path/to/document.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract tables from specific pages in markdown format
|
||||||
|
result = await extract_tables(
|
||||||
|
pdf_path="/path/to/document.pdf",
|
||||||
|
pages=[2, 3],
|
||||||
|
output_format="markdown" # Or "json", "csv"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### OCR for Scanned PDFs
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Basic OCR
|
||||||
|
result = await ocr_pdf(
|
||||||
|
pdf_path="/path/to/scanned.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
# OCR with multiple languages and preprocessing
|
||||||
|
result = await ocr_pdf(
|
||||||
|
pdf_path="/path/to/scanned.pdf",
|
||||||
|
languages=["eng", "fra", "deu"],
|
||||||
|
preprocess=True,
|
||||||
|
dpi=300
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Document Analysis
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Check if PDF is scanned
|
||||||
|
result = await is_scanned_pdf(
|
||||||
|
pdf_path="/path/to/document.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get document structure and metadata
|
||||||
|
result = await get_document_structure(
|
||||||
|
pdf_path="/path/to/document.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract comprehensive metadata
|
||||||
|
result = await extract_metadata(
|
||||||
|
pdf_path="/path/to/document.pdf"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Format Conversion
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Convert to Markdown
|
||||||
|
result = await pdf_to_markdown(
|
||||||
|
pdf_path="/path/to/document.pdf",
|
||||||
|
include_images=True,
|
||||||
|
include_metadata=True
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Image Extraction
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Extract images with size filtering
|
||||||
|
result = await extract_images(
|
||||||
|
pdf_path="/path/to/document.pdf",
|
||||||
|
min_width=200,
|
||||||
|
min_height=200,
|
||||||
|
output_format="png" # Or "jpeg"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Available Tools
|
||||||
|
|
||||||
|
| Tool | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `extract_text` | Extract text with multiple methods and layout preservation |
|
||||||
|
| `extract_tables` | Extract tables in various formats (JSON, CSV, Markdown) |
|
||||||
|
| `ocr_pdf` | Perform OCR on scanned PDFs with preprocessing |
|
||||||
|
| `is_scanned_pdf` | Check if a PDF is scanned or text-based |
|
||||||
|
| `get_document_structure` | Extract document structure, outline, and basic metadata |
|
||||||
|
| `extract_metadata` | Extract comprehensive metadata and file statistics |
|
||||||
|
| `pdf_to_markdown` | Convert PDF to clean Markdown format |
|
||||||
|
| `extract_images` | Extract images with filtering options |
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
### Setup Development Environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone and enter directory
|
||||||
|
git clone https://github.com/rpm/mcp-pdf-tools
|
||||||
|
cd mcp-pdf-tools
|
||||||
|
|
||||||
|
# Install with development dependencies
|
||||||
|
uv sync --dev
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
uv run pytest
|
||||||
|
|
||||||
|
# Format code
|
||||||
|
uv run black src/ tests/
|
||||||
|
uv run ruff check src/ tests/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running Tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run all tests
|
||||||
|
uv run pytest
|
||||||
|
|
||||||
|
# Run with coverage
|
||||||
|
uv run pytest --cov=mcp_pdf_tools
|
||||||
|
|
||||||
|
# Run specific test
|
||||||
|
uv run pytest tests/test_server.py::test_extract_text
|
||||||
|
```
|
||||||
|
|
||||||
|
### Building for PyPI
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build the package
|
||||||
|
uv build
|
||||||
|
|
||||||
|
# Upload to PyPI (requires credentials)
|
||||||
|
uv publish
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### OCR Not Working
|
||||||
|
|
||||||
|
1. **Tesseract not installed**: Make sure Tesseract is installed on your system
|
||||||
|
2. **Language data missing**: Install additional language packs:
|
||||||
|
```bash
|
||||||
|
# Ubuntu/Debian
|
||||||
|
sudo apt-get install tesseract-ocr-fra tesseract-ocr-deu
|
||||||
|
|
||||||
|
# macOS
|
||||||
|
brew install tesseract-lang
|
||||||
|
```
|
||||||
|
|
||||||
|
### Table Extraction Issues
|
||||||
|
|
||||||
|
1. **Java not found**: Tabula requires Java. Install Java 8 or higher.
|
||||||
|
2. **Camelot dependencies**: Install system dependencies:
|
||||||
|
```bash
|
||||||
|
# Ubuntu/Debian
|
||||||
|
sudo apt-get install python3-tk ghostscript
|
||||||
|
|
||||||
|
# macOS
|
||||||
|
brew install ghostscript tcl-tk
|
||||||
|
```
|
||||||
|
|
||||||
|
### Memory Issues with Large PDFs
|
||||||
|
|
||||||
|
For very large PDFs, consider:
|
||||||
|
1. Processing specific page ranges instead of the entire document
|
||||||
|
2. Increasing available memory for Python
|
||||||
|
3. Using the streaming capabilities of pdfplumber for text extraction
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
The server uses intelligent fallback mechanisms:
|
||||||
|
|
||||||
|
1. **Text Extraction**: Automatically detects if a PDF is scanned and suggests OCR
|
||||||
|
2. **Table Extraction**: Tries multiple methods (Camelot → pdfplumber → Tabula) until tables are found
|
||||||
|
3. **Error Handling**: Graceful degradation with informative error messages
|
||||||
|
|
||||||
|
## Performance Tips
|
||||||
|
|
||||||
|
- For large PDFs, process in chunks using page ranges
|
||||||
|
- Use `method="pymupdf"` for fastest text extraction
|
||||||
|
- For complex tables, start with `method="camelot"`
|
||||||
|
- Enable preprocessing for better OCR results on poor quality scans
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
Contributions are welcome! Please:
|
||||||
|
|
||||||
|
1. Fork the repository
|
||||||
|
2. Create a feature branch
|
||||||
|
3. Add tests for new functionality
|
||||||
|
4. Submit a pull request
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT License - see LICENSE file for details
|
||||||
|
|
||||||
|
## Acknowledgments
|
||||||
|
|
||||||
|
This MCP server leverages several excellent PDF processing libraries:
|
||||||
|
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF) for fast PDF operations
|
||||||
|
- [pdfplumber](https://github.com/jsvine/pdfplumber) for layout-aware extraction
|
||||||
|
- [Camelot](https://github.com/camelot-dev/camelot) for table extraction
|
||||||
|
- [Tabula-py](https://github.com/chezou/tabula-py) for Java-based table extraction
|
||||||
|
- [Tesseract](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
||||||
|
20
docker-compose.yml
Normal file
20
docker-compose.yml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
mcp-pdf-tools:
|
||||||
|
build: .
|
||||||
|
image: mcp-pdf-tools:latest
|
||||||
|
container_name: mcp-pdf-tools
|
||||||
|
volumes:
|
||||||
|
# Mount a directory for PDF files
|
||||||
|
- ./test_pdfs:/pdfs:ro
|
||||||
|
# Mount temp directory for processing
|
||||||
|
- ./tmp:/tmp/pdf_processing
|
||||||
|
environment:
|
||||||
|
- DEBUG=true
|
||||||
|
- TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
|
||||||
|
- PDF_TEMP_DIR=/tmp/pdf_processing
|
||||||
|
stdin_open: true
|
||||||
|
tty: true
|
||||||
|
# For testing, you can override the entrypoint
|
||||||
|
# entrypoint: /bin/bash
|
101
examples/create_test_pdf.py
Normal file
101
examples/create_test_pdf.py
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Create a test PDF for testing the MCP PDF Tools"""
|
||||||
|
|
||||||
|
from reportlab.lib.pagesizes import letter
|
||||||
|
from reportlab.pdfgen import canvas
|
||||||
|
from reportlab.lib.units import inch
|
||||||
|
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
|
||||||
|
from reportlab.lib.styles import getSampleStyleSheet
|
||||||
|
from reportlab.lib import colors
|
||||||
|
|
||||||
|
def create_test_pdf(filename="test_document.pdf"):
|
||||||
|
"""Create a test PDF with text, tables, and metadata"""
|
||||||
|
|
||||||
|
# Create the PDF
|
||||||
|
doc = SimpleDocTemplate(filename, pagesize=letter)
|
||||||
|
story = []
|
||||||
|
styles = getSampleStyleSheet()
|
||||||
|
|
||||||
|
# Title
|
||||||
|
title = Paragraph("MCP PDF Tools Test Document", styles['Title'])
|
||||||
|
story.append(title)
|
||||||
|
story.append(Spacer(1, 0.3*inch))
|
||||||
|
|
||||||
|
# Introduction
|
||||||
|
intro = Paragraph(
|
||||||
|
"This is a test document created to demonstrate the capabilities of the MCP PDF Tools server. "
|
||||||
|
"It contains various elements including text, tables, and metadata to test different extraction features.",
|
||||||
|
styles['Normal']
|
||||||
|
)
|
||||||
|
story.append(intro)
|
||||||
|
story.append(Spacer(1, 0.2*inch))
|
||||||
|
|
||||||
|
# Section 1
|
||||||
|
section1 = Paragraph("1. Text Extraction Test", styles['Heading2'])
|
||||||
|
story.append(section1)
|
||||||
|
story.append(Spacer(1, 0.1*inch))
|
||||||
|
|
||||||
|
text1 = Paragraph(
|
||||||
|
"This section contains regular paragraph text that should be easily extractable using any of the "
|
||||||
|
"text extraction methods (PyMuPDF, pdfplumber, or pypdf). The text includes various formatting "
|
||||||
|
"and should maintain its structure when extracted with layout preservation enabled.",
|
||||||
|
styles['Normal']
|
||||||
|
)
|
||||||
|
story.append(text1)
|
||||||
|
story.append(Spacer(1, 0.2*inch))
|
||||||
|
|
||||||
|
# Section 2 - Table
|
||||||
|
section2 = Paragraph("2. Table Extraction Test", styles['Heading2'])
|
||||||
|
story.append(section2)
|
||||||
|
story.append(Spacer(1, 0.1*inch))
|
||||||
|
|
||||||
|
# Create a table
|
||||||
|
data = [
|
||||||
|
['Product', 'Price', 'Quantity', 'Total'],
|
||||||
|
['Widget A', '$10.00', '5', '$50.00'],
|
||||||
|
['Widget B', '$15.00', '3', '$45.00'],
|
||||||
|
['Widget C', '$20.00', '2', '$40.00'],
|
||||||
|
['Total', '', '', '$135.00']
|
||||||
|
]
|
||||||
|
|
||||||
|
table = Table(data)
|
||||||
|
table.setStyle(TableStyle([
|
||||||
|
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
|
||||||
|
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
||||||
|
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
||||||
|
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
||||||
|
('FONTSIZE', (0, 0), (-1, 0), 14),
|
||||||
|
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
||||||
|
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
||||||
|
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
||||||
|
]))
|
||||||
|
|
||||||
|
story.append(table)
|
||||||
|
story.append(Spacer(1, 0.2*inch))
|
||||||
|
|
||||||
|
# Section 3
|
||||||
|
section3 = Paragraph("3. Document Structure Test", styles['Heading2'])
|
||||||
|
story.append(section3)
|
||||||
|
story.append(Spacer(1, 0.1*inch))
|
||||||
|
|
||||||
|
text3 = Paragraph(
|
||||||
|
"This document has a clear structure with numbered sections and headings. "
|
||||||
|
"The document structure extraction should identify these sections and create "
|
||||||
|
"an outline or table of contents.",
|
||||||
|
styles['Normal']
|
||||||
|
)
|
||||||
|
story.append(text3)
|
||||||
|
story.append(Spacer(1, 0.2*inch))
|
||||||
|
|
||||||
|
# Add metadata
|
||||||
|
doc.title = "MCP PDF Tools Test Document"
|
||||||
|
doc.author = "MCP PDF Tools Tester"
|
||||||
|
doc.subject = "Testing PDF Processing"
|
||||||
|
doc.keywords = ["test", "pdf", "mcp", "extraction"]
|
||||||
|
|
||||||
|
# Build the PDF
|
||||||
|
doc.build(story)
|
||||||
|
print(f"✅ Created test PDF: {filename}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
create_test_pdf()
|
157
examples/test_pdf_tools.py
Normal file
157
examples/test_pdf_tools.py
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
"""
|
||||||
|
Example usage of MCP PDF Tools server
|
||||||
|
|
||||||
|
This script demonstrates how to test the PDF tools locally.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add the src directory to the path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||||
|
|
||||||
|
from mcp_pdf_tools.server import create_server
|
||||||
|
|
||||||
|
|
||||||
|
async def call_tool(mcp, tool_name: str, **kwargs):
|
||||||
|
"""Call a tool through the MCP server"""
|
||||||
|
tools = await mcp.get_tools()
|
||||||
|
if tool_name not in tools:
|
||||||
|
raise ValueError(f"Tool '{tool_name}' not found")
|
||||||
|
|
||||||
|
tool = tools[tool_name]
|
||||||
|
# Call the tool's function directly using the fn attribute
|
||||||
|
result = await tool.fn(**kwargs)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
async def test_pdf_tools(pdf_path: str):
|
||||||
|
"""Test various PDF tools on a given PDF file"""
|
||||||
|
|
||||||
|
# Create the MCP server
|
||||||
|
mcp = create_server()
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Testing PDF Tools on: {pdf_path}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# 1. Check if PDF is scanned
|
||||||
|
print("1. Checking if PDF is scanned...")
|
||||||
|
scan_result = await call_tool(mcp, "is_scanned_pdf", pdf_path=pdf_path)
|
||||||
|
print(f" Is scanned: {scan_result.get('is_scanned', 'Unknown')}")
|
||||||
|
print(f" Recommendation: {scan_result.get('recommendation', 'N/A')}")
|
||||||
|
|
||||||
|
# 2. Extract metadata
|
||||||
|
print("\n2. Extracting metadata...")
|
||||||
|
metadata_result = await call_tool(mcp, "extract_metadata", pdf_path=pdf_path)
|
||||||
|
if "error" not in metadata_result:
|
||||||
|
print(f" Title: {metadata_result['metadata'].get('title', 'N/A')}")
|
||||||
|
print(f" Author: {metadata_result['metadata'].get('author', 'N/A')}")
|
||||||
|
print(f" Pages: {metadata_result['statistics'].get('page_count', 'N/A')}")
|
||||||
|
print(f" File size: {metadata_result['file_info'].get('size_mb', 'N/A')} MB")
|
||||||
|
else:
|
||||||
|
print(f" Error: {metadata_result['error']}")
|
||||||
|
|
||||||
|
# 3. Get document structure
|
||||||
|
print("\n3. Getting document structure...")
|
||||||
|
structure_result = await call_tool(mcp, "get_document_structure", pdf_path=pdf_path)
|
||||||
|
if "error" not in structure_result:
|
||||||
|
print(f" Outline items: {len(structure_result.get('outline', []))}")
|
||||||
|
fonts = structure_result.get('fonts', [])
|
||||||
|
if fonts:
|
||||||
|
print(f" Fonts used: {', '.join(fonts[:3])}...")
|
||||||
|
else:
|
||||||
|
print(f" Error: {structure_result['error']}")
|
||||||
|
|
||||||
|
# 4. Extract text (if not scanned)
|
||||||
|
if not scan_result.get('is_scanned', True):
|
||||||
|
print("\n4. Extracting text...")
|
||||||
|
text_result = await call_tool(mcp, "extract_text",
|
||||||
|
pdf_path=pdf_path,
|
||||||
|
pages=[0]) # First page only
|
||||||
|
if "error" not in text_result:
|
||||||
|
text_preview = text_result['text'][:200].replace('\n', ' ')
|
||||||
|
print(f" Method used: {text_result['method_used']}")
|
||||||
|
print(f" Text preview: {text_preview}...")
|
||||||
|
else:
|
||||||
|
print(f" Error: {text_result['error']}")
|
||||||
|
else:
|
||||||
|
print("\n4. Skipping text extraction (PDF is scanned)")
|
||||||
|
|
||||||
|
# 5. Extract tables
|
||||||
|
print("\n5. Extracting tables...")
|
||||||
|
table_result = await call_tool(mcp, "extract_tables",
|
||||||
|
pdf_path=pdf_path,
|
||||||
|
pages=[0]) # First page only
|
||||||
|
if "error" not in table_result:
|
||||||
|
print(f" Tables found: {table_result['total_tables']}")
|
||||||
|
print(f" Method used: {table_result['method_used']}")
|
||||||
|
if table_result['total_tables'] > 0:
|
||||||
|
first_table = table_result['tables'][0]
|
||||||
|
print(f" First table shape: {first_table['shape']['rows']}x{first_table['shape']['columns']}")
|
||||||
|
else:
|
||||||
|
print(f" Error: {table_result['error']}")
|
||||||
|
|
||||||
|
# 6. Convert to Markdown (first page)
|
||||||
|
print("\n6. Converting to Markdown...")
|
||||||
|
markdown_result = await call_tool(mcp, "pdf_to_markdown",
|
||||||
|
pdf_path=pdf_path,
|
||||||
|
pages=[0],
|
||||||
|
include_images=False)
|
||||||
|
if "error" not in markdown_result:
|
||||||
|
md_preview = markdown_result['markdown'][:200].replace('\n', ' ')
|
||||||
|
print(f" Markdown preview: {md_preview}...")
|
||||||
|
else:
|
||||||
|
print(f" Error: {markdown_result['error']}")
|
||||||
|
|
||||||
|
# 7. Extract images
|
||||||
|
print("\n7. Extracting images...")
|
||||||
|
images_result = await call_tool(mcp, "extract_images",
|
||||||
|
pdf_path=pdf_path,
|
||||||
|
pages=[0])
|
||||||
|
if "error" not in images_result:
|
||||||
|
print(f" Images found: {images_result['total_images']}")
|
||||||
|
if images_result['total_images'] > 0:
|
||||||
|
first_image = images_result['images'][0]
|
||||||
|
print(f" First image size: {first_image['width']}x{first_image['height']}")
|
||||||
|
else:
|
||||||
|
print(f" Error: {images_result['error']}")
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Testing complete!")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main function to run the tests"""
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: python test_pdf_tools.py <path_to_pdf>")
|
||||||
|
print("\nExample:")
|
||||||
|
print(" python test_pdf_tools.py /path/to/document.pdf")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
pdf_path = sys.argv[1]
|
||||||
|
|
||||||
|
# Check if file exists
|
||||||
|
if not Path(pdf_path).exists():
|
||||||
|
print(f"Error: File not found: {pdf_path}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Check if it's a PDF
|
||||||
|
if not pdf_path.lower().endswith('.pdf'):
|
||||||
|
print(f"Error: File must be a PDF: {pdf_path}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await test_pdf_tools(pdf_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\nError during testing: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
39
examples/verify_installation.py
Normal file
39
examples/verify_installation.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Simple test script to verify the MCP server can be initialized
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add the src directory to the path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
try:
|
||||||
|
from mcp_pdf_tools import create_server, __version__
|
||||||
|
|
||||||
|
print(f"✅ MCP PDF Tools v{__version__} imported successfully!")
|
||||||
|
|
||||||
|
# Try to create the server
|
||||||
|
mcp = create_server()
|
||||||
|
print("✅ Server created successfully!")
|
||||||
|
|
||||||
|
# Check available tools
|
||||||
|
tools = await mcp.get_tools()
|
||||||
|
|
||||||
|
print(f"\n📋 Available tools ({len(tools)}):")
|
||||||
|
for tool_name in sorted(tools.keys()):
|
||||||
|
print(f" - {tool_name}")
|
||||||
|
|
||||||
|
print("\n✅ All systems operational! The MCP server is ready to use.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
9
mcp-config-example.json
Normal file
9
mcp-config-example.json
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"pdf-tools": {
|
||||||
|
"command": "uv",
|
||||||
|
"args": ["run", "--directory", "/home/rpm/claude/mcp-pdf-tools", "mcp-pdf-tools"],
|
||||||
|
"env": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
86
pyproject.toml
Normal file
86
pyproject.toml
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
[project]
|
||||||
|
name = "mcp-pdf-tools"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "FastMCP server for comprehensive PDF processing - text extraction, OCR, table extraction, and more"
|
||||||
|
authors = [{name = "RPM", email = "rpm@example.com"}]
|
||||||
|
readme = "README.md"
|
||||||
|
license = {text = "MIT"}
|
||||||
|
requires-python = ">=3.10"
|
||||||
|
keywords = [
|
||||||
|
"mcp",
|
||||||
|
"fastmcp",
|
||||||
|
"pdf",
|
||||||
|
"ocr",
|
||||||
|
"text-extraction",
|
||||||
|
"table-extraction",
|
||||||
|
"pdf-processing",
|
||||||
|
"api",
|
||||||
|
"integration"
|
||||||
|
]
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 4 - Beta",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||||
|
"Topic :: Text Processing :: General",
|
||||||
|
"Topic :: Office/Business",
|
||||||
|
]
|
||||||
|
dependencies = [
|
||||||
|
"fastmcp>=0.1.0",
|
||||||
|
"httpx>=0.25.0",
|
||||||
|
"pydantic>=2.0.0",
|
||||||
|
"python-dotenv>=1.0.0",
|
||||||
|
"PyMuPDF>=1.23.0",
|
||||||
|
"pdfplumber>=0.10.0",
|
||||||
|
"camelot-py[cv]>=0.11.0",
|
||||||
|
"tabula-py>=2.8.0",
|
||||||
|
"pytesseract>=0.3.10",
|
||||||
|
"pdf2image>=1.16.0",
|
||||||
|
"pypdf>=3.17.0",
|
||||||
|
"pandas>=2.0.0",
|
||||||
|
"Pillow>=10.0.0",
|
||||||
|
"markdown>=3.5.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://github.com/rpm/mcp-pdf-tools"
|
||||||
|
Documentation = "https://github.com/rpm/mcp-pdf-tools#readme"
|
||||||
|
Repository = "https://github.com/rpm/mcp-pdf-tools.git"
|
||||||
|
Issues = "https://github.com/rpm/mcp-pdf-tools/issues"
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
mcp-pdf-tools = "mcp_pdf_tools.server:main"
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"pytest>=7.0.0",
|
||||||
|
"pytest-asyncio>=0.21.0",
|
||||||
|
"black>=23.0.0",
|
||||||
|
"ruff>=0.1.0",
|
||||||
|
"mypy>=1.0.0",
|
||||||
|
"build>=0.10.0",
|
||||||
|
"twine>=4.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["hatchling"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[tool.hatchling.build.targets.sdist]
|
||||||
|
include = [
|
||||||
|
"/src",
|
||||||
|
"/tests",
|
||||||
|
"/examples",
|
||||||
|
"README.md",
|
||||||
|
"LICENSE",
|
||||||
|
"MANIFEST.in",
|
||||||
|
]
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"reportlab>=4.4.3",
|
||||||
|
]
|
6
run-mcp-server.sh
Executable file
6
run-mcp-server.sh
Executable file
@ -0,0 +1,6 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# MCP PDF Tools Server Launcher
|
||||||
|
# This script provides an easy way to run the MCP server
|
||||||
|
|
||||||
|
cd "$(dirname "$0")"
|
||||||
|
exec uv run mcp-pdf-tools
|
7
src/mcp_pdf_tools/__init__.py
Normal file
7
src/mcp_pdf_tools/__init__.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
"""MCP PDF Tools - A comprehensive PDF processing server for MCP"""
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
|
|
||||||
|
from .server import create_server
|
||||||
|
|
||||||
|
__all__ = ["create_server", "__version__"]
|
828
src/mcp_pdf_tools/server.py
Normal file
828
src/mcp_pdf_tools/server.py
Normal file
@ -0,0 +1,828 @@
|
|||||||
|
"""
|
||||||
|
MCP PDF Tools Server - Comprehensive PDF processing capabilities
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import tempfile
|
||||||
|
import base64
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, List, Optional, Union
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastmcp import FastMCP
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
# PDF processing libraries
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
import pdfplumber
|
||||||
|
import camelot
|
||||||
|
import tabula
|
||||||
|
import pytesseract
|
||||||
|
from pdf2image import convert_from_path
|
||||||
|
import pypdf
|
||||||
|
from PIL import Image
|
||||||
|
import pandas as pd
|
||||||
|
import json
|
||||||
|
import markdown
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Initialize FastMCP server
|
||||||
|
mcp = FastMCP("pdf-tools")
|
||||||
|
|
||||||
|
# Configuration models
|
||||||
|
class ExtractionConfig(BaseModel):
|
||||||
|
"""Configuration for text extraction"""
|
||||||
|
method: str = Field(default="auto", description="Extraction method: auto, pymupdf, pdfplumber, pypdf")
|
||||||
|
pages: Optional[List[int]] = Field(default=None, description="Specific pages to extract")
|
||||||
|
preserve_layout: bool = Field(default=False, description="Preserve text layout")
|
||||||
|
|
||||||
|
class TableExtractionConfig(BaseModel):
|
||||||
|
"""Configuration for table extraction"""
|
||||||
|
method: str = Field(default="auto", description="Method: auto, camelot, tabula, pdfplumber")
|
||||||
|
pages: Optional[List[int]] = Field(default=None, description="Pages to extract tables from")
|
||||||
|
output_format: str = Field(default="json", description="Output format: json, csv, markdown")
|
||||||
|
|
||||||
|
class OCRConfig(BaseModel):
|
||||||
|
"""Configuration for OCR processing"""
|
||||||
|
languages: List[str] = Field(default=["eng"], description="OCR languages")
|
||||||
|
preprocess: bool = Field(default=True, description="Preprocess image for better OCR")
|
||||||
|
dpi: int = Field(default=300, description="DPI for image conversion")
|
||||||
|
|
||||||
|
# Utility functions
|
||||||
|
async def validate_pdf_path(pdf_path: str) -> Path:
|
||||||
|
"""Validate that the path exists and is a PDF file"""
|
||||||
|
path = Path(pdf_path)
|
||||||
|
if not path.exists():
|
||||||
|
raise ValueError(f"File not found: {pdf_path}")
|
||||||
|
if not path.suffix.lower() == '.pdf':
|
||||||
|
raise ValueError(f"Not a PDF file: {pdf_path}")
|
||||||
|
return path
|
||||||
|
|
||||||
|
def detect_scanned_pdf(pdf_path: str) -> bool:
|
||||||
|
"""Detect if a PDF is scanned (image-based)"""
|
||||||
|
try:
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
# Check first few pages for text
|
||||||
|
pages_to_check = min(3, len(pdf.pages))
|
||||||
|
for i in range(pages_to_check):
|
||||||
|
text = pdf.pages[i].extract_text()
|
||||||
|
if text and len(text.strip()) > 50:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Text extraction methods
|
||||||
|
async def extract_with_pymupdf(pdf_path: Path, pages: Optional[List[int]] = None, preserve_layout: bool = False) -> str:
|
||||||
|
"""Extract text using PyMuPDF"""
|
||||||
|
doc = fitz.open(str(pdf_path))
|
||||||
|
text_parts = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
page_range = pages if pages else range(len(doc))
|
||||||
|
for page_num in page_range:
|
||||||
|
page = doc[page_num]
|
||||||
|
if preserve_layout:
|
||||||
|
text_parts.append(page.get_text("text"))
|
||||||
|
else:
|
||||||
|
text_parts.append(page.get_text())
|
||||||
|
finally:
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
return "\n\n".join(text_parts)
|
||||||
|
|
||||||
|
async def extract_with_pdfplumber(pdf_path: Path, pages: Optional[List[int]] = None, preserve_layout: bool = False) -> str:
|
||||||
|
"""Extract text using pdfplumber"""
|
||||||
|
text_parts = []
|
||||||
|
|
||||||
|
with pdfplumber.open(str(pdf_path)) as pdf:
|
||||||
|
page_range = pages if pages else range(len(pdf.pages))
|
||||||
|
for page_num in page_range:
|
||||||
|
page = pdf.pages[page_num]
|
||||||
|
text = page.extract_text(layout=preserve_layout)
|
||||||
|
if text:
|
||||||
|
text_parts.append(text)
|
||||||
|
|
||||||
|
return "\n\n".join(text_parts)
|
||||||
|
|
||||||
|
async def extract_with_pypdf(pdf_path: Path, pages: Optional[List[int]] = None, preserve_layout: bool = False) -> str:
|
||||||
|
"""Extract text using pypdf"""
|
||||||
|
reader = pypdf.PdfReader(str(pdf_path))
|
||||||
|
text_parts = []
|
||||||
|
|
||||||
|
page_range = pages if pages else range(len(reader.pages))
|
||||||
|
for page_num in page_range:
|
||||||
|
page = reader.pages[page_num]
|
||||||
|
text = page.extract_text()
|
||||||
|
if text:
|
||||||
|
text_parts.append(text)
|
||||||
|
|
||||||
|
return "\n\n".join(text_parts)
|
||||||
|
|
||||||
|
# Main text extraction tool
|
||||||
|
@mcp.tool(name="extract_text", description="Extract text from PDF with intelligent method selection")
|
||||||
|
async def extract_text(
|
||||||
|
pdf_path: str,
|
||||||
|
method: str = "auto",
|
||||||
|
pages: Optional[List[int]] = None,
|
||||||
|
preserve_layout: bool = False
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract text from PDF using various methods
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path: Path to the PDF file
|
||||||
|
method: Extraction method (auto, pymupdf, pdfplumber, pypdf)
|
||||||
|
pages: List of page numbers to extract (0-indexed), None for all pages
|
||||||
|
preserve_layout: Whether to preserve the original text layout
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing extracted text and metadata
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
path = await validate_pdf_path(pdf_path)
|
||||||
|
|
||||||
|
# Auto-select method based on PDF characteristics
|
||||||
|
if method == "auto":
|
||||||
|
is_scanned = detect_scanned_pdf(str(path))
|
||||||
|
if is_scanned:
|
||||||
|
return {
|
||||||
|
"error": "Scanned PDF detected. Please use the OCR tool for this file.",
|
||||||
|
"is_scanned": True
|
||||||
|
}
|
||||||
|
method = "pymupdf" # Default to PyMuPDF for text-based PDFs
|
||||||
|
|
||||||
|
# Extract text using selected method
|
||||||
|
if method == "pymupdf":
|
||||||
|
text = await extract_with_pymupdf(path, pages, preserve_layout)
|
||||||
|
elif method == "pdfplumber":
|
||||||
|
text = await extract_with_pdfplumber(path, pages, preserve_layout)
|
||||||
|
elif method == "pypdf":
|
||||||
|
text = await extract_with_pypdf(path, pages, preserve_layout)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown extraction method: {method}")
|
||||||
|
|
||||||
|
# Get metadata
|
||||||
|
doc = fitz.open(str(path))
|
||||||
|
metadata = {
|
||||||
|
"pages": len(doc),
|
||||||
|
"title": doc.metadata.get("title", ""),
|
||||||
|
"author": doc.metadata.get("author", ""),
|
||||||
|
"subject": doc.metadata.get("subject", ""),
|
||||||
|
"creator": doc.metadata.get("creator", ""),
|
||||||
|
}
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"text": text,
|
||||||
|
"method_used": method,
|
||||||
|
"metadata": metadata,
|
||||||
|
"pages_extracted": pages or list(range(metadata["pages"])),
|
||||||
|
"extraction_time": round(time.time() - start_time, 2),
|
||||||
|
"warnings": []
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Text extraction failed: {str(e)}")
|
||||||
|
return {
|
||||||
|
"error": f"Text extraction failed: {str(e)}",
|
||||||
|
"method_attempted": method
|
||||||
|
}
|
||||||
|
|
||||||
|
# Table extraction methods
|
||||||
|
async def extract_tables_camelot(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
|
||||||
|
"""Extract tables using Camelot"""
|
||||||
|
page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
|
||||||
|
|
||||||
|
# Try lattice mode first (for bordered tables)
|
||||||
|
try:
|
||||||
|
tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='lattice')
|
||||||
|
if len(tables) > 0:
|
||||||
|
return [table.df for table in tables]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fall back to stream mode (for borderless tables)
|
||||||
|
try:
|
||||||
|
tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='stream')
|
||||||
|
return [table.df for table in tables]
|
||||||
|
except:
|
||||||
|
return []
|
||||||
|
|
||||||
|
async def extract_tables_tabula(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
|
||||||
|
"""Extract tables using Tabula"""
|
||||||
|
page_list = [p+1 for p in pages] if pages else 'all'
|
||||||
|
|
||||||
|
try:
|
||||||
|
tables = tabula.read_pdf(str(pdf_path), pages=page_list, multiple_tables=True)
|
||||||
|
return tables
|
||||||
|
except:
|
||||||
|
return []
|
||||||
|
|
||||||
|
async def extract_tables_pdfplumber(pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]:
|
||||||
|
"""Extract tables using pdfplumber"""
|
||||||
|
tables = []
|
||||||
|
|
||||||
|
with pdfplumber.open(str(pdf_path)) as pdf:
|
||||||
|
page_range = pages if pages else range(len(pdf.pages))
|
||||||
|
for page_num in page_range:
|
||||||
|
page = pdf.pages[page_num]
|
||||||
|
page_tables = page.extract_tables()
|
||||||
|
for table in page_tables:
|
||||||
|
if table and len(table) > 1: # Skip empty tables
|
||||||
|
df = pd.DataFrame(table[1:], columns=table[0])
|
||||||
|
tables.append(df)
|
||||||
|
|
||||||
|
return tables
|
||||||
|
|
||||||
|
# Main table extraction tool
|
||||||
|
@mcp.tool(name="extract_tables", description="Extract tables from PDF with automatic method selection")
|
||||||
|
async def extract_tables(
|
||||||
|
pdf_path: str,
|
||||||
|
pages: Optional[List[int]] = None,
|
||||||
|
method: str = "auto",
|
||||||
|
output_format: str = "json"
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract tables from PDF using various methods
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path: Path to the PDF file
|
||||||
|
pages: List of page numbers to extract tables from (0-indexed)
|
||||||
|
method: Extraction method (auto, camelot, tabula, pdfplumber)
|
||||||
|
output_format: Output format (json, csv, markdown)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing extracted tables and metadata
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
path = await validate_pdf_path(pdf_path)
|
||||||
|
all_tables = []
|
||||||
|
methods_tried = []
|
||||||
|
|
||||||
|
# Auto method: try methods in order until we find tables
|
||||||
|
if method == "auto":
|
||||||
|
for try_method in ["camelot", "pdfplumber", "tabula"]:
|
||||||
|
methods_tried.append(try_method)
|
||||||
|
|
||||||
|
if try_method == "camelot":
|
||||||
|
tables = await extract_tables_camelot(path, pages)
|
||||||
|
elif try_method == "pdfplumber":
|
||||||
|
tables = await extract_tables_pdfplumber(path, pages)
|
||||||
|
elif try_method == "tabula":
|
||||||
|
tables = await extract_tables_tabula(path, pages)
|
||||||
|
|
||||||
|
if tables:
|
||||||
|
method = try_method
|
||||||
|
all_tables = tables
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Use specific method
|
||||||
|
methods_tried.append(method)
|
||||||
|
if method == "camelot":
|
||||||
|
all_tables = await extract_tables_camelot(path, pages)
|
||||||
|
elif method == "pdfplumber":
|
||||||
|
all_tables = await extract_tables_pdfplumber(path, pages)
|
||||||
|
elif method == "tabula":
|
||||||
|
all_tables = await extract_tables_tabula(path, pages)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown table extraction method: {method}")
|
||||||
|
|
||||||
|
# Format tables based on output format
|
||||||
|
formatted_tables = []
|
||||||
|
for i, df in enumerate(all_tables):
|
||||||
|
if output_format == "json":
|
||||||
|
formatted_tables.append({
|
||||||
|
"table_index": i,
|
||||||
|
"data": df.to_dict(orient="records"),
|
||||||
|
"shape": {"rows": len(df), "columns": len(df.columns)}
|
||||||
|
})
|
||||||
|
elif output_format == "csv":
|
||||||
|
formatted_tables.append({
|
||||||
|
"table_index": i,
|
||||||
|
"data": df.to_csv(index=False),
|
||||||
|
"shape": {"rows": len(df), "columns": len(df.columns)}
|
||||||
|
})
|
||||||
|
elif output_format == "markdown":
|
||||||
|
formatted_tables.append({
|
||||||
|
"table_index": i,
|
||||||
|
"data": df.to_markdown(index=False),
|
||||||
|
"shape": {"rows": len(df), "columns": len(df.columns)}
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"tables": formatted_tables,
|
||||||
|
"total_tables": len(formatted_tables),
|
||||||
|
"method_used": method,
|
||||||
|
"methods_tried": methods_tried,
|
||||||
|
"pages_searched": pages or "all",
|
||||||
|
"extraction_time": round(time.time() - start_time, 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Table extraction failed: {str(e)}")
|
||||||
|
return {
|
||||||
|
"error": f"Table extraction failed: {str(e)}",
|
||||||
|
"methods_tried": methods_tried
|
||||||
|
}
|
||||||
|
|
||||||
|
# OCR functionality
|
||||||
|
@mcp.tool(name="ocr_pdf", description="Perform OCR on scanned PDFs")
|
||||||
|
async def ocr_pdf(
|
||||||
|
pdf_path: str,
|
||||||
|
languages: List[str] = ["eng"],
|
||||||
|
preprocess: bool = True,
|
||||||
|
dpi: int = 300,
|
||||||
|
pages: Optional[List[int]] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Perform OCR on a scanned PDF
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path: Path to the PDF file
|
||||||
|
languages: List of language codes for OCR (e.g., ["eng", "fra"])
|
||||||
|
preprocess: Whether to preprocess images for better OCR
|
||||||
|
dpi: DPI for PDF to image conversion
|
||||||
|
pages: Specific pages to OCR (0-indexed)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing OCR text and metadata
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
path = await validate_pdf_path(pdf_path)
|
||||||
|
|
||||||
|
# Convert PDF pages to images
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
if pages:
|
||||||
|
images = []
|
||||||
|
for page_num in pages:
|
||||||
|
page_images = convert_from_path(
|
||||||
|
str(path),
|
||||||
|
dpi=dpi,
|
||||||
|
first_page=page_num+1,
|
||||||
|
last_page=page_num+1,
|
||||||
|
output_folder=temp_dir
|
||||||
|
)
|
||||||
|
images.extend(page_images)
|
||||||
|
else:
|
||||||
|
images = convert_from_path(str(path), dpi=dpi, output_folder=temp_dir)
|
||||||
|
|
||||||
|
# Perform OCR on each page
|
||||||
|
ocr_texts = []
|
||||||
|
for i, image in enumerate(images):
|
||||||
|
# Preprocess image if requested
|
||||||
|
if preprocess:
|
||||||
|
# Convert to grayscale
|
||||||
|
image = image.convert('L')
|
||||||
|
|
||||||
|
# Enhance contrast
|
||||||
|
from PIL import ImageEnhance
|
||||||
|
enhancer = ImageEnhance.Contrast(image)
|
||||||
|
image = enhancer.enhance(2.0)
|
||||||
|
|
||||||
|
# Perform OCR
|
||||||
|
lang_str = '+'.join(languages)
|
||||||
|
text = pytesseract.image_to_string(image, lang=lang_str)
|
||||||
|
ocr_texts.append(text)
|
||||||
|
|
||||||
|
# Combine all OCR text
|
||||||
|
full_text = "\n\n--- Page Break ---\n\n".join(ocr_texts)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"text": full_text,
|
||||||
|
"pages_processed": len(images),
|
||||||
|
"languages": languages,
|
||||||
|
"dpi": dpi,
|
||||||
|
"preprocessing_applied": preprocess,
|
||||||
|
"extraction_time": round(time.time() - start_time, 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"OCR failed: {str(e)}")
|
||||||
|
return {
|
||||||
|
"error": f"OCR failed: {str(e)}",
|
||||||
|
"hint": "Make sure Tesseract is installed and language data is available"
|
||||||
|
}
|
||||||
|
|
||||||
|
# PDF analysis tools
|
||||||
|
@mcp.tool(name="is_scanned_pdf", description="Check if a PDF is scanned/image-based")
|
||||||
|
async def is_scanned_pdf(pdf_path: str) -> Dict[str, Any]:
|
||||||
|
"""Check if a PDF is scanned (image-based) or contains extractable text"""
|
||||||
|
try:
|
||||||
|
path = await validate_pdf_path(pdf_path)
|
||||||
|
is_scanned = detect_scanned_pdf(str(path))
|
||||||
|
|
||||||
|
# Get more details
|
||||||
|
doc = fitz.open(str(path))
|
||||||
|
page_count = len(doc)
|
||||||
|
|
||||||
|
# Check a few pages for text content
|
||||||
|
sample_pages = min(5, page_count)
|
||||||
|
text_pages = 0
|
||||||
|
|
||||||
|
for i in range(sample_pages):
|
||||||
|
page = doc[i]
|
||||||
|
text = page.get_text().strip()
|
||||||
|
if len(text) > 50:
|
||||||
|
text_pages += 1
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"is_scanned": is_scanned,
|
||||||
|
"page_count": page_count,
|
||||||
|
"sample_pages_checked": sample_pages,
|
||||||
|
"pages_with_text": text_pages,
|
||||||
|
"recommendation": "Use OCR tool" if is_scanned else "Use text extraction tool"
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PDF scan detection failed: {str(e)}")
|
||||||
|
return {"error": f"Failed to analyze PDF: {str(e)}"}
|
||||||
|
|
||||||
|
@mcp.tool(name="get_document_structure", description="Extract document structure including headers, sections, and metadata")
|
||||||
|
async def get_document_structure(pdf_path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract document structure including headers, sections, and metadata
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path: Path to the PDF file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing document structure information
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
path = await validate_pdf_path(pdf_path)
|
||||||
|
doc = fitz.open(str(path))
|
||||||
|
|
||||||
|
structure = {
|
||||||
|
"metadata": {
|
||||||
|
"title": doc.metadata.get("title", ""),
|
||||||
|
"author": doc.metadata.get("author", ""),
|
||||||
|
"subject": doc.metadata.get("subject", ""),
|
||||||
|
"keywords": doc.metadata.get("keywords", ""),
|
||||||
|
"creator": doc.metadata.get("creator", ""),
|
||||||
|
"producer": doc.metadata.get("producer", ""),
|
||||||
|
"creation_date": str(doc.metadata.get("creationDate", "")),
|
||||||
|
"modification_date": str(doc.metadata.get("modDate", "")),
|
||||||
|
},
|
||||||
|
"pages": len(doc),
|
||||||
|
"outline": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract table of contents / bookmarks
|
||||||
|
toc = doc.get_toc()
|
||||||
|
for level, title, page in toc:
|
||||||
|
structure["outline"].append({
|
||||||
|
"level": level,
|
||||||
|
"title": title,
|
||||||
|
"page": page
|
||||||
|
})
|
||||||
|
|
||||||
|
# Extract page-level information
|
||||||
|
page_info = []
|
||||||
|
for i in range(min(5, len(doc))): # Sample first 5 pages
|
||||||
|
page = doc[i]
|
||||||
|
page_data = {
|
||||||
|
"page_number": i + 1,
|
||||||
|
"width": page.rect.width,
|
||||||
|
"height": page.rect.height,
|
||||||
|
"rotation": page.rotation,
|
||||||
|
"text_length": len(page.get_text()),
|
||||||
|
"image_count": len(page.get_images()),
|
||||||
|
"link_count": len(page.get_links())
|
||||||
|
}
|
||||||
|
page_info.append(page_data)
|
||||||
|
|
||||||
|
structure["sample_pages"] = page_info
|
||||||
|
|
||||||
|
# Detect fonts used
|
||||||
|
fonts = set()
|
||||||
|
for page in doc:
|
||||||
|
for font in page.get_fonts():
|
||||||
|
fonts.add(font[3]) # Font name
|
||||||
|
structure["fonts"] = list(fonts)
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
return structure
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Document structure extraction failed: {str(e)}")
|
||||||
|
return {"error": f"Failed to extract document structure: {str(e)}"}
|
||||||
|
|
||||||
|
# PDF to Markdown conversion
|
||||||
|
@mcp.tool(name="pdf_to_markdown", description="Convert PDF to clean markdown format")
|
||||||
|
async def pdf_to_markdown(
|
||||||
|
pdf_path: str,
|
||||||
|
include_images: bool = True,
|
||||||
|
include_metadata: bool = True,
|
||||||
|
pages: Optional[List[int]] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convert PDF to markdown format
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path: Path to the PDF file
|
||||||
|
include_images: Whether to extract and include images
|
||||||
|
include_metadata: Whether to include document metadata
|
||||||
|
pages: Specific pages to convert (0-indexed)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing markdown content
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
path = await validate_pdf_path(pdf_path)
|
||||||
|
doc = fitz.open(str(path))
|
||||||
|
|
||||||
|
markdown_parts = []
|
||||||
|
|
||||||
|
# Add metadata if requested
|
||||||
|
if include_metadata:
|
||||||
|
metadata = doc.metadata
|
||||||
|
if any(metadata.values()):
|
||||||
|
markdown_parts.append("# Document Metadata\n")
|
||||||
|
for key, value in metadata.items():
|
||||||
|
if value:
|
||||||
|
markdown_parts.append(f"- **{key.title()}**: {value}")
|
||||||
|
markdown_parts.append("\n---\n")
|
||||||
|
|
||||||
|
# Extract table of contents
|
||||||
|
toc = doc.get_toc()
|
||||||
|
if toc:
|
||||||
|
markdown_parts.append("# Table of Contents\n")
|
||||||
|
for level, title, page in toc:
|
||||||
|
indent = " " * (level - 1)
|
||||||
|
markdown_parts.append(f"{indent}- [{title}](#{page})")
|
||||||
|
markdown_parts.append("\n---\n")
|
||||||
|
|
||||||
|
# Process pages
|
||||||
|
page_range = pages if pages else range(len(doc))
|
||||||
|
images_extracted = []
|
||||||
|
|
||||||
|
for page_num in page_range:
|
||||||
|
page = doc[page_num]
|
||||||
|
|
||||||
|
# Add page header
|
||||||
|
markdown_parts.append(f"\n## Page {page_num + 1}\n")
|
||||||
|
|
||||||
|
# Extract text with basic formatting
|
||||||
|
blocks = page.get_text("blocks")
|
||||||
|
|
||||||
|
for block in blocks:
|
||||||
|
if block[6] == 0: # Text block
|
||||||
|
text = block[4].strip()
|
||||||
|
if text:
|
||||||
|
# Try to detect headers by font size
|
||||||
|
if len(text) < 100 and text.isupper():
|
||||||
|
markdown_parts.append(f"### {text}\n")
|
||||||
|
else:
|
||||||
|
markdown_parts.append(f"{text}\n")
|
||||||
|
|
||||||
|
# Extract images if requested
|
||||||
|
if include_images:
|
||||||
|
image_list = page.get_images()
|
||||||
|
for img_index, img in enumerate(image_list):
|
||||||
|
xref = img[0]
|
||||||
|
pix = fitz.Pixmap(doc, xref)
|
||||||
|
|
||||||
|
if pix.n - pix.alpha < 4: # GRAY or RGB
|
||||||
|
img_data = pix.tobytes("png")
|
||||||
|
img_b64 = base64.b64encode(img_data).decode()
|
||||||
|
images_extracted.append({
|
||||||
|
"page": page_num + 1,
|
||||||
|
"index": img_index,
|
||||||
|
"data": img_b64,
|
||||||
|
"width": pix.width,
|
||||||
|
"height": pix.height
|
||||||
|
})
|
||||||
|
markdown_parts.append(f"\n\n")
|
||||||
|
pix = None
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
# Combine markdown
|
||||||
|
markdown_content = "\n".join(markdown_parts)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"markdown": markdown_content,
|
||||||
|
"pages_converted": len(page_range),
|
||||||
|
"images_extracted": len(images_extracted),
|
||||||
|
"images": images_extracted if include_images else [],
|
||||||
|
"conversion_time": round(time.time() - start_time, 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PDF to Markdown conversion failed: {str(e)}")
|
||||||
|
return {"error": f"Conversion failed: {str(e)}"}
|
||||||
|
|
||||||
|
# Image extraction
|
||||||
|
@mcp.tool(name="extract_images", description="Extract images from PDF")
|
||||||
|
async def extract_images(
|
||||||
|
pdf_path: str,
|
||||||
|
pages: Optional[List[int]] = None,
|
||||||
|
min_width: int = 100,
|
||||||
|
min_height: int = 100,
|
||||||
|
output_format: str = "png"
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract images from PDF
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path: Path to the PDF file
|
||||||
|
pages: Specific pages to extract images from (0-indexed)
|
||||||
|
min_width: Minimum image width to extract
|
||||||
|
min_height: Minimum image height to extract
|
||||||
|
output_format: Output format (png, jpeg)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing extracted images
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
path = await validate_pdf_path(pdf_path)
|
||||||
|
doc = fitz.open(str(path))
|
||||||
|
|
||||||
|
images = []
|
||||||
|
page_range = pages if pages else range(len(doc))
|
||||||
|
|
||||||
|
for page_num in page_range:
|
||||||
|
page = doc[page_num]
|
||||||
|
image_list = page.get_images()
|
||||||
|
|
||||||
|
for img_index, img in enumerate(image_list):
|
||||||
|
xref = img[0]
|
||||||
|
pix = fitz.Pixmap(doc, xref)
|
||||||
|
|
||||||
|
# Check size requirements
|
||||||
|
if pix.width >= min_width and pix.height >= min_height:
|
||||||
|
if pix.n - pix.alpha < 4: # GRAY or RGB
|
||||||
|
if output_format == "jpeg" and pix.alpha:
|
||||||
|
pix = fitz.Pixmap(fitz.csRGB, pix)
|
||||||
|
|
||||||
|
img_data = pix.tobytes(output_format)
|
||||||
|
img_b64 = base64.b64encode(img_data).decode()
|
||||||
|
|
||||||
|
images.append({
|
||||||
|
"page": page_num + 1,
|
||||||
|
"index": img_index,
|
||||||
|
"data": img_b64,
|
||||||
|
"width": pix.width,
|
||||||
|
"height": pix.height,
|
||||||
|
"format": output_format
|
||||||
|
})
|
||||||
|
|
||||||
|
pix = None
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"images": images,
|
||||||
|
"total_images": len(images),
|
||||||
|
"pages_searched": len(page_range),
|
||||||
|
"filters": {
|
||||||
|
"min_width": min_width,
|
||||||
|
"min_height": min_height
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Image extraction failed: {str(e)}")
|
||||||
|
return {"error": f"Image extraction failed: {str(e)}"}
|
||||||
|
|
||||||
|
# Metadata extraction
|
||||||
|
@mcp.tool(name="extract_metadata", description="Extract comprehensive PDF metadata")
|
||||||
|
async def extract_metadata(pdf_path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract comprehensive metadata from PDF
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path: Path to the PDF file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing all available metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
path = await validate_pdf_path(pdf_path)
|
||||||
|
|
||||||
|
# Get file stats
|
||||||
|
file_stats = path.stat()
|
||||||
|
|
||||||
|
# PyMuPDF metadata
|
||||||
|
doc = fitz.open(str(path))
|
||||||
|
fitz_metadata = {
|
||||||
|
"title": doc.metadata.get("title", ""),
|
||||||
|
"author": doc.metadata.get("author", ""),
|
||||||
|
"subject": doc.metadata.get("subject", ""),
|
||||||
|
"keywords": doc.metadata.get("keywords", ""),
|
||||||
|
"creator": doc.metadata.get("creator", ""),
|
||||||
|
"producer": doc.metadata.get("producer", ""),
|
||||||
|
"creation_date": str(doc.metadata.get("creationDate", "")),
|
||||||
|
"modification_date": str(doc.metadata.get("modDate", "")),
|
||||||
|
"trapped": doc.metadata.get("trapped", ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Document statistics
|
||||||
|
has_annotations = False
|
||||||
|
has_links = False
|
||||||
|
try:
|
||||||
|
for page in doc:
|
||||||
|
if hasattr(page, 'annots') and page.annots() is not None:
|
||||||
|
annots_list = list(page.annots())
|
||||||
|
if len(annots_list) > 0:
|
||||||
|
has_annotations = True
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
for page in doc:
|
||||||
|
if page.get_links():
|
||||||
|
has_links = True
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"page_count": len(doc),
|
||||||
|
"file_size_bytes": file_stats.st_size,
|
||||||
|
"file_size_mb": round(file_stats.st_size / (1024*1024), 2),
|
||||||
|
"is_encrypted": doc.is_encrypted,
|
||||||
|
"is_form": doc.is_form_pdf,
|
||||||
|
"has_annotations": has_annotations,
|
||||||
|
"has_links": has_links,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Page dimensions
|
||||||
|
if len(doc) > 0:
|
||||||
|
first_page = doc[0]
|
||||||
|
stats["page_width"] = first_page.rect.width
|
||||||
|
stats["page_height"] = first_page.rect.height
|
||||||
|
stats["page_rotation"] = first_page.rotation
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
# PyPDF metadata (sometimes has additional info)
|
||||||
|
try:
|
||||||
|
reader = pypdf.PdfReader(str(path))
|
||||||
|
pypdf_metadata = reader.metadata
|
||||||
|
|
||||||
|
additional_metadata = {}
|
||||||
|
if pypdf_metadata:
|
||||||
|
for key, value in pypdf_metadata.items():
|
||||||
|
key_str = key.strip("/")
|
||||||
|
if key_str not in fitz_metadata or not fitz_metadata[key_str]:
|
||||||
|
additional_metadata[key_str] = str(value)
|
||||||
|
except:
|
||||||
|
additional_metadata = {}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"file_info": {
|
||||||
|
"path": str(path),
|
||||||
|
"name": path.name,
|
||||||
|
"size_bytes": file_stats.st_size,
|
||||||
|
"size_mb": round(file_stats.st_size / (1024*1024), 2),
|
||||||
|
"created": str(file_stats.st_ctime),
|
||||||
|
"modified": str(file_stats.st_mtime),
|
||||||
|
},
|
||||||
|
"metadata": fitz_metadata,
|
||||||
|
"statistics": stats,
|
||||||
|
"additional_metadata": additional_metadata
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Metadata extraction failed: {str(e)}")
|
||||||
|
return {"error": f"Metadata extraction failed: {str(e)}"}
|
||||||
|
|
||||||
|
# Main entry point
|
||||||
|
def create_server():
|
||||||
|
"""Create and return the MCP server instance"""
|
||||||
|
return mcp
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run the MCP server - entry point for CLI"""
|
||||||
|
asyncio.run(run_server())
|
||||||
|
|
||||||
|
async def run_server():
|
||||||
|
"""Run the MCP server"""
|
||||||
|
await mcp.run_stdio_async()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
1
tests/__init__.py
Normal file
1
tests/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
"""Tests package for MCP PDF Tools"""
|
401
tests/test_server.py
Normal file
401
tests/test_server.py
Normal file
@ -0,0 +1,401 @@
|
|||||||
|
"""Test suite for MCP PDF Tools server"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import asyncio
|
||||||
|
from unittest.mock import Mock, patch, MagicMock
|
||||||
|
import base64
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from mcp_pdf_tools.server import (
|
||||||
|
create_server,
|
||||||
|
validate_pdf_path,
|
||||||
|
detect_scanned_pdf,
|
||||||
|
extract_text,
|
||||||
|
extract_tables,
|
||||||
|
ocr_pdf,
|
||||||
|
is_scanned_pdf,
|
||||||
|
get_document_structure,
|
||||||
|
extract_metadata,
|
||||||
|
pdf_to_markdown,
|
||||||
|
extract_images
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def server():
|
||||||
|
"""Create server instance for testing"""
|
||||||
|
return create_server()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_pdf_path(tmp_path):
|
||||||
|
"""Create a mock PDF file path"""
|
||||||
|
pdf_file = tmp_path / "test.pdf"
|
||||||
|
pdf_file.touch()
|
||||||
|
return str(pdf_file)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_fitz_doc():
|
||||||
|
"""Create a mock PyMuPDF document"""
|
||||||
|
doc = MagicMock()
|
||||||
|
doc.__len__.return_value = 3
|
||||||
|
doc.metadata = {
|
||||||
|
"title": "Test PDF",
|
||||||
|
"author": "Test Author",
|
||||||
|
"subject": "Testing",
|
||||||
|
"keywords": "test, pdf",
|
||||||
|
"creator": "Test Creator",
|
||||||
|
"producer": "Test Producer",
|
||||||
|
"creationDate": "2024-01-01",
|
||||||
|
"modDate": "2024-01-02"
|
||||||
|
}
|
||||||
|
doc.is_encrypted = False
|
||||||
|
doc.is_form_pdf = False
|
||||||
|
doc.get_toc.return_value = [(1, "Chapter 1", 1), (2, "Section 1.1", 2)]
|
||||||
|
|
||||||
|
# Mock pages
|
||||||
|
pages = []
|
||||||
|
for i in range(3):
|
||||||
|
page = MagicMock()
|
||||||
|
page.get_text.return_value = f"This is page {i+1} text content."
|
||||||
|
page.rect.width = 595
|
||||||
|
page.rect.height = 842
|
||||||
|
page.rotation = 0
|
||||||
|
page.get_images.return_value = []
|
||||||
|
page.get_links.return_value = []
|
||||||
|
page.get_annotations.return_value = []
|
||||||
|
page.get_fonts.return_value = [(0, 0, 0, "Arial"), (0, 0, 0, "Times")]
|
||||||
|
pages.append(page)
|
||||||
|
|
||||||
|
doc.__getitem__.side_effect = lambda i: pages[i]
|
||||||
|
doc.pages = pages
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidation:
|
||||||
|
"""Test validation functions"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_validate_pdf_path_valid(self, mock_pdf_path):
|
||||||
|
"""Test validation with valid PDF path"""
|
||||||
|
result = await validate_pdf_path(mock_pdf_path)
|
||||||
|
assert result.exists()
|
||||||
|
assert result.suffix == ".pdf"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_validate_pdf_path_not_exists(self):
|
||||||
|
"""Test validation with non-existent file"""
|
||||||
|
with pytest.raises(ValueError, match="File not found"):
|
||||||
|
await validate_pdf_path("/non/existent/file.pdf")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_validate_pdf_path_not_pdf(self, tmp_path):
|
||||||
|
"""Test validation with non-PDF file"""
|
||||||
|
txt_file = tmp_path / "test.txt"
|
||||||
|
txt_file.touch()
|
||||||
|
with pytest.raises(ValueError, match="Not a PDF file"):
|
||||||
|
await validate_pdf_path(str(txt_file))
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextExtraction:
|
||||||
|
"""Test text extraction functionality"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@patch('fitz.open')
|
||||||
|
async def test_extract_text_success(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path):
|
||||||
|
"""Test successful text extraction"""
|
||||||
|
mock_fitz_open.return_value = mock_fitz_doc
|
||||||
|
|
||||||
|
result = await extract_text(
|
||||||
|
pdf_path=mock_pdf_path,
|
||||||
|
method="pymupdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["text"] == "This is page 1 text content.\n\nThis is page 2 text content.\n\nThis is page 3 text content."
|
||||||
|
assert result["method_used"] == "pymupdf"
|
||||||
|
assert result["metadata"]["pages"] == 3
|
||||||
|
assert result["metadata"]["title"] == "Test PDF"
|
||||||
|
assert len(result["pages_extracted"]) == 3
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@patch('fitz.open')
|
||||||
|
async def test_extract_text_specific_pages(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path):
|
||||||
|
"""Test text extraction from specific pages"""
|
||||||
|
mock_fitz_open.return_value = mock_fitz_doc
|
||||||
|
|
||||||
|
result = await extract_text(
|
||||||
|
pdf_path=mock_pdf_path,
|
||||||
|
pages=[0, 2],
|
||||||
|
method="pymupdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "page 1" in result["text"]
|
||||||
|
assert "page 2" not in result["text"]
|
||||||
|
assert "page 3" in result["text"]
|
||||||
|
assert result["pages_extracted"] == [0, 2]
|
||||||
|
|
||||||
|
|
||||||
|
class TestTableExtraction:
|
||||||
|
"""Test table extraction functionality"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@patch('camelot.read_pdf')
|
||||||
|
async def test_extract_tables_camelot(self, mock_camelot, mock_pdf_path):
|
||||||
|
"""Test table extraction with Camelot"""
|
||||||
|
# Mock Camelot tables
|
||||||
|
mock_table = MagicMock()
|
||||||
|
mock_table.df = pd.DataFrame({
|
||||||
|
'Column1': ['A', 'B'],
|
||||||
|
'Column2': ['1', '2']
|
||||||
|
})
|
||||||
|
mock_camelot.return_value = [mock_table]
|
||||||
|
|
||||||
|
result = await extract_tables(
|
||||||
|
pdf_path=mock_pdf_path,
|
||||||
|
method="camelot",
|
||||||
|
output_format="json"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["total_tables"] == 1
|
||||||
|
assert result["method_used"] == "camelot"
|
||||||
|
assert len(result["tables"]) == 1
|
||||||
|
assert result["tables"][0]["shape"]["rows"] == 2
|
||||||
|
assert result["tables"][0]["shape"]["columns"] == 2
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@patch('camelot.read_pdf')
|
||||||
|
@patch('pdfplumber.open')
|
||||||
|
@patch('tabula.read_pdf')
|
||||||
|
async def test_extract_tables_auto_fallback(self, mock_tabula, mock_pdfplumber, mock_camelot, mock_pdf_path):
|
||||||
|
"""Test automatic fallback between table extraction methods"""
|
||||||
|
# Camelot fails
|
||||||
|
mock_camelot.side_effect = Exception("Camelot failed")
|
||||||
|
|
||||||
|
# pdfplumber succeeds
|
||||||
|
mock_pdf = MagicMock()
|
||||||
|
mock_page = MagicMock()
|
||||||
|
mock_page.extract_tables.return_value = [[['Col1', 'Col2'], ['A', '1'], ['B', '2']]]
|
||||||
|
mock_pdf.pages = [mock_page]
|
||||||
|
mock_pdf.__enter__.return_value = mock_pdf
|
||||||
|
mock_pdfplumber.return_value = mock_pdf
|
||||||
|
|
||||||
|
result = await extract_tables(
|
||||||
|
pdf_path=mock_pdf_path,
|
||||||
|
method="auto"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["total_tables"] == 1
|
||||||
|
assert result["method_used"] == "pdfplumber"
|
||||||
|
assert "camelot" in result["methods_tried"]
|
||||||
|
assert "pdfplumber" in result["methods_tried"]
|
||||||
|
preprocess=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["preprocessing_applied"] is True
|
||||||
|
mock_image.convert.assert_called_with('L') # Grayscale conversion
|
||||||
|
mock_enhancer.enhance.assert_called_with(2.0) # Contrast enhancement
|
||||||
|
|
||||||
|
|
||||||
|
class TestDocumentAnalysis:
|
||||||
|
"""Test document analysis functions"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@patch('fitz.open')
|
||||||
|
@patch('pdfplumber.open')
|
||||||
|
async def test_is_scanned_pdf_true(self, mock_pdfplumber, mock_fitz, mock_pdf_path):
|
||||||
|
"""Test detection of scanned PDF"""
|
||||||
|
# Mock pdfplumber for scanned detection
|
||||||
|
mock_pdf = MagicMock()
|
||||||
|
mock_page = MagicMock()
|
||||||
|
mock_page.extract_text.return_value = "" # No text = scanned
|
||||||
|
mock_pdf.pages = [mock_page]
|
||||||
|
mock_pdf.__enter__.return_value = mock_pdf
|
||||||
|
mock_pdfplumber.return_value = mock_pdf
|
||||||
|
|
||||||
|
# Mock fitz for additional info
|
||||||
|
mock_doc = MagicMock()
|
||||||
|
mock_doc.__len__.return_value = 1
|
||||||
|
mock_doc.__getitem__.return_value.get_text.return_value = ""
|
||||||
|
mock_fitz.return_value = mock_doc
|
||||||
|
|
||||||
|
result = await is_scanned_pdf(mock_pdf_path)
|
||||||
|
|
||||||
|
assert result["is_scanned"] is True
|
||||||
|
assert result["recommendation"] == "Use OCR tool"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@patch('fitz.open')
|
||||||
|
async def test_get_document_structure(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path):
|
||||||
|
"""Test document structure extraction"""
|
||||||
|
mock_fitz_open.return_value = mock_fitz_doc
|
||||||
|
|
||||||
|
result = await get_document_structure(mock_pdf_path)
|
||||||
|
|
||||||
|
assert result["metadata"]["title"] == "Test PDF"
|
||||||
|
assert result["pages"] == 3
|
||||||
|
assert len(result["outline"]) == 2
|
||||||
|
assert result["outline"][0]["title"] == "Chapter 1"
|
||||||
|
assert len(result["sample_pages"]) == 3
|
||||||
|
assert "Arial" in result["fonts"]
|
||||||
|
assert "Times" in result["fonts"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@patch('fitz.open')
|
||||||
|
@patch('pypdf.PdfReader')
|
||||||
|
async def test_extract_metadata(self, mock_pypdf, mock_fitz_open, mock_fitz_doc, mock_pdf_path):
|
||||||
|
"""Test comprehensive metadata extraction"""
|
||||||
|
mock_fitz_open.return_value = mock_fitz_doc
|
||||||
|
|
||||||
|
# Mock pypdf for additional metadata
|
||||||
|
mock_reader = MagicMock()
|
||||||
|
mock_reader.metadata = {
|
||||||
|
"/CustomField": "Custom Value"
|
||||||
|
}
|
||||||
|
mock_pypdf.return_value = mock_reader
|
||||||
|
|
||||||
|
# Mock file stats
|
||||||
|
with patch('pathlib.Path.stat') as mock_stat:
|
||||||
|
mock_stat.return_value = MagicMock(
|
||||||
|
st_size=1024000, # 1MB
|
||||||
|
st_ctime=1704067200, # 2024-01-01
|
||||||
|
st_mtime=1704153600 # 2024-01-02
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await extract_metadata(mock_pdf_path)
|
||||||
|
|
||||||
|
assert result["metadata"]["title"] == "Test PDF"
|
||||||
|
assert result["file_info"]["size_mb"] == 1.0
|
||||||
|
assert result["statistics"]["page_count"] == 3
|
||||||
|
assert result["statistics"]["is_encrypted"] is False
|
||||||
|
assert result["additional_metadata"]["CustomField"] == "Custom Value"
|
||||||
|
|
||||||
|
|
||||||
|
class TestConversion:
|
||||||
|
"""Test PDF conversion functions"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@patch('fitz.open')
|
||||||
|
async def test_pdf_to_markdown(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path):
|
||||||
|
"""Test PDF to Markdown conversion"""
|
||||||
|
# Enhance mock for text blocks
|
||||||
|
mock_page = mock_fitz_doc[0]
|
||||||
|
mock_page.get_text.return_value = "Page 1 content"
|
||||||
|
mock_page.get_text.side_effect = lambda fmt="": {
|
||||||
|
"blocks": [(0, 0, 100, 20, "HEADER TEXT", 0, 0)],
|
||||||
|
"": "Page 1 content"
|
||||||
|
}.get(fmt, "Page 1 content")
|
||||||
|
|
||||||
|
mock_fitz_open.return_value = mock_fitz_doc
|
||||||
|
|
||||||
|
result = await pdf_to_markdown(
|
||||||
|
pdf_path=mock_pdf_path,
|
||||||
|
include_metadata=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "# Document Metadata" in result["markdown"]
|
||||||
|
assert "Test PDF" in result["markdown"]
|
||||||
|
assert "# Table of Contents" in result["markdown"]
|
||||||
|
assert "Chapter 1" in result["markdown"]
|
||||||
|
assert result["pages_converted"] == 3
|
||||||
|
|
||||||
|
|
||||||
|
class TestImageExtraction:
|
||||||
|
"""Test image extraction functionality"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@patch('fitz.open')
|
||||||
|
@patch('fitz.Pixmap')
|
||||||
|
async def test_extract_images(self, mock_pixmap_class, mock_fitz_open, mock_pdf_path):
|
||||||
|
"""Test image extraction from PDF"""
|
||||||
|
# Mock document
|
||||||
|
mock_doc = MagicMock()
|
||||||
|
mock_page = MagicMock()
|
||||||
|
mock_page.get_images.return_value = [(1, 0, 100, 100, 8, 'DeviceRGB', '', 'Im1', 'FlateDecode')]
|
||||||
|
mock_doc.__len__.return_value = 1
|
||||||
|
mock_doc.__getitem__.return_value = mock_page
|
||||||
|
mock_fitz_open.return_value = mock_doc
|
||||||
|
|
||||||
|
# Mock pixmap
|
||||||
|
mock_pixmap = MagicMock()
|
||||||
|
mock_pixmap.width = 200
|
||||||
|
mock_pixmap.height = 200
|
||||||
|
mock_pixmap.n = 3 # RGB
|
||||||
|
mock_pixmap.alpha = 0
|
||||||
|
mock_pixmap.tobytes.return_value = b"fake_image_data"
|
||||||
|
mock_pixmap_class.return_value = mock_pixmap
|
||||||
|
|
||||||
|
result = await extract_images(
|
||||||
|
pdf_path=mock_pdf_path,
|
||||||
|
min_width=100,
|
||||||
|
min_height=100
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["total_images"] == 1
|
||||||
|
assert len(result["images"]) == 1
|
||||||
|
assert result["images"][0]["width"] == 200
|
||||||
|
assert result["images"][0]["height"] == 200
|
||||||
|
assert result["images"][0]["format"] == "png"
|
||||||
|
assert result["images"][0]["data"] == base64.b64encode(b"fake_image_data").decode()
|
||||||
|
|
||||||
|
|
||||||
|
class TestServerInitialization:
|
||||||
|
"""Test server initialization and configuration"""
|
||||||
|
|
||||||
|
def test_create_server(self):
|
||||||
|
"""Test server creation"""
|
||||||
|
server = create_server()
|
||||||
|
assert server is not None
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_server_has_all_tools(self, server):
|
||||||
|
"""Test that all expected tools are registered"""
|
||||||
|
# Get all registered tools
|
||||||
|
tools = []
|
||||||
|
for handler in server._tool_handlers:
|
||||||
|
tools.append(handler.name)
|
||||||
|
|
||||||
|
expected_tools = [
|
||||||
|
"extract_text",
|
||||||
|
"extract_tables",
|
||||||
|
"ocr_pdf",
|
||||||
|
"is_scanned_pdf",
|
||||||
|
"get_document_structure",
|
||||||
|
"extract_metadata",
|
||||||
|
"pdf_to_markdown",
|
||||||
|
"extract_images"
|
||||||
|
]
|
||||||
|
|
||||||
|
for tool in expected_tools:
|
||||||
|
assert tool in tools, f"Tool '{tool}' not found in server"
|
||||||
|
|
||||||
|
|
||||||
|
class TestErrorHandling:
|
||||||
|
"""Test error handling in various scenarios"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extract_text_invalid_method(self, mock_pdf_path):
|
||||||
|
"""Test error handling for invalid extraction method"""
|
||||||
|
result = await extract_text(
|
||||||
|
pdf_path=mock_pdf_path,
|
||||||
|
method="invalid_method"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "error" in result
|
||||||
|
assert "Unknown extraction method" in result["error"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extract_text_file_not_found(self):
|
||||||
|
"""Test error handling for non-existent file"""
|
||||||
|
result = await extract_text(
|
||||||
|
pdf_path="/non/existent/file.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "error" in result
|
||||||
|
assert "File not found" in result["error"]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
Loading…
x
Reference in New Issue
Block a user