mcp-pdf-tools/tests/test_server.py
Ryan Malloy 75f8548668
Some checks failed
Security Scan / security-scan (push) Has been cancelled
🔒 Comprehensive security hardening and vulnerability fixes
Implemented extensive security improvements to prevent attacks and ensure
production readiness:

**Critical Security Fixes:**
- Fixed path traversal vulnerability in get_pdf_image function
- Added file size limits (100MB PDFs, 50MB images) to prevent DoS
- Implemented secure output path validation with directory restrictions
- Added page count limits (1000 pages max) for resource protection
- Secured JSON parameter parsing with 10KB size limits

**Access Control & Validation:**
- URL allowlisting with SSRF protection (blocks localhost, internal IPs)
- IPv6 security handling for comprehensive host blocking
- Input validation framework with length limits and sanitization
- Secure file permissions (0o700 dirs, 0o600 files)

**Error Handling & Privacy:**
- Sanitized error messages to prevent information disclosure
- Automatic removal of sensitive patterns (paths, emails, SSNs)
- Generic error responses for failed operations

**Infrastructure & Monitoring:**
- Added security scanning tools (safety, pip-audit)
- GitHub Actions workflow for continuous vulnerability monitoring
- Daily automated security assessments
- Fixed pypdf vulnerability (5.9.0 → 6.0.0)

**Testing & Validation:**
- 20 comprehensive security tests (all passing)
- Integration tests confirming functionality preservation
- Zero known vulnerabilities in dependencies
- Validated all security functions work correctly

All security measures tested and verified. Project now production-ready
with enterprise-grade security posture.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-06 15:35:31 -06:00

396 lines
13 KiB
Python

"""Test suite for MCP PDF Tools server"""
import pytest
import asyncio
from unittest.mock import Mock, patch, MagicMock
import base64
import pandas as pd
from pathlib import Path
from mcp_pdf_tools.server import (
create_server,
validate_pdf_path,
detect_scanned_pdf,
extract_text,
extract_tables,
ocr_pdf,
is_scanned_pdf,
get_document_structure,
extract_metadata,
pdf_to_markdown,
extract_images
)
@pytest.fixture
def server():
"""Create server instance for testing"""
return create_server()
@pytest.fixture
def mock_pdf_path(tmp_path):
"""Create a mock PDF file path"""
pdf_file = tmp_path / "test.pdf"
pdf_file.touch()
return str(pdf_file)
@pytest.fixture
def mock_fitz_doc():
"""Create a mock PyMuPDF document"""
doc = MagicMock()
doc.__len__.return_value = 3
doc.metadata = {
"title": "Test PDF",
"author": "Test Author",
"subject": "Testing",
"keywords": "test, pdf",
"creator": "Test Creator",
"producer": "Test Producer",
"creationDate": "2024-01-01",
"modDate": "2024-01-02"
}
doc.is_encrypted = False
doc.is_form_pdf = False
doc.get_toc.return_value = [(1, "Chapter 1", 1), (2, "Section 1.1", 2)]
# Mock pages
pages = []
for i in range(3):
page = MagicMock()
page.get_text.return_value = f"This is page {i+1} text content."
page.rect.width = 595
page.rect.height = 842
page.rotation = 0
page.get_images.return_value = []
page.get_links.return_value = []
page.get_annotations.return_value = []
page.get_fonts.return_value = [(0, 0, 0, "Arial"), (0, 0, 0, "Times")]
pages.append(page)
doc.__getitem__.side_effect = lambda i: pages[i]
doc.pages = pages
return doc
class TestValidation:
"""Test validation functions"""
@pytest.mark.asyncio
async def test_validate_pdf_path_valid(self, mock_pdf_path):
"""Test validation with valid PDF path"""
result = await validate_pdf_path(mock_pdf_path)
assert result.exists()
assert result.suffix == ".pdf"
@pytest.mark.asyncio
async def test_validate_pdf_path_not_exists(self):
"""Test validation with non-existent file"""
with pytest.raises(ValueError, match="File not found"):
await validate_pdf_path("/non/existent/file.pdf")
@pytest.mark.asyncio
async def test_validate_pdf_path_not_pdf(self, tmp_path):
"""Test validation with non-PDF file"""
txt_file = tmp_path / "test.txt"
txt_file.touch()
with pytest.raises(ValueError, match="Not a PDF file"):
await validate_pdf_path(str(txt_file))
class TestTextExtraction:
"""Test text extraction functionality"""
@pytest.mark.asyncio
@patch('fitz.open')
async def test_extract_text_success(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path):
"""Test successful text extraction"""
mock_fitz_open.return_value = mock_fitz_doc
result = await extract_text(
pdf_path=mock_pdf_path,
method="pymupdf"
)
assert result["text"] == "This is page 1 text content.\n\nThis is page 2 text content.\n\nThis is page 3 text content."
assert result["method_used"] == "pymupdf"
assert result["metadata"]["pages"] == 3
assert result["metadata"]["title"] == "Test PDF"
assert len(result["pages_extracted"]) == 3
@pytest.mark.asyncio
@patch('fitz.open')
async def test_extract_text_specific_pages(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path):
"""Test text extraction from specific pages"""
mock_fitz_open.return_value = mock_fitz_doc
result = await extract_text(
pdf_path=mock_pdf_path,
pages=[0, 2],
method="pymupdf"
)
assert "page 1" in result["text"]
assert "page 2" not in result["text"]
assert "page 3" in result["text"]
assert result["pages_extracted"] == [0, 2]
class TestTableExtraction:
"""Test table extraction functionality"""
@pytest.mark.asyncio
@patch('camelot.read_pdf')
async def test_extract_tables_camelot(self, mock_camelot, mock_pdf_path):
"""Test table extraction with Camelot"""
# Mock Camelot tables
mock_table = MagicMock()
mock_table.df = pd.DataFrame({
'Column1': ['A', 'B'],
'Column2': ['1', '2']
})
mock_camelot.return_value = [mock_table]
result = await extract_tables(
pdf_path=mock_pdf_path,
method="camelot",
output_format="json"
)
assert result["total_tables"] == 1
assert result["method_used"] == "camelot"
assert len(result["tables"]) == 1
assert result["tables"][0]["shape"]["rows"] == 2
assert result["tables"][0]["shape"]["columns"] == 2
@pytest.mark.asyncio
@patch('camelot.read_pdf')
@patch('pdfplumber.open')
@patch('tabula.read_pdf')
async def test_extract_tables_auto_fallback(self, mock_tabula, mock_pdfplumber, mock_camelot, mock_pdf_path):
"""Test automatic fallback between table extraction methods"""
# Camelot fails
mock_camelot.side_effect = Exception("Camelot failed")
# pdfplumber succeeds
mock_pdf = MagicMock()
mock_page = MagicMock()
mock_page.extract_tables.return_value = [[['Col1', 'Col2'], ['A', '1'], ['B', '2']]]
mock_pdf.pages = [mock_page]
mock_pdf.__enter__.return_value = mock_pdf
mock_pdfplumber.return_value = mock_pdf
result = await extract_tables(
pdf_path=mock_pdf_path,
method="auto"
)
assert result["total_tables"] == 1
assert result["method_used"] == "pdfplumber"
assert "camelot" in result["methods_tried"]
assert "pdfplumber" in result["methods_tried"]
class TestDocumentAnalysis:
"""Test document analysis functions"""
@pytest.mark.asyncio
@patch('fitz.open')
@patch('pdfplumber.open')
async def test_is_scanned_pdf_true(self, mock_pdfplumber, mock_fitz, mock_pdf_path):
"""Test detection of scanned PDF"""
# Mock pdfplumber for scanned detection
mock_pdf = MagicMock()
mock_page = MagicMock()
mock_page.extract_text.return_value = "" # No text = scanned
mock_pdf.pages = [mock_page]
mock_pdf.__enter__.return_value = mock_pdf
mock_pdfplumber.return_value = mock_pdf
# Mock fitz for additional info
mock_doc = MagicMock()
mock_doc.__len__.return_value = 1
mock_doc.__getitem__.return_value.get_text.return_value = ""
mock_fitz.return_value = mock_doc
result = await is_scanned_pdf(mock_pdf_path)
assert result["is_scanned"] is True
assert result["recommendation"] == "Use OCR tool"
@pytest.mark.asyncio
@patch('fitz.open')
async def test_get_document_structure(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path):
"""Test document structure extraction"""
mock_fitz_open.return_value = mock_fitz_doc
result = await get_document_structure(mock_pdf_path)
assert result["metadata"]["title"] == "Test PDF"
assert result["pages"] == 3
assert len(result["outline"]) == 2
assert result["outline"][0]["title"] == "Chapter 1"
assert len(result["sample_pages"]) == 3
assert "Arial" in result["fonts"]
assert "Times" in result["fonts"]
@pytest.mark.asyncio
@patch('fitz.open')
@patch('pypdf.PdfReader')
async def test_extract_metadata(self, mock_pypdf, mock_fitz_open, mock_fitz_doc, mock_pdf_path):
"""Test comprehensive metadata extraction"""
mock_fitz_open.return_value = mock_fitz_doc
# Mock pypdf for additional metadata
mock_reader = MagicMock()
mock_reader.metadata = {
"/CustomField": "Custom Value"
}
mock_pypdf.return_value = mock_reader
# Mock file stats
with patch('pathlib.Path.stat') as mock_stat:
mock_stat.return_value = MagicMock(
st_size=1024000, # 1MB
st_ctime=1704067200, # 2024-01-01
st_mtime=1704153600 # 2024-01-02
)
result = await extract_metadata(mock_pdf_path)
assert result["metadata"]["title"] == "Test PDF"
assert result["file_info"]["size_mb"] == 1.0
assert result["statistics"]["page_count"] == 3
assert result["statistics"]["is_encrypted"] is False
assert result["additional_metadata"]["CustomField"] == "Custom Value"
class TestConversion:
"""Test PDF conversion functions"""
@pytest.mark.asyncio
@patch('fitz.open')
async def test_pdf_to_markdown(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path):
"""Test PDF to Markdown conversion"""
# Enhance mock for text blocks
mock_page = mock_fitz_doc[0]
mock_page.get_text.return_value = "Page 1 content"
mock_page.get_text.side_effect = lambda fmt="": {
"blocks": [(0, 0, 100, 20, "HEADER TEXT", 0, 0)],
"": "Page 1 content"
}.get(fmt, "Page 1 content")
mock_fitz_open.return_value = mock_fitz_doc
result = await pdf_to_markdown(
pdf_path=mock_pdf_path,
include_metadata=True
)
assert "# Document Metadata" in result["markdown"]
assert "Test PDF" in result["markdown"]
assert "# Table of Contents" in result["markdown"]
assert "Chapter 1" in result["markdown"]
assert result["pages_converted"] == 3
class TestImageExtraction:
"""Test image extraction functionality"""
@pytest.mark.asyncio
@patch('fitz.open')
@patch('fitz.Pixmap')
async def test_extract_images(self, mock_pixmap_class, mock_fitz_open, mock_pdf_path):
"""Test image extraction from PDF"""
# Mock document
mock_doc = MagicMock()
mock_page = MagicMock()
mock_page.get_images.return_value = [(1, 0, 100, 100, 8, 'DeviceRGB', '', 'Im1', 'FlateDecode')]
mock_doc.__len__.return_value = 1
mock_doc.__getitem__.return_value = mock_page
mock_fitz_open.return_value = mock_doc
# Mock pixmap
mock_pixmap = MagicMock()
mock_pixmap.width = 200
mock_pixmap.height = 200
mock_pixmap.n = 3 # RGB
mock_pixmap.alpha = 0
mock_pixmap.tobytes.return_value = b"fake_image_data"
mock_pixmap_class.return_value = mock_pixmap
result = await extract_images(
pdf_path=mock_pdf_path,
min_width=100,
min_height=100
)
assert result["total_images"] == 1
assert len(result["images"]) == 1
assert result["images"][0]["width"] == 200
assert result["images"][0]["height"] == 200
assert result["images"][0]["format"] == "png"
assert result["images"][0]["data"] == base64.b64encode(b"fake_image_data").decode()
class TestServerInitialization:
"""Test server initialization and configuration"""
def test_create_server(self):
"""Test server creation"""
server = create_server()
assert server is not None
@pytest.mark.asyncio
async def test_server_has_all_tools(self, server):
"""Test that all expected tools are registered"""
# Get all registered tools
tools = []
for handler in server._tool_handlers:
tools.append(handler.name)
expected_tools = [
"extract_text",
"extract_tables",
"ocr_pdf",
"is_scanned_pdf",
"get_document_structure",
"extract_metadata",
"pdf_to_markdown",
"extract_images"
]
for tool in expected_tools:
assert tool in tools, f"Tool '{tool}' not found in server"
class TestErrorHandling:
"""Test error handling in various scenarios"""
@pytest.mark.asyncio
async def test_extract_text_invalid_method(self, mock_pdf_path):
"""Test error handling for invalid extraction method"""
result = await extract_text(
pdf_path=mock_pdf_path,
method="invalid_method"
)
assert "error" in result
assert "Unknown extraction method" in result["error"]
@pytest.mark.asyncio
async def test_extract_text_file_not_found(self):
"""Test error handling for non-existent file"""
result = await extract_text(
pdf_path="/non/existent/file.pdf"
)
assert "error" in result
assert "File not found" in result["error"]
if __name__ == "__main__":
pytest.main([__file__, "-v"])