"""Test suite for MCP PDF Tools server""" import pytest import asyncio from unittest.mock import Mock, patch, MagicMock import base64 import pandas as pd from pathlib import Path from mcp_pdf_tools.server import ( create_server, validate_pdf_path, detect_scanned_pdf, extract_text, extract_tables, ocr_pdf, is_scanned_pdf, get_document_structure, extract_metadata, pdf_to_markdown, extract_images ) @pytest.fixture def server(): """Create server instance for testing""" return create_server() @pytest.fixture def mock_pdf_path(tmp_path): """Create a mock PDF file path""" pdf_file = tmp_path / "test.pdf" pdf_file.touch() return str(pdf_file) @pytest.fixture def mock_fitz_doc(): """Create a mock PyMuPDF document""" doc = MagicMock() doc.__len__.return_value = 3 doc.metadata = { "title": "Test PDF", "author": "Test Author", "subject": "Testing", "keywords": "test, pdf", "creator": "Test Creator", "producer": "Test Producer", "creationDate": "2024-01-01", "modDate": "2024-01-02" } doc.is_encrypted = False doc.is_form_pdf = False doc.get_toc.return_value = [(1, "Chapter 1", 1), (2, "Section 1.1", 2)] # Mock pages pages = [] for i in range(3): page = MagicMock() page.get_text.return_value = f"This is page {i+1} text content." page.rect.width = 595 page.rect.height = 842 page.rotation = 0 page.get_images.return_value = [] page.get_links.return_value = [] page.get_annotations.return_value = [] page.get_fonts.return_value = [(0, 0, 0, "Arial"), (0, 0, 0, "Times")] pages.append(page) doc.__getitem__.side_effect = lambda i: pages[i] doc.pages = pages return doc class TestValidation: """Test validation functions""" @pytest.mark.asyncio async def test_validate_pdf_path_valid(self, mock_pdf_path): """Test validation with valid PDF path""" result = await validate_pdf_path(mock_pdf_path) assert result.exists() assert result.suffix == ".pdf" @pytest.mark.asyncio async def test_validate_pdf_path_not_exists(self): """Test validation with non-existent file""" with pytest.raises(ValueError, match="File not found"): await validate_pdf_path("/non/existent/file.pdf") @pytest.mark.asyncio async def test_validate_pdf_path_not_pdf(self, tmp_path): """Test validation with non-PDF file""" txt_file = tmp_path / "test.txt" txt_file.touch() with pytest.raises(ValueError, match="Not a PDF file"): await validate_pdf_path(str(txt_file)) class TestTextExtraction: """Test text extraction functionality""" @pytest.mark.asyncio @patch('fitz.open') async def test_extract_text_success(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path): """Test successful text extraction""" mock_fitz_open.return_value = mock_fitz_doc result = await extract_text( pdf_path=mock_pdf_path, method="pymupdf" ) assert result["text"] == "This is page 1 text content.\n\nThis is page 2 text content.\n\nThis is page 3 text content." assert result["method_used"] == "pymupdf" assert result["metadata"]["pages"] == 3 assert result["metadata"]["title"] == "Test PDF" assert len(result["pages_extracted"]) == 3 @pytest.mark.asyncio @patch('fitz.open') async def test_extract_text_specific_pages(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path): """Test text extraction from specific pages""" mock_fitz_open.return_value = mock_fitz_doc result = await extract_text( pdf_path=mock_pdf_path, pages=[0, 2], method="pymupdf" ) assert "page 1" in result["text"] assert "page 2" not in result["text"] assert "page 3" in result["text"] assert result["pages_extracted"] == [0, 2] class TestTableExtraction: """Test table extraction functionality""" @pytest.mark.asyncio @patch('camelot.read_pdf') async def test_extract_tables_camelot(self, mock_camelot, mock_pdf_path): """Test table extraction with Camelot""" # Mock Camelot tables mock_table = MagicMock() mock_table.df = pd.DataFrame({ 'Column1': ['A', 'B'], 'Column2': ['1', '2'] }) mock_camelot.return_value = [mock_table] result = await extract_tables( pdf_path=mock_pdf_path, method="camelot", output_format="json" ) assert result["total_tables"] == 1 assert result["method_used"] == "camelot" assert len(result["tables"]) == 1 assert result["tables"][0]["shape"]["rows"] == 2 assert result["tables"][0]["shape"]["columns"] == 2 @pytest.mark.asyncio @patch('camelot.read_pdf') @patch('pdfplumber.open') @patch('tabula.read_pdf') async def test_extract_tables_auto_fallback(self, mock_tabula, mock_pdfplumber, mock_camelot, mock_pdf_path): """Test automatic fallback between table extraction methods""" # Camelot fails mock_camelot.side_effect = Exception("Camelot failed") # pdfplumber succeeds mock_pdf = MagicMock() mock_page = MagicMock() mock_page.extract_tables.return_value = [[['Col1', 'Col2'], ['A', '1'], ['B', '2']]] mock_pdf.pages = [mock_page] mock_pdf.__enter__.return_value = mock_pdf mock_pdfplumber.return_value = mock_pdf result = await extract_tables( pdf_path=mock_pdf_path, method="auto" ) assert result["total_tables"] == 1 assert result["method_used"] == "pdfplumber" assert "camelot" in result["methods_tried"] assert "pdfplumber" in result["methods_tried"] preprocess=True ) assert result["preprocessing_applied"] is True mock_image.convert.assert_called_with('L') # Grayscale conversion mock_enhancer.enhance.assert_called_with(2.0) # Contrast enhancement class TestDocumentAnalysis: """Test document analysis functions""" @pytest.mark.asyncio @patch('fitz.open') @patch('pdfplumber.open') async def test_is_scanned_pdf_true(self, mock_pdfplumber, mock_fitz, mock_pdf_path): """Test detection of scanned PDF""" # Mock pdfplumber for scanned detection mock_pdf = MagicMock() mock_page = MagicMock() mock_page.extract_text.return_value = "" # No text = scanned mock_pdf.pages = [mock_page] mock_pdf.__enter__.return_value = mock_pdf mock_pdfplumber.return_value = mock_pdf # Mock fitz for additional info mock_doc = MagicMock() mock_doc.__len__.return_value = 1 mock_doc.__getitem__.return_value.get_text.return_value = "" mock_fitz.return_value = mock_doc result = await is_scanned_pdf(mock_pdf_path) assert result["is_scanned"] is True assert result["recommendation"] == "Use OCR tool" @pytest.mark.asyncio @patch('fitz.open') async def test_get_document_structure(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path): """Test document structure extraction""" mock_fitz_open.return_value = mock_fitz_doc result = await get_document_structure(mock_pdf_path) assert result["metadata"]["title"] == "Test PDF" assert result["pages"] == 3 assert len(result["outline"]) == 2 assert result["outline"][0]["title"] == "Chapter 1" assert len(result["sample_pages"]) == 3 assert "Arial" in result["fonts"] assert "Times" in result["fonts"] @pytest.mark.asyncio @patch('fitz.open') @patch('pypdf.PdfReader') async def test_extract_metadata(self, mock_pypdf, mock_fitz_open, mock_fitz_doc, mock_pdf_path): """Test comprehensive metadata extraction""" mock_fitz_open.return_value = mock_fitz_doc # Mock pypdf for additional metadata mock_reader = MagicMock() mock_reader.metadata = { "/CustomField": "Custom Value" } mock_pypdf.return_value = mock_reader # Mock file stats with patch('pathlib.Path.stat') as mock_stat: mock_stat.return_value = MagicMock( st_size=1024000, # 1MB st_ctime=1704067200, # 2024-01-01 st_mtime=1704153600 # 2024-01-02 ) result = await extract_metadata(mock_pdf_path) assert result["metadata"]["title"] == "Test PDF" assert result["file_info"]["size_mb"] == 1.0 assert result["statistics"]["page_count"] == 3 assert result["statistics"]["is_encrypted"] is False assert result["additional_metadata"]["CustomField"] == "Custom Value" class TestConversion: """Test PDF conversion functions""" @pytest.mark.asyncio @patch('fitz.open') async def test_pdf_to_markdown(self, mock_fitz_open, mock_fitz_doc, mock_pdf_path): """Test PDF to Markdown conversion""" # Enhance mock for text blocks mock_page = mock_fitz_doc[0] mock_page.get_text.return_value = "Page 1 content" mock_page.get_text.side_effect = lambda fmt="": { "blocks": [(0, 0, 100, 20, "HEADER TEXT", 0, 0)], "": "Page 1 content" }.get(fmt, "Page 1 content") mock_fitz_open.return_value = mock_fitz_doc result = await pdf_to_markdown( pdf_path=mock_pdf_path, include_metadata=True ) assert "# Document Metadata" in result["markdown"] assert "Test PDF" in result["markdown"] assert "# Table of Contents" in result["markdown"] assert "Chapter 1" in result["markdown"] assert result["pages_converted"] == 3 class TestImageExtraction: """Test image extraction functionality""" @pytest.mark.asyncio @patch('fitz.open') @patch('fitz.Pixmap') async def test_extract_images(self, mock_pixmap_class, mock_fitz_open, mock_pdf_path): """Test image extraction from PDF""" # Mock document mock_doc = MagicMock() mock_page = MagicMock() mock_page.get_images.return_value = [(1, 0, 100, 100, 8, 'DeviceRGB', '', 'Im1', 'FlateDecode')] mock_doc.__len__.return_value = 1 mock_doc.__getitem__.return_value = mock_page mock_fitz_open.return_value = mock_doc # Mock pixmap mock_pixmap = MagicMock() mock_pixmap.width = 200 mock_pixmap.height = 200 mock_pixmap.n = 3 # RGB mock_pixmap.alpha = 0 mock_pixmap.tobytes.return_value = b"fake_image_data" mock_pixmap_class.return_value = mock_pixmap result = await extract_images( pdf_path=mock_pdf_path, min_width=100, min_height=100 ) assert result["total_images"] == 1 assert len(result["images"]) == 1 assert result["images"][0]["width"] == 200 assert result["images"][0]["height"] == 200 assert result["images"][0]["format"] == "png" assert result["images"][0]["data"] == base64.b64encode(b"fake_image_data").decode() class TestServerInitialization: """Test server initialization and configuration""" def test_create_server(self): """Test server creation""" server = create_server() assert server is not None @pytest.mark.asyncio async def test_server_has_all_tools(self, server): """Test that all expected tools are registered""" # Get all registered tools tools = [] for handler in server._tool_handlers: tools.append(handler.name) expected_tools = [ "extract_text", "extract_tables", "ocr_pdf", "is_scanned_pdf", "get_document_structure", "extract_metadata", "pdf_to_markdown", "extract_images" ] for tool in expected_tools: assert tool in tools, f"Tool '{tool}' not found in server" class TestErrorHandling: """Test error handling in various scenarios""" @pytest.mark.asyncio async def test_extract_text_invalid_method(self, mock_pdf_path): """Test error handling for invalid extraction method""" result = await extract_text( pdf_path=mock_pdf_path, method="invalid_method" ) assert "error" in result assert "Unknown extraction method" in result["error"] @pytest.mark.asyncio async def test_extract_text_file_not_found(self): """Test error handling for non-existent file""" result = await extract_text( pdf_path="/non/existent/file.pdf" ) assert "error" in result assert "File not found" in result["error"] if __name__ == "__main__": pytest.main([__file__, "-v"])