mcp-pdf-tools/examples/test_pdf_tools.py

"""
Example usage of MCP PDF Tools server

This script demonstrates how to test the PDF tools locally.
"""

import asyncio
import sys
import json
from pathlib import Path

# Add the src directory to the path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from mcp_pdf_tools.server import create_server


async def call_tool(mcp, tool_name: str, **kwargs):
    """Call a tool through the MCP server"""
    tools = await mcp.get_tools()
    if tool_name not in tools:
        raise ValueError(f"Tool '{tool_name}' not found")

    tool = tools[tool_name]
    # Call the tool's function directly using the fn attribute
    result = await tool.fn(**kwargs)
    return result


async def test_pdf_tools(pdf_path: str):
    """Test various PDF tools on a given PDF file"""

    # Create the MCP server
    mcp = create_server()

    print(f"\n{'='*60}")
    print(f"Testing PDF Tools on: {pdf_path}")
    print(f"{'='*60}\n")

    # 1. Check if PDF is scanned
    print("1. Checking if PDF is scanned...")
    scan_result = await call_tool(mcp, "is_scanned_pdf", pdf_path=pdf_path)
    print(f"   Is scanned: {scan_result.get('is_scanned', 'Unknown')}")
    print(f"   Recommendation: {scan_result.get('recommendation', 'N/A')}")

    # 2. Extract metadata
    print("\n2. Extracting metadata...")
    metadata_result = await call_tool(mcp, "extract_metadata", pdf_path=pdf_path)
    if "error" not in metadata_result:
        print(f"   Title: {metadata_result['metadata'].get('title', 'N/A')}")
        print(f"   Author: {metadata_result['metadata'].get('author', 'N/A')}")
        print(f"   Pages: {metadata_result['statistics'].get('page_count', 'N/A')}")
        print(f"   File size: {metadata_result['file_info'].get('size_mb', 'N/A')} MB")
    else:
        print(f"   Error: {metadata_result['error']}")

    # 3. Get document structure
    print("\n3. Getting document structure...")
    structure_result = await call_tool(mcp, "get_document_structure", pdf_path=pdf_path)
    if "error" not in structure_result:
        print(f"   Outline items: {len(structure_result.get('outline', []))}")
        fonts = structure_result.get('fonts', [])
        if fonts:
            print(f"   Fonts used: {', '.join(fonts[:3])}...")
    else:
        print(f"   Error: {structure_result['error']}")

    # 4. Extract text (if not scanned)
    if not scan_result.get('is_scanned', True):
        print("\n4. Extracting text...")
        text_result = await call_tool(mcp, "extract_text",
                                     pdf_path=pdf_path,
                                     pages=[0])  # First page only
        if "error" not in text_result:
            text_preview = text_result['text'][:200].replace('\n', ' ')
            print(f"   Method used: {text_result['method_used']}")
            print(f"   Text preview: {text_preview}...")
        else:
            print(f"   Error: {text_result['error']}")
    else:
        print("\n4. Skipping text extraction (PDF is scanned)")

    # 5. Extract tables
    print("\n5. Extracting tables...")
    table_result = await call_tool(mcp, "extract_tables",
                                  pdf_path=pdf_path,
                                  pages=[0])  # First page only
    if "error" not in table_result:
        print(f"   Tables found: {table_result['total_tables']}")
        print(f"   Method used: {table_result['method_used']}")
        if table_result['total_tables'] > 0:
            first_table = table_result['tables'][0]
            print(f"   First table shape: {first_table['shape']['rows']}x{first_table['shape']['columns']}")
    else:
        print(f"   Error: {table_result['error']}")

    # 6. Convert to Markdown (first page)
    print("\n6. Converting to Markdown...")
    markdown_result = await call_tool(mcp, "pdf_to_markdown",
                                     pdf_path=pdf_path,
                                     pages=[0],
                                     include_images=False)
    if "error" not in markdown_result:
        md_preview = markdown_result['markdown'][:200].replace('\n', ' ')
        print(f"   Markdown preview: {md_preview}...")
    else:
        print(f"   Error: {markdown_result['error']}")

    # 7. Extract images
    print("\n7. Extracting images...")
    images_result = await call_tool(mcp, "extract_images",
                                   pdf_path=pdf_path,
                                   pages=[0])
    if "error" not in images_result:
        print(f"   Images found: {images_result['total_images']}")
        if images_result['total_images'] > 0:
            first_image = images_result['images'][0]
            print(f"   First image size: {first_image['width']}x{first_image['height']}")
    else:
        print(f"   Error: {images_result['error']}")

    print(f"\n{'='*60}")
    print("Testing complete!")
    print(f"{'='*60}\n")


async def main():
    """Main function to run the tests"""
    if len(sys.argv) < 2:
        print("Usage: python test_pdf_tools.py <path_to_pdf>")
        print("\nExample:")
        print("  python test_pdf_tools.py /path/to/document.pdf")
        sys.exit(1)

    pdf_path = sys.argv[1]

    # Check if file exists
    if not Path(pdf_path).exists():
        print(f"Error: File not found: {pdf_path}")
        sys.exit(1)

    # Check if it's a PDF
    if not pdf_path.lower().endswith('.pdf'):
        print(f"Error: File must be a PDF: {pdf_path}")
        sys.exit(1)

    try:
        await test_pdf_tools(pdf_path)
    except Exception as e:
        print(f"\nError during testing: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    asyncio.run(main())