llm-fusion-mcp/test_large_file_analysis.py

#!/usr/bin/env python3
"""Test the large file analysis tool."""

import os
import sys
sys.path.insert(0, 'src')
from dotenv import load_dotenv

load_dotenv()

# Import the large file analysis function components
from llm_fusion_mcp.server import (
    _extract_file_content, _estimate_token_count,
    _select_optimal_provider_for_size, _smart_chunk_content,
    get_client, PROVIDER_CONFIG
)

def test_file_extraction():
    """Test file content extraction."""
    print("📁 Testing File Content Extraction")
    print("=" * 50)

    # Test markdown file
    if os.path.exists("test_large_document.md"):
        content = _extract_file_content("test_large_document.md")

        if content:
            word_count = len(content.split())
            char_count = len(content)
            print(f"✓ Extracted content: {word_count} words, {char_count} characters")

            # Test token estimation
            estimated_tokens = _estimate_token_count(content)
            print(f"✓ Estimated tokens: {estimated_tokens}")

            return content, estimated_tokens
        else:
            print("✗ Failed to extract content")
            return None, 0
    else:
        print("⚠️  Test document not found")
        return None, 0

def test_provider_selection():
    """Test optimal provider selection."""
    print("\n🎯 Testing Provider Selection")
    print("=" * 50)

    test_sizes = [1000, 50000, 150000, 500000, 1200000]

    for size in test_sizes:
        provider, model = _select_optimal_provider_for_size(size)
        print(f"Size {size:>8} tokens → {provider:<10} / {model}")

def test_chunking_strategies():
    """Test different chunking strategies."""
    print("\n✂️  Testing Chunking Strategies")
    print("=" * 50)

    # Create test content
    test_content = """
# Section 1
This is the first section with some content.

It has multiple paragraphs to test semantic chunking.

# Section 2
This is the second section.

It also has multiple paragraphs.

# Section 3
The third section is here.

With more content for testing.
"""

    strategies = ["auto", "semantic", "fixed", "hierarchical"]
    chunk_size = 100  # Small for testing

    for strategy in strategies:
        chunks = _smart_chunk_content(test_content, strategy, chunk_size)
        print(f"{strategy:<12}: {len(chunks)} chunks")
        for i, chunk in enumerate(chunks[:2]):  # Show first 2 chunks
            preview = chunk.replace('\n', ' ')[:50] + "..."
            print(f"  Chunk {i+1}: {preview}")

def test_direct_analysis():
    """Test direct file analysis (without MCP wrapper)."""
    print("\n🔍 Testing Direct Large File Analysis")
    print("=" * 50)

    if not os.getenv("GOOGLE_API_KEY"):
        print("⚠️  Skipping analysis test - no Google API key")
        return

    try:
        # Test with our test document
        if os.path.exists("test_large_document.md"):
            content = _extract_file_content("test_large_document.md")
            tokens = _estimate_token_count(content)
            provider, model = _select_optimal_provider_for_size(tokens)

            print(f"📄 File: test_large_document.md")
            print(f"📊 Tokens: {tokens}")
            print(f"🎯 Selected: {provider} / {model}")

            # Test if it would use direct or chunked approach
            context_limits = {
                "gemini": 1000000, "openai": 1000000,
                "anthropic": 200000, "grok": 100000
            }

            provider_limit = context_limits.get(provider, 100000)
            approach = "direct" if tokens <= provider_limit else "chunked"
            print(f"📋 Approach: {approach}")

            if approach == "direct":
                # Test direct analysis
                client = get_client(provider)
                prompt = "Provide a brief summary of this document's main topics"

                response = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": f"{prompt}\n\n{content}"}]
                )

                analysis = response.choices[0].message.content
                print(f"✓ Analysis completed: {len(analysis)} characters")
                print(f"📝 Summary: {analysis[:200]}...")
            else:
                # Test chunking approach
                chunks = _smart_chunk_content(content, "auto", provider_limit // 2)
                print(f"✓ Would create {len(chunks)} chunks for processing")

    except Exception as e:
        print(f"✗ Analysis test failed: {e}")

def test_file_type_support():
    """Test support for different file types."""
    print("\n📋 Testing File Type Support")
    print("=" * 50)

    # Create test files of different types
    test_files = {
        "test.txt": "This is a plain text file for testing.",
        "test.json": '{"name": "test", "type": "json", "data": [1, 2, 3]}',
        "test.py": "def hello():\n    print('Hello, world!')\n    return True"
    }

    for filename, content in test_files.items():
        try:
            # Write test file
            with open(filename, 'w') as f:
                f.write(content)

            # Test extraction
            extracted = _extract_file_content(filename)
            if extracted:
                tokens = _estimate_token_count(extracted)
                print(f"✓ {filename:<12}: {tokens} tokens")
            else:
                print(f"✗ {filename:<12}: extraction failed")

            # Clean up
            os.remove(filename)

        except Exception as e:
            print(f"✗ {filename:<12}: {e}")

if __name__ == "__main__":
    print("🚀 Large File Analysis Testing")
    print("=" * 70)

    test_file_extraction()
    test_provider_selection()
    test_chunking_strategies()
    test_direct_analysis()
    test_file_type_support()

    print("\n" + "=" * 70)
    print("✅ Large file analysis testing completed!")

    # Show configuration summary
    configured_providers = [
        provider for provider, config in PROVIDER_CONFIG.items()
        if os.getenv(config["api_key_env"])
    ]

    print(f"📊 Summary:")
    print(f"  Configured providers: {', '.join(configured_providers)}")
    print(f"  Max context windows: gemini(1M), openai(1M), anthropic(200K), grok(100K)")
    print(f"  Chunking strategies: auto, semantic, fixed, hierarchical")
    print(f"  Supported file types: txt, md, py, json, csv, log")