1
0
forked from MCP/llm-fusion-mcp
llm-fusion-mcp/test_large_file_analysis.py
Ryan Malloy c335ba0e1e Initial commit: LLM Fusion MCP Server
- Unified access to 4 major LLM providers (Gemini, OpenAI, Anthropic, Grok)
- Real-time streaming support across all providers
- Multimodal capabilities (text, images, audio)
- Intelligent document processing with smart chunking
- Production-ready with health monitoring and error handling
- Full OpenAI ecosystem integration (Assistants, DALL-E, Whisper)
- Vector embeddings and semantic similarity
- Session-based API key management
- Built with FastMCP and modern Python tooling

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-05 05:47:51 -06:00

194 lines
6.5 KiB
Python

#!/usr/bin/env python3
"""Test the large file analysis tool."""
import os
import sys
sys.path.insert(0, 'src')
from dotenv import load_dotenv
load_dotenv()
# Import the large file analysis function components
from llm_fusion_mcp.server import (
_extract_file_content, _estimate_token_count,
_select_optimal_provider_for_size, _smart_chunk_content,
get_client, PROVIDER_CONFIG
)
def test_file_extraction():
"""Test file content extraction."""
print("📁 Testing File Content Extraction")
print("=" * 50)
# Test markdown file
if os.path.exists("test_large_document.md"):
content = _extract_file_content("test_large_document.md")
if content:
word_count = len(content.split())
char_count = len(content)
print(f"✓ Extracted content: {word_count} words, {char_count} characters")
# Test token estimation
estimated_tokens = _estimate_token_count(content)
print(f"✓ Estimated tokens: {estimated_tokens}")
return content, estimated_tokens
else:
print("✗ Failed to extract content")
return None, 0
else:
print("⚠️ Test document not found")
return None, 0
def test_provider_selection():
"""Test optimal provider selection."""
print("\n🎯 Testing Provider Selection")
print("=" * 50)
test_sizes = [1000, 50000, 150000, 500000, 1200000]
for size in test_sizes:
provider, model = _select_optimal_provider_for_size(size)
print(f"Size {size:>8} tokens → {provider:<10} / {model}")
def test_chunking_strategies():
"""Test different chunking strategies."""
print("\n✂️ Testing Chunking Strategies")
print("=" * 50)
# Create test content
test_content = """
# Section 1
This is the first section with some content.
It has multiple paragraphs to test semantic chunking.
# Section 2
This is the second section.
It also has multiple paragraphs.
# Section 3
The third section is here.
With more content for testing.
"""
strategies = ["auto", "semantic", "fixed", "hierarchical"]
chunk_size = 100 # Small for testing
for strategy in strategies:
chunks = _smart_chunk_content(test_content, strategy, chunk_size)
print(f"{strategy:<12}: {len(chunks)} chunks")
for i, chunk in enumerate(chunks[:2]): # Show first 2 chunks
preview = chunk.replace('\n', ' ')[:50] + "..."
print(f" Chunk {i+1}: {preview}")
def test_direct_analysis():
"""Test direct file analysis (without MCP wrapper)."""
print("\n🔍 Testing Direct Large File Analysis")
print("=" * 50)
if not os.getenv("GOOGLE_API_KEY"):
print("⚠️ Skipping analysis test - no Google API key")
return
try:
# Test with our test document
if os.path.exists("test_large_document.md"):
content = _extract_file_content("test_large_document.md")
tokens = _estimate_token_count(content)
provider, model = _select_optimal_provider_for_size(tokens)
print(f"📄 File: test_large_document.md")
print(f"📊 Tokens: {tokens}")
print(f"🎯 Selected: {provider} / {model}")
# Test if it would use direct or chunked approach
context_limits = {
"gemini": 1000000, "openai": 1000000,
"anthropic": 200000, "grok": 100000
}
provider_limit = context_limits.get(provider, 100000)
approach = "direct" if tokens <= provider_limit else "chunked"
print(f"📋 Approach: {approach}")
if approach == "direct":
# Test direct analysis
client = get_client(provider)
prompt = "Provide a brief summary of this document's main topics"
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": f"{prompt}\n\n{content}"}]
)
analysis = response.choices[0].message.content
print(f"✓ Analysis completed: {len(analysis)} characters")
print(f"📝 Summary: {analysis[:200]}...")
else:
# Test chunking approach
chunks = _smart_chunk_content(content, "auto", provider_limit // 2)
print(f"✓ Would create {len(chunks)} chunks for processing")
except Exception as e:
print(f"✗ Analysis test failed: {e}")
def test_file_type_support():
"""Test support for different file types."""
print("\n📋 Testing File Type Support")
print("=" * 50)
# Create test files of different types
test_files = {
"test.txt": "This is a plain text file for testing.",
"test.json": '{"name": "test", "type": "json", "data": [1, 2, 3]}',
"test.py": "def hello():\n print('Hello, world!')\n return True"
}
for filename, content in test_files.items():
try:
# Write test file
with open(filename, 'w') as f:
f.write(content)
# Test extraction
extracted = _extract_file_content(filename)
if extracted:
tokens = _estimate_token_count(extracted)
print(f"{filename:<12}: {tokens} tokens")
else:
print(f"{filename:<12}: extraction failed")
# Clean up
os.remove(filename)
except Exception as e:
print(f"{filename:<12}: {e}")
if __name__ == "__main__":
print("🚀 Large File Analysis Testing")
print("=" * 70)
test_file_extraction()
test_provider_selection()
test_chunking_strategies()
test_direct_analysis()
test_file_type_support()
print("\n" + "=" * 70)
print("✅ Large file analysis testing completed!")
# Show configuration summary
configured_providers = [
provider for provider, config in PROVIDER_CONFIG.items()
if os.getenv(config["api_key_env"])
]
print(f"📊 Summary:")
print(f" Configured providers: {', '.join(configured_providers)}")
print(f" Max context windows: gemini(1M), openai(1M), anthropic(200K), grok(100K)")
print(f" Chunking strategies: auto, semantic, fixed, hierarchical")
print(f" Supported file types: txt, md, py, json, csv, log")