forked from MCP/llm-fusion-mcp
- Unified access to 4 major LLM providers (Gemini, OpenAI, Anthropic, Grok) - Real-time streaming support across all providers - Multimodal capabilities (text, images, audio) - Intelligent document processing with smart chunking - Production-ready with health monitoring and error handling - Full OpenAI ecosystem integration (Assistants, DALL-E, Whisper) - Vector embeddings and semantic similarity - Session-based API key management - Built with FastMCP and modern Python tooling 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
194 lines
6.5 KiB
Python
194 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Test the large file analysis tool."""
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, 'src')
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
# Import the large file analysis function components
|
|
from llm_fusion_mcp.server import (
|
|
_extract_file_content, _estimate_token_count,
|
|
_select_optimal_provider_for_size, _smart_chunk_content,
|
|
get_client, PROVIDER_CONFIG
|
|
)
|
|
|
|
def test_file_extraction():
|
|
"""Test file content extraction."""
|
|
print("📁 Testing File Content Extraction")
|
|
print("=" * 50)
|
|
|
|
# Test markdown file
|
|
if os.path.exists("test_large_document.md"):
|
|
content = _extract_file_content("test_large_document.md")
|
|
|
|
if content:
|
|
word_count = len(content.split())
|
|
char_count = len(content)
|
|
print(f"✓ Extracted content: {word_count} words, {char_count} characters")
|
|
|
|
# Test token estimation
|
|
estimated_tokens = _estimate_token_count(content)
|
|
print(f"✓ Estimated tokens: {estimated_tokens}")
|
|
|
|
return content, estimated_tokens
|
|
else:
|
|
print("✗ Failed to extract content")
|
|
return None, 0
|
|
else:
|
|
print("⚠️ Test document not found")
|
|
return None, 0
|
|
|
|
def test_provider_selection():
|
|
"""Test optimal provider selection."""
|
|
print("\n🎯 Testing Provider Selection")
|
|
print("=" * 50)
|
|
|
|
test_sizes = [1000, 50000, 150000, 500000, 1200000]
|
|
|
|
for size in test_sizes:
|
|
provider, model = _select_optimal_provider_for_size(size)
|
|
print(f"Size {size:>8} tokens → {provider:<10} / {model}")
|
|
|
|
def test_chunking_strategies():
|
|
"""Test different chunking strategies."""
|
|
print("\n✂️ Testing Chunking Strategies")
|
|
print("=" * 50)
|
|
|
|
# Create test content
|
|
test_content = """
|
|
# Section 1
|
|
This is the first section with some content.
|
|
|
|
It has multiple paragraphs to test semantic chunking.
|
|
|
|
# Section 2
|
|
This is the second section.
|
|
|
|
It also has multiple paragraphs.
|
|
|
|
# Section 3
|
|
The third section is here.
|
|
|
|
With more content for testing.
|
|
"""
|
|
|
|
strategies = ["auto", "semantic", "fixed", "hierarchical"]
|
|
chunk_size = 100 # Small for testing
|
|
|
|
for strategy in strategies:
|
|
chunks = _smart_chunk_content(test_content, strategy, chunk_size)
|
|
print(f"{strategy:<12}: {len(chunks)} chunks")
|
|
for i, chunk in enumerate(chunks[:2]): # Show first 2 chunks
|
|
preview = chunk.replace('\n', ' ')[:50] + "..."
|
|
print(f" Chunk {i+1}: {preview}")
|
|
|
|
def test_direct_analysis():
|
|
"""Test direct file analysis (without MCP wrapper)."""
|
|
print("\n🔍 Testing Direct Large File Analysis")
|
|
print("=" * 50)
|
|
|
|
if not os.getenv("GOOGLE_API_KEY"):
|
|
print("⚠️ Skipping analysis test - no Google API key")
|
|
return
|
|
|
|
try:
|
|
# Test with our test document
|
|
if os.path.exists("test_large_document.md"):
|
|
content = _extract_file_content("test_large_document.md")
|
|
tokens = _estimate_token_count(content)
|
|
provider, model = _select_optimal_provider_for_size(tokens)
|
|
|
|
print(f"📄 File: test_large_document.md")
|
|
print(f"📊 Tokens: {tokens}")
|
|
print(f"🎯 Selected: {provider} / {model}")
|
|
|
|
# Test if it would use direct or chunked approach
|
|
context_limits = {
|
|
"gemini": 1000000, "openai": 1000000,
|
|
"anthropic": 200000, "grok": 100000
|
|
}
|
|
|
|
provider_limit = context_limits.get(provider, 100000)
|
|
approach = "direct" if tokens <= provider_limit else "chunked"
|
|
print(f"📋 Approach: {approach}")
|
|
|
|
if approach == "direct":
|
|
# Test direct analysis
|
|
client = get_client(provider)
|
|
prompt = "Provide a brief summary of this document's main topics"
|
|
|
|
response = client.chat.completions.create(
|
|
model=model,
|
|
messages=[{"role": "user", "content": f"{prompt}\n\n{content}"}]
|
|
)
|
|
|
|
analysis = response.choices[0].message.content
|
|
print(f"✓ Analysis completed: {len(analysis)} characters")
|
|
print(f"📝 Summary: {analysis[:200]}...")
|
|
else:
|
|
# Test chunking approach
|
|
chunks = _smart_chunk_content(content, "auto", provider_limit // 2)
|
|
print(f"✓ Would create {len(chunks)} chunks for processing")
|
|
|
|
except Exception as e:
|
|
print(f"✗ Analysis test failed: {e}")
|
|
|
|
def test_file_type_support():
|
|
"""Test support for different file types."""
|
|
print("\n📋 Testing File Type Support")
|
|
print("=" * 50)
|
|
|
|
# Create test files of different types
|
|
test_files = {
|
|
"test.txt": "This is a plain text file for testing.",
|
|
"test.json": '{"name": "test", "type": "json", "data": [1, 2, 3]}',
|
|
"test.py": "def hello():\n print('Hello, world!')\n return True"
|
|
}
|
|
|
|
for filename, content in test_files.items():
|
|
try:
|
|
# Write test file
|
|
with open(filename, 'w') as f:
|
|
f.write(content)
|
|
|
|
# Test extraction
|
|
extracted = _extract_file_content(filename)
|
|
if extracted:
|
|
tokens = _estimate_token_count(extracted)
|
|
print(f"✓ {filename:<12}: {tokens} tokens")
|
|
else:
|
|
print(f"✗ {filename:<12}: extraction failed")
|
|
|
|
# Clean up
|
|
os.remove(filename)
|
|
|
|
except Exception as e:
|
|
print(f"✗ {filename:<12}: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
print("🚀 Large File Analysis Testing")
|
|
print("=" * 70)
|
|
|
|
test_file_extraction()
|
|
test_provider_selection()
|
|
test_chunking_strategies()
|
|
test_direct_analysis()
|
|
test_file_type_support()
|
|
|
|
print("\n" + "=" * 70)
|
|
print("✅ Large file analysis testing completed!")
|
|
|
|
# Show configuration summary
|
|
configured_providers = [
|
|
provider for provider, config in PROVIDER_CONFIG.items()
|
|
if os.getenv(config["api_key_env"])
|
|
]
|
|
|
|
print(f"📊 Summary:")
|
|
print(f" Configured providers: {', '.join(configured_providers)}")
|
|
print(f" Max context windows: gemini(1M), openai(1M), anthropic(200K), grok(100K)")
|
|
print(f" Chunking strategies: auto, semantic, fixed, hierarchical")
|
|
print(f" Supported file types: txt, md, py, json, csv, log") |