#!/usr/bin/env python3 """Test the large file analysis tool.""" import os import sys sys.path.insert(0, 'src') from dotenv import load_dotenv load_dotenv() # Import the large file analysis function components from llm_fusion_mcp.server import ( _extract_file_content, _estimate_token_count, _select_optimal_provider_for_size, _smart_chunk_content, get_client, PROVIDER_CONFIG ) def test_file_extraction(): """Test file content extraction.""" print("šŸ“ Testing File Content Extraction") print("=" * 50) # Test markdown file if os.path.exists("test_large_document.md"): content = _extract_file_content("test_large_document.md") if content: word_count = len(content.split()) char_count = len(content) print(f"āœ“ Extracted content: {word_count} words, {char_count} characters") # Test token estimation estimated_tokens = _estimate_token_count(content) print(f"āœ“ Estimated tokens: {estimated_tokens}") return content, estimated_tokens else: print("āœ— Failed to extract content") return None, 0 else: print("āš ļø Test document not found") return None, 0 def test_provider_selection(): """Test optimal provider selection.""" print("\nšŸŽÆ Testing Provider Selection") print("=" * 50) test_sizes = [1000, 50000, 150000, 500000, 1200000] for size in test_sizes: provider, model = _select_optimal_provider_for_size(size) print(f"Size {size:>8} tokens → {provider:<10} / {model}") def test_chunking_strategies(): """Test different chunking strategies.""" print("\nāœ‚ļø Testing Chunking Strategies") print("=" * 50) # Create test content test_content = """ # Section 1 This is the first section with some content. It has multiple paragraphs to test semantic chunking. # Section 2 This is the second section. It also has multiple paragraphs. # Section 3 The third section is here. With more content for testing. """ strategies = ["auto", "semantic", "fixed", "hierarchical"] chunk_size = 100 # Small for testing for strategy in strategies: chunks = _smart_chunk_content(test_content, strategy, chunk_size) print(f"{strategy:<12}: {len(chunks)} chunks") for i, chunk in enumerate(chunks[:2]): # Show first 2 chunks preview = chunk.replace('\n', ' ')[:50] + "..." print(f" Chunk {i+1}: {preview}") def test_direct_analysis(): """Test direct file analysis (without MCP wrapper).""" print("\nšŸ” Testing Direct Large File Analysis") print("=" * 50) if not os.getenv("GOOGLE_API_KEY"): print("āš ļø Skipping analysis test - no Google API key") return try: # Test with our test document if os.path.exists("test_large_document.md"): content = _extract_file_content("test_large_document.md") tokens = _estimate_token_count(content) provider, model = _select_optimal_provider_for_size(tokens) print(f"šŸ“„ File: test_large_document.md") print(f"šŸ“Š Tokens: {tokens}") print(f"šŸŽÆ Selected: {provider} / {model}") # Test if it would use direct or chunked approach context_limits = { "gemini": 1000000, "openai": 1000000, "anthropic": 200000, "grok": 100000 } provider_limit = context_limits.get(provider, 100000) approach = "direct" if tokens <= provider_limit else "chunked" print(f"šŸ“‹ Approach: {approach}") if approach == "direct": # Test direct analysis client = get_client(provider) prompt = "Provide a brief summary of this document's main topics" response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": f"{prompt}\n\n{content}"}] ) analysis = response.choices[0].message.content print(f"āœ“ Analysis completed: {len(analysis)} characters") print(f"šŸ“ Summary: {analysis[:200]}...") else: # Test chunking approach chunks = _smart_chunk_content(content, "auto", provider_limit // 2) print(f"āœ“ Would create {len(chunks)} chunks for processing") except Exception as e: print(f"āœ— Analysis test failed: {e}") def test_file_type_support(): """Test support for different file types.""" print("\nšŸ“‹ Testing File Type Support") print("=" * 50) # Create test files of different types test_files = { "test.txt": "This is a plain text file for testing.", "test.json": '{"name": "test", "type": "json", "data": [1, 2, 3]}', "test.py": "def hello():\n print('Hello, world!')\n return True" } for filename, content in test_files.items(): try: # Write test file with open(filename, 'w') as f: f.write(content) # Test extraction extracted = _extract_file_content(filename) if extracted: tokens = _estimate_token_count(extracted) print(f"āœ“ {filename:<12}: {tokens} tokens") else: print(f"āœ— {filename:<12}: extraction failed") # Clean up os.remove(filename) except Exception as e: print(f"āœ— {filename:<12}: {e}") if __name__ == "__main__": print("šŸš€ Large File Analysis Testing") print("=" * 70) test_file_extraction() test_provider_selection() test_chunking_strategies() test_direct_analysis() test_file_type_support() print("\n" + "=" * 70) print("āœ… Large file analysis testing completed!") # Show configuration summary configured_providers = [ provider for provider, config in PROVIDER_CONFIG.items() if os.getenv(config["api_key_env"]) ] print(f"šŸ“Š Summary:") print(f" Configured providers: {', '.join(configured_providers)}") print(f" Max context windows: gemini(1M), openai(1M), anthropic(200K), grok(100K)") print(f" Chunking strategies: auto, semantic, fixed, hierarchical") print(f" Supported file types: txt, md, py, json, csv, log")