llm-fusion-mcp/test_performance_comparison.py
Ryan Malloy 80f1ecbf7d
Some checks are pending
🚀 LLM Fusion MCP - CI/CD Pipeline / 📢 Deployment Notification (push) Blocked by required conditions
🚀 LLM Fusion MCP - CI/CD Pipeline / 🔍 Code Quality & Testing (3.10) (push) Waiting to run
🚀 LLM Fusion MCP - CI/CD Pipeline / 🔍 Code Quality & Testing (3.11) (push) Waiting to run
🚀 LLM Fusion MCP - CI/CD Pipeline / 🔍 Code Quality & Testing (3.12) (push) Waiting to run
🚀 LLM Fusion MCP - CI/CD Pipeline / 🛡️ Security Scanning (push) Blocked by required conditions
🚀 LLM Fusion MCP - CI/CD Pipeline / 🐳 Docker Build & Push (push) Blocked by required conditions
🚀 LLM Fusion MCP - CI/CD Pipeline / 🎉 Create Release (push) Blocked by required conditions
🚀 Phase 2 Complete: Universal MCP Tool Orchestrator
Revolutionary architecture that bridges remote LLMs with the entire MCP ecosystem!

## 🌟 Key Features Added:
- Real MCP protocol implementation (STDIO + HTTP servers)
- Hybrid LLM provider system (OpenAI-compatible + Native APIs)
- Unified YAML configuration with environment variable substitution
- Advanced error handling with circuit breakers and provider fallback
- FastAPI HTTP bridge for remote LLM access
- Comprehensive tool & resource discovery system
- Complete test suite with 4 validation levels

## 🔧 Architecture Components:
- `src/llm_fusion_mcp/orchestrator.py` - Main orchestrator with hybrid providers
- `src/llm_fusion_mcp/mcp_client.py` - Full MCP protocol implementation
- `src/llm_fusion_mcp/config.py` - Configuration management system
- `src/llm_fusion_mcp/error_handling.py` - Circuit breaker & retry logic
- `config/orchestrator.yaml` - Unified system configuration

## 🧪 Testing Infrastructure:
- Complete system integration tests (4/4 passed)
- MCP protocol validation tests
- Provider compatibility analysis
- Performance benchmarking suite

🎉 This creates the FIRST system enabling remote LLMs to access
the entire MCP ecosystem through a unified HTTP API!

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-06 10:01:37 -06:00

454 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Performance Comparison: OpenAI Interface vs Native Implementation
Compares response times and reliability between OpenAI-compatible
endpoints and native provider implementations.
"""
import asyncio
import time
import statistics
from typing import List, Dict
import google.generativeai as genai
from openai import OpenAI
from dotenv import load_dotenv
import os
load_dotenv()
class PerformanceBenchmark:
def __init__(self):
self.results = []
# OpenAI-compatible Gemini client
self.gemini_openai = OpenAI(
api_key=os.getenv('GOOGLE_API_KEY'),
base_url='https://generativelanguage.googleapis.com/v1beta/openai/'
)
# Native Gemini client
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
async def benchmark_openai_interface(self, iterations: int = 5) -> Dict:
"""Benchmark Gemini through OpenAI interface"""
print(f"🧪 Testing Gemini via OpenAI interface ({iterations} iterations)...")
times = []
errors = 0
for i in range(iterations):
try:
start_time = time.time()
response = self.gemini_openai.chat.completions.create(
model="gemini-2.5-flash",
messages=[{"role": "user", "content": "Say exactly: 'Test response'"}],
max_tokens=20
)
end_time = time.time()
response_time = end_time - start_time
times.append(response_time)
print(f" Iteration {i+1}: {response_time:.3f}s - {response.choices[0].message.content}")
except Exception as e:
errors += 1
print(f" Iteration {i+1}: ERROR - {e}")
return {
'method': 'OpenAI Interface',
'provider': 'Gemini',
'iterations': iterations,
'successful': len(times),
'errors': errors,
'avg_time': statistics.mean(times) if times else 0,
'min_time': min(times) if times else 0,
'max_time': max(times) if times else 0,
'std_dev': statistics.stdev(times) if len(times) > 1 else 0
}
async def benchmark_native_interface(self, iterations: int = 5) -> Dict:
"""Benchmark Gemini through native interface"""
print(f"🧪 Testing Gemini via native interface ({iterations} iterations)...")
times = []
errors = 0
model = genai.GenerativeModel('gemini-2.5-flash')
for i in range(iterations):
try:
start_time = time.time()
response = model.generate_content(
"Say exactly: 'Test response'",
generation_config=genai.GenerationConfig(max_output_tokens=20)
)
end_time = time.time()
response_time = end_time - start_time
times.append(response_time)
print(f" Iteration {i+1}: {response_time:.3f}s - {response.text}")
except Exception as e:
errors += 1
print(f" Iteration {i+1}: ERROR - {e}")
return {
'method': 'Native Interface',
'provider': 'Gemini',
'iterations': iterations,
'successful': len(times),
'errors': errors,
'avg_time': statistics.mean(times) if times else 0,
'min_time': min(times) if times else 0,
'max_time': max(times) if times else 0,
'std_dev': statistics.stdev(times) if len(times) > 1 else 0
}
async def benchmark_streaming_openai(self, iterations: int = 3) -> Dict:
"""Benchmark streaming via OpenAI interface"""
print(f"🧪 Testing Gemini streaming via OpenAI interface ({iterations} iterations)...")
times = []
errors = 0
for i in range(iterations):
try:
start_time = time.time()
stream = self.gemini_openai.chat.completions.create(
model="gemini-2.5-flash",
messages=[{"role": "user", "content": "Count from 1 to 5"}],
stream=True,
max_tokens=50
)
chunks = 0
for chunk in stream:
chunks += 1
if chunks >= 5: # Limit chunks
break
end_time = time.time()
response_time = end_time - start_time
times.append(response_time)
print(f" Iteration {i+1}: {response_time:.3f}s - {chunks} chunks")
except Exception as e:
errors += 1
print(f" Iteration {i+1}: ERROR - {e}")
return {
'method': 'OpenAI Streaming',
'provider': 'Gemini',
'iterations': iterations,
'successful': len(times),
'errors': errors,
'avg_time': statistics.mean(times) if times else 0,
'min_time': min(times) if times else 0,
'max_time': max(times) if times else 0,
'std_dev': statistics.stdev(times) if len(times) > 1 else 0
}
async def benchmark_streaming_native(self, iterations: int = 3) -> Dict:
"""Benchmark streaming via native interface"""
print(f"🧪 Testing Gemini streaming via native interface ({iterations} iterations)...")
times = []
errors = 0
model = genai.GenerativeModel('gemini-2.5-flash')
for i in range(iterations):
try:
start_time = time.time()
response = model.generate_content(
"Count from 1 to 5",
stream=True,
generation_config=genai.GenerationConfig(max_output_tokens=50)
)
chunks = 0
for chunk in response:
chunks += 1
if chunks >= 5: # Limit chunks
break
end_time = time.time()
response_time = end_time - start_time
times.append(response_time)
print(f" Iteration {i+1}: {response_time:.3f}s - {chunks} chunks")
except Exception as e:
errors += 1
print(f" Iteration {i+1}: ERROR - {e}")
return {
'method': 'Native Streaming',
'provider': 'Gemini',
'iterations': iterations,
'successful': len(times),
'errors': errors,
'avg_time': statistics.mean(times) if times else 0,
'min_time': min(times) if times else 0,
'max_time': max(times) if times else 0,
'std_dev': statistics.stdev(times) if len(times) > 1 else 0
}
async def benchmark_function_calling_openai(self, iterations: int = 3) -> Dict:
"""Benchmark function calling via OpenAI interface"""
print(f"🧪 Testing Gemini function calling via OpenAI interface ({iterations} iterations)...")
times = []
errors = 0
successful_calls = 0
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get weather information for a city",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "City name"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}
},
"required": ["city"]
}
}
}
]
for i in range(iterations):
try:
start_time = time.time()
response = self.gemini_openai.chat.completions.create(
model="gemini-2.5-flash",
messages=[{"role": "user", "content": "What's the weather like in Tokyo?"}],
tools=tools,
max_tokens=100
)
end_time = time.time()
response_time = end_time - start_time
times.append(response_time)
# Check if function was called
if (hasattr(response.choices[0].message, 'tool_calls') and
response.choices[0].message.tool_calls):
successful_calls += 1
tool_call = response.choices[0].message.tool_calls[0]
print(f" Iteration {i+1}: {response_time:.3f}s - Called: {tool_call.function.name}({tool_call.function.arguments})")
else:
print(f" Iteration {i+1}: {response_time:.3f}s - No function call")
except Exception as e:
errors += 1
print(f" Iteration {i+1}: ERROR - {e}")
return {
'method': 'OpenAI Function Calling',
'provider': 'Gemini',
'iterations': iterations,
'successful': len(times),
'successful_calls': successful_calls,
'errors': errors,
'avg_time': statistics.mean(times) if times else 0,
'min_time': min(times) if times else 0,
'max_time': max(times) if times else 0,
'std_dev': statistics.stdev(times) if len(times) > 1 else 0
}
async def benchmark_function_calling_native(self, iterations: int = 3) -> Dict:
"""Benchmark function calling via native interface"""
print(f"🧪 Testing Gemini function calling via native interface ({iterations} iterations)...")
times = []
errors = 0
successful_calls = 0
# Define function for native interface
def get_weather(city: str, unit: str = "celsius"):
"""Get weather information for a city"""
return f"Weather in {city}: 22°{unit[0].upper()}, sunny"
model = genai.GenerativeModel(
'gemini-2.5-flash',
tools=[get_weather]
)
for i in range(iterations):
try:
start_time = time.time()
chat = model.start_chat()
response = chat.send_message("What's the weather like in Tokyo?")
end_time = time.time()
response_time = end_time - start_time
times.append(response_time)
# Check if function was called
if hasattr(response, 'candidates') and response.candidates:
candidate = response.candidates[0]
if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
function_calls = [part for part in candidate.content.parts if hasattr(part, 'function_call')]
if function_calls:
successful_calls += 1
func_call = function_calls[0].function_call
print(f" Iteration {i+1}: {response_time:.3f}s - Called: {func_call.name}")
else:
print(f" Iteration {i+1}: {response_time:.3f}s - No function call")
else:
print(f" Iteration {i+1}: {response_time:.3f}s - Response: {response.text[:50]}...")
except Exception as e:
errors += 1
print(f" Iteration {i+1}: ERROR - {e}")
return {
'method': 'Native Function Calling',
'provider': 'Gemini',
'iterations': iterations,
'successful': len(times),
'successful_calls': successful_calls,
'errors': errors,
'avg_time': statistics.mean(times) if times else 0,
'min_time': min(times) if times else 0,
'max_time': max(times) if times else 0,
'std_dev': statistics.stdev(times) if len(times) > 1 else 0
}
def print_comparison_report(self, results: List[Dict]):
"""Print formatted comparison report"""
print("\n" + "="*70)
print("📊 Performance Comparison Report")
print("="*70)
print(f"\n{'Method':<20} {'Avg Time':<10} {'Min':<8} {'Max':<8} {'Success':<8} {'Errors'}")
print("-" * 70)
for result in results:
print(f"{result['method']:<20} "
f"{result['avg_time']:.3f}s{'':<4} "
f"{result['min_time']:.3f}s{'':<2} "
f"{result['max_time']:.3f}s{'':<2} "
f"{result['successful']:<8} "
f"{result['errors']}")
print(f"\n💡 Key Findings:")
print("-" * 20)
# Compare OpenAI vs Native
openai_basic = next((r for r in results if r['method'] == 'OpenAI Interface'), None)
native_basic = next((r for r in results if r['method'] == 'Native Interface'), None)
if openai_basic and native_basic:
diff = openai_basic['avg_time'] - native_basic['avg_time']
if abs(diff) < 0.1:
print(f"✅ Similar performance: OpenAI and Native within 0.1s")
elif diff > 0:
print(f"⚡ Native faster by {diff:.3f}s ({((diff/openai_basic['avg_time'])*100):.1f}%)")
else:
print(f"⚡ OpenAI faster by {abs(diff):.3f}s ({((abs(diff)/native_basic['avg_time'])*100):.1f}%)")
# Function calling comparison - Critical for MCP!
openai_func = next((r for r in results if r['method'] == 'OpenAI Function Calling'), None)
native_func = next((r for r in results if r['method'] == 'Native Function Calling'), None)
if openai_func and native_func:
func_diff = openai_func['avg_time'] - native_func['avg_time']
openai_success_rate = (openai_func.get('successful_calls', 0) / openai_func['iterations']) * 100
native_success_rate = (native_func.get('successful_calls', 0) / native_func['iterations']) * 100
print(f"🛠️ Function calling success rates:")
print(f" OpenAI interface: {openai_success_rate:.0f}% ({openai_func.get('successful_calls', 0)}/{openai_func['iterations']})")
print(f" Native interface: {native_success_rate:.0f}% ({native_func.get('successful_calls', 0)}/{native_func['iterations']})")
if abs(func_diff) < 0.1:
print(f"🛠️ Function calling performance similar")
elif func_diff > 0:
print(f"🛠️ Native function calling faster by {func_diff:.3f}s")
else:
print(f"🛠️ OpenAI function calling faster by {abs(func_diff):.3f}s")
# Check reliability
total_errors = sum(r['errors'] for r in results)
total_tests = sum(r['iterations'] for r in results)
reliability = ((total_tests - total_errors) / total_tests) * 100
print(f"🎯 Overall reliability: {reliability:.1f}% ({total_tests - total_errors}/{total_tests} successful)")
# Streaming comparison
openai_stream = next((r for r in results if r['method'] == 'OpenAI Streaming'), None)
native_stream = next((r for r in results if r['method'] == 'Native Streaming'), None)
if openai_stream and native_stream:
stream_diff = openai_stream['avg_time'] - native_stream['avg_time']
if abs(stream_diff) < 0.1:
print(f"🌊 Streaming performance similar")
elif stream_diff > 0:
print(f"🌊 Native streaming faster by {stream_diff:.3f}s")
else:
print(f"🌊 OpenAI streaming faster by {abs(stream_diff):.3f}s")
print(f"\n🏗️ Architecture Recommendation:")
print("-" * 35)
if reliability >= 95 and total_errors == 0:
print("✅ Both interfaces highly reliable - choose based on simplicity")
print(" → OpenAI interface recommended for unified architecture")
elif openai_basic and openai_basic['errors'] == 0:
print("✅ OpenAI interface stable - good choice for hybrid architecture")
elif native_basic and native_basic['errors'] == 0:
print("⚡ Native interface more reliable - consider native-first approach")
else:
print("⚠️ Mixed reliability - implement robust error handling")
async def main():
"""Run comprehensive performance benchmark"""
benchmark = PerformanceBenchmark()
if not os.getenv('GOOGLE_API_KEY'):
print("❌ GOOGLE_API_KEY not found. Please set API key to run benchmarks.")
return
print("🚀 Starting Performance Benchmark")
print("Comparing OpenAI interface vs Native implementation for Gemini")
results = []
# Basic text generation
results.append(await benchmark.benchmark_openai_interface())
await asyncio.sleep(1) # Rate limiting
results.append(await benchmark.benchmark_native_interface())
await asyncio.sleep(1)
# Streaming
results.append(await benchmark.benchmark_streaming_openai())
await asyncio.sleep(1)
results.append(await benchmark.benchmark_streaming_native())
await asyncio.sleep(1)
# Function calling - Critical for MCP integration!
results.append(await benchmark.benchmark_function_calling_openai())
await asyncio.sleep(1)
results.append(await benchmark.benchmark_function_calling_native())
# Generate report
benchmark.print_comparison_report(results)
print(f"\n💾 Performance data available for further analysis")
if __name__ == "__main__":
asyncio.run(main())