llm-fusion-mcp/test_performance_comparison.py

#!/usr/bin/env python3
"""
Performance Comparison: OpenAI Interface vs Native Implementation

Compares response times and reliability between OpenAI-compatible
endpoints and native provider implementations.
"""

import asyncio
import time
import statistics
from typing import List, Dict
import google.generativeai as genai
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()

class PerformanceBenchmark:
    def __init__(self):
        self.results = []

        # OpenAI-compatible Gemini client
        self.gemini_openai = OpenAI(
            api_key=os.getenv('GOOGLE_API_KEY'),
            base_url='https://generativelanguage.googleapis.com/v1beta/openai/'
        )

        # Native Gemini client
        genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))

    async def benchmark_openai_interface(self, iterations: int = 5) -> Dict:
        """Benchmark Gemini through OpenAI interface"""
        print(f"🧪 Testing Gemini via OpenAI interface ({iterations} iterations)...")

        times = []
        errors = 0

        for i in range(iterations):
            try:
                start_time = time.time()

                response = self.gemini_openai.chat.completions.create(
                    model="gemini-2.5-flash",
                    messages=[{"role": "user", "content": "Say exactly: 'Test response'"}],
                    max_tokens=20
                )

                end_time = time.time()
                response_time = end_time - start_time
                times.append(response_time)

                print(f"  Iteration {i+1}: {response_time:.3f}s - {response.choices[0].message.content}")

            except Exception as e:
                errors += 1
                print(f"  Iteration {i+1}: ERROR - {e}")

        return {
            'method': 'OpenAI Interface',
            'provider': 'Gemini',
            'iterations': iterations,
            'successful': len(times),
            'errors': errors,
            'avg_time': statistics.mean(times) if times else 0,
            'min_time': min(times) if times else 0,
            'max_time': max(times) if times else 0,
            'std_dev': statistics.stdev(times) if len(times) > 1 else 0
        }

    async def benchmark_native_interface(self, iterations: int = 5) -> Dict:
        """Benchmark Gemini through native interface"""
        print(f"🧪 Testing Gemini via native interface ({iterations} iterations)...")

        times = []
        errors = 0

        model = genai.GenerativeModel('gemini-2.5-flash')

        for i in range(iterations):
            try:
                start_time = time.time()

                response = model.generate_content(
                    "Say exactly: 'Test response'",
                    generation_config=genai.GenerationConfig(max_output_tokens=20)
                )

                end_time = time.time()
                response_time = end_time - start_time
                times.append(response_time)

                print(f"  Iteration {i+1}: {response_time:.3f}s - {response.text}")

            except Exception as e:
                errors += 1
                print(f"  Iteration {i+1}: ERROR - {e}")

        return {
            'method': 'Native Interface',
            'provider': 'Gemini',
            'iterations': iterations,
            'successful': len(times),
            'errors': errors,
            'avg_time': statistics.mean(times) if times else 0,
            'min_time': min(times) if times else 0,
            'max_time': max(times) if times else 0,
            'std_dev': statistics.stdev(times) if len(times) > 1 else 0
        }

    async def benchmark_streaming_openai(self, iterations: int = 3) -> Dict:
        """Benchmark streaming via OpenAI interface"""
        print(f"🧪 Testing Gemini streaming via OpenAI interface ({iterations} iterations)...")

        times = []
        errors = 0

        for i in range(iterations):
            try:
                start_time = time.time()

                stream = self.gemini_openai.chat.completions.create(
                    model="gemini-2.5-flash",
                    messages=[{"role": "user", "content": "Count from 1 to 5"}],
                    stream=True,
                    max_tokens=50
                )

                chunks = 0
                for chunk in stream:
                    chunks += 1
                    if chunks >= 5:  # Limit chunks
                        break

                end_time = time.time()
                response_time = end_time - start_time
                times.append(response_time)

                print(f"  Iteration {i+1}: {response_time:.3f}s - {chunks} chunks")

            except Exception as e:
                errors += 1
                print(f"  Iteration {i+1}: ERROR - {e}")

        return {
            'method': 'OpenAI Streaming',
            'provider': 'Gemini',
            'iterations': iterations,
            'successful': len(times),
            'errors': errors,
            'avg_time': statistics.mean(times) if times else 0,
            'min_time': min(times) if times else 0,
            'max_time': max(times) if times else 0,
            'std_dev': statistics.stdev(times) if len(times) > 1 else 0
        }

    async def benchmark_streaming_native(self, iterations: int = 3) -> Dict:
        """Benchmark streaming via native interface"""
        print(f"🧪 Testing Gemini streaming via native interface ({iterations} iterations)...")

        times = []
        errors = 0

        model = genai.GenerativeModel('gemini-2.5-flash')

        for i in range(iterations):
            try:
                start_time = time.time()

                response = model.generate_content(
                    "Count from 1 to 5",
                    stream=True,
                    generation_config=genai.GenerationConfig(max_output_tokens=50)
                )

                chunks = 0
                for chunk in response:
                    chunks += 1
                    if chunks >= 5:  # Limit chunks
                        break

                end_time = time.time()
                response_time = end_time - start_time
                times.append(response_time)

                print(f"  Iteration {i+1}: {response_time:.3f}s - {chunks} chunks")

            except Exception as e:
                errors += 1
                print(f"  Iteration {i+1}: ERROR - {e}")

        return {
            'method': 'Native Streaming',
            'provider': 'Gemini',
            'iterations': iterations,
            'successful': len(times),
            'errors': errors,
            'avg_time': statistics.mean(times) if times else 0,
            'min_time': min(times) if times else 0,
            'max_time': max(times) if times else 0,
            'std_dev': statistics.stdev(times) if len(times) > 1 else 0
        }

    async def benchmark_function_calling_openai(self, iterations: int = 3) -> Dict:
        """Benchmark function calling via OpenAI interface"""
        print(f"🧪 Testing Gemini function calling via OpenAI interface ({iterations} iterations)...")

        times = []
        errors = 0
        successful_calls = 0

        tools = [
            {
                "type": "function",
                "function": {
                    "name": "get_weather",
                    "description": "Get weather information for a city",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "city": {"type": "string", "description": "City name"},
                            "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}
                        },
                        "required": ["city"]
                    }
                }
            }
        ]

        for i in range(iterations):
            try:
                start_time = time.time()

                response = self.gemini_openai.chat.completions.create(
                    model="gemini-2.5-flash",
                    messages=[{"role": "user", "content": "What's the weather like in Tokyo?"}],
                    tools=tools,
                    max_tokens=100
                )

                end_time = time.time()
                response_time = end_time - start_time
                times.append(response_time)

                # Check if function was called
                if (hasattr(response.choices[0].message, 'tool_calls') and
                    response.choices[0].message.tool_calls):
                    successful_calls += 1
                    tool_call = response.choices[0].message.tool_calls[0]
                    print(f"  Iteration {i+1}: {response_time:.3f}s - Called: {tool_call.function.name}({tool_call.function.arguments})")
                else:
                    print(f"  Iteration {i+1}: {response_time:.3f}s - No function call")

            except Exception as e:
                errors += 1
                print(f"  Iteration {i+1}: ERROR - {e}")

        return {
            'method': 'OpenAI Function Calling',
            'provider': 'Gemini',
            'iterations': iterations,
            'successful': len(times),
            'successful_calls': successful_calls,
            'errors': errors,
            'avg_time': statistics.mean(times) if times else 0,
            'min_time': min(times) if times else 0,
            'max_time': max(times) if times else 0,
            'std_dev': statistics.stdev(times) if len(times) > 1 else 0
        }

    async def benchmark_function_calling_native(self, iterations: int = 3) -> Dict:
        """Benchmark function calling via native interface"""
        print(f"🧪 Testing Gemini function calling via native interface ({iterations} iterations)...")

        times = []
        errors = 0
        successful_calls = 0

        # Define function for native interface
        def get_weather(city: str, unit: str = "celsius"):
            """Get weather information for a city"""
            return f"Weather in {city}: 22°{unit[0].upper()}, sunny"

        model = genai.GenerativeModel(
            'gemini-2.5-flash',
            tools=[get_weather]
        )

        for i in range(iterations):
            try:
                start_time = time.time()

                chat = model.start_chat()
                response = chat.send_message("What's the weather like in Tokyo?")

                end_time = time.time()
                response_time = end_time - start_time
                times.append(response_time)

                # Check if function was called
                if hasattr(response, 'candidates') and response.candidates:
                    candidate = response.candidates[0]
                    if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
                        function_calls = [part for part in candidate.content.parts if hasattr(part, 'function_call')]
                        if function_calls:
                            successful_calls += 1
                            func_call = function_calls[0].function_call
                            print(f"  Iteration {i+1}: {response_time:.3f}s - Called: {func_call.name}")
                        else:
                            print(f"  Iteration {i+1}: {response_time:.3f}s - No function call")
                    else:
                        print(f"  Iteration {i+1}: {response_time:.3f}s - Response: {response.text[:50]}...")

            except Exception as e:
                errors += 1
                print(f"  Iteration {i+1}: ERROR - {e}")

        return {
            'method': 'Native Function Calling',
            'provider': 'Gemini',
            'iterations': iterations,
            'successful': len(times),
            'successful_calls': successful_calls,
            'errors': errors,
            'avg_time': statistics.mean(times) if times else 0,
            'min_time': min(times) if times else 0,
            'max_time': max(times) if times else 0,
            'std_dev': statistics.stdev(times) if len(times) > 1 else 0
        }

    def print_comparison_report(self, results: List[Dict]):
        """Print formatted comparison report"""
        print("\n" + "="*70)
        print("📊 Performance Comparison Report")
        print("="*70)

        print(f"\n{'Method':<20} {'Avg Time':<10} {'Min':<8} {'Max':<8} {'Success':<8} {'Errors'}")
        print("-" * 70)

        for result in results:
            print(f"{result['method']:<20} "
                  f"{result['avg_time']:.3f}s{'':<4} "
                  f"{result['min_time']:.3f}s{'':<2} "
                  f"{result['max_time']:.3f}s{'':<2} "
                  f"{result['successful']:<8} "
                  f"{result['errors']}")

        print(f"\n💡 Key Findings:")
        print("-" * 20)

        # Compare OpenAI vs Native
        openai_basic = next((r for r in results if r['method'] == 'OpenAI Interface'), None)
        native_basic = next((r for r in results if r['method'] == 'Native Interface'), None)

        if openai_basic and native_basic:
            diff = openai_basic['avg_time'] - native_basic['avg_time']
            if abs(diff) < 0.1:
                print(f"✅ Similar performance: OpenAI and Native within 0.1s")
            elif diff > 0:
                print(f"⚡ Native faster by {diff:.3f}s ({((diff/openai_basic['avg_time'])*100):.1f}%)")
            else:
                print(f"⚡ OpenAI faster by {abs(diff):.3f}s ({((abs(diff)/native_basic['avg_time'])*100):.1f}%)")

        # Function calling comparison - Critical for MCP!
        openai_func = next((r for r in results if r['method'] == 'OpenAI Function Calling'), None)
        native_func = next((r for r in results if r['method'] == 'Native Function Calling'), None)

        if openai_func and native_func:
            func_diff = openai_func['avg_time'] - native_func['avg_time']
            openai_success_rate = (openai_func.get('successful_calls', 0) / openai_func['iterations']) * 100
            native_success_rate = (native_func.get('successful_calls', 0) / native_func['iterations']) * 100

            print(f"🛠️  Function calling success rates:")
            print(f"   OpenAI interface: {openai_success_rate:.0f}% ({openai_func.get('successful_calls', 0)}/{openai_func['iterations']})")
            print(f"   Native interface: {native_success_rate:.0f}% ({native_func.get('successful_calls', 0)}/{native_func['iterations']})")

            if abs(func_diff) < 0.1:
                print(f"🛠️  Function calling performance similar")
            elif func_diff > 0:
                print(f"🛠️  Native function calling faster by {func_diff:.3f}s")
            else:
                print(f"🛠️  OpenAI function calling faster by {abs(func_diff):.3f}s")

        # Check reliability
        total_errors = sum(r['errors'] for r in results)
        total_tests = sum(r['iterations'] for r in results)
        reliability = ((total_tests - total_errors) / total_tests) * 100

        print(f"🎯 Overall reliability: {reliability:.1f}% ({total_tests - total_errors}/{total_tests} successful)")

        # Streaming comparison
        openai_stream = next((r for r in results if r['method'] == 'OpenAI Streaming'), None)
        native_stream = next((r for r in results if r['method'] == 'Native Streaming'), None)

        if openai_stream and native_stream:
            stream_diff = openai_stream['avg_time'] - native_stream['avg_time']
            if abs(stream_diff) < 0.1:
                print(f"🌊 Streaming performance similar")
            elif stream_diff > 0:
                print(f"🌊 Native streaming faster by {stream_diff:.3f}s")
            else:
                print(f"🌊 OpenAI streaming faster by {abs(stream_diff):.3f}s")

        print(f"\n🏗️ Architecture Recommendation:")
        print("-" * 35)

        if reliability >= 95 and total_errors == 0:
            print("✅ Both interfaces highly reliable - choose based on simplicity")
            print("   → OpenAI interface recommended for unified architecture")
        elif openai_basic and openai_basic['errors'] == 0:
            print("✅ OpenAI interface stable - good choice for hybrid architecture")
        elif native_basic and native_basic['errors'] == 0:
            print("⚡ Native interface more reliable - consider native-first approach")
        else:
            print("⚠️  Mixed reliability - implement robust error handling")

async def main():
    """Run comprehensive performance benchmark"""
    benchmark = PerformanceBenchmark()

    if not os.getenv('GOOGLE_API_KEY'):
        print("❌ GOOGLE_API_KEY not found. Please set API key to run benchmarks.")
        return

    print("🚀 Starting Performance Benchmark")
    print("Comparing OpenAI interface vs Native implementation for Gemini")

    results = []

    # Basic text generation
    results.append(await benchmark.benchmark_openai_interface())
    await asyncio.sleep(1)  # Rate limiting
    results.append(await benchmark.benchmark_native_interface())
    await asyncio.sleep(1)

    # Streaming
    results.append(await benchmark.benchmark_streaming_openai())
    await asyncio.sleep(1)
    results.append(await benchmark.benchmark_streaming_native())
    await asyncio.sleep(1)

    # Function calling - Critical for MCP integration!
    results.append(await benchmark.benchmark_function_calling_openai())
    await asyncio.sleep(1)
    results.append(await benchmark.benchmark_function_calling_native())

    # Generate report
    benchmark.print_comparison_report(results)

    print(f"\n💾 Performance data available for further analysis")

if __name__ == "__main__":
    asyncio.run(main())