#!/usr/bin/env python3 """ Performance Comparison: OpenAI Interface vs Native Implementation Compares response times and reliability between OpenAI-compatible endpoints and native provider implementations. """ import asyncio import time import statistics from typing import List, Dict import google.generativeai as genai from openai import OpenAI from dotenv import load_dotenv import os load_dotenv() class PerformanceBenchmark: def __init__(self): self.results = [] # OpenAI-compatible Gemini client self.gemini_openai = OpenAI( api_key=os.getenv('GOOGLE_API_KEY'), base_url='https://generativelanguage.googleapis.com/v1beta/openai/' ) # Native Gemini client genai.configure(api_key=os.getenv('GOOGLE_API_KEY')) async def benchmark_openai_interface(self, iterations: int = 5) -> Dict: """Benchmark Gemini through OpenAI interface""" print(f"๐Ÿงช Testing Gemini via OpenAI interface ({iterations} iterations)...") times = [] errors = 0 for i in range(iterations): try: start_time = time.time() response = self.gemini_openai.chat.completions.create( model="gemini-2.5-flash", messages=[{"role": "user", "content": "Say exactly: 'Test response'"}], max_tokens=20 ) end_time = time.time() response_time = end_time - start_time times.append(response_time) print(f" Iteration {i+1}: {response_time:.3f}s - {response.choices[0].message.content}") except Exception as e: errors += 1 print(f" Iteration {i+1}: ERROR - {e}") return { 'method': 'OpenAI Interface', 'provider': 'Gemini', 'iterations': iterations, 'successful': len(times), 'errors': errors, 'avg_time': statistics.mean(times) if times else 0, 'min_time': min(times) if times else 0, 'max_time': max(times) if times else 0, 'std_dev': statistics.stdev(times) if len(times) > 1 else 0 } async def benchmark_native_interface(self, iterations: int = 5) -> Dict: """Benchmark Gemini through native interface""" print(f"๐Ÿงช Testing Gemini via native interface ({iterations} iterations)...") times = [] errors = 0 model = genai.GenerativeModel('gemini-2.5-flash') for i in range(iterations): try: start_time = time.time() response = model.generate_content( "Say exactly: 'Test response'", generation_config=genai.GenerationConfig(max_output_tokens=20) ) end_time = time.time() response_time = end_time - start_time times.append(response_time) print(f" Iteration {i+1}: {response_time:.3f}s - {response.text}") except Exception as e: errors += 1 print(f" Iteration {i+1}: ERROR - {e}") return { 'method': 'Native Interface', 'provider': 'Gemini', 'iterations': iterations, 'successful': len(times), 'errors': errors, 'avg_time': statistics.mean(times) if times else 0, 'min_time': min(times) if times else 0, 'max_time': max(times) if times else 0, 'std_dev': statistics.stdev(times) if len(times) > 1 else 0 } async def benchmark_streaming_openai(self, iterations: int = 3) -> Dict: """Benchmark streaming via OpenAI interface""" print(f"๐Ÿงช Testing Gemini streaming via OpenAI interface ({iterations} iterations)...") times = [] errors = 0 for i in range(iterations): try: start_time = time.time() stream = self.gemini_openai.chat.completions.create( model="gemini-2.5-flash", messages=[{"role": "user", "content": "Count from 1 to 5"}], stream=True, max_tokens=50 ) chunks = 0 for chunk in stream: chunks += 1 if chunks >= 5: # Limit chunks break end_time = time.time() response_time = end_time - start_time times.append(response_time) print(f" Iteration {i+1}: {response_time:.3f}s - {chunks} chunks") except Exception as e: errors += 1 print(f" Iteration {i+1}: ERROR - {e}") return { 'method': 'OpenAI Streaming', 'provider': 'Gemini', 'iterations': iterations, 'successful': len(times), 'errors': errors, 'avg_time': statistics.mean(times) if times else 0, 'min_time': min(times) if times else 0, 'max_time': max(times) if times else 0, 'std_dev': statistics.stdev(times) if len(times) > 1 else 0 } async def benchmark_streaming_native(self, iterations: int = 3) -> Dict: """Benchmark streaming via native interface""" print(f"๐Ÿงช Testing Gemini streaming via native interface ({iterations} iterations)...") times = [] errors = 0 model = genai.GenerativeModel('gemini-2.5-flash') for i in range(iterations): try: start_time = time.time() response = model.generate_content( "Count from 1 to 5", stream=True, generation_config=genai.GenerationConfig(max_output_tokens=50) ) chunks = 0 for chunk in response: chunks += 1 if chunks >= 5: # Limit chunks break end_time = time.time() response_time = end_time - start_time times.append(response_time) print(f" Iteration {i+1}: {response_time:.3f}s - {chunks} chunks") except Exception as e: errors += 1 print(f" Iteration {i+1}: ERROR - {e}") return { 'method': 'Native Streaming', 'provider': 'Gemini', 'iterations': iterations, 'successful': len(times), 'errors': errors, 'avg_time': statistics.mean(times) if times else 0, 'min_time': min(times) if times else 0, 'max_time': max(times) if times else 0, 'std_dev': statistics.stdev(times) if len(times) > 1 else 0 } async def benchmark_function_calling_openai(self, iterations: int = 3) -> Dict: """Benchmark function calling via OpenAI interface""" print(f"๐Ÿงช Testing Gemini function calling via OpenAI interface ({iterations} iterations)...") times = [] errors = 0 successful_calls = 0 tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get weather information for a city", "parameters": { "type": "object", "properties": { "city": {"type": "string", "description": "City name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"} }, "required": ["city"] } } } ] for i in range(iterations): try: start_time = time.time() response = self.gemini_openai.chat.completions.create( model="gemini-2.5-flash", messages=[{"role": "user", "content": "What's the weather like in Tokyo?"}], tools=tools, max_tokens=100 ) end_time = time.time() response_time = end_time - start_time times.append(response_time) # Check if function was called if (hasattr(response.choices[0].message, 'tool_calls') and response.choices[0].message.tool_calls): successful_calls += 1 tool_call = response.choices[0].message.tool_calls[0] print(f" Iteration {i+1}: {response_time:.3f}s - Called: {tool_call.function.name}({tool_call.function.arguments})") else: print(f" Iteration {i+1}: {response_time:.3f}s - No function call") except Exception as e: errors += 1 print(f" Iteration {i+1}: ERROR - {e}") return { 'method': 'OpenAI Function Calling', 'provider': 'Gemini', 'iterations': iterations, 'successful': len(times), 'successful_calls': successful_calls, 'errors': errors, 'avg_time': statistics.mean(times) if times else 0, 'min_time': min(times) if times else 0, 'max_time': max(times) if times else 0, 'std_dev': statistics.stdev(times) if len(times) > 1 else 0 } async def benchmark_function_calling_native(self, iterations: int = 3) -> Dict: """Benchmark function calling via native interface""" print(f"๐Ÿงช Testing Gemini function calling via native interface ({iterations} iterations)...") times = [] errors = 0 successful_calls = 0 # Define function for native interface def get_weather(city: str, unit: str = "celsius"): """Get weather information for a city""" return f"Weather in {city}: 22ยฐ{unit[0].upper()}, sunny" model = genai.GenerativeModel( 'gemini-2.5-flash', tools=[get_weather] ) for i in range(iterations): try: start_time = time.time() chat = model.start_chat() response = chat.send_message("What's the weather like in Tokyo?") end_time = time.time() response_time = end_time - start_time times.append(response_time) # Check if function was called if hasattr(response, 'candidates') and response.candidates: candidate = response.candidates[0] if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'): function_calls = [part for part in candidate.content.parts if hasattr(part, 'function_call')] if function_calls: successful_calls += 1 func_call = function_calls[0].function_call print(f" Iteration {i+1}: {response_time:.3f}s - Called: {func_call.name}") else: print(f" Iteration {i+1}: {response_time:.3f}s - No function call") else: print(f" Iteration {i+1}: {response_time:.3f}s - Response: {response.text[:50]}...") except Exception as e: errors += 1 print(f" Iteration {i+1}: ERROR - {e}") return { 'method': 'Native Function Calling', 'provider': 'Gemini', 'iterations': iterations, 'successful': len(times), 'successful_calls': successful_calls, 'errors': errors, 'avg_time': statistics.mean(times) if times else 0, 'min_time': min(times) if times else 0, 'max_time': max(times) if times else 0, 'std_dev': statistics.stdev(times) if len(times) > 1 else 0 } def print_comparison_report(self, results: List[Dict]): """Print formatted comparison report""" print("\n" + "="*70) print("๐Ÿ“Š Performance Comparison Report") print("="*70) print(f"\n{'Method':<20} {'Avg Time':<10} {'Min':<8} {'Max':<8} {'Success':<8} {'Errors'}") print("-" * 70) for result in results: print(f"{result['method']:<20} " f"{result['avg_time']:.3f}s{'':<4} " f"{result['min_time']:.3f}s{'':<2} " f"{result['max_time']:.3f}s{'':<2} " f"{result['successful']:<8} " f"{result['errors']}") print(f"\n๐Ÿ’ก Key Findings:") print("-" * 20) # Compare OpenAI vs Native openai_basic = next((r for r in results if r['method'] == 'OpenAI Interface'), None) native_basic = next((r for r in results if r['method'] == 'Native Interface'), None) if openai_basic and native_basic: diff = openai_basic['avg_time'] - native_basic['avg_time'] if abs(diff) < 0.1: print(f"โœ… Similar performance: OpenAI and Native within 0.1s") elif diff > 0: print(f"โšก Native faster by {diff:.3f}s ({((diff/openai_basic['avg_time'])*100):.1f}%)") else: print(f"โšก OpenAI faster by {abs(diff):.3f}s ({((abs(diff)/native_basic['avg_time'])*100):.1f}%)") # Function calling comparison - Critical for MCP! openai_func = next((r for r in results if r['method'] == 'OpenAI Function Calling'), None) native_func = next((r for r in results if r['method'] == 'Native Function Calling'), None) if openai_func and native_func: func_diff = openai_func['avg_time'] - native_func['avg_time'] openai_success_rate = (openai_func.get('successful_calls', 0) / openai_func['iterations']) * 100 native_success_rate = (native_func.get('successful_calls', 0) / native_func['iterations']) * 100 print(f"๐Ÿ› ๏ธ Function calling success rates:") print(f" OpenAI interface: {openai_success_rate:.0f}% ({openai_func.get('successful_calls', 0)}/{openai_func['iterations']})") print(f" Native interface: {native_success_rate:.0f}% ({native_func.get('successful_calls', 0)}/{native_func['iterations']})") if abs(func_diff) < 0.1: print(f"๐Ÿ› ๏ธ Function calling performance similar") elif func_diff > 0: print(f"๐Ÿ› ๏ธ Native function calling faster by {func_diff:.3f}s") else: print(f"๐Ÿ› ๏ธ OpenAI function calling faster by {abs(func_diff):.3f}s") # Check reliability total_errors = sum(r['errors'] for r in results) total_tests = sum(r['iterations'] for r in results) reliability = ((total_tests - total_errors) / total_tests) * 100 print(f"๐ŸŽฏ Overall reliability: {reliability:.1f}% ({total_tests - total_errors}/{total_tests} successful)") # Streaming comparison openai_stream = next((r for r in results if r['method'] == 'OpenAI Streaming'), None) native_stream = next((r for r in results if r['method'] == 'Native Streaming'), None) if openai_stream and native_stream: stream_diff = openai_stream['avg_time'] - native_stream['avg_time'] if abs(stream_diff) < 0.1: print(f"๐ŸŒŠ Streaming performance similar") elif stream_diff > 0: print(f"๐ŸŒŠ Native streaming faster by {stream_diff:.3f}s") else: print(f"๐ŸŒŠ OpenAI streaming faster by {abs(stream_diff):.3f}s") print(f"\n๐Ÿ—๏ธ Architecture Recommendation:") print("-" * 35) if reliability >= 95 and total_errors == 0: print("โœ… Both interfaces highly reliable - choose based on simplicity") print(" โ†’ OpenAI interface recommended for unified architecture") elif openai_basic and openai_basic['errors'] == 0: print("โœ… OpenAI interface stable - good choice for hybrid architecture") elif native_basic and native_basic['errors'] == 0: print("โšก Native interface more reliable - consider native-first approach") else: print("โš ๏ธ Mixed reliability - implement robust error handling") async def main(): """Run comprehensive performance benchmark""" benchmark = PerformanceBenchmark() if not os.getenv('GOOGLE_API_KEY'): print("โŒ GOOGLE_API_KEY not found. Please set API key to run benchmarks.") return print("๐Ÿš€ Starting Performance Benchmark") print("Comparing OpenAI interface vs Native implementation for Gemini") results = [] # Basic text generation results.append(await benchmark.benchmark_openai_interface()) await asyncio.sleep(1) # Rate limiting results.append(await benchmark.benchmark_native_interface()) await asyncio.sleep(1) # Streaming results.append(await benchmark.benchmark_streaming_openai()) await asyncio.sleep(1) results.append(await benchmark.benchmark_streaming_native()) await asyncio.sleep(1) # Function calling - Critical for MCP integration! results.append(await benchmark.benchmark_function_calling_openai()) await asyncio.sleep(1) results.append(await benchmark.benchmark_function_calling_native()) # Generate report benchmark.print_comparison_report(results) print(f"\n๐Ÿ’พ Performance data available for further analysis") if __name__ == "__main__": asyncio.run(main())