#!/bin/bash # LLM Fusion MCP - Health Check & Monitoring Script set -e # Configuration SERVICE_NAME="llm-fusion-mcp" HEALTH_ENDPOINT="http://localhost:8000/health" TIMEOUT=10 CHECK_INTERVAL=30 # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' print_status() { echo -e "${BLUE}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1" } print_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } print_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" } print_error() { echo -e "${RED}[ERROR]${NC} $1" } # Health check function check_health() { local endpoint=$1 local response local http_code response=$(curl -s -w "HTTPSTATUS:%{http_code}" --max-time $TIMEOUT "$endpoint" 2>/dev/null || echo "HTTPSTATUS:000") http_code=$(echo "$response" | grep -o "HTTPSTATUS:[0-9]*" | cut -d: -f2) if [ "$http_code" = "200" ]; then return 0 else return 1 fi } # Docker container check check_container() { if docker ps --filter "name=${SERVICE_NAME}" --filter "status=running" | grep -q "$SERVICE_NAME"; then return 0 else return 1 fi } # System resource check check_resources() { local container_id container_id=$(docker ps -q --filter "name=${SERVICE_NAME}") if [ -n "$container_id" ]; then local stats stats=$(docker stats --no-stream --format "table {{.CPUPerc}}\t{{.MemUsage}}" "$container_id" 2>/dev/null | tail -n 1) if [ -n "$stats" ]; then local cpu_usage memory_usage cpu_usage=$(echo "$stats" | awk '{print $1}' | tr -d '%') memory_usage=$(echo "$stats" | awk '{print $2}') echo "CPU: ${cpu_usage}%, Memory: ${memory_usage}" # Alert if CPU > 80% if (( $(echo "$cpu_usage > 80" | bc -l) )); then print_warning "High CPU usage: ${cpu_usage}%" fi fi fi } # Provider connectivity check check_providers() { local response response=$(curl -s --max-time $TIMEOUT "${HEALTH_ENDPOINT}/providers" 2>/dev/null || echo "{}") if echo "$response" | grep -q "\"success\":true"; then local provider_count provider_count=$(echo "$response" | grep -o "\"configured\":true" | wc -l) echo "Active providers: $provider_count" else print_warning "Provider health check failed" fi } # Main monitoring function run_monitor() { print_status "Starting LLM Fusion MCP health monitoring..." while true; do echo "" print_status "=== Health Check Report ===" # Container status if check_container; then print_success "✅ Container is running" # Resource usage local resource_info resource_info=$(check_resources) if [ -n "$resource_info" ]; then print_status "📊 Resource usage: $resource_info" fi else print_error "❌ Container is not running" print_status "Attempting to restart..." docker-compose restart "$SERVICE_NAME" || print_error "Failed to restart container" sleep 10 continue fi # Health endpoint check if check_health "$HEALTH_ENDPOINT"; then print_success "✅ Health endpoint responding" else print_error "❌ Health endpoint not responding" fi # Provider check print_status "🔍 Checking AI providers..." check_providers # Disk space check local disk_usage disk_usage=$(df -h . | tail -1 | awk '{print $5}' | tr -d '%') if [ "$disk_usage" -gt 85 ]; then print_warning "⚠️ Low disk space: ${disk_usage}% used" else print_status "💾 Disk usage: ${disk_usage}%" fi # Log file size check if [ -d "./logs" ]; then local log_size log_size=$(du -sh ./logs 2>/dev/null | cut -f1 || echo "N/A") print_status "📝 Log directory size: $log_size" fi print_status "Next check in ${CHECK_INTERVAL} seconds..." sleep $CHECK_INTERVAL done } # One-time health check run_check() { print_status "Running one-time health check..." # Container check if check_container; then print_success "✅ Container Status: Running" else print_error "❌ Container Status: Not Running" return 1 fi # Health endpoint if check_health "$HEALTH_ENDPOINT"; then print_success "✅ Health Endpoint: OK" else print_error "❌ Health Endpoint: Failed" return 1 fi # Resource usage local resource_info resource_info=$(check_resources) if [ -n "$resource_info" ]; then print_status "📊 Resource Usage: $resource_info" fi # Provider check check_providers print_success "🎉 All checks passed!" return 0 } # Usage information show_usage() { echo "LLM Fusion MCP Health Check Script" echo "" echo "Usage: $0 [COMMAND]" echo "" echo "Commands:" echo " check Run one-time health check" echo " monitor Start continuous monitoring" echo " help Show this help message" echo "" echo "Environment Variables:" echo " HEALTH_ENDPOINT Health check URL (default: http://localhost:8000/health)" echo " CHECK_INTERVAL Monitoring interval in seconds (default: 30)" echo " TIMEOUT HTTP timeout in seconds (default: 10)" } # Main script logic case "${1:-check}" in "monitor") run_monitor ;; "check") run_check ;; "help"|"-h"|"--help") show_usage ;; *) print_error "Unknown command: $1" show_usage exit 1 ;; esac