mcptesta/examples/templates/stress_template.yaml

# MCPTesta Stress Testing Configuration Template
#
# Specialized template for comprehensive stress testing and performance validation.
# Designed to push FastMCP servers to their limits and identify bottlenecks.
#
# Stress Testing Categories:
# - Load testing with various patterns
# - Performance benchmarking
# - Resource exhaustion testing
# - Concurrency and parallelism limits
# - Memory and CPU pressure testing
# - Network stress and bandwidth testing

# Stress testing optimized configuration
config:
  parallel_workers: 16  # High concurrency for stress testing
  output_directory: "./stress_test_results"
  output_format: "all"
  global_timeout: 1800  # 30 minutes for long-running stress tests
  max_concurrent_operations: 100

  # Stress testing specific features
  enable_stress_testing: true
  enable_memory_profiling: true
  enable_performance_profiling: true
  enable_resource_monitoring: true

  features:
    test_notifications: true
    test_cancellation: true
    test_progress: true
    test_sampling: true

  # Aggressive retry policy for stress conditions
  retry_policy:
    max_retries: 1  # Minimal retries to avoid masking stress failures
    backoff_factor: 1.0
    retry_on_errors: ["ConnectionError"]

  # Performance monitoring configuration
  monitoring:
    enable_real_time_metrics: true
    metrics_collection_interval: 1  # Collect metrics every second
    performance_thresholds:
      max_latency_ms: 5000  # Allow higher latency under stress
      max_memory_mb: 2048
      max_cpu_percent: 95
    resource_sampling_rate: 0.1  # Sample 10% of operations for detailed metrics

# Multiple server instances for distributed load testing
servers:
  - name: "stress_target_1"
    command: "${STRESS_SERVER_1_CMD:python -m my_fastmcp_server --performance-mode --instance 1}"
    transport: "stdio"
    timeout: 60
    enabled: true
    env_vars:
      PERFORMANCE_MODE: "true"
      MAX_CONNECTIONS: "1000"
      BUFFER_SIZE: "65536"
      GC_THRESHOLD: "high"

  - name: "stress_target_2"
    command: "${STRESS_SERVER_2_CMD:python -m my_fastmcp_server --performance-mode --instance 2}"
    transport: "stdio"
    timeout: 60
    enabled: true
    env_vars:
      PERFORMANCE_MODE: "true"
      INSTANCE_ID: "2"

  - name: "stress_target_3"
    command: "${STRESS_SERVER_3_CMD:python -m my_fastmcp_server --performance-mode --instance 3}"
    transport: "stdio"
    timeout: 60
    enabled: false  # Enable for multi-instance testing

# Comprehensive stress testing suites
test_suites:
  - name: "Baseline Performance Measurement"
    description: "Establish performance baseline before stress testing"
    enabled: true
    tags: ["baseline", "performance"]
    parallel: false  # Sequential for accurate baseline
    timeout: 300

    tests:
      - name: "single_operation_latency"
        description: "Measure single operation latency"
        test_type: "tool_call"
        target: "echo"
        parameters:
          message: "baseline_test"
        retry_count: 1000  # Multiple samples for statistical significance
        timeout: 120
        tags: ["latency", "baseline"]

      - name: "throughput_measurement"
        description: "Measure maximum throughput"
        test_type: "tool_call"
        target: "echo"
        parameters:
          message: "throughput_test"
        retry_count: 10000
        enable_progress: true
        timeout: 300
        tags: ["throughput", "baseline"]

      - name: "resource_usage_baseline"
        description: "Measure baseline resource usage"
        test_type: "tool_call"
        target: "resource_monitor"
        parameters:
          duration: 60
          metrics: ["cpu", "memory", "io", "network"]
        timeout: 90
        tags: ["resources", "baseline"]

  - name: "Load Pattern Testing"
    description: "Test various load patterns and traffic shapes"
    enabled: true
    tags: ["load", "patterns"]
    parallel: true
    timeout: 900

    tests:
      - name: "constant_load_test"
        description: "Sustained constant load testing"
        test_type: "tool_call"
        target: "echo"
        parameters:
          message: "constant_load_${ITERATION}"
        retry_count: 50000  # 50k operations
        timeout: 600
        tags: ["constant", "sustained"]

      - name: "spike_load_test"
        description: "Sudden traffic spike testing"
        test_type: "tool_call"
        target: "spike_handler"
        parameters:
          spike_factor: 10
          spike_duration: 30
          baseline_rps: 100
        enable_progress: true
        timeout: 120
        tags: ["spike", "burst"]

      - name: "ramp_up_test"
        description: "Gradual load ramp-up testing"
        test_type: "tool_call"
        target: "ramp_processor"
        parameters:
          start_rps: 1
          end_rps: 1000
          ramp_duration: 300
          hold_duration: 60
        enable_progress: true
        timeout: 480
        tags: ["ramp", "gradual"]

      - name: "oscillating_load_test"
        description: "Oscillating load pattern testing"
        test_type: "tool_call"
        target: "oscillator"
        parameters:
          min_rps: 10
          max_rps: 500
          period_seconds: 60
          cycles: 10
        enable_progress: true
        timeout: 720
        tags: ["oscillating", "variable"]

  - name: "Concurrency Stress Testing"
    description: "High concurrency and parallelism stress testing"
    enabled: true
    tags: ["concurrency", "parallel"]
    parallel: true
    timeout: 600

    tests:
      - name: "maximum_concurrent_connections"
        description: "Test maximum concurrent connection limits"
        test_type: "tool_call"
        target: "connection_holder"
        parameters:
          hold_duration: 120
          connection_type: "persistent"
        retry_count: 1000  # Attempt 1000 concurrent connections
        timeout: 180
        tags: ["connections", "limits"]

      - name: "thread_pool_exhaustion"
        description: "Test thread pool exhaustion and recovery"
        test_type: "tool_call"
        target: "thread_consumer"
        parameters:
          threads_to_consume: 500
          hold_duration: 60
        timeout: 120
        tags: ["threads", "exhaustion"]

      - name: "async_operation_flood"
        description: "Flood server with async operations"
        test_type: "tool_call"
        target: "async_processor"
        parameters:
          async_operations: 10000
          operation_type: "concurrent"
        enable_progress: true
        timeout: 300
        tags: ["async", "flood"]

      - name: "request_queue_overflow"
        description: "Test request queue overflow handling"
        test_type: "tool_call"
        target: "queue_filler"
        parameters:
          queue_size_target: 100000
          overflow_strategy: "backpressure"
        timeout: 180
        tags: ["queue", "overflow"]

  - name: "Memory Stress Testing"
    description: "Memory-intensive operations and pressure testing"
    enabled: true
    tags: ["memory", "stress"]
    parallel: true
    timeout: 800

    tests:
      - name: "large_payload_processing"
        description: "Process increasingly large payloads"
        test_type: "tool_call"
        target: "payload_processor"
        parameters:
          payload_sizes: ["1MB", "10MB", "100MB", "500MB"]
          processing_type: "memory_intensive"
        enable_progress: true
        timeout: 600
        tags: ["payload", "large"]

      - name: "memory_leak_detection"
        description: "Long-running test to detect memory leaks"
        test_type: "tool_call"
        target: "memory_allocator"
        parameters:
          allocation_pattern: "incremental"
          test_duration: 1800  # 30 minutes
          leak_detection: true
        enable_progress: true
        timeout: 2000
        tags: ["leaks", "long_running"]

      - name: "garbage_collection_pressure"
        description: "Create GC pressure and measure impact"
        test_type: "tool_call"
        target: "gc_stress_tester"
        parameters:
          allocation_rate: "high"
          object_lifetime: "mixed"
          gc_frequency_target: 100
        timeout: 300
        tags: ["gc", "pressure"]

      - name: "out_of_memory_recovery"
        description: "Test OOM recovery mechanisms"
        test_type: "tool_call"
        target: "oom_simulator"
        parameters:
          memory_limit: "512MB"
          allocation_strategy: "aggressive"
          recovery_validation: true
        expected_error: "out of memory"
        timeout: 120
        tags: ["oom", "recovery"]

  - name: "CPU Intensive Stress Testing"
    description: "CPU-bound operations and computational stress"
    enabled: true
    tags: ["cpu", "computational"]
    parallel: true
    timeout: 600

    tests:
      - name: "cpu_bound_operations"
        description: "CPU-intensive computational tasks"
        test_type: "tool_call"
        target: "cpu_intensive_task"
        parameters:
          operation_type: "prime_calculation"
          complexity: "high"
          iterations: 1000000
        retry_count: 10  # Multiple CPU-bound tasks
        timeout: 300
        tags: ["cpu_bound", "computation"]

      - name: "algorithm_complexity_test"
        description: "Test algorithmic complexity under load"
        test_type: "tool_call"
        target: "algorithm_tester"
        parameters:
          algorithms: ["sorting", "searching", "graph_traversal"]
          input_sizes: [1000, 10000, 100000]
          complexity_analysis: true
        enable_progress: true
        timeout: 400
        tags: ["algorithms", "complexity"]

      - name: "multi_core_utilization"
        description: "Test multi-core CPU utilization"
        test_type: "tool_call"
        target: "parallel_processor"
        parameters:
          cores_to_utilize: "all"
          workload_distribution: "balanced"
          cpu_affinity: "round_robin"
        timeout: 240
        tags: ["multicore", "utilization"]

  - name: "I/O Stress Testing"
    description: "Intensive I/O operations and bandwidth testing"
    enabled: true
    tags: ["io", "bandwidth"]
    parallel: true
    timeout: 700

    tests:
      - name: "disk_io_stress"
        description: "Intensive disk I/O operations"
        test_type: "tool_call"
        target: "disk_io_tester"
        parameters:
          io_pattern: "random_write"
          file_size: "1GB"
          block_size: "4KB"
          concurrent_operations: 100
        enable_progress: true
        timeout: 600
        tags: ["disk", "io"]

      - name: "network_bandwidth_test"
        description: "Network bandwidth saturation testing"
        test_type: "tool_call"
        target: "bandwidth_tester"
        parameters:
          data_volume: "10GB"
          connection_count: 50
          transfer_pattern: "bulk"
        enable_progress: true
        timeout: 400
        tags: ["network", "bandwidth"]

      - name: "file_descriptor_exhaustion"
        description: "Test file descriptor limit handling"
        test_type: "tool_call"
        target: "fd_consumer"
        parameters:
          target_fd_count: 10000
          fd_type: "mixed"
          cleanup_strategy: "gradual"
        timeout: 180
        tags: ["file_descriptors", "limits"]

  - name: "Error Handling Under Stress"
    description: "Error handling and recovery under stress conditions"
    enabled: true
    tags: ["errors", "recovery", "stress"]
    parallel: true
    timeout: 400

    tests:
      - name: "error_flood_test"
        description: "Flood server with error-inducing requests"
        test_type: "tool_call"
        target: "error_generator"
        parameters:
          error_types: ["invalid_params", "timeout", "resource_unavailable"]
          error_rate: 0.5  # 50% error rate
          total_operations: 10000
        timeout: 300
        tags: ["errors", "flood"]

      - name: "cascading_failure_stress"
        description: "Test cascading failure handling under stress"
        test_type: "tool_call"
        target: "cascade_simulator"
        parameters:
          initial_failure_rate: 0.1
          cascade_probability: 0.3
          recovery_time: 30
        timeout: 240
        tags: ["cascading", "failures"]

      - name: "timeout_storm_test"
        description: "Multiple simultaneous timeout scenarios"
        test_type: "tool_call"
        target: "timeout_generator"
        parameters:
          timeout_patterns: ["random", "burst", "gradual"]
          concurrent_timeouts: 100
        timeout: 180
        tags: ["timeouts", "storm"]

  - name: "Resource Exhaustion Testing"
    description: "Systematic resource exhaustion and recovery testing"
    enabled: true
    tags: ["resources", "exhaustion"]
    parallel: true
    timeout: 900

    tests:
      - name: "connection_pool_exhaustion"
        description: "Exhaust connection pool resources"
        test_type: "tool_call"
        target: "connection_exhaustor"
        parameters:
          pool_size: 100
          hold_duration: 300
          exhaustion_strategy: "gradual"
        timeout: 400
        tags: ["connections", "pool"]

      - name: "buffer_overflow_test"
        description: "Test buffer overflow handling"
        test_type: "tool_call"
        target: "buffer_tester"
        parameters:
          buffer_sizes: ["64KB", "1MB", "10MB"]
          overflow_data: "random"
          safety_mechanisms: true
        timeout: 180
        tags: ["buffers", "overflow"]

      - name: "cache_thrashing_test"
        description: "Induce cache thrashing and measure impact"
        test_type: "tool_call"
        target: "cache_thrasher"
        parameters:
          cache_size: "100MB"
          working_set: "1GB"
          access_pattern: "random"
        timeout: 300
        tags: ["cache", "thrashing"]

  - name: "Long Duration Stability Testing"
    description: "Extended duration stability and endurance testing"
    enabled: true
    tags: ["stability", "endurance", "soak"]
    parallel: false  # Sequential for stability testing
    timeout: 7200  # 2 hours

    tests:
      - name: "soak_test_24h"
        description: "24-hour soak test simulation"
        test_type: "tool_call"
        target: "soak_tester"
        parameters:
          duration: 3600  # 1 hour for demo (would be 86400 for full 24h)
          operations_per_minute: 60
          stability_monitoring: true
        enable_progress: true
        timeout: 3900
        tags: ["soak", "24h", "stability"]

      - name: "resource_leak_detection"
        description: "Long-running resource leak detection"
        test_type: "tool_call"
        target: "leak_detector"
        parameters:
          monitoring_duration: 1800  # 30 minutes
          leak_types: ["memory", "connections", "file_handles"]
          detection_threshold: 0.05  # 5% growth threshold
        enable_progress: true
        timeout: 2000
        tags: ["leaks", "monitoring"]

# Stress testing specific variables
variables:
  # Server configurations optimized for stress testing
  STRESS_SERVER_1_CMD: "python -m my_fastmcp_server --performance-mode --max-connections 1000 --instance 1"
  STRESS_SERVER_2_CMD: "python -m my_fastmcp_server --performance-mode --max-connections 1000 --instance 2"
  STRESS_SERVER_3_CMD: "python -m my_fastmcp_server --performance-mode --max-connections 1000 --instance 3"

  # Load testing parameters
  MAX_RPS: "10000"
  STRESS_DURATION: "1800"  # 30 minutes
  RAMP_DURATION: "300"     # 5 minutes

  # Resource limits for testing
  MAX_MEMORY_MB: "2048"
  MAX_CPU_PERCENT: "95"
  MAX_CONNECTIONS: "1000"
  MAX_FILE_DESCRIPTORS: "10000"

  # Payload sizes for testing
  SMALL_PAYLOAD: "1KB"
  MEDIUM_PAYLOAD: "1MB"
  LARGE_PAYLOAD: "100MB"
  XLARGE_PAYLOAD: "500MB"

  # Test iteration counters
  ITERATION: "0"
  BATCH_ID: "stress_batch_1"

# Stress Testing Execution Guide:
#
# 1. Baseline Establishment:
#    - Always run baseline tests first
#    - Document performance metrics before stress testing
#    - Establish SLA thresholds
#
# 2. Progressive Load Testing:
#    - Start with lower loads and increase gradually
#    - Monitor resource utilization continuously
#    - Identify breaking points and bottlenecks
#
# 3. Resource Monitoring:
#    - Enable all profiling and monitoring features
#    - Watch for memory leaks, CPU spikes, I/O bottlenecks
#    - Monitor system metrics beyond application metrics
#
# 4. Failure Analysis:
#    - Document failure modes and recovery patterns
#    - Test error handling under stress conditions
#    - Validate graceful degradation mechanisms
#
# 5. Long Duration Testing:
#    - Run soak tests to detect stability issues
#    - Monitor for gradual resource leaks
#    - Validate system behavior over extended periods
#
# Execution Examples:
#
# Full stress test suite:
# mcptesta yaml stress_config.yaml --parallel 16 --timeout 7200
#
# Memory-focused stress testing:
# mcptesta yaml stress_config.yaml --tag memory --enable-memory-profiling
#
# Load pattern testing only:
# mcptesta yaml stress_config.yaml --tag load --tag patterns
#
# Long duration stability testing:
# mcptesta yaml stress_config.yaml --tag stability --tag endurance
#
# CPU stress testing:
# mcptesta yaml stress_config.yaml --tag cpu --tag computational --parallel 8