Add comprehensive production deployment documentation

- Add Docker Volume Drivers section with NFS, CIFS/SMB, ZFS, Block Storage configs - Add Docker Swarm multi-node deployment with HA and cloud provider examples - Include enterprise storage integrations and monitoring/scaling configurations - Complete production-ready infrastructure documentation
2025-09-09 14:44:42 -06:00 · 2025-09-09 14:44:42 -06:00 · 1b5b71acad
commit 1b5b71acad
parent f704dc5975
1 changed files with 293 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -228,6 +228,299 @@ go test ./tests/integration/ -v -workers=10 -jobs=50
 2. **Local + Sync**: Job files copied to local storage before rendering
 3. **Shaman Storage**: Content-addressable system with automatic deduplication
 ### 🗄️ Docker Volume Drivers for Production Storage
 Flamenco's render farm requires shared storage accessible by all workers. Here's how to configure different Docker volume drivers for production deployments:
 #### NFS (Network File System)
 ```yaml
 # docker-compose.yml
 volumes:
  shared_storage:
    driver: local
    driver_opts:
      type: nfs
      o: addr=192.168.1.100,rw,nolock,hard,intr
      device: ":/export/flamenco-shared"
 services:
  flamenco-manager:
    volumes:
      - shared_storage:/shared-storage
  flamenco-worker:
    volumes:
      - shared_storage:/shared-storage
 ```
 #### CIFS/SMB (Windows File Shares)
 ```yaml
 # docker-compose.yml  
 volumes:
  shared_storage:
    driver: local
    driver_opts:
      type: cifs
      o: username=flamenco,password=secure123,uid=1000,gid=1000,iocharset=utf8
      device: "//192.168.1.100/flamenco-share"
 services:
  flamenco-manager:
    volumes:
      - shared_storage:/shared-storage
 ```
 #### ZFS (High-Performance Storage)
 ```yaml
 # docker-compose.yml
 volumes:
  shared_storage:
    driver: local
    driver_opts:
      type: zfs
      zfs-name: tank/flamenco-shared
 services:
  flamenco-manager:
    volumes:
      - shared_storage:/shared-storage
 ```
 #### Block Storage (Cloud/SAN)
 ```yaml
 # For AWS EFS, GCP Filestore, or SAN storage
 volumes:
  shared_storage:
    driver: local
    driver_opts:
      type: nfs4
      o: addr=fs-12345678.efs.us-west-2.amazonaws.com,rsize=1048576,wsize=1048576
      device: ":/"
 ```
 #### Third-Party Volume Plugins
 ```yaml
 # NetApp, Pure Storage, etc.
 volumes:
  shared_storage:
    driver: netapp:latest
    driver_opts:
      size: "500GB"
      performance: "high"
      snapshot_policy: "hourly"
 ```
 #### Performance Considerations
 - **NFS**: Best for mixed OS environments, excellent performance with NFSv4
 - **CIFS/SMB**: Ideal for Windows-heavy environments, good Windows compatibility  
 - **ZFS**: Superior performance and data integrity, great for Linux environments
 - **Block Storage**: Cloud-native scaling, excellent for multi-region deployments
 - **Enterprise Storage**: Hardware-accelerated performance for high-throughput rendering
 ### 🐝 Docker Swarm Multi-Node Deployment
 Scale Flamenco across multiple servers, datacenters, and cloud regions with Docker Swarm orchestration:
 #### Swarm Initialization
 ```bash
 # Initialize Swarm on manager node
 docker swarm init --advertise-addr 192.168.1.10
 # Add worker nodes (run on each worker server)
 docker swarm join --token SWMTKN-1-xxx 192.168.1.10:2377
 # Label nodes for specific roles
 docker node update --label-add role=manager node-1
 docker node update --label-add role=worker node-2
 docker node update --label-add gpu=nvidia node-3
 ```
 #### Production Swarm Stack
 ```yaml
 # flamenco-swarm.yml
 version: '3.8'
 services:
  flamenco-manager:
    image: flamenco:production
    ports:
      - "8080:8080"
    networks:
      - flamenco-net
    volumes:
      - shared_storage:/shared-storage
      - manager_data:/app/data
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.labels.role == manager
      restart_policy:
        condition: on-failure
        max_attempts: 3
    environment:
      - FLAMENCO_LISTEN=:8080
      - FLAMENCO_DATABASE_URL=sqlite:///app/data/flamenco.db
  flamenco-worker:
    image: flamenco:production
    networks:
      - flamenco-net
    volumes:
      - shared_storage:/shared-storage
      - worker_data:/app/data
    deploy:
      mode: global  # One worker per node
      placement:
        constraints:
          - node.labels.role == worker
      resources:
        limits:
          memory: 4G
        reservations:
          memory: 2G
      restart_policy:
        condition: on-failure
        max_attempts: 3
    environment:
      - MANAGER_URL=http://flamenco-manager:8080
      - WORKER_SLEEP_SCHEDULE=22:00-06:00
  # GPU-enabled workers for CUDA/OpenCL rendering
  flamenco-gpu-worker:
    image: flamenco:production-gpu
    networks:
      - flamenco-net
    volumes:
      - shared_storage:/shared-storage
    deploy:
      replicas: 2
      placement:
        constraints:
          - node.labels.gpu == nvidia
      resources:
        reservations:
          generic_resources:
            - discrete_resource_spec:
                kind: 'NVIDIA-GPU'
                value: 1
    environment:
      - MANAGER_URL=http://flamenco-manager:8080
      - CUDA_VISIBLE_DEVICES=all
 networks:
  flamenco-net:
    driver: overlay
    attachable: true
 volumes:
  shared_storage:
    driver: local
    driver_opts:
      type: nfs4
      o: addr=storage.company.com,rsize=1048576,wsize=1048576
      device: ":/flamenco-shared"
  manager_data:
    driver: local
  worker_data:
    driver: local
 ```
 #### Multi-Datacenter Deployment
 ```bash
 # Deploy the stack
 docker stack deploy -c flamenco-swarm.yml flamenco
 # Scale workers across regions
 docker service scale flamenco_flamenco-worker=20
 # Update service with zero downtime
 docker service update --image flamenco:v3.8 flamenco_flamenco-manager
 # Monitor cluster health
 docker service ls
 docker service ps flamenco_flamenco-worker --no-trunc
 ```
 #### Advanced Swarm Configuration
 ```yaml
 # High-availability manager setup
  flamenco-manager:
    deploy:
      replicas: 3  # HA setup
      placement:
        max_replicas_per_node: 1
        constraints:
          - node.labels.role == manager
      update_config:
        parallelism: 1
        delay: 30s
        order: stop-first
      rollback_config:
        parallelism: 1
        delay: 30s
 # Load balancer for multi-manager setup
  flamenco-lb:
    image: nginx:alpine
    ports:
      - "80:80"
      - "443:443"
    deploy:
      replicas: 2
      placement:
        constraints:
          - node.labels.role == edge
 ```
 #### Cloud Provider Integration
 ```yaml
 # AWS ECS/Fargate integration
  flamenco-worker:
    deploy:
      placement:
        constraints:
          - node.labels.cloud == aws
          - node.labels.zone == us-west-2a
    environment:
      - AWS_REGION=us-west-2
      - S3_BUCKET=company-flamenco-assets
 # Multi-cloud deployment
  flamenco-worker-aws:
    deploy:
      placement:
        constraints:
          - node.labels.provider == aws
  flamenco-worker-gcp:
    deploy:
      placement:
        constraints:
          - node.labels.provider == gcp
 ```
 #### Monitoring & Scaling
 ```bash
 # Auto-scaling based on queue depth
 docker service update --replicas 50 flamenco_flamenco-worker
 # Health monitoring
 docker service logs -f flamenco_flamenco-manager
 # Resource monitoring across cluster
 docker stats $(docker ps -q)
 # Update shared storage across all nodes
 docker service update --mount-add type=volume,src=new_storage,dst=/shared-storage flamenco_flamenco-worker
 ```
 #### Benefits of Swarm Deployment
 - **Geographic Distribution**: Workers across multiple datacenters/clouds
 - **High Availability**: Manager failover and redundancy
 - **Dynamic Scaling**: Auto-scale workers based on render queue
 - **Zero-Downtime Updates**: Rolling updates across the fleet
 - **Resource Management**: CPU/memory/GPU resource allocation
 - **Service Discovery**: Automatic load balancing and networking
 ### Docker Production Deployment
 ```yaml
 version: '3.8'