flamenco/internal/manager/persistence/jobs_blocklist.go
Sybren A. Stüvel fd31a85bcd Manager: add blocking of workers when they fail certain tasks too much
When a worker fails too many tasks, of the same task type, on the same job,
it'll get blocked from doing those.
2022-06-17 15:49:16 +02:00

93 lines
3.0 KiB
Go

package persistence
// SPDX-License-Identifier: GPL-3.0-or-later
import (
"context"
"math"
"time"
"gorm.io/gorm/clause"
)
// JobBlock keeps track of which Worker is not allowed to run which task type on which job.
type JobBlock struct {
// Don't include the standard Gorm UpdatedAt or DeletedAt fields, as they're useless here.
// Entries will never be updated, and should never be soft-deleted but just purged from existence.
ID uint
CreatedAt time.Time
JobID uint `gorm:"default:0;uniqueIndex:job_worker_tasktype"`
Job *Job `gorm:"foreignkey:JobID;references:ID;constraint:OnDelete:CASCADE"`
WorkerID uint `gorm:"default:0;uniqueIndex:job_worker_tasktype"`
Worker *Worker `gorm:"foreignkey:WorkerID;references:ID;constraint:OnDelete:CASCADE"`
TaskType string `gorm:"uniqueIndex:job_worker_tasktype"`
}
// AddWorkerToJobBlocklist prevents this Worker of getting any task, of this type, on this job, from the task scheduler.
func (db *DB) AddWorkerToJobBlocklist(ctx context.Context, job *Job, worker *Worker, taskType string) error {
entry := JobBlock{
Job: job,
Worker: worker,
TaskType: taskType,
}
tx := db.gormDB.WithContext(ctx).
Clauses(clause.OnConflict{DoNothing: true}).
Create(&entry)
return tx.Error
}
// WorkersLeftToRun returns a set of worker UUIDs that can run tasks of the given type on the given job.
//
// NOTE: this does NOT consider the task failure list, which blocks individual
// workers from individual tasks. This is ONLY concerning the job blocklist.
func (db *DB) WorkersLeftToRun(ctx context.Context, job *Job, taskType string) (map[string]bool, error) {
// Find the IDs of the workers blocked on this job + tasktype combo.
blockedWorkers := db.gormDB.
Table("workers as blocked_workers").
Select("blocked_workers.id").
Joins("inner join job_blocks JB on blocked_workers.id = JB.worker_id").
Where("JB.job_id = ?", job.ID).
Where("JB.task_type = ?", taskType)
// Find the workers NOT blocked.
workers := []*Worker{}
tx := db.gormDB.WithContext(ctx).
Model(&Worker{}).
Select("uuid").
Where("id not in (?)", blockedWorkers).
Scan(&workers)
if tx.Error != nil {
return nil, tx.Error
}
// From the list of workers, construct the map of UUIDs.
uuidMap := map[string]bool{}
for _, worker := range workers {
uuidMap[worker.UUID] = true
}
return uuidMap, nil
}
// CountTaskFailuresOfWorker returns the number of task failures of this worker, on this particular job and task type.
func (db *DB) CountTaskFailuresOfWorker(ctx context.Context, job *Job, worker *Worker, taskType string) (int, error) {
var numFailures int64
tx := db.gormDB.WithContext(ctx).
Model(&TaskFailure{}).
Joins("inner join tasks T on task_failures.task_id = T.id").
Where("task_failures.worker_id = ?", worker.ID).
Where("T.job_id = ?", job.ID).
Where("T.type = ?", taskType).
Count(&numFailures)
if numFailures > math.MaxInt {
panic("overflow error in number of failures")
}
return int(numFailures), tx.Error
}