Manager: add blocking of workers when they fail certain tasks too much
When a worker fails too many tasks, of the same task type, on the same job, it'll get blocked from doing those.
This commit is contained in:
parent
56abc825a6
commit
fd31a85bcd
@ -53,6 +53,13 @@ type PersistenceService interface {
|
||||
// ClearFailureListOfJob en-mass, for all tasks of this job, clears the list of workers that failed those tasks.
|
||||
ClearFailureListOfJob(context.Context, *persistence.Job) error
|
||||
|
||||
// AddWorkerToJobBlocklist prevents this Worker of getting any task, of this type, on this job, from the task scheduler.
|
||||
AddWorkerToJobBlocklist(ctx context.Context, job *persistence.Job, worker *persistence.Worker, taskType string) error
|
||||
// WorkersLeftToRun returns a set of worker UUIDs that can run tasks of the given type on the given job.
|
||||
WorkersLeftToRun(ctx context.Context, job *persistence.Job, taskType string) (map[string]bool, error)
|
||||
// CountTaskFailuresOfWorker returns the number of task failures of this worker, on this particular job and task type.
|
||||
CountTaskFailuresOfWorker(ctx context.Context, job *persistence.Job, worker *persistence.Worker, taskType string) (int, error)
|
||||
|
||||
// Database queries.
|
||||
QueryJobs(ctx context.Context, query api.JobsQuery) ([]*persistence.Job, error)
|
||||
QueryJobTaskSummaries(ctx context.Context, jobUUID string) ([]*persistence.Task, error)
|
||||
|
@ -40,6 +40,20 @@ func (m *MockPersistenceService) EXPECT() *MockPersistenceServiceMockRecorder {
|
||||
return m.recorder
|
||||
}
|
||||
|
||||
// AddWorkerToJobBlocklist mocks base method.
|
||||
func (m *MockPersistenceService) AddWorkerToJobBlocklist(arg0 context.Context, arg1 *persistence.Job, arg2 *persistence.Worker, arg3 string) error {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "AddWorkerToJobBlocklist", arg0, arg1, arg2, arg3)
|
||||
ret0, _ := ret[0].(error)
|
||||
return ret0
|
||||
}
|
||||
|
||||
// AddWorkerToJobBlocklist indicates an expected call of AddWorkerToJobBlocklist.
|
||||
func (mr *MockPersistenceServiceMockRecorder) AddWorkerToJobBlocklist(arg0, arg1, arg2, arg3 interface{}) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddWorkerToJobBlocklist", reflect.TypeOf((*MockPersistenceService)(nil).AddWorkerToJobBlocklist), arg0, arg1, arg2, arg3)
|
||||
}
|
||||
|
||||
// AddWorkerToTaskFailedList mocks base method.
|
||||
func (m *MockPersistenceService) AddWorkerToTaskFailedList(arg0 context.Context, arg1 *persistence.Task, arg2 *persistence.Worker) (int, error) {
|
||||
m.ctrl.T.Helper()
|
||||
@ -83,6 +97,21 @@ func (mr *MockPersistenceServiceMockRecorder) ClearFailureListOfTask(arg0, arg1
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ClearFailureListOfTask", reflect.TypeOf((*MockPersistenceService)(nil).ClearFailureListOfTask), arg0, arg1)
|
||||
}
|
||||
|
||||
// CountTaskFailuresOfWorker mocks base method.
|
||||
func (m *MockPersistenceService) CountTaskFailuresOfWorker(arg0 context.Context, arg1 *persistence.Job, arg2 *persistence.Worker, arg3 string) (int, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "CountTaskFailuresOfWorker", arg0, arg1, arg2, arg3)
|
||||
ret0, _ := ret[0].(int)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// CountTaskFailuresOfWorker indicates an expected call of CountTaskFailuresOfWorker.
|
||||
func (mr *MockPersistenceServiceMockRecorder) CountTaskFailuresOfWorker(arg0, arg1, arg2, arg3 interface{}) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CountTaskFailuresOfWorker", reflect.TypeOf((*MockPersistenceService)(nil).CountTaskFailuresOfWorker), arg0, arg1, arg2, arg3)
|
||||
}
|
||||
|
||||
// CreateWorker mocks base method.
|
||||
func (m *MockPersistenceService) CreateWorker(arg0 context.Context, arg1 *persistence.Worker) error {
|
||||
m.ctrl.T.Helper()
|
||||
@ -315,6 +344,21 @@ func (mr *MockPersistenceServiceMockRecorder) WorkerSeen(arg0, arg1 interface{})
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WorkerSeen", reflect.TypeOf((*MockPersistenceService)(nil).WorkerSeen), arg0, arg1)
|
||||
}
|
||||
|
||||
// WorkersLeftToRun mocks base method.
|
||||
func (m *MockPersistenceService) WorkersLeftToRun(arg0 context.Context, arg1 *persistence.Job, arg2 string) (map[string]bool, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "WorkersLeftToRun", arg0, arg1, arg2)
|
||||
ret0, _ := ret[0].(map[string]bool)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// WorkersLeftToRun indicates an expected call of WorkersLeftToRun.
|
||||
func (mr *MockPersistenceServiceMockRecorder) WorkersLeftToRun(arg0, arg1, arg2 interface{}) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WorkersLeftToRun", reflect.TypeOf((*MockPersistenceService)(nil).WorkersLeftToRun), arg0, arg1, arg2)
|
||||
}
|
||||
|
||||
// MockChangeBroadcaster is a mock of ChangeBroadcaster interface.
|
||||
type MockChangeBroadcaster struct {
|
||||
ctrl *gomock.Controller
|
||||
|
@ -159,7 +159,17 @@ func (f *Flamenco) onTaskFailed(
|
||||
if err != nil {
|
||||
return fmt.Errorf("adding worker to failure list of task: %w", err)
|
||||
}
|
||||
// f.maybeBlocklistWorker(ctx, logger, worker, task)
|
||||
|
||||
logger = logger.With().Str("taskType", task.Type).Logger()
|
||||
shouldHardFail, err := f.maybeBlocklistWorker(ctx, logger, worker, task)
|
||||
if err != nil {
|
||||
return fmt.Errorf("block-listing worker: %w", err)
|
||||
}
|
||||
if shouldHardFail {
|
||||
// Hard failure because of blocklisting should simply fail the entire job.
|
||||
// There are no more workers left to finish it.
|
||||
return f.failJobAfterCatastroficTaskFailure(ctx, logger, worker, task)
|
||||
}
|
||||
|
||||
// Determine whether this is soft or hard failure.
|
||||
threshold := f.config.Get().TaskFailAfterSoftFailCount
|
||||
@ -173,6 +183,120 @@ func (f *Flamenco) onTaskFailed(
|
||||
return f.hardFailTask(ctx, logger, worker, task, numFailed)
|
||||
}
|
||||
|
||||
// maybeBlocklistWorker potentially block-lists the Worker, and checks whether
|
||||
// there are any workers left to run tasks of this type.
|
||||
//
|
||||
// Returns "must hard-fail". That is, returns `false` if there are still workers
|
||||
// left to run tasks of this type, on this job.
|
||||
//
|
||||
// If the worker is NOT block-listed at this moment, always returns `false`.
|
||||
//
|
||||
// Returns `true` if ALL workers that can execute this task type are blocked
|
||||
// from working on this job.
|
||||
func (f *Flamenco) maybeBlocklistWorker(
|
||||
ctx context.Context,
|
||||
logger zerolog.Logger,
|
||||
worker *persistence.Worker,
|
||||
task *persistence.Task,
|
||||
) (bool, error) {
|
||||
numFailures, err := f.persist.CountTaskFailuresOfWorker(ctx, task.Job, worker, task.Type)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("counting failures of worker on job %q, task type %q: %w", task.Job.UUID, task.Type, err)
|
||||
}
|
||||
// The received task update hasn't been persisted in the database yet,
|
||||
// so we should count that too.
|
||||
numFailures++
|
||||
|
||||
threshold := f.config.Get().BlocklistThreshold
|
||||
if numFailures < threshold {
|
||||
logger.Info().Int("numFailedTasks", numFailures).Msg("not enough failed tasks to blocklist worker")
|
||||
// TODO: This might need special handling, as this worker will be blocked
|
||||
// from retrying this particular task. It could have been the last worker to
|
||||
// be allowed this task type; if that is the case, the job is now stuck.
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Blocklist the Worker.
|
||||
if err := f.blocklistWorker(ctx, logger, worker, task); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
// Return hard-failure if there are no workers left for this task type.
|
||||
numWorkers, err := f.numWorkersCapableOfRunningTask(ctx, task)
|
||||
return numWorkers == 0, err
|
||||
}
|
||||
|
||||
func (f *Flamenco) blocklistWorker(
|
||||
ctx context.Context,
|
||||
logger zerolog.Logger,
|
||||
worker *persistence.Worker,
|
||||
task *persistence.Task,
|
||||
) error {
|
||||
logger.Warn().
|
||||
Str("job", task.Job.UUID).
|
||||
Msg("block-listing worker")
|
||||
err := f.persist.AddWorkerToJobBlocklist(ctx, task.Job, worker, task.Type)
|
||||
if err != nil {
|
||||
return fmt.Errorf("adding worker to block list: %w", err)
|
||||
}
|
||||
|
||||
// TODO: requeue all tasks of this job & task type that were hard-failed by this worker.
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *Flamenco) numWorkersCapableOfRunningTask(ctx context.Context, task *persistence.Task) (int, error) {
|
||||
// See which workers are left to run tasks of this type, on this job,
|
||||
workersLeft, err := f.persist.WorkersLeftToRun(ctx, task.Job, task.Type)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("fetching workers available to run tasks of type %q on job %q: %w",
|
||||
task.Job.UUID, task.Type, err)
|
||||
}
|
||||
|
||||
// Remove (from the list of available workers) those who failed this task before.
|
||||
failers, err := f.persist.FetchTaskFailureList(ctx, task)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("fetching failure list of task %q: %w", task.UUID, err)
|
||||
}
|
||||
for _, failure := range failers {
|
||||
delete(workersLeft, failure.UUID)
|
||||
}
|
||||
|
||||
return len(workersLeft), nil
|
||||
}
|
||||
|
||||
// failJobAfterCatastroficTaskFailure fails the entire job.
|
||||
// This function is meant to be called when a task is failed, causing a block of
|
||||
// the worker, and leaving no workers any more to do tasks of this type.
|
||||
func (f *Flamenco) failJobAfterCatastroficTaskFailure(
|
||||
ctx context.Context,
|
||||
logger zerolog.Logger,
|
||||
worker *persistence.Worker,
|
||||
task *persistence.Task,
|
||||
) error {
|
||||
taskLog := fmt.Sprintf(
|
||||
"Task failed by worker %s, Manager will fail the entire job as there are no more workers left for tasks of type %q.",
|
||||
worker.Identifier(), task.Type,
|
||||
)
|
||||
if err := f.logStorage.WriteTimestamped(logger, task.Job.UUID, task.UUID, taskLog); err != nil {
|
||||
logger.Error().Err(err).Msg("error writing failure notice to task log")
|
||||
}
|
||||
|
||||
if err := f.stateMachine.TaskStatusChange(ctx, task, api.TaskStatusFailed); err != nil {
|
||||
logger.Error().
|
||||
Err(err).
|
||||
Str("newStatus", string(api.TaskStatusFailed)).
|
||||
Msg("error changing task status")
|
||||
}
|
||||
|
||||
newJobStatus := api.JobStatusFailed
|
||||
logger.Info().
|
||||
Str("job", task.Job.UUID).
|
||||
Str("newJobStatus", string(newJobStatus)).
|
||||
Msg("no more workers left to run tasks of this type, failing the entire job")
|
||||
reason := fmt.Sprintf("no more workers left to run tasks of type %q", task.Type)
|
||||
return f.stateMachine.JobStatusChange(ctx, task.Job, newJobStatus, reason)
|
||||
}
|
||||
|
||||
func (f *Flamenco) hardFailTask(
|
||||
ctx context.Context,
|
||||
logger zerolog.Logger,
|
||||
|
@ -464,24 +464,30 @@ func TestTaskUpdateFailed(t *testing.T) {
|
||||
WorkerID: &worker.ID,
|
||||
Job: &mockJob,
|
||||
Activity: "pre-update activity",
|
||||
Type: "misc",
|
||||
}
|
||||
|
||||
conf := config.Conf{
|
||||
Base: config.Base{
|
||||
TaskFailAfterSoftFailCount: 3,
|
||||
BlocklistThreshold: 65535, // This test doesn't cover blocklisting.
|
||||
},
|
||||
}
|
||||
mf.config.EXPECT().Get().Return(&conf).AnyTimes()
|
||||
|
||||
const numSubTests = 2
|
||||
// Expect the task to be fetched for each sub-test:
|
||||
mf.persistence.EXPECT().FetchTask(gomock.Any(), taskID).Return(&mockTask, nil).Times(2)
|
||||
mf.persistence.EXPECT().FetchTask(gomock.Any(), taskID).Return(&mockTask, nil).Times(numSubTests)
|
||||
|
||||
// Expect a 'touch' of the task for each sub-test:
|
||||
mf.persistence.EXPECT().TaskTouchedByWorker(gomock.Any(), &mockTask).Times(2)
|
||||
mf.persistence.EXPECT().WorkerSeen(gomock.Any(), &worker).Times(2)
|
||||
mf.persistence.EXPECT().TaskTouchedByWorker(gomock.Any(), &mockTask).Times(numSubTests)
|
||||
mf.persistence.EXPECT().WorkerSeen(gomock.Any(), &worker).Times(numSubTests)
|
||||
|
||||
// Mimick that this is always first failure of this worker/job/tasktype combo:
|
||||
mf.persistence.EXPECT().CountTaskFailuresOfWorker(gomock.Any(), &mockJob, &worker, "misc").Return(0, nil).Times(numSubTests)
|
||||
|
||||
{
|
||||
// Expect the Worker to be added to the list of workers.
|
||||
// Expect the Worker to be added to the list of failed workers.
|
||||
// This returns 1, which is less than the failure threshold -> soft failure expected.
|
||||
mf.persistence.EXPECT().AddWorkerToTaskFailedList(gomock.Any(), &mockTask, &worker).Return(1, nil)
|
||||
|
||||
@ -515,6 +521,146 @@ func TestTaskUpdateFailed(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlockingAfterFailure(t *testing.T) {
|
||||
mockCtrl := gomock.NewController(t)
|
||||
defer mockCtrl.Finish()
|
||||
|
||||
mf := newMockedFlamenco(mockCtrl)
|
||||
worker := testWorker()
|
||||
|
||||
// Construct the JSON request object.
|
||||
taskUpdate := api.TaskUpdateJSONRequestBody{
|
||||
TaskStatus: ptr(api.TaskStatusFailed),
|
||||
}
|
||||
|
||||
// Construct the task that's supposed to be updated.
|
||||
taskID := "181eab68-1123-4790-93b1-94309a899411"
|
||||
jobID := "e4719398-7cfa-4877-9bab-97c2d6c158b5"
|
||||
mockJob := persistence.Job{UUID: jobID}
|
||||
mockTask := persistence.Task{
|
||||
UUID: taskID,
|
||||
Worker: &worker,
|
||||
WorkerID: &worker.ID,
|
||||
Job: &mockJob,
|
||||
Activity: "pre-update activity",
|
||||
Type: "misc",
|
||||
}
|
||||
|
||||
conf := config.Conf{
|
||||
Base: config.Base{
|
||||
TaskFailAfterSoftFailCount: 3,
|
||||
BlocklistThreshold: 3,
|
||||
},
|
||||
}
|
||||
mf.config.EXPECT().Get().Return(&conf).AnyTimes()
|
||||
|
||||
const numSubTests = 3
|
||||
// Expect the task to be fetched for each sub-test:
|
||||
mf.persistence.EXPECT().FetchTask(gomock.Any(), taskID).Return(&mockTask, nil).Times(numSubTests)
|
||||
|
||||
// Expect a 'touch' of the task for each sub-test:
|
||||
mf.persistence.EXPECT().TaskTouchedByWorker(gomock.Any(), &mockTask).Times(numSubTests)
|
||||
mf.persistence.EXPECT().WorkerSeen(gomock.Any(), &worker).Times(numSubTests)
|
||||
|
||||
// Mimick that this is the 3rd of this worker/job/tasktype combo, and thus should trigger a block.
|
||||
// Returns 2 because there have been 2 previous failures.
|
||||
mf.persistence.EXPECT().
|
||||
CountTaskFailuresOfWorker(gomock.Any(), &mockJob, &worker, "misc").
|
||||
Return(2, nil).
|
||||
Times(numSubTests)
|
||||
|
||||
// Expect the worker to be blocked.
|
||||
mf.persistence.EXPECT().
|
||||
AddWorkerToJobBlocklist(gomock.Any(), &mockJob, &worker, "misc").
|
||||
Times(numSubTests)
|
||||
|
||||
{
|
||||
// Mimick that there is another worker to work on this task, so the job should continue happily.
|
||||
mf.persistence.EXPECT().WorkersLeftToRun(gomock.Any(), &mockJob, "misc").
|
||||
Return(map[string]bool{"60453eec-5a26-43e9-9da2-d00506d492cc": true}, nil)
|
||||
mf.persistence.EXPECT().FetchTaskFailureList(gomock.Any(), &mockTask).
|
||||
Return([]*persistence.Worker{ /* It shouldn't matter whether the failing worker is here or not. */ }, nil)
|
||||
|
||||
// Expect the Worker to be added to the list of failed workers for this task.
|
||||
// This returns 1, which is less than the failure threshold -> soft failure.
|
||||
mf.persistence.EXPECT().AddWorkerToTaskFailedList(gomock.Any(), &mockTask, &worker).Return(1, nil)
|
||||
|
||||
// Expect soft failure of the task.
|
||||
mf.stateMachine.EXPECT().TaskStatusChange(gomock.Any(), &mockTask, api.TaskStatusSoftFailed)
|
||||
mf.logStorage.EXPECT().WriteTimestamped(gomock.Any(), jobID, taskID,
|
||||
"Task failed by 1 worker, Manager will mark it as soft failure. 2 more failures will cause hard failure.")
|
||||
|
||||
// Do the call.
|
||||
echoCtx := mf.prepareMockedJSONRequest(taskUpdate)
|
||||
requestWorkerStore(echoCtx, &worker)
|
||||
err := mf.flamenco.TaskUpdate(echoCtx, taskID)
|
||||
assert.NoError(t, err)
|
||||
assertResponseEmpty(t, echoCtx)
|
||||
}
|
||||
|
||||
{
|
||||
// Test without any workers left to run these tasks on this job due to blocklisting. This should fail the entire job.
|
||||
mf.persistence.EXPECT().WorkersLeftToRun(gomock.Any(), &mockJob, "misc").
|
||||
Return(map[string]bool{}, nil)
|
||||
mf.persistence.EXPECT().FetchTaskFailureList(gomock.Any(), &mockTask).
|
||||
Return([]*persistence.Worker{ /* It shouldn't matter whether the failing worker is here or not. */ }, nil)
|
||||
|
||||
// Expect the Worker to be added to the list of failed workers for this task.
|
||||
// This returns 1, which is less than the failure threshold -> soft failure if it were only based on this metric.
|
||||
mf.persistence.EXPECT().AddWorkerToTaskFailedList(gomock.Any(), &mockTask, &worker).Return(1, nil)
|
||||
|
||||
// Expect hard failure of the task, because there are no workers left to perfom it.
|
||||
mf.stateMachine.EXPECT().TaskStatusChange(gomock.Any(), &mockTask, api.TaskStatusFailed)
|
||||
mf.logStorage.EXPECT().WriteTimestamped(gomock.Any(), jobID, taskID,
|
||||
"Task failed by worker дрон (e7632d62-c3b8-4af0-9e78-01752928952c), Manager will fail the entire job "+
|
||||
"as there are no more workers left for tasks of type \"misc\".")
|
||||
|
||||
// Expect failure of the job.
|
||||
mf.stateMachine.EXPECT().
|
||||
JobStatusChange(gomock.Any(), &mockJob, api.JobStatusFailed, "no more workers left to run tasks of type \"misc\"")
|
||||
|
||||
// Do the call.
|
||||
echoCtx := mf.prepareMockedJSONRequest(taskUpdate)
|
||||
requestWorkerStore(echoCtx, &worker)
|
||||
err := mf.flamenco.TaskUpdate(echoCtx, taskID)
|
||||
assert.NoError(t, err)
|
||||
assertResponseEmpty(t, echoCtx)
|
||||
}
|
||||
|
||||
{
|
||||
// Test that no worker has been blocklisted, but the one available one did fail this task.
|
||||
// This also makes the task impossible to run, and should just fail the entire job.
|
||||
theOtherFailingWorker := persistence.Worker{
|
||||
UUID: "ce312357-29cd-4389-81ab-4d43e30945f8",
|
||||
}
|
||||
mf.persistence.EXPECT().WorkersLeftToRun(gomock.Any(), &mockJob, "misc").
|
||||
Return(map[string]bool{theOtherFailingWorker.UUID: true}, nil)
|
||||
mf.persistence.EXPECT().FetchTaskFailureList(gomock.Any(), &mockTask).
|
||||
Return([]*persistence.Worker{&theOtherFailingWorker}, nil)
|
||||
|
||||
// Expect the Worker to be added to the list of failed workers for this task.
|
||||
// This returns 1, which is less than the failure threshold -> soft failure if it were only based on this metric.
|
||||
mf.persistence.EXPECT().AddWorkerToTaskFailedList(gomock.Any(), &mockTask, &worker).Return(1, nil)
|
||||
|
||||
// Expect hard failure of the task, because there are no workers left to perfom it.
|
||||
mf.stateMachine.EXPECT().TaskStatusChange(gomock.Any(), &mockTask, api.TaskStatusFailed)
|
||||
mf.logStorage.EXPECT().WriteTimestamped(gomock.Any(), jobID, taskID,
|
||||
"Task failed by worker дрон (e7632d62-c3b8-4af0-9e78-01752928952c), Manager will fail the entire job "+
|
||||
"as there are no more workers left for tasks of type \"misc\".")
|
||||
|
||||
// Expect failure of the job.
|
||||
mf.stateMachine.EXPECT().
|
||||
JobStatusChange(gomock.Any(), &mockJob, api.JobStatusFailed, "no more workers left to run tasks of type \"misc\"")
|
||||
|
||||
// Do the call.
|
||||
echoCtx := mf.prepareMockedJSONRequest(taskUpdate)
|
||||
requestWorkerStore(echoCtx, &worker)
|
||||
err := mf.flamenco.TaskUpdate(echoCtx, taskID)
|
||||
assert.NoError(t, err)
|
||||
assertResponseEmpty(t, echoCtx)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMayWorkerRun(t *testing.T) {
|
||||
mockCtrl := gomock.NewController(t)
|
||||
defer mockCtrl.Finish()
|
||||
|
@ -92,7 +92,7 @@ type Base struct {
|
||||
|
||||
/* This many failures (on a given job+task type combination) will ban a worker
|
||||
* from that task type on that job. */
|
||||
// BlocklistThreshold int `yaml:"blocklist_threshold"`
|
||||
BlocklistThreshold int `yaml:"blocklist_threshold"`
|
||||
|
||||
// When this many workers have tried the task and failed, it will be hard-failed
|
||||
// (even when there are workers left that could technically retry the task).
|
||||
|
@ -39,7 +39,7 @@ var defaultConfig = Conf{
|
||||
// // be accurate enough for this type of cleanup.
|
||||
// TaskCleanupMaxAge: 14 * 24 * time.Hour,
|
||||
|
||||
// BlocklistThreshold: 3,
|
||||
BlocklistThreshold: 3,
|
||||
TaskFailAfterSoftFailCount: 3,
|
||||
|
||||
// WorkerCleanupStatus: []string{string(api.WorkerStatusOffline)},
|
||||
|
@ -20,6 +20,8 @@ func TestDefaultSettings(t *testing.T) {
|
||||
assert.Equal(t, false, config.Variables["ffmpeg"].IsTwoWay)
|
||||
assert.Equal(t, "ffmpeg", config.Variables["ffmpeg"].Values[0].Value)
|
||||
assert.Equal(t, VariablePlatformLinux, config.Variables["ffmpeg"].Values[0].Platform)
|
||||
|
||||
assert.Greater(t, config.BlocklistThreshold, 0)
|
||||
}
|
||||
|
||||
func TestVariableValidation(t *testing.T) {
|
||||
|
@ -7,7 +7,13 @@ import (
|
||||
)
|
||||
|
||||
func (db *DB) migrate() error {
|
||||
err := db.gormDB.AutoMigrate(&Job{}, &Task{}, &TaskFailure{}, &Worker{})
|
||||
err := db.gormDB.AutoMigrate(
|
||||
&Job{},
|
||||
&JobBlock{},
|
||||
&Task{},
|
||||
&TaskFailure{},
|
||||
&Worker{},
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to automigrate database: %v", err)
|
||||
}
|
||||
|
92
internal/manager/persistence/jobs_blocklist.go
Normal file
92
internal/manager/persistence/jobs_blocklist.go
Normal file
@ -0,0 +1,92 @@
|
||||
package persistence
|
||||
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"gorm.io/gorm/clause"
|
||||
)
|
||||
|
||||
// JobBlock keeps track of which Worker is not allowed to run which task type on which job.
|
||||
type JobBlock struct {
|
||||
// Don't include the standard Gorm UpdatedAt or DeletedAt fields, as they're useless here.
|
||||
// Entries will never be updated, and should never be soft-deleted but just purged from existence.
|
||||
ID uint
|
||||
CreatedAt time.Time
|
||||
|
||||
JobID uint `gorm:"default:0;uniqueIndex:job_worker_tasktype"`
|
||||
Job *Job `gorm:"foreignkey:JobID;references:ID;constraint:OnDelete:CASCADE"`
|
||||
|
||||
WorkerID uint `gorm:"default:0;uniqueIndex:job_worker_tasktype"`
|
||||
Worker *Worker `gorm:"foreignkey:WorkerID;references:ID;constraint:OnDelete:CASCADE"`
|
||||
|
||||
TaskType string `gorm:"uniqueIndex:job_worker_tasktype"`
|
||||
}
|
||||
|
||||
// AddWorkerToJobBlocklist prevents this Worker of getting any task, of this type, on this job, from the task scheduler.
|
||||
func (db *DB) AddWorkerToJobBlocklist(ctx context.Context, job *Job, worker *Worker, taskType string) error {
|
||||
entry := JobBlock{
|
||||
Job: job,
|
||||
Worker: worker,
|
||||
TaskType: taskType,
|
||||
}
|
||||
tx := db.gormDB.WithContext(ctx).
|
||||
Clauses(clause.OnConflict{DoNothing: true}).
|
||||
Create(&entry)
|
||||
return tx.Error
|
||||
}
|
||||
|
||||
// WorkersLeftToRun returns a set of worker UUIDs that can run tasks of the given type on the given job.
|
||||
//
|
||||
// NOTE: this does NOT consider the task failure list, which blocks individual
|
||||
// workers from individual tasks. This is ONLY concerning the job blocklist.
|
||||
func (db *DB) WorkersLeftToRun(ctx context.Context, job *Job, taskType string) (map[string]bool, error) {
|
||||
// Find the IDs of the workers blocked on this job + tasktype combo.
|
||||
blockedWorkers := db.gormDB.
|
||||
Table("workers as blocked_workers").
|
||||
Select("blocked_workers.id").
|
||||
Joins("inner join job_blocks JB on blocked_workers.id = JB.worker_id").
|
||||
Where("JB.job_id = ?", job.ID).
|
||||
Where("JB.task_type = ?", taskType)
|
||||
|
||||
// Find the workers NOT blocked.
|
||||
workers := []*Worker{}
|
||||
tx := db.gormDB.WithContext(ctx).
|
||||
Model(&Worker{}).
|
||||
Select("uuid").
|
||||
Where("id not in (?)", blockedWorkers).
|
||||
Scan(&workers)
|
||||
if tx.Error != nil {
|
||||
return nil, tx.Error
|
||||
}
|
||||
|
||||
// From the list of workers, construct the map of UUIDs.
|
||||
uuidMap := map[string]bool{}
|
||||
for _, worker := range workers {
|
||||
uuidMap[worker.UUID] = true
|
||||
}
|
||||
|
||||
return uuidMap, nil
|
||||
}
|
||||
|
||||
// CountTaskFailuresOfWorker returns the number of task failures of this worker, on this particular job and task type.
|
||||
func (db *DB) CountTaskFailuresOfWorker(ctx context.Context, job *Job, worker *Worker, taskType string) (int, error) {
|
||||
var numFailures int64
|
||||
|
||||
tx := db.gormDB.WithContext(ctx).
|
||||
Model(&TaskFailure{}).
|
||||
Joins("inner join tasks T on task_failures.task_id = T.id").
|
||||
Where("task_failures.worker_id = ?", worker.ID).
|
||||
Where("T.job_id = ?", job.ID).
|
||||
Where("T.type = ?", taskType).
|
||||
Count(&numFailures)
|
||||
|
||||
if numFailures > math.MaxInt {
|
||||
panic("overflow error in number of failures")
|
||||
}
|
||||
|
||||
return int(numFailures), tx.Error
|
||||
}
|
137
internal/manager/persistence/jobs_blocklist_test.go
Normal file
137
internal/manager/persistence/jobs_blocklist_test.go
Normal file
@ -0,0 +1,137 @@
|
||||
package persistence
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
func TestAddWorkerToJobBlocklist(t *testing.T) {
|
||||
ctx, close, db, job, _ := jobTasksTestFixtures(t)
|
||||
defer close()
|
||||
|
||||
worker := createWorker(ctx, t, db)
|
||||
|
||||
{
|
||||
// Add a worker to the block list.
|
||||
err := db.AddWorkerToJobBlocklist(ctx, job, worker, "blender")
|
||||
assert.NoError(t, err)
|
||||
|
||||
list := []JobBlock{}
|
||||
tx := db.gormDB.Model(&JobBlock{}).Scan(&list)
|
||||
assert.NoError(t, tx.Error)
|
||||
if assert.Len(t, list, 1) {
|
||||
entry := list[0]
|
||||
assert.Equal(t, entry.JobID, job.ID)
|
||||
assert.Equal(t, entry.WorkerID, worker.ID)
|
||||
assert.Equal(t, entry.TaskType, "blender")
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// Adding the same worker again should be a no-op.
|
||||
err := db.AddWorkerToJobBlocklist(ctx, job, worker, "blender")
|
||||
assert.NoError(t, err)
|
||||
|
||||
list := []JobBlock{}
|
||||
tx := db.gormDB.Model(&JobBlock{}).Scan(&list)
|
||||
assert.NoError(t, tx.Error)
|
||||
assert.Len(t, list, 1, "No new entry should have been created")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkersLeftToRun(t *testing.T) {
|
||||
ctx, close, db, job, _ := jobTasksTestFixtures(t)
|
||||
defer close()
|
||||
|
||||
// No workers.
|
||||
left, err := db.WorkersLeftToRun(ctx, job, "blender")
|
||||
assert.NoError(t, err)
|
||||
assert.Empty(t, left)
|
||||
|
||||
worker1 := createWorker(ctx, t, db)
|
||||
worker2 := createWorkerFrom(ctx, t, db, *worker1)
|
||||
|
||||
uuidMap := func(workers ...*Worker) map[string]bool {
|
||||
theMap := map[string]bool{}
|
||||
for _, worker := range workers {
|
||||
theMap[worker.UUID] = true
|
||||
}
|
||||
return theMap
|
||||
}
|
||||
|
||||
// Two workers, no blocklist.
|
||||
left, err = db.WorkersLeftToRun(ctx, job, "blender")
|
||||
if assert.NoError(t, err) {
|
||||
assert.Equal(t, uuidMap(worker1, worker2), left)
|
||||
}
|
||||
|
||||
// Two workers, one blocked.
|
||||
_ = db.AddWorkerToJobBlocklist(ctx, job, worker1, "blender")
|
||||
left, err = db.WorkersLeftToRun(ctx, job, "blender")
|
||||
if assert.NoError(t, err) {
|
||||
assert.Equal(t, uuidMap(worker2), left)
|
||||
}
|
||||
|
||||
// Two workers, both blocked.
|
||||
_ = db.AddWorkerToJobBlocklist(ctx, job, worker2, "blender")
|
||||
left, err = db.WorkersLeftToRun(ctx, job, "blender")
|
||||
assert.NoError(t, err)
|
||||
assert.Empty(t, left)
|
||||
|
||||
// Two workers, unknown job.
|
||||
fakeJob := Job{Model: Model{ID: 327}}
|
||||
left, err = db.WorkersLeftToRun(ctx, &fakeJob, "blender")
|
||||
if assert.NoError(t, err) {
|
||||
assert.Equal(t, uuidMap(worker1, worker2), left)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCountTaskFailuresOfWorker(t *testing.T) {
|
||||
ctx, close, db, dbJob, authoredJob := jobTasksTestFixtures(t)
|
||||
defer close()
|
||||
|
||||
task0, _ := db.FetchTask(ctx, authoredJob.Tasks[0].UUID)
|
||||
task1, _ := db.FetchTask(ctx, authoredJob.Tasks[1].UUID)
|
||||
task2, _ := db.FetchTask(ctx, authoredJob.Tasks[2].UUID)
|
||||
|
||||
// Sanity check on the test data.
|
||||
assert.Equal(t, "blender", task0.Type)
|
||||
assert.Equal(t, "blender", task1.Type)
|
||||
assert.Equal(t, "ffmpeg", task2.Type)
|
||||
|
||||
worker1 := createWorker(ctx, t, db)
|
||||
worker2 := createWorkerFrom(ctx, t, db, *worker1)
|
||||
|
||||
// Store some failures for different tasks
|
||||
_, _ = db.AddWorkerToTaskFailedList(ctx, task0, worker1)
|
||||
_, _ = db.AddWorkerToTaskFailedList(ctx, task1, worker1)
|
||||
_, _ = db.AddWorkerToTaskFailedList(ctx, task1, worker2)
|
||||
_, _ = db.AddWorkerToTaskFailedList(ctx, task2, worker1)
|
||||
|
||||
// Multiple failures.
|
||||
numBlender1, err := db.CountTaskFailuresOfWorker(ctx, dbJob, worker1, "blender")
|
||||
if assert.NoError(t, err) {
|
||||
assert.Equal(t, 2, numBlender1)
|
||||
}
|
||||
|
||||
// Single failure, but multiple tasks exist of this type.
|
||||
numBlender2, err := db.CountTaskFailuresOfWorker(ctx, dbJob, worker2, "blender")
|
||||
if assert.NoError(t, err) {
|
||||
assert.Equal(t, 1, numBlender2)
|
||||
}
|
||||
|
||||
// Single failure, only one task of this type exists.
|
||||
numFFMpeg1, err := db.CountTaskFailuresOfWorker(ctx, dbJob, worker1, "ffmpeg")
|
||||
if assert.NoError(t, err) {
|
||||
assert.Equal(t, 1, numFFMpeg1)
|
||||
}
|
||||
|
||||
// No failure.
|
||||
numFFMpeg2, err := db.CountTaskFailuresOfWorker(ctx, dbJob, worker2, "ffmpeg")
|
||||
if assert.NoError(t, err) {
|
||||
assert.Equal(t, 0, numFFMpeg2)
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user