Manager: fix jobs getting stuck in pause-requested status

When a Worker would sign off while working on a task, and that task's job would be in `pause-requested` state, it would always re-queue the task. This in turn would not get detected, which caused the job to get stuck. Now tasks correctly go to `paused` when a Worker signs off, and subsequently the job will be re-checked and go to `paused` when possible as well. Note that this does not handle already-stuck jobs. That'll be for the following commit.
2025-02-28 12:26:00 +01:00 · 2025-02-28 12:26:00 +01:00 · ff29d2ef26
commit ff29d2ef26
parent efdc5a5a20
4 changed files with 136 additions and 14 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -10,6 +10,8 @@ bugs in actually-released versions.
 - The conversion from a known path prefix to a variable is now done in a case-insensitive way. This means that if the variable `{storage}` has value `S:\Flamenco`, a path `s:\flamenco\project\file.blend` will be recognised and stored as `{storage}\project\file.blend`.
  This happens uniformly, regardless of the platform. So also on Linux, which has a case-sensitive filesystem, this matching is done in a case-insensitive way. It is very unlikely that a Flamenco configuration has two separate variables, for paths that only differ in their case.
 - Fix issue where jobs could get stuck in `pause-requested` state.
 ## 3.6 - released 2024-12-01
--- a/internal/manager/task_state_machine/task_state_machine.go
+++ b/internal/manager/task_state_machine/task_state_machine.go
@ -41,19 +41,33 @@ func NewStateMachine(persist PersistenceService, broadcaster ChangeBroadcaster,
 // TaskStatusChange updates the task's status to the new one.
 // `task` is expected to still have its original status, and have a filled `Job` pointer.
 // This is the external API endpoint, which ensures the state machine is locked.
 func (sm *StateMachine) TaskStatusChange(
 	ctx context.Context,
 	task *persistence.Task,
 	newTaskStatus api.TaskStatus,
 ) error {
 	sm.mutex.Lock()
 	defer sm.mutex.Unlock()
 	return sm.taskStatusChange(ctx, task, newTaskStatus)
 }
 // taskStatusChange updates the task's status to the new one.
 // `task` is expected to still have its original status, and have a filled `Job` pointer.
 // This is the internal version fo TaskStatusChange(), which assumes the state
 // machine is already locked.
 func (sm *StateMachine) taskStatusChange(
 	ctx context.Context,
 	task *persistence.Task,
 	newTaskStatus api.TaskStatus,
 ) error {
 	if task.JobID == 0 {
 		log.Panic().Str("task", task.UUID).Msg("task without job ID, cannot handle this")
 		return nil // Will not run because of the panic.
 	}
 	sm.mutex.Lock()
 	defer sm.mutex.Unlock()
 	job, err := sm.persist.FetchJobByID(ctx, task.JobID)
 	if err != nil {
 		return fmt.Errorf("cannot fetch the job of task %s: %w", task.UUID, err)
@ -137,8 +151,7 @@ func (sm *StateMachine) updateJobAfterTaskStatusChange(
 		return sm.jobStatusIfAThenB(ctx, logger, job, api.JobStatusCompleted, api.JobStatusRequeueing, "task was queued")
 	case api.TaskStatusPaused:
-		// Pausing a task has no impact on the job.
+		return sm.updateJobOnTaskStatusPaused(ctx, job)
 		return nil
 	case api.TaskStatusCanceled:
 		return sm.updateJobOnTaskStatusCanceled(ctx, logger, job)
@ -222,6 +235,19 @@ func (sm *StateMachine) updateJobOnTaskStatusCanceled(ctx context.Context, logge
 	return nil
 }
 // updateJobOnTaskStatusPaused checks if all tasks are paused, and if so, pauses the entire job.
 func (sm *StateMachine) updateJobOnTaskStatusPaused(ctx context.Context, job *persistence.Job) error {
 	toBePaused, err := sm.isJobPausingComplete(ctx, job)
 	if err != nil {
 		return err
 	}
 	if !toBePaused {
 		return nil
 	}
 	return sm.jobStatusChange(ctx, job, api.JobStatusPaused, "last task got paused")
 }
 // updateJobOnTaskStatusFailed conditionally escalates the failure of a task to fail the entire job.
 func (sm *StateMachine) updateJobOnTaskStatusFailed(ctx context.Context, logger zerolog.Logger, job *persistence.Job) error {
 	// Count the number of failed tasks. If it is over the threshold, fail the job.
--- a/internal/manager/task_state_machine/worker_requeue.go
+++ b/internal/manager/task_state_machine/worker_requeue.go
@ -4,6 +4,7 @@ package task_state_machine
 import (
 	"context"
 	"fmt"
 	"github.com/rs/zerolog/log"
 	"projects.blender.org/studio/flamenco/internal/manager/persistence"
@ -18,6 +19,9 @@ func (sm *StateMachine) RequeueActiveTasksOfWorker(
 	worker *persistence.Worker,
 	reason string,
 ) error {
 	sm.mutex.Lock()
 	defer sm.mutex.Unlock()
 	// Fetch the tasks to update.
 	tasksJobs, err := sm.persist.FetchTasksOfWorkerInStatus(
 		ctx, worker, api.TaskStatusActive)
@ -28,7 +32,7 @@ func (sm *StateMachine) RequeueActiveTasksOfWorker(
 	// Run each task change through the task state machine.
 	var lastErr error
 	for _, taskJobWorker := range tasksJobs {
-		lastErr = sm.requeueTaskOfWorker(ctx, &taskJobWorker.Task, taskJobWorker.JobUUID, worker, reason)
+		lastErr = sm.returnTaskOfWorker(ctx, &taskJobWorker.Task, taskJobWorker.JobUUID, worker, reason)
 	}
 	return lastErr
@ -43,6 +47,9 @@ func (sm *StateMachine) RequeueFailedTasksOfWorkerOfJob(
 	jobUUID string,
 	reason string,
 ) error {
 	sm.mutex.Lock()
 	defer sm.mutex.Unlock()
 	// Fetch the tasks to update.
 	tasks, err := sm.persist.FetchTasksOfWorkerInStatusOfJob(
 		ctx, worker, api.TaskStatusFailed, jobUUID)
@ -53,13 +60,15 @@ func (sm *StateMachine) RequeueFailedTasksOfWorkerOfJob(
 	// Run each task change through the task state machine.
 	var lastErr error
 	for _, task := range tasks {
-		lastErr = sm.requeueTaskOfWorker(ctx, task, jobUUID, worker, reason)
+		lastErr = sm.returnTaskOfWorker(ctx, task, jobUUID, worker, reason)
 	}
 	return lastErr
 }
-func (sm *StateMachine) requeueTaskOfWorker(
+// returnTaskOfWorker returns the task to the task pool.
 // This either re-queues the task for execution, or pauses it, depending on the current job status.
 func (sm *StateMachine) returnTaskOfWorker(
 	ctx context.Context,
 	task *persistence.Task,
 	jobUUID string,
@ -71,12 +80,27 @@ func (sm *StateMachine) requeueTaskOfWorker(
 		Str("reason", reason).
 		Logger()
 	job, err := sm.persist.FetchJob(ctx, jobUUID)
 	if err != nil {
 		return fmt.Errorf("could not requeue task of worker %q: %w", worker.UUID, err)
 	}
 	// Depending on the job's status, a Worker returning its task to the pool should make it go to 'queued' or 'paused'.
 	var targetTaskStatus api.TaskStatus
 	switch job.Status {
 	case api.JobStatusPauseRequested, api.JobStatusPaused:
 		targetTaskStatus = api.TaskStatusPaused
 	default:
 		targetTaskStatus = api.TaskStatusQueued
 	}
 	logger.Info().
 		Str("task", task.UUID).
-		Msg("re-queueing task")
+		Str("newTaskStatus", string(targetTaskStatus)).
 		Msg("returning task to pool")
 		// Write to task activity that it got requeued because of worker sign-off.
-	task.Activity = "Task was requeued by Manager because " + reason
+	task.Activity = fmt.Sprintf("Task was %s by Manager because %s", targetTaskStatus, reason)
 	if err := sm.persist.SaveTaskActivity(ctx, task); err != nil {
 		logger.Warn().Err(err).
 			Str("task", task.UUID).
@ -85,12 +109,11 @@ func (sm *StateMachine) requeueTaskOfWorker(
 			Msg("error saving task activity to database")
 	}
-	err := sm.TaskStatusChange(ctx, task, api.TaskStatusQueued)
+	if err := sm.taskStatusChange(ctx, task, targetTaskStatus); err != nil {
 	if err != nil {
 		logger.Warn().Err(err).
 			Str("task", task.UUID).
 			Str("reason", reason).
-			Msg("error queueing task")
+			Msg("error returning task to pool")
 	}
 	// The error is already logged by the log storage.
--- a/internal/manager/task_state_machine/worker_requeue_test.go
+++ b/internal/manager/task_state_machine/worker_requeue_test.go
@ -32,11 +32,12 @@ func TestRequeueActiveTasksOfWorker(t *testing.T) {
 	task1PrevStatus := task1.Status
 	task2PrevStatus := task2.Status
 	mocks.persist.EXPECT().FetchJob(ctx, job.UUID).Return(job, nil).Times(len(workerTasks))
 	mocks.persist.EXPECT().FetchTasksOfWorkerInStatus(ctx, &worker, api.TaskStatusActive).Return(workerTasks, nil)
 	// Expect this re-queueing to end up in the task's log and activity.
 	logMsg1 := "task changed status active -> queued"
-	logMsg2 := "Task was requeued by Manager because worker had to test"
+	logMsg2 := "Task was queued by Manager because worker had to test"
 	task1WithActivity := *task1
 	task1WithActivity.Activity = logMsg2
 	task2WithActivity := *task2
@ -82,3 +83,73 @@ func TestRequeueActiveTasksOfWorker(t *testing.T) {
 	err := sm.RequeueActiveTasksOfWorker(ctx, &worker, "worker had to test")
 	require.NoError(t, err)
 }
 // Check that a Worker's task goes to 'paused' when the worker signs off and the
 // job is in 'pause-requested' state. This should also ensure that, if this
 // happens to be the last-active task of the job, that the entire job goes to
 // `paused` state.
 func TestPauseActiveTasksOfWorker(t *testing.T) {
 	mockCtrl, ctx, sm, mocks := taskStateMachineTestFixtures(t)
 	defer mockCtrl.Finish()
 	worker := persistence.Worker{
 		UUID: "3ed470c8-d41e-4668-92d0-d799997433a4",
 		Name: "testert",
 	}
 	// Mock that the job is in pause-requested, with one paused task and one active task.
 	task1, job := taskWithStatus(api.JobStatusPauseRequested, api.TaskStatusActive)
 	taskOfSameJob(task1, api.TaskStatusPaused) // Create a 2nd, already-paused task.
 	jobPrevStatus := job.Status
 	task1PrevStatus := task1.Status
 	mocks.persist.EXPECT().FetchJob(ctx, job.UUID).Return(job, nil)
 	mocks.persist.EXPECT().FetchTasksOfWorkerInStatus(ctx, &worker, api.TaskStatusActive).
 		Return([]persistence.TaskJob{{Task: *task1, JobUUID: job.UUID}}, nil)
 	mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, job, api.TaskStatusActive).Return(0, 2, nil) // 0 of 2 active.
 	// Expect this re-queueing to end up in the task's log and activity.
 	logMsg1 := "task changed status active -> paused"
 	logMsg2 := "Task was paused by Manager because worker had to test"
 	task1WithActivity := *task1
 	task1WithActivity.Activity = logMsg2
 	task1WithActivityAndStatus := task1WithActivity
 	task1WithActivityAndStatus.Status = api.TaskStatusPaused
 	mocks.persist.EXPECT().SaveTaskActivity(ctx, &task1WithActivity)
 	mocks.persist.EXPECT().SaveTaskStatus(ctx, &task1WithActivityAndStatus)
 	jobWithStatus := *job
 	jobWithStatus.Status = api.JobStatusPaused
 	jobWithStatus.Activity = "Changed to status \"paused\": last task got paused"
 	mocks.persist.EXPECT().SaveJobStatus(ctx, &jobWithStatus)
 	mocks.logStorage.EXPECT().WriteTimestamped(gomock.Any(), job.UUID, task1.UUID, logMsg1)
 	mocks.logStorage.EXPECT().WriteTimestamped(gomock.Any(), job.UUID, task1.UUID, logMsg2)
 	mocks.broadcaster.EXPECT().BroadcastTaskUpdate(api.EventTaskUpdate{
 		Activity:       logMsg2,
 		Id:             task1.UUID,
 		JobId:          job.UUID,
 		Name:           task1.Name,
 		PreviousStatus: &task1PrevStatus,
 		Status:         api.TaskStatusPaused,
 		Updated:        task1.UpdatedAt.Time,
 	})
 	mocks.broadcaster.EXPECT().BroadcastJobUpdate(api.EventJobUpdate{
 		Id:             job.UUID,
 		Name:           &job.Name,
 		PreviousStatus: &jobPrevStatus,
 		Priority:       int(job.Priority),
 		RefreshTasks:   false,
 		Status:         api.JobStatusPaused,
 		Type:           job.JobType,
 		Updated:        job.UpdatedAt.Time,
 	})
 	mocks.expectFetchJobOfTask(task1, job)
 	err := sm.RequeueActiveTasksOfWorker(ctx, &worker, "worker had to test")
 	require.NoError(t, err)
 }