flamenco/internal/manager/task_state_machine/worker_requeue.go

package task_state_machine

// SPDX-License-Identifier: GPL-3.0-or-later

import (
	"context"
	"fmt"

	"github.com/rs/zerolog/log"
	"projects.blender.org/studio/flamenco/internal/manager/persistence"
	"projects.blender.org/studio/flamenco/pkg/api"
)

// RequeueActiveTasksOfWorker re-queues all active tasks (should be max one) of this worker.
//
// `reason`: a string that can be appended to text like "Task requeued because "
func (sm *StateMachine) RequeueActiveTasksOfWorker(
	ctx context.Context,
	worker *persistence.Worker,
	reason string,
) error {
	sm.mutex.Lock()
	defer sm.mutex.Unlock()

	// Fetch the tasks to update.
	tasksJobs, err := sm.persist.FetchTasksOfWorkerInStatus(
		ctx, worker, api.TaskStatusActive)
	if err != nil {
		return err
	}

	// Run each task change through the task state machine.
	var lastErr error
	for _, taskJobWorker := range tasksJobs {
		lastErr = sm.returnTaskOfWorker(ctx, &taskJobWorker.Task, taskJobWorker.JobUUID, worker, reason)
	}

	return lastErr
}

// RequeueFailedTasksOfWorkerOfJob re-queues all failed tasks of this worker on this job.
//
// `reason`: a string that can be appended to text like "Task requeued because "
func (sm *StateMachine) RequeueFailedTasksOfWorkerOfJob(
	ctx context.Context,
	worker *persistence.Worker,
	jobUUID string,
	reason string,
) error {
	sm.mutex.Lock()
	defer sm.mutex.Unlock()

	// Fetch the tasks to update.
	tasks, err := sm.persist.FetchTasksOfWorkerInStatusOfJob(
		ctx, worker, api.TaskStatusFailed, jobUUID)
	if err != nil {
		return err
	}

	// Run each task change through the task state machine.
	var lastErr error
	for _, task := range tasks {
		lastErr = sm.returnTaskOfWorker(ctx, task, jobUUID, worker, reason)
	}

	return lastErr
}

// returnTaskOfWorker returns the task to the task pool.
// This either re-queues the task for execution, or pauses it, depending on the current job status.
func (sm *StateMachine) returnTaskOfWorker(
	ctx context.Context,
	task *persistence.Task,
	jobUUID string,
	worker *persistence.Worker,
	reason string,
) error {
	logger := log.With().
		Str("worker", worker.UUID).
		Str("reason", reason).
		Logger()

	job, err := sm.persist.FetchJob(ctx, jobUUID)
	if err != nil {
		return fmt.Errorf("could not requeue task of worker %q: %w", worker.UUID, err)
	}

	// Depending on the job's status, a Worker returning its task to the pool should make it go to 'queued' or 'paused'.
	var targetTaskStatus api.TaskStatus
	switch job.Status {
	case api.JobStatusPauseRequested, api.JobStatusPaused:
		targetTaskStatus = api.TaskStatusPaused
	default:
		targetTaskStatus = api.TaskStatusQueued
	}

	logger.Info().
		Str("task", task.UUID).
		Str("newTaskStatus", string(targetTaskStatus)).
		Msg("returning task to pool")

		// Write to task activity that it got requeued because of worker sign-off.
	task.Activity = fmt.Sprintf("Task was %s by Manager because %s", targetTaskStatus, reason)
	if err := sm.persist.SaveTaskActivity(ctx, task); err != nil {
		logger.Warn().Err(err).
			Str("task", task.UUID).
			Str("reason", reason).
			Str("activity", task.Activity).
			Msg("error saving task activity to database")
	}

	if err := sm.taskStatusChange(ctx, task, targetTaskStatus); err != nil {
		logger.Warn().Err(err).
			Str("task", task.UUID).
			Str("reason", reason).
			Msg("error returning task to pool")
	}

	// The error is already logged by the log storage.
	_ = sm.logStorage.WriteTimestamped(logger, jobUUID, task.UUID, task.Activity)

	return err
}