87 lines
2.6 KiB
Go
87 lines
2.6 KiB
Go
package timeout_checker
|
|
|
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog"
|
|
"github.com/rs/zerolog/log"
|
|
|
|
"git.blender.org/flamenco/internal/manager/persistence"
|
|
"git.blender.org/flamenco/pkg/api"
|
|
)
|
|
|
|
func (ttc *TimeoutChecker) checkTasks(ctx context.Context) {
|
|
timeoutThreshold := ttc.clock.Now().UTC().Add(-ttc.taskTimeout)
|
|
logger := log.With().
|
|
Time("threshold", timeoutThreshold.Local()).
|
|
Logger()
|
|
logger.Debug().Msg("TimeoutChecker: finding active tasks that have not been touched since threshold")
|
|
|
|
tasks, err := ttc.persist.FetchTimedOutTasks(ctx, timeoutThreshold)
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("TimeoutChecker: error fetching timed-out tasks from database")
|
|
return
|
|
}
|
|
|
|
if len(tasks) == 0 {
|
|
logger.Trace().Msg("TimeoutChecker: no timed-out tasks")
|
|
return
|
|
}
|
|
logger.Debug().
|
|
Int("numTasks", len(tasks)).
|
|
Msg("TimeoutChecker: failing all active tasks that have not been touched since threshold")
|
|
|
|
for _, task := range tasks {
|
|
ttc.timeoutTask(ctx, task)
|
|
}
|
|
}
|
|
|
|
// timeoutTask marks a task as 'failed' due to a timeout.
|
|
func (ttc *TimeoutChecker) timeoutTask(ctx context.Context, task *persistence.Task) {
|
|
workerIdent, logger := ttc.assignedWorker(task)
|
|
|
|
task.Activity = fmt.Sprintf("Task timed out on worker %s", workerIdent)
|
|
err := ttc.taskStateMachine.TaskStatusChange(ctx, task, api.TaskStatusFailed)
|
|
if err != nil {
|
|
logger.Error().Err(err).Msg("TimeoutChecker: error saving timed-out task to database")
|
|
}
|
|
|
|
err = ttc.logStorage.WriteTimestamped(logger, task.Job.UUID, task.UUID,
|
|
fmt.Sprintf("Task timed out. It was assigned to worker %s, but untouched since %s",
|
|
workerIdent, task.LastTouchedAt.Format(time.RFC3339)))
|
|
if err != nil {
|
|
logger.Error().Err(err).Msg("TimeoutChecker: error writing timeout info to the task log")
|
|
}
|
|
}
|
|
|
|
// assignedWorker returns a description of the worker assigned to this task,
|
|
// and a logger configured for it.
|
|
func (ttc *TimeoutChecker) assignedWorker(task *persistence.Task) (string, zerolog.Logger) {
|
|
logCtx := log.With().Str("task", task.UUID)
|
|
|
|
if task.WorkerID == nil {
|
|
logger := logCtx.Logger()
|
|
logger.Warn().Msg("TimeoutChecker: task timed out, but was not assigned to any worker")
|
|
return "-unassigned-", logger
|
|
}
|
|
|
|
if task.Worker == nil {
|
|
logger := logCtx.Logger()
|
|
logger.Warn().Uint("workerDBID", *task.WorkerID).
|
|
Msg("TimeoutChecker: task is assigned to worker that no longer exists")
|
|
return "-unknown-", logger
|
|
}
|
|
|
|
logCtx = logCtx.
|
|
Str("worker", task.Worker.UUID).
|
|
Str("workerName", task.Worker.Name)
|
|
logger := logCtx.Logger()
|
|
logger.Warn().Msg("TimeoutChecker: task timed out")
|
|
|
|
return task.Worker.Identifier(), logger
|
|
}
|