
Soon there will be another function to requeue tasks of workers by other criteria, so being clear in the name helps. No functional changes.
72 lines
2.1 KiB
Go
72 lines
2.1 KiB
Go
package timeout_checker
|
|
|
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
import (
|
|
"context"
|
|
|
|
"git.blender.org/flamenco/internal/manager/persistence"
|
|
"git.blender.org/flamenco/pkg/api"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
func (ttc *TimeoutChecker) checkWorkers(ctx context.Context) {
|
|
timeoutThreshold := ttc.clock.Now().UTC().Add(-ttc.workerTimeout)
|
|
logger := log.With().
|
|
Time("threshold", timeoutThreshold.Local()).
|
|
Logger()
|
|
logger.Trace().Msg("TimeoutChecker: finding all awake workers that have not been seen since threshold")
|
|
|
|
workers, err := ttc.persist.FetchTimedOutWorkers(ctx, timeoutThreshold)
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("TimeoutChecker: error fetching timed-out workers from database")
|
|
return
|
|
}
|
|
|
|
if len(workers) == 0 {
|
|
logger.Trace().Msg("TimeoutChecker: no timed-out workers")
|
|
return
|
|
}
|
|
logger.Debug().
|
|
Int("numWorkers", len(workers)).
|
|
Msg("TimeoutChecker: failing all awake workers that have not been seen since threshold")
|
|
|
|
for _, worker := range workers {
|
|
ttc.timeoutWorker(ctx, worker)
|
|
}
|
|
}
|
|
|
|
// timeoutTask marks a task as 'failed' due to a timeout.
|
|
func (ttc *TimeoutChecker) timeoutWorker(ctx context.Context, worker *persistence.Worker) {
|
|
logger := log.With().
|
|
Str("worker", worker.UUID).
|
|
Str("name", worker.Name).
|
|
Str("lastSeenAt", worker.LastSeenAt.String()).
|
|
Logger()
|
|
logger.Warn().Msg("TimeoutChecker: worker timed out")
|
|
|
|
prevStatus := worker.Status
|
|
worker.Status = api.WorkerStatusError
|
|
worker.StatusChangeClear()
|
|
|
|
err := ttc.persist.SaveWorker(ctx, worker)
|
|
if err != nil {
|
|
logger.Error().Err(err).Msg("TimeoutChecker: error saving timed-out worker to database")
|
|
}
|
|
|
|
err = ttc.taskStateMachine.RequeueActiveTasksOfWorker(ctx, worker, "worker timed out")
|
|
if err != nil {
|
|
logger.Error().Err(err).Msg("TimeoutChecker: error re-queueing tasks of timed-out worker")
|
|
}
|
|
|
|
// Broadcast worker change via SocketIO
|
|
ttc.broadcaster.BroadcastWorkerUpdate(api.SocketIOWorkerUpdate{
|
|
Id: worker.UUID,
|
|
Name: worker.Name,
|
|
PreviousStatus: &prevStatus,
|
|
Status: api.WorkerStatusError,
|
|
Updated: worker.UpdatedAt,
|
|
Version: worker.Software,
|
|
})
|
|
}
|