
Replace old used-to-be-GORM datastructures (#104305) with sqlc-generated structs. This also makes it possible to use more specific structs that are more taylored to the specific queries, increasing efficiency. This commit mostly deals with workers, including the sleep schedule and task scheduler. Functional changes are kept to a minimum, as the API still serves the same data. Because this work covers so much of Flamenco's code, it's been split up into different commits. Each commit brings Flamenco to a state where it compiles and unit tests pass. Only the result of the final commit has actually been tested properly. Ref: #104343
76 lines
2.2 KiB
Go
76 lines
2.2 KiB
Go
package timeout_checker
|
|
|
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
import (
|
|
"context"
|
|
|
|
"github.com/rs/zerolog/log"
|
|
"projects.blender.org/studio/flamenco/internal/manager/persistence/sqlc"
|
|
"projects.blender.org/studio/flamenco/pkg/api"
|
|
)
|
|
|
|
func (ttc *TimeoutChecker) checkWorkers(ctx context.Context) {
|
|
timeoutThreshold := ttc.clock.Now().UTC().Add(-ttc.workerTimeout)
|
|
logger := log.With().
|
|
Time("threshold", timeoutThreshold.Local()).
|
|
Logger()
|
|
logger.Trace().Msg("TimeoutChecker: finding all awake workers that have not been seen since threshold")
|
|
|
|
workers, err := ttc.persist.FetchTimedOutWorkers(ctx, timeoutThreshold)
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("TimeoutChecker: error fetching timed-out workers from database")
|
|
return
|
|
}
|
|
|
|
if len(workers) == 0 {
|
|
logger.Trace().Msg("TimeoutChecker: no timed-out workers")
|
|
return
|
|
}
|
|
logger.Debug().
|
|
Int("numWorkers", len(workers)).
|
|
Msg("TimeoutChecker: failing all awake workers that have not been seen since threshold")
|
|
|
|
for _, worker := range workers {
|
|
ttc.timeoutWorker(ctx, worker)
|
|
}
|
|
}
|
|
|
|
// timeoutTask marks a task as 'failed' due to a timeout.
|
|
func (ttc *TimeoutChecker) timeoutWorker(ctx context.Context, worker *sqlc.Worker) {
|
|
logger := log.With().
|
|
Str("worker", worker.UUID).
|
|
Str("name", worker.Name).
|
|
Str("lastSeenAt", worker.LastSeenAt.Time.String()).
|
|
Logger()
|
|
logger.Warn().Msg("TimeoutChecker: worker timed out")
|
|
|
|
prevStatus := worker.Status
|
|
worker.Status = api.WorkerStatusError
|
|
worker.StatusChangeClear()
|
|
|
|
err := ttc.persist.SaveWorker(ctx, worker)
|
|
if err != nil {
|
|
logger.Error().Err(err).Msg("TimeoutChecker: error saving timed-out worker to database")
|
|
}
|
|
|
|
err = ttc.taskStateMachine.RequeueActiveTasksOfWorker(ctx, worker, "worker timed out")
|
|
if err != nil {
|
|
logger.Error().Err(err).Msg("TimeoutChecker: error re-queueing tasks of timed-out worker")
|
|
}
|
|
|
|
// Broadcast worker change via SocketIO
|
|
ttc.broadcaster.BroadcastWorkerUpdate(api.EventWorkerUpdate{
|
|
Id: worker.UUID,
|
|
Name: worker.Name,
|
|
PreviousStatus: ptr(api.WorkerStatus(prevStatus)),
|
|
Status: api.WorkerStatusError,
|
|
Updated: worker.UpdatedAt.Time,
|
|
Version: worker.Software,
|
|
})
|
|
}
|
|
|
|
func ptr[T any](value T) *T {
|
|
return &value
|
|
}
|