Sybren A. Stüvel 84f93e7502 Transition from ex-GORM structs to sqlc structs (2/5)
Replace old used-to-be-GORM datastructures (#104305) with sqlc-generated
structs. This also makes it possible to use more specific structs that
are more taylored to the specific queries, increasing efficiency.

This commit mostly deals with workers, including the sleep schedule and
task scheduler.

Functional changes are kept to a minimum, as the API still serves the
same data.

Because this work covers so much of Flamenco's code, it's been split up
into different commits. Each commit brings Flamenco to a state where it
compiles and unit tests pass. Only the result of the final commit has
actually been tested properly.

Ref: #104343
2024-12-04 14:00:13 +01:00

76 lines
2.2 KiB
Go

package timeout_checker
// SPDX-License-Identifier: GPL-3.0-or-later
import (
"context"
"github.com/rs/zerolog/log"
"projects.blender.org/studio/flamenco/internal/manager/persistence/sqlc"
"projects.blender.org/studio/flamenco/pkg/api"
)
func (ttc *TimeoutChecker) checkWorkers(ctx context.Context) {
timeoutThreshold := ttc.clock.Now().UTC().Add(-ttc.workerTimeout)
logger := log.With().
Time("threshold", timeoutThreshold.Local()).
Logger()
logger.Trace().Msg("TimeoutChecker: finding all awake workers that have not been seen since threshold")
workers, err := ttc.persist.FetchTimedOutWorkers(ctx, timeoutThreshold)
if err != nil {
log.Error().Err(err).Msg("TimeoutChecker: error fetching timed-out workers from database")
return
}
if len(workers) == 0 {
logger.Trace().Msg("TimeoutChecker: no timed-out workers")
return
}
logger.Debug().
Int("numWorkers", len(workers)).
Msg("TimeoutChecker: failing all awake workers that have not been seen since threshold")
for _, worker := range workers {
ttc.timeoutWorker(ctx, worker)
}
}
// timeoutTask marks a task as 'failed' due to a timeout.
func (ttc *TimeoutChecker) timeoutWorker(ctx context.Context, worker *sqlc.Worker) {
logger := log.With().
Str("worker", worker.UUID).
Str("name", worker.Name).
Str("lastSeenAt", worker.LastSeenAt.Time.String()).
Logger()
logger.Warn().Msg("TimeoutChecker: worker timed out")
prevStatus := worker.Status
worker.Status = api.WorkerStatusError
worker.StatusChangeClear()
err := ttc.persist.SaveWorker(ctx, worker)
if err != nil {
logger.Error().Err(err).Msg("TimeoutChecker: error saving timed-out worker to database")
}
err = ttc.taskStateMachine.RequeueActiveTasksOfWorker(ctx, worker, "worker timed out")
if err != nil {
logger.Error().Err(err).Msg("TimeoutChecker: error re-queueing tasks of timed-out worker")
}
// Broadcast worker change via SocketIO
ttc.broadcaster.BroadcastWorkerUpdate(api.EventWorkerUpdate{
Id: worker.UUID,
Name: worker.Name,
PreviousStatus: ptr(api.WorkerStatus(prevStatus)),
Status: api.WorkerStatusError,
Updated: worker.UpdatedAt.Time,
Version: worker.Software,
})
}
func ptr[T any](value T) *T {
return &value
}