Sybren A. Stüvel c1a9b1e877 Manager: force a poll of the farm status when a job/worker changes state
This introduces the concept of 'event listener', which is now used by
the farm status service to respond to events on the event bus.

This makes it possible to reduce the regular poll period from 5 to 30
seconds. That's now only necessary as backup, just in case events are
missed or otherwise things change without the event bus logic noticing.
2024-03-01 22:36:38 +01:00

227 lines
6.4 KiB
Go

// package farmstatus provides a status indicator for the entire Flamenco farm.
package farmstatus
import (
"context"
"errors"
"slices"
"sync"
"time"
"github.com/rs/zerolog/log"
"projects.blender.org/studio/flamenco/internal/manager/eventbus"
"projects.blender.org/studio/flamenco/pkg/api"
"projects.blender.org/studio/flamenco/pkg/website"
)
const (
// pollWait determines how often the persistence layer is queried to get the
// counts & statuses of workers and jobs.
//
// Note that this indicates the time between polls, so between a poll
// operation being done, and the next one starting.
pollWait = 30 * time.Second
)
// Service keeps track of the overall farm status.
type Service struct {
persist PersistenceService
eventbus EventBus
mutex sync.Mutex
lastReport api.FarmStatusReport
forcePoll chan struct{} // Send anything here to force a poll, if none is running yet.
}
// NewService returns a 'farm status' service. Run its Run() function in a
// goroutine to make it actually do something.
func NewService(persist PersistenceService, eventbus EventBus) *Service {
service := Service{
persist: persist,
eventbus: eventbus,
mutex: sync.Mutex{},
forcePoll: make(chan struct{}, 1),
lastReport: api.FarmStatusReport{
Status: api.FarmStatusStarting,
},
}
eventbus.AddListener(&service)
return &service
}
// Run the farm status polling loop.
func (s *Service) Run(ctx context.Context) {
log.Debug().Msg("farm status: polling service running")
defer log.Debug().Msg("farm status: polling service stopped")
for {
select {
case <-ctx.Done():
return
case <-time.After(pollWait):
s.poll(ctx)
case <-s.forcePoll:
s.poll(ctx)
}
}
}
func (s *Service) OnEvent(topic eventbus.EventTopic, payload interface{}) {
forcePoll := false
eventSubject := ""
switch event := payload.(type) {
case api.EventJobUpdate:
forcePoll = event.PreviousStatus != nil && *event.PreviousStatus != event.Status
eventSubject = "job"
case api.EventWorkerUpdate:
forcePoll = event.PreviousStatus != nil && *event.PreviousStatus != event.Status
eventSubject = "worker"
}
if !forcePoll {
return
}
log.Debug().
Str("event", string(topic)).
Msgf("farm status: investigating after %s status change", eventSubject)
// Polling queries the database, and thus can have a non-trivial duration.
// Better to run in the Run() goroutine.
select {
case s.forcePoll <- struct{}{}:
default:
// If sending to the channel fails, there is already a struct{}{} in
// there, and thus a poll will be triggered ASAP anyway.
}
}
// Report returns the last-known farm status report.
//
// It is updated every few seconds, from the Run() function.
func (s *Service) Report() api.FarmStatusReport {
s.mutex.Lock()
defer s.mutex.Unlock()
return s.lastReport
}
// updateStatusReport updates the last status report in a thread-safe way.
// It returns whether the report changed.
func (s *Service) updateStatusReport(report api.FarmStatusReport) bool {
s.mutex.Lock()
defer s.mutex.Unlock()
reportChanged := s.lastReport != report
s.lastReport = report
return reportChanged
}
func (s *Service) poll(ctx context.Context) {
report := s.checkFarmStatus(ctx)
if report == nil {
// Already logged, just keep the last known log around for querying.
return
}
reportChanged := s.updateStatusReport(*report)
if reportChanged {
event := eventbus.NewFarmStatusEvent(s.lastReport)
s.eventbus.BroadcastFarmStatusEvent(event)
}
}
// checkFarmStatus checks the farm status by querying the peristence layer.
// This function does not return an error, but instead logs them as warnings and returns nil.
func (s *Service) checkFarmStatus(ctx context.Context) *api.FarmStatusReport {
log.Trace().Msg("farm status: checking the farm status")
startTime := time.Now()
defer func() {
duration := time.Since(startTime)
log.Debug().Stringer("duration", duration).Msg("farm status: checked the farm status")
}()
workerStatuses, err := s.persist.SummarizeWorkerStatuses(ctx)
if err != nil {
logDBError(err, "farm status: could not summarize worker statuses")
return nil
}
// Check some worker statuses first. When there are no workers and the farm is
// inoperative, there is little use in checking jobs. At least for now. Maybe
// later we want to have some info in the reported status that indicates a
// more pressing matter (as in, inoperative AND a job is queued).
// Check: inoperative
if len(workerStatuses) == 0 || allIn(workerStatuses, api.WorkerStatusOffline, api.WorkerStatusError) {
return &api.FarmStatusReport{
Status: api.FarmStatusInoperative,
}
}
jobStatuses, err := s.persist.SummarizeJobStatuses(ctx)
if err != nil {
logDBError(err, "farm status: could not summarize job statuses")
return nil
}
anyJobActive := jobStatuses[api.JobStatusActive] > 0
anyJobQueued := jobStatuses[api.JobStatusQueued] > 0
isWorkAvailable := anyJobActive || anyJobQueued
anyWorkerAwake := workerStatuses[api.WorkerStatusAwake] > 0
anyWorkerAsleep := workerStatuses[api.WorkerStatusAsleep] > 0
allWorkersAsleep := !anyWorkerAwake && anyWorkerAsleep
report := api.FarmStatusReport{}
switch {
case anyJobActive && anyWorkerAwake:
// - "active" # Actively working on jobs.
report.Status = api.FarmStatusActive
case isWorkAvailable:
// - "waiting" # Work to be done, but there is no worker awake.
report.Status = api.FarmStatusWaiting
case !isWorkAvailable && allWorkersAsleep:
// - "asleep" # Farm is idle, and all workers are asleep.
report.Status = api.FarmStatusAsleep
case !isWorkAvailable:
// - "idle" # Farm could be active, but has no work to do.
report.Status = api.FarmStatusIdle
default:
log.Warn().
Interface("workerStatuses", workerStatuses).
Interface("jobStatuses", jobStatuses).
Msgf("farm status: unexpected configuration of worker and job statuses, please report this at %s", website.BugReportURL)
report.Status = api.FarmStatusUnknown
}
return &report
}
func logDBError(err error, message string) {
switch {
case errors.Is(err, context.DeadlineExceeded):
log.Warn().Msg(message + " (it took too long)")
case errors.Is(err, context.Canceled):
log.Debug().Msg(message + " (Flamenco is shutting down)")
default:
log.Warn().AnErr("cause", err).Msg(message)
}
}
func allIn[T comparable](statuses map[T]int, shouldBeIn ...T) bool {
for status, count := range statuses {
if count == 0 {
continue
}
if !slices.Contains(shouldBeIn, status) {
return false
}
}
return true
}