Manager: mark workers as 'seen' when they send updates

Update the 'last seen at' timestamp of workers when they:
- sign on
- sign off
- get a task assigned
- send a task update
- check whether they can keep running their task

Note that this commit is necessary to not have the workers time out
immediately ;-)
This commit is contained in:
Sybren A. Stüvel 2022-06-10 17:56:48 +02:00
parent 986b647967
commit 5dac3c2dc0
5 changed files with 85 additions and 7 deletions

View File

@ -40,6 +40,7 @@ type PersistenceService interface {
FetchWorkers(ctx context.Context) ([]*persistence.Worker, error)
SaveWorker(ctx context.Context, w *persistence.Worker) error
SaveWorkerStatus(ctx context.Context, w *persistence.Worker) error
WorkerSeen(ctx context.Context, w *persistence.Worker) error
// ScheduleTask finds a task to execute by the given worker, and assigns it to that worker.
// If no task is available, (nil, nil) is returned, as this is not an error situation.

View File

@ -243,6 +243,20 @@ func (mr *MockPersistenceServiceMockRecorder) TaskTouchedByWorker(arg0, arg1 int
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TaskTouchedByWorker", reflect.TypeOf((*MockPersistenceService)(nil).TaskTouchedByWorker), arg0, arg1)
}
// WorkerSeen mocks base method.
func (m *MockPersistenceService) WorkerSeen(arg0 context.Context, arg1 *persistence.Worker) error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "WorkerSeen", arg0, arg1)
ret0, _ := ret[0].(error)
return ret0
}
// WorkerSeen indicates an expected call of WorkerSeen.
func (mr *MockPersistenceServiceMockRecorder) WorkerSeen(arg0, arg1 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WorkerSeen", reflect.TypeOf((*MockPersistenceService)(nil).WorkerSeen), arg0, arg1)
}
// MockChangeBroadcaster is a mock of ChangeBroadcaster interface.
type MockChangeBroadcaster struct {
ctrl *gomock.Controller

View File

@ -109,6 +109,7 @@ func (f *Flamenco) SignOn(e echo.Context) error {
func (f *Flamenco) workerUpdateAfterSignOn(e echo.Context, update api.SignOnJSONBody) (*persistence.Worker, api.WorkerStatus, error) {
logger := requestLogger(e)
w := requestWorkerOrPanic(e)
ctx := e.Request().Context()
// Update the worker for with the new sign-on info.
prevStatus := w.Status
@ -124,7 +125,7 @@ func (f *Flamenco) workerUpdateAfterSignOn(e echo.Context, update api.SignOnJSON
w.SupportedTaskTypes = strings.Join(update.SupportedTaskTypes, ",")
// Save the new Worker info to the database.
err := f.persist.SaveWorker(e.Request().Context(), w)
err := f.persist.SaveWorker(ctx, w)
if err != nil {
logger.Warn().Err(err).
Str("newStatus", string(w.Status)).
@ -132,6 +133,11 @@ func (f *Flamenco) workerUpdateAfterSignOn(e echo.Context, update api.SignOnJSON
return nil, "", err
}
err = f.workerSeen(ctx, logger, w)
if err != nil {
return nil, "", err
}
return w, prevStatus, nil
}
@ -167,6 +173,9 @@ func (f *Flamenco) SignOff(e echo.Context) error {
return sendAPIError(e, http.StatusInternalServerError, "error storing new status in database")
}
// Ignore database errors here; the rest of the signoff process should just happen.
_ = f.workerSeen(ctx, logger, w)
// Re-queue all tasks (should be only one) this worker is now working on.
err = f.stateMachine.RequeueTasksOfWorker(ctx, w, "worker signed off")
if err != nil {
@ -224,7 +233,8 @@ func (f *Flamenco) WorkerStateChanged(e echo.Context) error {
w.StatusChangeClear()
}
err = f.persist.SaveWorkerStatus(e.Request().Context(), w)
ctx := e.Request().Context()
err = f.persist.SaveWorkerStatus(ctx, w)
if err != nil {
logger.Warn().Err(err).
Str("newStatus", string(w.Status)).
@ -232,6 +242,10 @@ func (f *Flamenco) WorkerStateChanged(e echo.Context) error {
return sendAPIError(e, http.StatusInternalServerError, "error storing worker in database")
}
if err := f.workerSeen(ctx, logger, w); err != nil {
return sendAPIError(e, http.StatusInternalServerError, "error storing worker 'last seen' timestamp in database")
}
update := webupdates.NewWorkerUpdate(w)
update.PreviousStatus = &prevStatus
f.broadcaster.BroadcastWorkerUpdate(update)
@ -288,9 +302,13 @@ func (f *Flamenco) ScheduleTask(e echo.Context) error {
}
// Start timeout measurement as soon as the Worker gets the task assigned.
if err := f.workerPingedTask(e.Request().Context(), logger, dbTask); err != nil {
ctx := e.Request().Context()
if err := f.workerPingedTask(ctx, logger, dbTask); err != nil {
return sendAPIError(e, http.StatusInternalServerError, "internal error updating task for timeout calculation: %v", err)
}
if err := f.workerSeen(ctx, logger, worker); err != nil {
return sendAPIError(e, http.StatusInternalServerError, "error storing worker 'last seen' timestamp in database")
}
// Convert database objects to API objects:
apiCommands := []api.Command{}
@ -361,6 +379,7 @@ func (f *Flamenco) TaskUpdate(e echo.Context, taskID string) error {
taskUpdateErr := f.doTaskUpdate(ctx, logger, worker, dbTask, taskUpdate)
workerUpdateErr := f.workerPingedTask(ctx, logger, dbTask)
workerSeenErr := f.workerSeen(ctx, logger, worker)
if taskUpdateErr != nil {
return sendAPIError(e, http.StatusInternalServerError, "unable to handle task update: %v", taskUpdateErr)
@ -368,6 +387,9 @@ func (f *Flamenco) TaskUpdate(e echo.Context, taskID string) error {
if workerUpdateErr != nil {
return sendAPIError(e, http.StatusInternalServerError, "unable to handle worker update: %v", workerUpdateErr)
}
if workerSeenErr != nil {
return sendAPIError(e, http.StatusInternalServerError, "unable to handle worker 'last seen' update: %v", workerSeenErr)
}
return e.NoContent(http.StatusNoContent)
}
@ -431,6 +453,20 @@ func (f *Flamenco) workerPingedTask(
return nil
}
// workerSeen marks the worker as 'seen' and logs any database error that may occur.
func (f *Flamenco) workerSeen(
ctx context.Context,
logger zerolog.Logger,
w *persistence.Worker,
) error {
err := f.persist.WorkerSeen(ctx, w)
if err != nil {
logger.Error().Err(err).Msg("error marking Worker as 'seen' in the database")
return err
}
return nil
}
func (f *Flamenco) MayWorkerRun(e echo.Context, taskID string) error {
logger := requestLogger(e)
worker := requestWorkerOrPanic(e)
@ -462,11 +498,12 @@ func (f *Flamenco) MayWorkerRun(e echo.Context, taskID string) error {
mkr := mayWorkerRun(worker, dbTask)
// Errors saving the "worker pinged task" and "worker seen" fields in the
// database are just logged. It's not something to bother the worker with.
if mkr.MayKeepRunning {
// Errors saving the "worker pinged task" field in the database are just
// logged. It's not something to bother the worker with.
_ = f.workerPingedTask(ctx, logger, dbTask)
}
_ = f.workerSeen(ctx, logger, worker)
return e.JSON(http.StatusOK, mkr)
}

View File

@ -33,8 +33,11 @@ func TestTaskScheduleHappy(t *testing.T) {
UUID: "4107c7aa-e86d-4244-858b-6c4fce2af503",
Job: &job,
}
mf.persistence.EXPECT().ScheduleTask(echo.Request().Context(), &worker).Return(&task, nil)
mf.persistence.EXPECT().TaskTouchedByWorker(echo.Request().Context(), &task)
ctx := echo.Request().Context()
mf.persistence.EXPECT().ScheduleTask(ctx, &worker).Return(&task, nil)
mf.persistence.EXPECT().TaskTouchedByWorker(ctx, &task)
mf.persistence.EXPECT().WorkerSeen(ctx, &worker)
mf.logStorage.EXPECT().WriteTimestamped(gomock.Any(), job.UUID, task.UUID,
"Task assigned to worker дрон (e7632d62-c3b8-4af0-9e78-01752928952c)")
@ -112,6 +115,7 @@ func TestWorkerSignOn(t *testing.T) {
})
mf.persistence.EXPECT().SaveWorker(gomock.Any(), &worker).Return(nil)
mf.persistence.EXPECT().WorkerSeen(gomock.Any(), &worker)
echo := mf.prepareMockedJSONRequest(api.WorkerSignOn{
Nickname: "Lazy Boi",
@ -142,6 +146,7 @@ func TestWorkerSignoffTaskRequeue(t *testing.T) {
// Expect worker's tasks to be re-queued.
mf.stateMachine.EXPECT().RequeueTasksOfWorker(expectCtx, &worker, "worker signed off").Return(nil)
mf.persistence.EXPECT().WorkerSeen(expectCtx, &worker)
// Expect worker to be saved as 'offline'.
mf.persistence.EXPECT().
@ -193,6 +198,7 @@ func TestWorkerSignoffStatusChangeRequest(t *testing.T) {
mf.persistence.EXPECT().SaveWorkerStatus(gomock.Any(), &savedWorker).Return(nil)
mf.stateMachine.EXPECT().RequeueTasksOfWorker(gomock.Any(), &worker, "worker signed off").Return(nil)
mf.persistence.EXPECT().WorkerSeen(gomock.Any(), &worker)
// Perform the request
echo := mf.prepareMockedRequest(nil)
@ -225,6 +231,7 @@ func TestWorkerStateChanged(t *testing.T) {
savedWorker := worker
savedWorker.Status = api.WorkerStatusAwake
mf.persistence.EXPECT().SaveWorkerStatus(gomock.Any(), &savedWorker).Return(nil)
mf.persistence.EXPECT().WorkerSeen(gomock.Any(), &worker)
// Perform the request
echo := mf.prepareMockedJSONRequest(api.WorkerStateChanged{
@ -267,6 +274,7 @@ func TestWorkerStateChangedAfterChangeRequest(t *testing.T) {
savedWorker := worker
savedWorker.Status = api.WorkerStatusStarting
mf.persistence.EXPECT().SaveWorkerStatus(gomock.Any(), &savedWorker).Return(nil)
mf.persistence.EXPECT().WorkerSeen(gomock.Any(), &worker)
// Perform the request
echo := mf.prepareMockedJSONRequest(api.WorkerStateChanged{
@ -296,6 +304,7 @@ func TestWorkerStateChangedAfterChangeRequest(t *testing.T) {
savedWorker.Status = api.WorkerStatusAsleep
savedWorker.StatusChangeClear()
mf.persistence.EXPECT().SaveWorkerStatus(gomock.Any(), &savedWorker).Return(nil)
mf.persistence.EXPECT().WorkerSeen(gomock.Any(), &worker)
// Perform the request
echo := mf.prepareMockedJSONRequest(api.WorkerStateChanged{
@ -363,6 +372,7 @@ func TestTaskUpdate(t *testing.T) {
touchedTask = *task
return nil
})
mf.persistence.EXPECT().WorkerSeen(gomock.Any(), &worker)
// Do the call.
echoCtx := mf.prepareMockedJSONRequest(taskUpdate)
@ -403,6 +413,11 @@ func TestMayWorkerRun(t *testing.T) {
mf.persistence.EXPECT().FetchTask(gomock.Any(), task.UUID).Return(&task, nil).AnyTimes()
// Expect the worker to be marked as 'seen' regardless of whether it may run
// its current task or not, so equal to the number of calls to
// `MayWorkerRun()` below.
mf.persistence.EXPECT().WorkerSeen(gomock.Any(), &worker).Times(4)
// Test: unhappy, task unassigned
{
echo := prepareRequest()

View File

@ -102,3 +102,14 @@ func (db *DB) SaveWorker(ctx context.Context, w *Worker) error {
}
return nil
}
// WorkerSeen marks the worker as 'seen' by this Manager. This is used for timeout detection.
func (db *DB) WorkerSeen(ctx context.Context, w *Worker) error {
tx := db.gormDB.WithContext(ctx).
Model(w).
Updates(Worker{LastSeenAt: db.gormDB.NowFunc()})
if err := tx.Error; err != nil {
return workerError(err, "saving worker 'last seen at'")
}
return nil
}