Cleanup: Manager, move worker task update handling code into its own file
Move the code related to task updates from workers to `worker_task_updates.go`. It's going to get more complex with the blocklisting in there; this prepares for that. No functional changes.
This commit is contained in:
parent
50e795c595
commit
6feee74c54
203
internal/manager/api_impl/worker_task_updates.go
Normal file
203
internal/manager/api_impl/worker_task_updates.go
Normal file
@ -0,0 +1,203 @@
|
|||||||
|
package api_impl
|
||||||
|
|
||||||
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
|
||||||
|
"github.com/gertd/go-pluralize"
|
||||||
|
"github.com/labstack/echo/v4"
|
||||||
|
"github.com/rs/zerolog"
|
||||||
|
|
||||||
|
"git.blender.org/flamenco/internal/manager/persistence"
|
||||||
|
"git.blender.org/flamenco/internal/uuid"
|
||||||
|
"git.blender.org/flamenco/pkg/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (f *Flamenco) TaskUpdate(e echo.Context, taskID string) error {
|
||||||
|
logger := requestLogger(e)
|
||||||
|
worker := requestWorkerOrPanic(e)
|
||||||
|
|
||||||
|
if !uuid.IsValid(taskID) {
|
||||||
|
logger.Debug().Msg("invalid task ID received")
|
||||||
|
return sendAPIError(e, http.StatusBadRequest, "task ID not valid")
|
||||||
|
}
|
||||||
|
logger = logger.With().Str("taskID", taskID).Logger()
|
||||||
|
|
||||||
|
// Fetch the task, to see if this worker is even allowed to send us updates.
|
||||||
|
ctx := e.Request().Context()
|
||||||
|
dbTask, err := f.persist.FetchTask(ctx, taskID)
|
||||||
|
if err != nil {
|
||||||
|
logger.Warn().Err(err).Msg("cannot fetch task")
|
||||||
|
if errors.Is(err, persistence.ErrTaskNotFound) {
|
||||||
|
return sendAPIError(e, http.StatusNotFound, "task %+v not found", taskID)
|
||||||
|
}
|
||||||
|
return sendAPIError(e, http.StatusInternalServerError, "error fetching task")
|
||||||
|
}
|
||||||
|
if dbTask == nil {
|
||||||
|
panic("task could not be fetched, but database gave no error either")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the request body.
|
||||||
|
var taskUpdate api.TaskUpdate
|
||||||
|
if err := e.Bind(&taskUpdate); err != nil {
|
||||||
|
logger.Warn().Err(err).Msg("bad request received")
|
||||||
|
return sendAPIError(e, http.StatusBadRequest, "invalid format")
|
||||||
|
}
|
||||||
|
if dbTask.WorkerID == nil {
|
||||||
|
logger.Warn().
|
||||||
|
Msg("worker trying to update task that's not assigned to any worker")
|
||||||
|
return sendAPIError(e, http.StatusConflict, "task %+v is not assigned to any worker, so also not to you", taskID)
|
||||||
|
}
|
||||||
|
if *dbTask.WorkerID != worker.ID {
|
||||||
|
logger.Warn().Msg("worker trying to update task that's assigned to another worker")
|
||||||
|
return sendAPIError(e, http.StatusConflict, "task %+v is not assigned to you", taskID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Status 'soft-failed' should never be sent by a Worker. Distinguishing
|
||||||
|
// between soft and hard failures is up to the Manager.
|
||||||
|
// Workers should always just send 'failed' instead.
|
||||||
|
if taskUpdate.TaskStatus != nil && *taskUpdate.TaskStatus == api.TaskStatusSoftFailed {
|
||||||
|
logger.Warn().Str("status", string(*taskUpdate.TaskStatus)).
|
||||||
|
Msg("worker trying to update task to not-allowed status")
|
||||||
|
return sendAPIError(e, http.StatusBadRequest,
|
||||||
|
"task status %s not allowed to be sent by Worker", *taskUpdate.TaskStatus)
|
||||||
|
}
|
||||||
|
|
||||||
|
taskUpdateErr := f.doTaskUpdate(ctx, logger, worker, dbTask, taskUpdate)
|
||||||
|
workerUpdateErr := f.workerPingedTask(ctx, logger, dbTask)
|
||||||
|
workerSeenErr := f.workerSeen(ctx, logger, worker)
|
||||||
|
|
||||||
|
if taskUpdateErr != nil {
|
||||||
|
return sendAPIError(e, http.StatusInternalServerError, "unable to handle task update: %v", taskUpdateErr)
|
||||||
|
}
|
||||||
|
if workerUpdateErr != nil {
|
||||||
|
return sendAPIError(e, http.StatusInternalServerError, "unable to handle worker update: %v", workerUpdateErr)
|
||||||
|
}
|
||||||
|
if workerSeenErr != nil {
|
||||||
|
return sendAPIError(e, http.StatusInternalServerError, "unable to handle worker 'last seen' update: %v", workerSeenErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
return e.NoContent(http.StatusNoContent)
|
||||||
|
}
|
||||||
|
|
||||||
|
// doTaskUpdate actually updates the task and its log file.
|
||||||
|
func (f *Flamenco) doTaskUpdate(
|
||||||
|
ctx context.Context,
|
||||||
|
logger zerolog.Logger,
|
||||||
|
w *persistence.Worker,
|
||||||
|
dbTask *persistence.Task,
|
||||||
|
update api.TaskUpdate,
|
||||||
|
) error {
|
||||||
|
if dbTask.Job == nil {
|
||||||
|
logger.Panic().Msg("dbTask.Job is nil, unable to continue")
|
||||||
|
}
|
||||||
|
|
||||||
|
var dbErrActivity error
|
||||||
|
|
||||||
|
if update.Activity != nil {
|
||||||
|
dbTask.Activity = *update.Activity
|
||||||
|
// The state machine will also save the task, including its activity, but
|
||||||
|
// relying on that here would create strong cohesion.
|
||||||
|
dbErrActivity = f.persist.SaveTaskActivity(ctx, dbTask)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the log first, because that's likely to contain the cause of the task
|
||||||
|
// state change. Any subsequent task logs, for example generated by the
|
||||||
|
// Manager in response to a status change, should be logged after that.
|
||||||
|
if update.Log != nil {
|
||||||
|
// Errors writing the log to disk are already logged by logStorage, and can be safely ignored here.
|
||||||
|
_ = f.logStorage.Write(logger, dbTask.Job.UUID, dbTask.UUID, *update.Log)
|
||||||
|
}
|
||||||
|
|
||||||
|
if update.TaskStatus == nil {
|
||||||
|
return dbErrActivity
|
||||||
|
}
|
||||||
|
|
||||||
|
oldTaskStatus := dbTask.Status
|
||||||
|
var err error
|
||||||
|
if *update.TaskStatus == api.TaskStatusFailed {
|
||||||
|
// Failure is more complex than just going to the failed state.
|
||||||
|
err = f.onTaskFailed(ctx, logger, w, dbTask, update)
|
||||||
|
} else {
|
||||||
|
// Just go to the given state.
|
||||||
|
err = f.stateMachine.TaskStatusChange(ctx, dbTask, *update.TaskStatus)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
logger.Error().Err(err).
|
||||||
|
Str("newTaskStatus", string(*update.TaskStatus)).
|
||||||
|
Str("oldTaskStatus", string(oldTaskStatus)).
|
||||||
|
Msg("error changing task status")
|
||||||
|
return fmt.Errorf("changing status of task %s to %q: %w",
|
||||||
|
dbTask.UUID, *update.TaskStatus, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// onTaskFailed decides whether a task is soft- or hard-failed. Note that this
|
||||||
|
// means that the task may NOT go to the status mentioned in the `update`
|
||||||
|
// parameter, but go to `soft-failed` instead.
|
||||||
|
func (f *Flamenco) onTaskFailed(
|
||||||
|
ctx context.Context,
|
||||||
|
logger zerolog.Logger,
|
||||||
|
worker *persistence.Worker,
|
||||||
|
task *persistence.Task,
|
||||||
|
update api.TaskUpdate,
|
||||||
|
) error {
|
||||||
|
// Sanity check.
|
||||||
|
if update.TaskStatus == nil || *update.TaskStatus != api.TaskStatusFailed {
|
||||||
|
panic("onTaskFailed should only be called with a task update that indicates task failure")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bookkeeping of failure.
|
||||||
|
numFailed, err := f.persist.AddWorkerToTaskFailedList(ctx, task, worker)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("adding worker to failure list of task: %w", err)
|
||||||
|
}
|
||||||
|
// f.maybeBlocklistWorker(ctx, logger, worker, task)
|
||||||
|
|
||||||
|
// Determine whether this is soft or hard failure.
|
||||||
|
threshold := f.config.Get().TaskFailAfterSoftFailCount
|
||||||
|
logger = logger.With().
|
||||||
|
Int("failedByWorkerCount", numFailed).
|
||||||
|
Int("threshold", threshold).
|
||||||
|
Logger()
|
||||||
|
|
||||||
|
var (
|
||||||
|
newStatus api.TaskStatus
|
||||||
|
localLog, taskLog string
|
||||||
|
)
|
||||||
|
pluralizer := pluralize.NewClient()
|
||||||
|
if numFailed >= threshold {
|
||||||
|
newStatus = api.TaskStatusFailed
|
||||||
|
|
||||||
|
localLog = "too many workers failed this task, hard-failing it"
|
||||||
|
taskLog = fmt.Sprintf(
|
||||||
|
"Task failed by %s, Manager will mark it as hard failure",
|
||||||
|
pluralizer.Pluralize("worker", numFailed, true),
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
newStatus = api.TaskStatusSoftFailed
|
||||||
|
|
||||||
|
localLog = "worker failed this task, soft-failing to give another worker a try"
|
||||||
|
failsToThreshold := threshold - numFailed
|
||||||
|
taskLog = fmt.Sprintf(
|
||||||
|
"Task failed by %s, Manager will mark it as soft failure. %d more %s will cause hard failure.",
|
||||||
|
pluralizer.Pluralize("worker", numFailed, true),
|
||||||
|
failsToThreshold,
|
||||||
|
pluralizer.Pluralize("failure", failsToThreshold, false),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := f.logStorage.WriteTimestamped(logger, task.Job.UUID, task.UUID, taskLog); err != nil {
|
||||||
|
logger.Error().Err(err).Msg("error writing failure notice to task log")
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.Info().Str("newTaskStatus", string(newStatus)).Msg(localLog)
|
||||||
|
return f.stateMachine.TaskStatusChange(ctx, task, newStatus)
|
||||||
|
}
|
@ -10,7 +10,6 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/gertd/go-pluralize"
|
|
||||||
"github.com/labstack/echo/v4"
|
"github.com/labstack/echo/v4"
|
||||||
"github.com/rs/zerolog"
|
"github.com/rs/zerolog"
|
||||||
"golang.org/x/crypto/bcrypt"
|
"golang.org/x/crypto/bcrypt"
|
||||||
@ -342,190 +341,6 @@ func (f *Flamenco) ScheduleTask(e echo.Context) error {
|
|||||||
return e.JSON(http.StatusOK, customisedTask)
|
return e.JSON(http.StatusOK, customisedTask)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f *Flamenco) TaskUpdate(e echo.Context, taskID string) error {
|
|
||||||
logger := requestLogger(e)
|
|
||||||
worker := requestWorkerOrPanic(e)
|
|
||||||
|
|
||||||
if !uuid.IsValid(taskID) {
|
|
||||||
logger.Debug().Msg("invalid task ID received")
|
|
||||||
return sendAPIError(e, http.StatusBadRequest, "task ID not valid")
|
|
||||||
}
|
|
||||||
logger = logger.With().Str("taskID", taskID).Logger()
|
|
||||||
|
|
||||||
// Fetch the task, to see if this worker is even allowed to send us updates.
|
|
||||||
ctx := e.Request().Context()
|
|
||||||
dbTask, err := f.persist.FetchTask(ctx, taskID)
|
|
||||||
if err != nil {
|
|
||||||
logger.Warn().Err(err).Msg("cannot fetch task")
|
|
||||||
if errors.Is(err, persistence.ErrTaskNotFound) {
|
|
||||||
return sendAPIError(e, http.StatusNotFound, "task %+v not found", taskID)
|
|
||||||
}
|
|
||||||
return sendAPIError(e, http.StatusInternalServerError, "error fetching task")
|
|
||||||
}
|
|
||||||
if dbTask == nil {
|
|
||||||
panic("task could not be fetched, but database gave no error either")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Decode the request body.
|
|
||||||
var taskUpdate api.TaskUpdate
|
|
||||||
if err := e.Bind(&taskUpdate); err != nil {
|
|
||||||
logger.Warn().Err(err).Msg("bad request received")
|
|
||||||
return sendAPIError(e, http.StatusBadRequest, "invalid format")
|
|
||||||
}
|
|
||||||
if dbTask.WorkerID == nil {
|
|
||||||
logger.Warn().
|
|
||||||
Msg("worker trying to update task that's not assigned to any worker")
|
|
||||||
return sendAPIError(e, http.StatusConflict, "task %+v is not assigned to any worker, so also not to you", taskID)
|
|
||||||
}
|
|
||||||
if *dbTask.WorkerID != worker.ID {
|
|
||||||
logger.Warn().Msg("worker trying to update task that's assigned to another worker")
|
|
||||||
return sendAPIError(e, http.StatusConflict, "task %+v is not assigned to you", taskID)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Status 'soft-failed' should never be sent by a Worker. Distinguishing
|
|
||||||
// between soft and hard failures is up to the Manager.
|
|
||||||
// Workers should always just send 'failed' instead.
|
|
||||||
if taskUpdate.TaskStatus != nil && *taskUpdate.TaskStatus == api.TaskStatusSoftFailed {
|
|
||||||
logger.Warn().Str("status", string(*taskUpdate.TaskStatus)).
|
|
||||||
Msg("worker trying to update task to not-allowed status")
|
|
||||||
return sendAPIError(e, http.StatusBadRequest,
|
|
||||||
"task status %s not allowed to be sent by Worker", *taskUpdate.TaskStatus)
|
|
||||||
}
|
|
||||||
|
|
||||||
taskUpdateErr := f.doTaskUpdate(ctx, logger, worker, dbTask, taskUpdate)
|
|
||||||
workerUpdateErr := f.workerPingedTask(ctx, logger, dbTask)
|
|
||||||
workerSeenErr := f.workerSeen(ctx, logger, worker)
|
|
||||||
|
|
||||||
if taskUpdateErr != nil {
|
|
||||||
return sendAPIError(e, http.StatusInternalServerError, "unable to handle task update: %v", taskUpdateErr)
|
|
||||||
}
|
|
||||||
if workerUpdateErr != nil {
|
|
||||||
return sendAPIError(e, http.StatusInternalServerError, "unable to handle worker update: %v", workerUpdateErr)
|
|
||||||
}
|
|
||||||
if workerSeenErr != nil {
|
|
||||||
return sendAPIError(e, http.StatusInternalServerError, "unable to handle worker 'last seen' update: %v", workerSeenErr)
|
|
||||||
}
|
|
||||||
|
|
||||||
return e.NoContent(http.StatusNoContent)
|
|
||||||
}
|
|
||||||
|
|
||||||
// doTaskUpdate actually updates the task and its log file.
|
|
||||||
func (f *Flamenco) doTaskUpdate(
|
|
||||||
ctx context.Context,
|
|
||||||
logger zerolog.Logger,
|
|
||||||
w *persistence.Worker,
|
|
||||||
dbTask *persistence.Task,
|
|
||||||
update api.TaskUpdate,
|
|
||||||
) error {
|
|
||||||
if dbTask.Job == nil {
|
|
||||||
logger.Panic().Msg("dbTask.Job is nil, unable to continue")
|
|
||||||
}
|
|
||||||
|
|
||||||
var dbErrActivity error
|
|
||||||
|
|
||||||
if update.Activity != nil {
|
|
||||||
dbTask.Activity = *update.Activity
|
|
||||||
// The state machine will also save the task, including its activity, but
|
|
||||||
// relying on that here would create strong cohesion.
|
|
||||||
dbErrActivity = f.persist.SaveTaskActivity(ctx, dbTask)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write the log first, because that's likely to contain the cause of the task
|
|
||||||
// state change. Any subsequent task logs, for example generated by the
|
|
||||||
// Manager in response to a status change, should be logged after that.
|
|
||||||
if update.Log != nil {
|
|
||||||
// Errors writing the log to disk are already logged by logStorage, and can be safely ignored here.
|
|
||||||
_ = f.logStorage.Write(logger, dbTask.Job.UUID, dbTask.UUID, *update.Log)
|
|
||||||
}
|
|
||||||
|
|
||||||
if update.TaskStatus == nil {
|
|
||||||
return dbErrActivity
|
|
||||||
}
|
|
||||||
|
|
||||||
oldTaskStatus := dbTask.Status
|
|
||||||
var err error
|
|
||||||
if *update.TaskStatus == api.TaskStatusFailed {
|
|
||||||
// Failure is more complex than just going to the failed state.
|
|
||||||
err = f.onTaskFailed(ctx, logger, w, dbTask, update)
|
|
||||||
} else {
|
|
||||||
// Just go to the given state.
|
|
||||||
err = f.stateMachine.TaskStatusChange(ctx, dbTask, *update.TaskStatus)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
logger.Error().Err(err).
|
|
||||||
Str("newTaskStatus", string(*update.TaskStatus)).
|
|
||||||
Str("oldTaskStatus", string(oldTaskStatus)).
|
|
||||||
Msg("error changing task status")
|
|
||||||
return fmt.Errorf("changing status of task %s to %q: %w",
|
|
||||||
dbTask.UUID, *update.TaskStatus, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// onTaskFailed decides whether a task is soft- or hard-failed. Note that this
|
|
||||||
// means that the task may NOT go to the status mentioned in the `update`
|
|
||||||
// parameter, but go to `soft-failed` instead.
|
|
||||||
func (f *Flamenco) onTaskFailed(
|
|
||||||
ctx context.Context,
|
|
||||||
logger zerolog.Logger,
|
|
||||||
worker *persistence.Worker,
|
|
||||||
task *persistence.Task,
|
|
||||||
update api.TaskUpdate,
|
|
||||||
) error {
|
|
||||||
// Sanity check.
|
|
||||||
if update.TaskStatus == nil || *update.TaskStatus != api.TaskStatusFailed {
|
|
||||||
panic("onTaskFailed should only be called with a task update that indicates task failure")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bookkeeping of failure.
|
|
||||||
numFailed, err := f.persist.AddWorkerToTaskFailedList(ctx, task, worker)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("adding worker to failure list of task: %w", err)
|
|
||||||
}
|
|
||||||
// f.maybeBlocklistWorker(ctx, w, dbTask)
|
|
||||||
|
|
||||||
threshold := f.config.Get().TaskFailAfterSoftFailCount
|
|
||||||
logger = logger.With().
|
|
||||||
Int("failedByWorkerCount", numFailed).
|
|
||||||
Int("threshold", threshold).
|
|
||||||
Logger()
|
|
||||||
|
|
||||||
var (
|
|
||||||
newStatus api.TaskStatus
|
|
||||||
localLog, taskLog string
|
|
||||||
)
|
|
||||||
pluralizer := pluralize.NewClient()
|
|
||||||
if numFailed >= threshold {
|
|
||||||
newStatus = api.TaskStatusFailed
|
|
||||||
|
|
||||||
localLog = "too many workers failed this task, hard-failing it"
|
|
||||||
taskLog = fmt.Sprintf(
|
|
||||||
"Task failed by %s, Manager will mark it as hard failure",
|
|
||||||
pluralizer.Pluralize("worker", numFailed, true),
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
newStatus = api.TaskStatusSoftFailed
|
|
||||||
|
|
||||||
localLog = "worker failed this task, soft-failing to give another worker a try"
|
|
||||||
failsToThreshold := threshold - numFailed
|
|
||||||
taskLog = fmt.Sprintf(
|
|
||||||
"Task failed by %s, Manager will mark it as soft failure. %d more %s will cause hard failure.",
|
|
||||||
pluralizer.Pluralize("worker", numFailed, true),
|
|
||||||
failsToThreshold,
|
|
||||||
pluralizer.Pluralize("failure", failsToThreshold, false),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := f.logStorage.WriteTimestamped(logger, task.Job.UUID, task.UUID, taskLog); err != nil {
|
|
||||||
logger.Error().Err(err).Msg("error writing failure notice to task log")
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Info().Str("newTaskStatus", string(newStatus)).Msg(localLog)
|
|
||||||
return f.stateMachine.TaskStatusChange(ctx, task, newStatus)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f *Flamenco) workerPingedTask(
|
func (f *Flamenco) workerPingedTask(
|
||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
logger zerolog.Logger,
|
logger zerolog.Logger,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user