From 930d7497d7c93d416fca6dcc50fc10c070efc2fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sybren=20A=2E=20St=C3=BCvel?= Date: Fri, 8 Apr 2022 12:02:30 +0200 Subject: [PATCH] OAPI: Better 'SQLITE_BUSY' error handling SQLite can return `SQLITE_BUSY` errors when it's doing too many things at the same time. This is now improved a bit by setting a 5-second timeout, during which the SQLite driver will wait for the database to become available. If that doesn't happen, Flamenco Manager will return a `503 Service Unavailable` response so that the client knows to back off a little. --- internal/manager/api_impl/api_impl.go | 23 +++++++++++ internal/manager/api_impl/workers.go | 7 ++++ internal/manager/persistence/db.go | 4 ++ internal/manager/persistence/sqlite_busy.go | 39 +++++++++++++++++++ .../manager/persistence/task_scheduler.go | 9 +++++ pkg/api/flamenco-manager.yaml | 14 ++++++- 6 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 internal/manager/persistence/sqlite_busy.go diff --git a/internal/manager/api_impl/api_impl.go b/internal/manager/api_impl/api_impl.go index 5a0604b5..2bbc93a3 100644 --- a/internal/manager/api_impl/api_impl.go +++ b/internal/manager/api_impl/api_impl.go @@ -7,6 +7,9 @@ import ( "context" "fmt" "io" + "net/http" + "strconv" + "time" "git.blender.org/flamenco/internal/manager/job_compilers" "git.blender.org/flamenco/internal/manager/persistence" @@ -155,3 +158,23 @@ func sendAPIError(e echo.Context, code int, message string, args ...interface{}) } return e.JSON(code, petErr) } + +// sendAPIErrorDBBusy sends a HTTP 503 Service Unavailable, with a hopefully +// reasonable "retry after" header. +func sendAPIErrorDBBusy(e echo.Context, message string, args ...interface{}) error { + if len(args) > 0 { + // Only interpret 'message' as format string if there are actually format parameters. + message = fmt.Sprintf(message, args) + } + + code := http.StatusServiceUnavailable + apiErr := api.Error{ + Code: int32(code), + Message: message, + } + + retryAfter := 1 * time.Second + seconds := int64(retryAfter.Seconds()) + e.Response().Header().Set("Retry-After", strconv.FormatInt(seconds, 10)) + return e.JSON(code, apiErr) +} diff --git a/internal/manager/api_impl/workers.go b/internal/manager/api_impl/workers.go index 9c630c29..64138ce9 100644 --- a/internal/manager/api_impl/workers.go +++ b/internal/manager/api_impl/workers.go @@ -49,6 +49,9 @@ func (f *Flamenco) RegisterWorker(e echo.Context) error { } if err := f.persist.CreateWorker(e.Request().Context(), &dbWorker); err != nil { logger.Warn().Err(err).Msg("error creating new worker in DB") + if persistence.ErrIsDBBusy(err) { + return sendAPIErrorDBBusy(e, "too busy to register worker, try again later") + } return sendAPIError(e, http.StatusBadRequest, "error registering worker") } @@ -264,6 +267,10 @@ func (f *Flamenco) ScheduleTask(e echo.Context) error { // Get a task to execute: dbTask, err := f.persist.ScheduleTask(e.Request().Context(), worker) if err != nil { + if persistence.ErrIsDBBusy(err) { + logger.Warn().Msg("database busy scheduling task for worker") + return sendAPIErrorDBBusy(e, "too busy to find a task for you, try again later") + } logger.Warn().Err(err).Msg("error scheduling task for worker") return sendAPIError(e, http.StatusInternalServerError, "internal error finding a task for you: %v", err) } diff --git a/internal/manager/persistence/db.go b/internal/manager/persistence/db.go index 68bc680e..8b11cebc 100644 --- a/internal/manager/persistence/db.go +++ b/internal/manager/persistence/db.go @@ -29,6 +29,10 @@ func OpenDB(ctx context.Context, dsn string) (*DB, error) { return nil, err } + if err := setBusyTimeout(db.gormDB, 5*time.Second); err != nil { + return nil, err + } + if err := db.migrate(); err != nil { return nil, err } diff --git a/internal/manager/persistence/sqlite_busy.go b/internal/manager/persistence/sqlite_busy.go new file mode 100644 index 00000000..e8e814b6 --- /dev/null +++ b/internal/manager/persistence/sqlite_busy.go @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +package persistence + +import ( + "errors" + "fmt" + "time" + + "github.com/glebarez/go-sqlite" + "gorm.io/gorm" + sqlite3 "modernc.org/sqlite/lib" +) + +var ( + // errDatabaseBusy is returned by this package when the operation could not be + // performed due to SQLite being busy. + errDatabaseBusy = errors.New("database busy") +) + +// ErrIsDBBusy returns true when the error is a "database busy" error. +func ErrIsDBBusy(err error) bool { + return errors.Is(err, errDatabaseBusy) || isDatabaseBusyError(err) +} + +// isDatabaseBusyError returns true when the error returned by GORM is a +// SQLITE_BUSY error. +func isDatabaseBusyError(err error) bool { + sqlErr, ok := err.(*sqlite.Error) + return ok && sqlErr.Code() == sqlite3.SQLITE_BUSY +} + +// setBusyTimeout sets the SQLite busy_timeout busy handler. +// See https://sqlite.org/pragma.html#pragma_busy_timeout +func setBusyTimeout(gormDB *gorm.DB, busyTimeout time.Duration) error { + if tx := gormDB.Exec(fmt.Sprintf("PRAGMA busy_timeout = %d", busyTimeout.Milliseconds())); tx.Error != nil { + return fmt.Errorf("setting busy_timeout: %w", tx.Error) + } + return nil +} diff --git a/internal/manager/persistence/task_scheduler.go b/internal/manager/persistence/task_scheduler.go index 46378c99..9953a597 100644 --- a/internal/manager/persistence/task_scheduler.go +++ b/internal/manager/persistence/task_scheduler.go @@ -33,6 +33,10 @@ func (db *DB) ScheduleTask(ctx context.Context, w *Worker) (*Task, error) { var err error task, err = findTaskForWorker(tx, w) if err != nil { + if isDatabaseBusyError(err) { + logger.Trace().Err(err).Msg("database busy while finding task for worker") + return errDatabaseBusy + } logger.Error().Err(err).Msg("finding task for worker") return fmt.Errorf("finding task for worker: %w", err) } @@ -43,6 +47,11 @@ func (db *DB) ScheduleTask(ctx context.Context, w *Worker) (*Task, error) { // Found a task, now assign it to the requesting worker. if err := assignTaskToWorker(tx, w, task); err != nil { + if isDatabaseBusyError(err) { + logger.Trace().Err(err).Msg("database busy while assigning task to worker") + return errDatabaseBusy + } + logger.Warn(). Str("taskID", task.UUID). Err(err). diff --git a/pkg/api/flamenco-manager.yaml b/pkg/api/flamenco-manager.yaml index 425ded61..eaf9d3cd 100644 --- a/pkg/api/flamenco-manager.yaml +++ b/pkg/api/flamenco-manager.yaml @@ -766,10 +766,22 @@ components: required: [jobs] Error: + description: Generic error response. type: object required: [code, message] properties: - code: {type: integer, format: int32} + code: + type: integer + format: int32 + description: > + HTTP status code of this response. Is included in the payload so + that a single object represents all error information. + + Code 503 is used when the database is busy. The HTTP response will + contain a 'Retry-After' HTTP header that indicates after which time + the request can be retried. Following the header is not mandatory, + and it's up to the client to do something reasonable like + exponential backoff. message: {type: string} SecurityError: