OAPI: Better 'SQLITE_BUSY' error handling

SQLite can return `SQLITE_BUSY` errors when it's doing too many things at
the same time. This is now improved a bit by setting a 5-second timeout,
during which the SQLite driver will wait for the database to become
available. If that doesn't happen, Flamenco Manager will return a
`503 Service Unavailable` response so that the client knows to back off
a little.
This commit is contained in:
Sybren A. Stüvel 2022-04-08 12:02:30 +02:00
parent 89e130c04f
commit 930d7497d7
6 changed files with 95 additions and 1 deletions

View File

@ -7,6 +7,9 @@ import (
"context" "context"
"fmt" "fmt"
"io" "io"
"net/http"
"strconv"
"time"
"git.blender.org/flamenco/internal/manager/job_compilers" "git.blender.org/flamenco/internal/manager/job_compilers"
"git.blender.org/flamenco/internal/manager/persistence" "git.blender.org/flamenco/internal/manager/persistence"
@ -155,3 +158,23 @@ func sendAPIError(e echo.Context, code int, message string, args ...interface{})
} }
return e.JSON(code, petErr) return e.JSON(code, petErr)
} }
// sendAPIErrorDBBusy sends a HTTP 503 Service Unavailable, with a hopefully
// reasonable "retry after" header.
func sendAPIErrorDBBusy(e echo.Context, message string, args ...interface{}) error {
if len(args) > 0 {
// Only interpret 'message' as format string if there are actually format parameters.
message = fmt.Sprintf(message, args)
}
code := http.StatusServiceUnavailable
apiErr := api.Error{
Code: int32(code),
Message: message,
}
retryAfter := 1 * time.Second
seconds := int64(retryAfter.Seconds())
e.Response().Header().Set("Retry-After", strconv.FormatInt(seconds, 10))
return e.JSON(code, apiErr)
}

View File

@ -49,6 +49,9 @@ func (f *Flamenco) RegisterWorker(e echo.Context) error {
} }
if err := f.persist.CreateWorker(e.Request().Context(), &dbWorker); err != nil { if err := f.persist.CreateWorker(e.Request().Context(), &dbWorker); err != nil {
logger.Warn().Err(err).Msg("error creating new worker in DB") logger.Warn().Err(err).Msg("error creating new worker in DB")
if persistence.ErrIsDBBusy(err) {
return sendAPIErrorDBBusy(e, "too busy to register worker, try again later")
}
return sendAPIError(e, http.StatusBadRequest, "error registering worker") return sendAPIError(e, http.StatusBadRequest, "error registering worker")
} }
@ -264,6 +267,10 @@ func (f *Flamenco) ScheduleTask(e echo.Context) error {
// Get a task to execute: // Get a task to execute:
dbTask, err := f.persist.ScheduleTask(e.Request().Context(), worker) dbTask, err := f.persist.ScheduleTask(e.Request().Context(), worker)
if err != nil { if err != nil {
if persistence.ErrIsDBBusy(err) {
logger.Warn().Msg("database busy scheduling task for worker")
return sendAPIErrorDBBusy(e, "too busy to find a task for you, try again later")
}
logger.Warn().Err(err).Msg("error scheduling task for worker") logger.Warn().Err(err).Msg("error scheduling task for worker")
return sendAPIError(e, http.StatusInternalServerError, "internal error finding a task for you: %v", err) return sendAPIError(e, http.StatusInternalServerError, "internal error finding a task for you: %v", err)
} }

View File

@ -29,6 +29,10 @@ func OpenDB(ctx context.Context, dsn string) (*DB, error) {
return nil, err return nil, err
} }
if err := setBusyTimeout(db.gormDB, 5*time.Second); err != nil {
return nil, err
}
if err := db.migrate(); err != nil { if err := db.migrate(); err != nil {
return nil, err return nil, err
} }

View File

@ -0,0 +1,39 @@
// SPDX-License-Identifier: GPL-3.0-or-later
package persistence
import (
"errors"
"fmt"
"time"
"github.com/glebarez/go-sqlite"
"gorm.io/gorm"
sqlite3 "modernc.org/sqlite/lib"
)
var (
// errDatabaseBusy is returned by this package when the operation could not be
// performed due to SQLite being busy.
errDatabaseBusy = errors.New("database busy")
)
// ErrIsDBBusy returns true when the error is a "database busy" error.
func ErrIsDBBusy(err error) bool {
return errors.Is(err, errDatabaseBusy) || isDatabaseBusyError(err)
}
// isDatabaseBusyError returns true when the error returned by GORM is a
// SQLITE_BUSY error.
func isDatabaseBusyError(err error) bool {
sqlErr, ok := err.(*sqlite.Error)
return ok && sqlErr.Code() == sqlite3.SQLITE_BUSY
}
// setBusyTimeout sets the SQLite busy_timeout busy handler.
// See https://sqlite.org/pragma.html#pragma_busy_timeout
func setBusyTimeout(gormDB *gorm.DB, busyTimeout time.Duration) error {
if tx := gormDB.Exec(fmt.Sprintf("PRAGMA busy_timeout = %d", busyTimeout.Milliseconds())); tx.Error != nil {
return fmt.Errorf("setting busy_timeout: %w", tx.Error)
}
return nil
}

View File

@ -33,6 +33,10 @@ func (db *DB) ScheduleTask(ctx context.Context, w *Worker) (*Task, error) {
var err error var err error
task, err = findTaskForWorker(tx, w) task, err = findTaskForWorker(tx, w)
if err != nil { if err != nil {
if isDatabaseBusyError(err) {
logger.Trace().Err(err).Msg("database busy while finding task for worker")
return errDatabaseBusy
}
logger.Error().Err(err).Msg("finding task for worker") logger.Error().Err(err).Msg("finding task for worker")
return fmt.Errorf("finding task for worker: %w", err) return fmt.Errorf("finding task for worker: %w", err)
} }
@ -43,6 +47,11 @@ func (db *DB) ScheduleTask(ctx context.Context, w *Worker) (*Task, error) {
// Found a task, now assign it to the requesting worker. // Found a task, now assign it to the requesting worker.
if err := assignTaskToWorker(tx, w, task); err != nil { if err := assignTaskToWorker(tx, w, task); err != nil {
if isDatabaseBusyError(err) {
logger.Trace().Err(err).Msg("database busy while assigning task to worker")
return errDatabaseBusy
}
logger.Warn(). logger.Warn().
Str("taskID", task.UUID). Str("taskID", task.UUID).
Err(err). Err(err).

View File

@ -766,10 +766,22 @@ components:
required: [jobs] required: [jobs]
Error: Error:
description: Generic error response.
type: object type: object
required: [code, message] required: [code, message]
properties: properties:
code: {type: integer, format: int32} code:
type: integer
format: int32
description: >
HTTP status code of this response. Is included in the payload so
that a single object represents all error information.
Code 503 is used when the database is busy. The HTTP response will
contain a 'Retry-After' HTTP header that indicates after which time
the request can be retried. Following the header is not mandatory,
and it's up to the client to do something reasonable like
exponential backoff.
message: {type: string} message: {type: string}
SecurityError: SecurityError: