OAPI: Better 'SQLITE_BUSY' error handling
SQLite can return `SQLITE_BUSY` errors when it's doing too many things at the same time. This is now improved a bit by setting a 5-second timeout, during which the SQLite driver will wait for the database to become available. If that doesn't happen, Flamenco Manager will return a `503 Service Unavailable` response so that the client knows to back off a little.
This commit is contained in:
parent
89e130c04f
commit
930d7497d7
@ -7,6 +7,9 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"git.blender.org/flamenco/internal/manager/job_compilers"
|
||||
"git.blender.org/flamenco/internal/manager/persistence"
|
||||
@ -155,3 +158,23 @@ func sendAPIError(e echo.Context, code int, message string, args ...interface{})
|
||||
}
|
||||
return e.JSON(code, petErr)
|
||||
}
|
||||
|
||||
// sendAPIErrorDBBusy sends a HTTP 503 Service Unavailable, with a hopefully
|
||||
// reasonable "retry after" header.
|
||||
func sendAPIErrorDBBusy(e echo.Context, message string, args ...interface{}) error {
|
||||
if len(args) > 0 {
|
||||
// Only interpret 'message' as format string if there are actually format parameters.
|
||||
message = fmt.Sprintf(message, args)
|
||||
}
|
||||
|
||||
code := http.StatusServiceUnavailable
|
||||
apiErr := api.Error{
|
||||
Code: int32(code),
|
||||
Message: message,
|
||||
}
|
||||
|
||||
retryAfter := 1 * time.Second
|
||||
seconds := int64(retryAfter.Seconds())
|
||||
e.Response().Header().Set("Retry-After", strconv.FormatInt(seconds, 10))
|
||||
return e.JSON(code, apiErr)
|
||||
}
|
||||
|
@ -49,6 +49,9 @@ func (f *Flamenco) RegisterWorker(e echo.Context) error {
|
||||
}
|
||||
if err := f.persist.CreateWorker(e.Request().Context(), &dbWorker); err != nil {
|
||||
logger.Warn().Err(err).Msg("error creating new worker in DB")
|
||||
if persistence.ErrIsDBBusy(err) {
|
||||
return sendAPIErrorDBBusy(e, "too busy to register worker, try again later")
|
||||
}
|
||||
return sendAPIError(e, http.StatusBadRequest, "error registering worker")
|
||||
}
|
||||
|
||||
@ -264,6 +267,10 @@ func (f *Flamenco) ScheduleTask(e echo.Context) error {
|
||||
// Get a task to execute:
|
||||
dbTask, err := f.persist.ScheduleTask(e.Request().Context(), worker)
|
||||
if err != nil {
|
||||
if persistence.ErrIsDBBusy(err) {
|
||||
logger.Warn().Msg("database busy scheduling task for worker")
|
||||
return sendAPIErrorDBBusy(e, "too busy to find a task for you, try again later")
|
||||
}
|
||||
logger.Warn().Err(err).Msg("error scheduling task for worker")
|
||||
return sendAPIError(e, http.StatusInternalServerError, "internal error finding a task for you: %v", err)
|
||||
}
|
||||
|
@ -29,6 +29,10 @@ func OpenDB(ctx context.Context, dsn string) (*DB, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := setBusyTimeout(db.gormDB, 5*time.Second); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := db.migrate(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
39
internal/manager/persistence/sqlite_busy.go
Normal file
39
internal/manager/persistence/sqlite_busy.go
Normal file
@ -0,0 +1,39 @@
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
package persistence
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/glebarez/go-sqlite"
|
||||
"gorm.io/gorm"
|
||||
sqlite3 "modernc.org/sqlite/lib"
|
||||
)
|
||||
|
||||
var (
|
||||
// errDatabaseBusy is returned by this package when the operation could not be
|
||||
// performed due to SQLite being busy.
|
||||
errDatabaseBusy = errors.New("database busy")
|
||||
)
|
||||
|
||||
// ErrIsDBBusy returns true when the error is a "database busy" error.
|
||||
func ErrIsDBBusy(err error) bool {
|
||||
return errors.Is(err, errDatabaseBusy) || isDatabaseBusyError(err)
|
||||
}
|
||||
|
||||
// isDatabaseBusyError returns true when the error returned by GORM is a
|
||||
// SQLITE_BUSY error.
|
||||
func isDatabaseBusyError(err error) bool {
|
||||
sqlErr, ok := err.(*sqlite.Error)
|
||||
return ok && sqlErr.Code() == sqlite3.SQLITE_BUSY
|
||||
}
|
||||
|
||||
// setBusyTimeout sets the SQLite busy_timeout busy handler.
|
||||
// See https://sqlite.org/pragma.html#pragma_busy_timeout
|
||||
func setBusyTimeout(gormDB *gorm.DB, busyTimeout time.Duration) error {
|
||||
if tx := gormDB.Exec(fmt.Sprintf("PRAGMA busy_timeout = %d", busyTimeout.Milliseconds())); tx.Error != nil {
|
||||
return fmt.Errorf("setting busy_timeout: %w", tx.Error)
|
||||
}
|
||||
return nil
|
||||
}
|
@ -33,6 +33,10 @@ func (db *DB) ScheduleTask(ctx context.Context, w *Worker) (*Task, error) {
|
||||
var err error
|
||||
task, err = findTaskForWorker(tx, w)
|
||||
if err != nil {
|
||||
if isDatabaseBusyError(err) {
|
||||
logger.Trace().Err(err).Msg("database busy while finding task for worker")
|
||||
return errDatabaseBusy
|
||||
}
|
||||
logger.Error().Err(err).Msg("finding task for worker")
|
||||
return fmt.Errorf("finding task for worker: %w", err)
|
||||
}
|
||||
@ -43,6 +47,11 @@ func (db *DB) ScheduleTask(ctx context.Context, w *Worker) (*Task, error) {
|
||||
|
||||
// Found a task, now assign it to the requesting worker.
|
||||
if err := assignTaskToWorker(tx, w, task); err != nil {
|
||||
if isDatabaseBusyError(err) {
|
||||
logger.Trace().Err(err).Msg("database busy while assigning task to worker")
|
||||
return errDatabaseBusy
|
||||
}
|
||||
|
||||
logger.Warn().
|
||||
Str("taskID", task.UUID).
|
||||
Err(err).
|
||||
|
@ -766,10 +766,22 @@ components:
|
||||
required: [jobs]
|
||||
|
||||
Error:
|
||||
description: Generic error response.
|
||||
type: object
|
||||
required: [code, message]
|
||||
properties:
|
||||
code: {type: integer, format: int32}
|
||||
code:
|
||||
type: integer
|
||||
format: int32
|
||||
description: >
|
||||
HTTP status code of this response. Is included in the payload so
|
||||
that a single object represents all error information.
|
||||
|
||||
Code 503 is used when the database is busy. The HTTP response will
|
||||
contain a 'Retry-After' HTTP header that indicates after which time
|
||||
the request can be retried. Following the header is not mandatory,
|
||||
and it's up to the client to do something reasonable like
|
||||
exponential backoff.
|
||||
message: {type: string}
|
||||
|
||||
SecurityError:
|
||||
|
Loading…
x
Reference in New Issue
Block a user