Sybren A. Stüvel e2bca9ad61 Worker: add configuration for Linux out-of-memory killer
Add a Worker configuration option to configure the Linux out-of-memory
behaviour. Add `oom_score_adjust=500` to `flamenco-worker.yaml` to increase
the chance that Blender gets killed when the machine runs out of memory,
instead of Flamenco Worker itself.
2024-04-15 17:21:11 +02:00

335 lines
9.6 KiB
Go

package main
// SPDX-License-Identifier: GPL-3.0-or-later
import (
"context"
"errors"
"flag"
"fmt"
"net/url"
"os"
"os/signal"
"runtime"
"sync"
"syscall"
"time"
"github.com/benbjohnson/clock"
"github.com/mattn/go-colorable"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
"projects.blender.org/studio/flamenco/internal/appinfo"
"projects.blender.org/studio/flamenco/internal/worker"
"projects.blender.org/studio/flamenco/internal/worker/cli_runner"
"projects.blender.org/studio/flamenco/pkg/oomscore"
"projects.blender.org/studio/flamenco/pkg/sysinfo"
"projects.blender.org/studio/flamenco/pkg/website"
)
var (
w *worker.Worker
listener *worker.Listener
buffer *worker.UpstreamBufferDB
shutdownComplete chan struct{}
)
const flamencoWorkerDatabase = "flamenco-worker.sqlite"
var cliArgs struct {
// Do-and-quit flags.
version bool
flush bool
// Logging level flags.
quiet, debug, trace bool
managerURL *url.URL
findManager bool
manager string
register bool
restartExitCode int
}
func main() {
parseCliArgs()
if cliArgs.version {
fmt.Println(appinfo.ExtendedVersion())
return
}
output := zerolog.ConsoleWriter{Out: colorable.NewColorableStdout(), TimeFormat: time.RFC3339}
log.Logger = log.Output(output)
osDetail, err := sysinfo.Description()
if err != nil {
osDetail = err.Error()
}
log.Info().
Str("version", appinfo.ApplicationVersion).
Str("git", appinfo.ApplicationGitHash).
Str("releaseCycle", appinfo.ReleaseCycle).
Str("os", runtime.GOOS).
Str("osDetail", osDetail).
Str("arch", runtime.GOARCH).
Int("pid", os.Getpid()).
Msgf("starting %v Worker", appinfo.ApplicationName)
configLogLevel()
if cliArgs.findManager {
// TODO: move this to a more suitable place.
discoverTimeout := 1 * time.Minute
discoverCtx, discoverCancel := context.WithTimeout(context.Background(), discoverTimeout)
defer discoverCancel()
managerURL, err := worker.AutodiscoverManager(discoverCtx)
if err != nil {
logFatalManagerDiscoveryError(err, discoverTimeout)
}
log.Info().Str("manager", managerURL).Msg("found Manager")
return
}
// Load configuration, and override things from the CLI arguments if necessary.
configWrangler := worker.NewConfigWrangler()
// Before the config can be overridden, it has to be loaded.
if _, err := configWrangler.WorkerConfig(); err != nil {
log.Fatal().Err(err).Msg("error loading worker configuration")
}
if cliArgs.managerURL != nil {
url := cliArgs.managerURL.String()
log.Info().Str("manager", url).Msg("using Manager URL from commandline")
configWrangler.SetManagerURL(url)
}
if cliArgs.restartExitCode != 0 {
log.Info().Int("exitCode", cliArgs.restartExitCode).
Msg("will tell Manager this Worker can restart")
configWrangler.SetRestartExitCode(cliArgs.restartExitCode)
}
findBlender()
findFFmpeg()
// Create the CLI runner before the auto-discovery, to make any configuration
// problems clear before waiting for the Manager to respond.
cliRunner := createCLIRunner(&configWrangler)
// Give the auto-discovery some time to find a Manager.
discoverTimeout := 10 * time.Minute
discoverCtx, discoverCancel := context.WithTimeout(context.Background(), discoverTimeout)
defer discoverCancel()
if err := worker.MaybeAutodiscoverManager(discoverCtx, &configWrangler); err != nil {
logFatalManagerDiscoveryError(err, discoverTimeout)
}
// Startup can take arbitrarily long, as it only ends when the Manager can be
// reached and accepts our sign-on request. An offline Manager would cause the
// Worker to wait for it indefinitely.
startupCtx := context.Background()
client, startupState := worker.RegisterOrSignOn(startupCtx, &configWrangler)
shutdownComplete = make(chan struct{})
workerCtx, workerCtxCancel := context.WithCancel(context.Background())
timeService := clock.New()
buffer = upstreamBufferOrDie(client, timeService)
if queueSize, err := buffer.QueueSize(); err != nil {
log.Fatal().Err(err).Msg("error checking upstream buffer")
} else if queueSize > 0 {
// Flush any updates before actually starting the Worker.
log.Info().Int("queueSize", queueSize).Msg("flushing upstream buffer")
buffer.Flush(workerCtx)
}
if cliArgs.flush {
log.Info().Msg("upstream buffer flushed, shutting down")
worker.SignOff(workerCtx, log.Logger, client)
workerCtxCancel()
shutdown()
return
}
listener = worker.NewListener(client, buffer)
cmdRunner := worker.NewCommandExecutor(cliRunner, listener, timeService)
taskRunner := worker.NewTaskExecutor(cmdRunner, listener)
w = worker.NewWorker(client, taskRunner)
// Handle Ctrl+C
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt)
signal.Notify(c, syscall.SIGTERM)
go func() {
for signum := range c {
log.Info().Str("signal", signum.String()).Msg("signal received, shutting down.")
// Run the shutdown sequence in a goroutine, so that multiple Ctrl+C presses can be handled in parallel.
workerCtxCancel()
go shutdown()
}
}()
wg := sync.WaitGroup{}
wg.Add(1)
go func() {
defer wg.Done()
listener.Run(workerCtx)
}()
go w.Start(workerCtx, startupState)
shutdownReason := w.WaitForShutdown(workerCtx)
if shutdownReason != worker.ReasonContextClosed {
go shutdown()
}
<-shutdownComplete
workerCtxCancel()
wg.Wait()
log.Debug().Msg("process shutting down")
config, _ := configWrangler.WorkerConfig()
stopProcess(config, shutdownReason)
}
func shutdown() {
done := make(chan struct{})
go func() {
if w != nil {
w.Close()
if err := buffer.Close(); err != nil {
log.Error().Err(err).Msg("closing upstream task buffer")
}
// Sign off as the last step. Any flushes should happen while we're still signed on.
signoffCtx, cancelFunc := context.WithTimeout(context.Background(), 3*time.Second)
defer cancelFunc()
w.SignOff(signoffCtx)
}
close(done)
}()
select {
case <-done:
log.Debug().Msg("shutdown OK")
case <-time.After(20 * time.Second):
log.Error().Msg("shutdown forced, stopping process.")
os.Exit(-2)
}
log.Warn().Msg("shutdown complete, stopping process.")
close(shutdownComplete)
}
func stopProcess(config worker.WorkerConfig, shutdownReason worker.ShutdownReason) {
switch shutdownReason {
case worker.ReasonContextClosed:
os.Exit(1)
case worker.ReasonShutdownReq:
os.Exit(0)
case worker.ReasonRestartReq:
os.Exit(config.RestartExitCode)
}
}
func parseCliArgs() {
flag.BoolVar(&cliArgs.version, "version", false, "Shows the application version, then exits.")
flag.BoolVar(&cliArgs.flush, "flush", false, "Flush any buffered task updates to the Manager, then exits.")
flag.BoolVar(&cliArgs.quiet, "quiet", false, "Only log warning-level and worse.")
flag.BoolVar(&cliArgs.debug, "debug", false, "Enable debug-level logging.")
flag.BoolVar(&cliArgs.trace, "trace", false, "Enable trace-level logging.")
// TODO: make this override whatever was stored in the configuration file.
flag.StringVar(&cliArgs.manager, "manager", "", "URL of the Flamenco Manager.")
flag.BoolVar(&cliArgs.register, "register", false, "(Re-)register at the Manager.")
flag.BoolVar(&cliArgs.findManager, "find-manager", false, "Autodiscover a Manager, then quit.")
flag.IntVar(&cliArgs.restartExitCode, "restart-exit-code", 0,
"Mark this Worker as restartable. It will exit with this code to signify it needs to be restarted.")
flag.Parse()
if cliArgs.manager != "" {
var err error
cliArgs.managerURL, err = worker.ParseURL(cliArgs.manager)
if err != nil {
log.Fatal().Err(err).Msg("invalid manager URL")
}
}
}
func configLogLevel() {
var logLevel zerolog.Level
switch {
case cliArgs.trace:
logLevel = zerolog.TraceLevel
case cliArgs.debug:
logLevel = zerolog.DebugLevel
case cliArgs.quiet:
logLevel = zerolog.WarnLevel
default:
logLevel = zerolog.InfoLevel
}
zerolog.SetGlobalLevel(logLevel)
}
func upstreamBufferOrDie(client worker.FlamencoClient, timeService clock.Clock) *worker.UpstreamBufferDB {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
buffer, err := worker.NewUpstreamBuffer(client, timeService)
if err != nil {
log.Fatal().Err(err).Msg("unable to create task update queue database")
}
dbPath, err := appinfo.InFlamencoHome(flamencoWorkerDatabase)
if err != nil {
log.Fatal().Err(err).Msg("unable to figure out where to save my database")
}
if err := buffer.OpenDB(ctx, dbPath); err != nil {
log.Fatal().Err(err).Msg("unable to open task update queue database")
}
return buffer
}
func logFatalManagerDiscoveryError(err error, discoverTimeout time.Duration) {
if errors.Is(err, context.DeadlineExceeded) {
log.Fatal().Stringer("timeout", discoverTimeout).
Msgf("could not discover Manager in time, see %s", website.CannotFindManagerHelpURL)
} else {
log.Fatal().Err(err).
Msgf("auto-discovery error, see %s", website.CannotFindManagerHelpURL)
}
}
func createCLIRunner(configWrangler *worker.FileConfigWrangler) *cli_runner.CLIRunner {
config, err := configWrangler.WorkerConfig()
if err != nil {
log.Fatal().Err(err).Msg("error loading worker configuration")
}
if config.LinuxOOMScoreAdjust == nil {
log.Debug().Msg("executables will be run without OOM score adjustment")
return cli_runner.NewCLIRunner()
}
if !oomscore.Available() {
log.Warn().
Msgf("config: oom_score_adjust configured, but that is only available on Linux, not this platform. See %s for more information.",
website.OOMScoreAdjURL)
return cli_runner.NewCLIRunner()
}
adjustment := *config.LinuxOOMScoreAdjust
log.Info().Int("oom_score_adjust", adjustment).Msg("executables will be run with OOM score adjustment")
return cli_runner.NewCLIRunnerWithOOMScoreAdjuster(adjustment)
}