From ced826581a293431772a1ecaaa2f7d2ea6981711 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sybren=20A=2E=20St=C3=BCvel?= Date: Tue, 22 Nov 2022 17:31:47 +0100 Subject: [PATCH] Worker: make sure long lines are broken on character boundaries When a command (like `blender` or `ffmpeg`) outputs lines that are longer than our buffer, they are broken into buffer-sized chunks. Extra code has been added to ensure those chunks consist of valid UTF-8 characters. --- internal/worker/cli_runner/cli_runner.go | 29 +++++++++++++++---- internal/worker/cli_runner/strings.go | 31 ++++++++++++++++++++ internal/worker/cli_runner/strings_test.go | 33 ++++++++++++++++++++++ 3 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 internal/worker/cli_runner/strings.go create mode 100644 internal/worker/cli_runner/strings_test.go diff --git a/internal/worker/cli_runner/cli_runner.go b/internal/worker/cli_runner/cli_runner.go index 22df760c..66c84225 100644 --- a/internal/worker/cli_runner/cli_runner.go +++ b/internal/worker/cli_runner/cli_runner.go @@ -12,8 +12,9 @@ import ( "github.com/rs/zerolog" ) -// The buffer size used to read stdout/stderr output from subprocesses. -// Effectively this determines the maximum line length that can be handled. +// The buffer size used to read stdout/stderr output from subprocesses, in +// bytes. Effectively this determines the maximum line length that can be +// handled in one go. Lines that are longer will be broken up. const StdoutBufferSize = 40 * 1024 // CLIRunner is a wrapper around exec.CommandContext() to allow mocking. @@ -60,6 +61,12 @@ func (cli *CLIRunner) RunWithTextOutput( // of simply returning, because the function must be run to completion in // order to wait for processes (and not create defunct ones). var returnErr error = nil + + // If a line longer than our buffer is received, it will be trimmed to the + // bufffer length. This means that it may not end on a valid character + // boundary. Any leftover bytes are collected here, and prepended to the next + // line. + leftovers := []byte{} readloop: for { lineBytes, isPrefix, readErr := reader.ReadLine() @@ -73,12 +80,24 @@ readloop: break readloop } + // Prepend any leftovers from the previous line to the received bytes. + if len(leftovers) > 0 { + lineBytes = append(leftovers, lineBytes...) + leftovers = []byte{} + } + // Make sure long lines are broken on character boundaries. + lineBytes, leftovers = splitOnCharacterBoundary(lineBytes) + line := string(lineBytes) if isPrefix { + prefix := []rune(line) + if len(prefix) > 256 { + prefix = prefix[:256] + } logger.Warn(). - Str("line", fmt.Sprintf("%s...", line[:256])). - Int("lineLength", len(line)). - Msg("unexpectedly long line read, truncating") + Str("line", fmt.Sprintf("%s...", string(prefix))). + Int("bytesRead", len(lineBytes)). + Msg("unexpectedly long line read, will be split up") } logger.Debug().Msg(line) diff --git a/internal/worker/cli_runner/strings.go b/internal/worker/cli_runner/strings.go new file mode 100644 index 00000000..77903c73 --- /dev/null +++ b/internal/worker/cli_runner/strings.go @@ -0,0 +1,31 @@ +package cli_runner + +import ( + "unicode/utf8" +) + +// splitOnCharacterBoundary splits `b` such that `valid` + `tail` = `b` and +// `valid` is valid UTF-8. +func splitOnCharacterBoundary(b []byte) (valid []byte, tail []byte) { + totalLength := len(b) + tailBytes := 0 + for { + valid = b[:totalLength-tailBytes] + r, size := utf8.DecodeLastRune(valid) + switch { + case r == utf8.RuneError && size == 0: + // valid is empty, which means 'b' consists of only non-UTF8 bytes. + return valid, b + case r == utf8.RuneError && size == 1: + // The last bytes do not form a valid rune. See what happens if we move + // one byte from `valid` to `tail`. + tailBytes++ + continue + case r == utf8.RuneError: + // This shouldn't happen, RuneError should only be returned with size 0 or 1. + panic(size) + default: + return valid, b[totalLength-tailBytes:] + } + } +} diff --git a/internal/worker/cli_runner/strings_test.go b/internal/worker/cli_runner/strings_test.go new file mode 100644 index 00000000..9e00f3db --- /dev/null +++ b/internal/worker/cli_runner/strings_test.go @@ -0,0 +1,33 @@ +package cli_runner + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSplitOnCharacterBoundary(t *testing.T) { + // Test with strings, as those are easier to type. + tests := []struct { + name string + input string + wantValid string + wantTail string + }{ + {"empty", "", "", ""}, + {"trivial", "abc", "abc", ""}, + {"valid", "Stüvel", "Stüvel", ""}, + {"cats", "🐈🐈\xf0\x9f\x90\x88", "🐈🐈🐈", ""}, + {"truncated-cats-1", "🐈🐈\xf0\x9f\x90", "🐈🐈", "\xf0\x9f\x90"}, + {"truncated-cats-2", "🐈🐈\xf0\x9f", "🐈🐈", "\xf0\x9f"}, + {"truncated-cats-3", "🐈🐈\xf0", "🐈🐈", "\xf0"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotValid, gotTail := splitOnCharacterBoundary([]byte(tt.input)) + assert.Equal(t, tt.input, string(gotValid)+string(gotTail)) + assert.Equal(t, tt.wantValid, string(gotValid)) + assert.Equal(t, tt.wantTail, string(gotTail)) + }) + } +}