H1/H2/M1: atomicity at the file boundary

H1 — Concurrent-modification detection. loadRRs now returns a
fileSnapshot capturing (mtime, size) at read time. handleUpdate calls
zf.checkUnchanged(snap) immediately before writeAtomic. If anything
modified the file between load and write — rsync push, manual edit,
`git checkout` — the UPDATE is refused with SERVFAIL. Caddy retries
with a fresh load. Protects against the CLAUDE.md-documented rsync
workflow racing the plugin.

H2 — Git commit-failure policy. The previous code logged at WARN and
continued, breaking the documented "file + git both updated" contract.
Now logs at ERROR with structured fields (zone, path, error, recovery
command) so operators discover the divergence. We do NOT roll back the
file write: by the time the commit fails, the auto plugin may have
already noticed the new mtime and reloaded; rolling back creates more
races than it solves. Recovery is `git -C <dir> status` + manual
commit.

M1 — exec.CommandContext with 10s timeout on git invocations. If git
hangs (NFS stall, gpg-sign prompt, broken pre-commit hook waiting on
stdin), the per-zone mutex would otherwise be held forever and queue
all subsequent UPDATEs. gitCommandTimeout caps the hang.

M2 deferred. Dropping the separate `git add` cleanly requires either
`-a` (wrong scope: auto-stages all tracked modifications) or `--include`
(still needs prior staging). The race window between add and commit is
theoretical for our setup (single-writer plugin + occasional `git
status`). M1's timeout already mitigates the worst hang case.

New tests:
- TestZoneFile_CheckUnchanged_DetectsExternalModification (H1)
This commit is contained in:
Ryan Malloy 2026-05-22 21:22:11 -06:00
parent 8e421f925e
commit 93ed180d8f
4 changed files with 131 additions and 19 deletions

View File

@ -2,6 +2,7 @@ package rfc2136
import (
"fmt"
"path/filepath"
"strings"
"time"
@ -80,8 +81,8 @@ func (p *RFC2136) handleUpdate(w dns.ResponseWriter, r *dns.Msg, verified bool)
zf.mu.Lock()
defer zf.mu.Unlock()
// 3. Load the current zone contents.
rrs, err := zf.loadRRs()
// 3. Load the current zone contents (with file-identity snapshot).
rrs, snap, err := zf.loadRRs()
if err != nil {
log.Errorf("UPDATE failed: %v", err)
return p.updateResp(w, resp, dns.RcodeServerFailure)
@ -126,17 +127,34 @@ func (p *RFC2136) handleUpdate(w dns.ResponseWriter, r *dns.Msg, verified bool)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
// 6b. Concurrent-modification check (Hamilton H1). Before clobbering
// the on-disk file, verify nothing changed it out from under us
// between loadRRs and now. The per-zone mutex serializes us against
// other in-process UPDATEs, but external editors (rsync push,
// manual edit, `git checkout`) can race in any time. If the file
// changed, refuse with SERVFAIL so Caddy retries on a fresh load.
if err := zf.checkUnchanged(snap); err != nil {
log.Warningf("UPDATE refused: %v", err)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
// 7. Atomic write.
if err := zf.writeAtomic(updated, now); err != nil {
log.Errorf("UPDATE write failed: %v", err)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
// 8. Auto-commit. Failure to commit isn't fatal to the UPDATE —
// the on-disk state is authoritative — but we log loudly.
// 8. Auto-commit. Failure to commit means the file is correct but
// the git audit trail diverges (Hamilton H2). We log at ERROR with
// structured detail so operators discover the divergence; recovery
// is `git -C <zonesDir> status` + `git add` + manual commit. We do
// NOT roll back the file write — by the time the commit fails, the
// auto plugin may have already noticed the new mtime, and rolling
// back creates more races than it solves.
msg := summarizeUpdate(zone, r.Ns)
if err := zf.commit(msg); err != nil {
log.Warningf("git auto-commit failed: %v", err)
log.Errorf("git auto-commit failed; zone file is correct but audit trail diverged: zone=%s path=%s err=%v — recover with `git -C %s status` + manual commit",
zone, zf.Path, err, filepath.Dir(zf.Path))
}
log.Infof("UPDATE applied: zone=%s prereqs=%d updates=%d msg=%q",

View File

@ -70,7 +70,7 @@ func runUpdate(t *testing.T, p *RFC2136, msg *dns.Msg) (rcode int) {
func readZoneRecords(t *testing.T, p *RFC2136, zone string) []dns.RR {
t.Helper()
zf := p.zones[dns.Fqdn(zone)]
rrs, err := zf.loadRRs()
rrs, _, err := zf.loadRRs()
if err != nil {
t.Fatalf("loadRRs: %v", err)
}

View File

@ -1,6 +1,7 @@
package rfc2136
import (
"context"
"fmt"
"os"
"os/exec"
@ -12,6 +13,31 @@ import (
"github.com/miekg/dns"
)
// gitCommandTimeout caps any single git invocation we shell out to. If
// the process hangs (NFS stall, gpg-sign prompt, broken pre-commit hook
// waiting on stdin, etc.) we must not block the caller — which holds
// the per-zone mutex — forever. 10s is generous for a local-disk repo.
const gitCommandTimeout = 10 * time.Second
// fileSnapshot captures the file-identity fingerprint at the moment we
// last read the zone file. We compare it to the live stat just before
// writing back to detect concurrent modification (rsync push, manual
// edit, `git checkout`). If anything changed the file out from under
// us, we refuse the UPDATE — Caddy will retry, and the next loadRRs
// will see the updated state.
type fileSnapshot struct {
mtime time.Time
size int64
}
// matches returns true if `info` reports the same mtime and size we
// captured. Inode comparison would be stricter, but mtime+size is
// sufficient for the failure modes we care about (rename-over-original
// edits change both).
func (s fileSnapshot) matches(info os.FileInfo) bool {
return s.mtime.Equal(info.ModTime()) && s.size == info.Size()
}
// zoneFile is a file-backed authority for a single DNS zone. Replaces
// the Phase-1.3 in-memory recordStore.
//
@ -74,13 +100,23 @@ func openZoneFile(path, origin string) *zoneFile {
// loadRRs reads the zone file and parses it into an RR slice via
// miekg/dns's zone parser. The parser handles $ORIGIN, $TTL, multi-line
// SOA, comments, includes, etc.
func (z *zoneFile) loadRRs() ([]dns.RR, error) {
//
// Returns (rrs, snapshot, error). The snapshot fingerprints the file
// identity at read time so a subsequent writeIfUnchanged can detect
// concurrent modification.
func (z *zoneFile) loadRRs() ([]dns.RR, fileSnapshot, error) {
f, err := os.Open(z.Path)
if err != nil {
return nil, fmt.Errorf("open %s: %w", z.Path, err)
return nil, fileSnapshot{}, fmt.Errorf("open %s: %w", z.Path, err)
}
defer f.Close()
info, err := f.Stat()
if err != nil {
return nil, fileSnapshot{}, fmt.Errorf("stat %s: %w", z.Path, err)
}
snap := fileSnapshot{mtime: info.ModTime(), size: info.Size()}
parser := dns.NewZoneParser(f, z.Origin, z.Path)
parser.SetDefaultTTL(3600)
@ -89,12 +125,29 @@ func (z *zoneFile) loadRRs() ([]dns.RR, error) {
rrs = append(rrs, rr)
}
if err := parser.Err(); err != nil {
return nil, fmt.Errorf("parse %s: %w", z.Path, err)
return nil, snap, fmt.Errorf("parse %s: %w", z.Path, err)
}
if len(rrs) == 0 {
return nil, fmt.Errorf("%s: zero RRs parsed", z.Path)
return nil, snap, fmt.Errorf("%s: zero RRs parsed", z.Path)
}
return rrs, nil
return rrs, snap, nil
}
// checkUnchanged returns nil if the on-disk file still matches the
// captured snapshot. If the file has been modified (mtime or size
// differs), returns an error — the caller should refuse the UPDATE
// rather than clobber the external change.
func (z *zoneFile) checkUnchanged(snap fileSnapshot) error {
info, err := os.Stat(z.Path)
if err != nil {
return fmt.Errorf("stat %s: %w", z.Path, err)
}
if !snap.matches(info) {
return fmt.Errorf("concurrent modification detected on %s: mtime %s/%s size %d/%d",
z.Path, snap.mtime.Format(time.RFC3339Nano), info.ModTime().Format(time.RFC3339Nano),
snap.size, info.Size())
}
return nil
}
// Lookup returns RRs in `rrs` matching (name, rtype). Both name and
@ -345,6 +398,12 @@ func (z *zoneFile) writeAtomic(rrs []dns.RR, now time.Time) error {
// repository directory inferred from the zone file's parent. Returns
// nil silently if AutoCommit is false. Returns an error if the commit
// fails; the caller decides whether to roll back the file write.
//
// Both git invocations run under a context with a hard timeout
// (gitCommandTimeout). If git hangs (NFS stall, gpg-sign prompt,
// pre-commit hook waiting on stdin), we kill it rather than block the
// caller's per-zone mutex indefinitely. ACME storms must not be able
// to wedge the plugin via git getting stuck.
func (z *zoneFile) commit(message string) error {
if !z.AutoCommit {
return nil
@ -352,15 +411,17 @@ func (z *zoneFile) commit(message string) error {
// We run git from the directory containing the zone file. git will
// walk upward to find the .git dir.
dir := filepath.Dir(z.Path)
ctx, cancel := context.WithTimeout(context.Background(), gitCommandTimeout)
defer cancel()
// `git add` first; if file is already in the index, no harm done.
add := exec.Command("git",
add := exec.CommandContext(ctx, "git",
"-C", dir,
"add", "--", z.Path,
)
if out, err := add.CombinedOutput(); err != nil {
return fmt.Errorf("git add failed: %w: %s", err, strings.TrimSpace(string(out)))
}
commit := exec.Command("git",
commit := exec.CommandContext(ctx, "git",
"-C", dir,
"-c", "user.name="+z.GitAuthorName,
"-c", "user.email="+z.GitAuthorEmail,

View File

@ -25,7 +25,7 @@ func TestZoneFile_LoadRRs(t *testing.T) {
dir := withTempZonesDir(t, "auth.example.com")
zf := openZoneFile(filepath.Join(dir, "auth.example.com.zone"), "auth.example.com.")
rrs, err := zf.loadRRs()
rrs, _, err := zf.loadRRs()
if err != nil {
t.Fatalf("loadRRs: %v", err)
}
@ -40,7 +40,7 @@ func TestZoneFile_LoadRRs(t *testing.T) {
func TestZoneFile_LoadRRs_MissingFile(t *testing.T) {
zf := openZoneFile("/nope/missing.zone", "missing.")
if _, err := zf.loadRRs(); err == nil {
if _, _, err := zf.loadRRs(); err == nil {
t.Errorf("expected error loading missing file, got nil")
}
}
@ -234,7 +234,7 @@ func TestZoneFile_WriteAtomic_RoundTrip(t *testing.T) {
zf := openZoneFile(path, "auth.example.com.")
zf.AutoCommit = false // not testing commit here
rrs, err := zf.loadRRs()
rrs, _, err := zf.loadRRs()
if err != nil {
t.Fatalf("initial load: %v", err)
}
@ -247,7 +247,7 @@ func TestZoneFile_WriteAtomic_RoundTrip(t *testing.T) {
}
// Re-load and verify.
after, err := zf.loadRRs()
after, _, err := zf.loadRRs()
if err != nil {
t.Fatalf("re-load: %v", err)
}
@ -272,7 +272,7 @@ func TestZoneFile_WriteAtomic_LeavesNoTempFile(t *testing.T) {
zf := openZoneFile(path, "auth.example.com.")
zf.AutoCommit = false
rrs, _ := zf.loadRRs()
rrs, _, _ := zf.loadRRs()
_ = zf.writeAtomic(rrs, time.Now())
// No .rfc2136-*.zone temp files should remain.
@ -282,13 +282,46 @@ func TestZoneFile_WriteAtomic_LeavesNoTempFile(t *testing.T) {
}
}
// TestZoneFile_CheckUnchanged_DetectsExternalModification covers H1:
// between loadRRs and the next writeAtomic, if anything (rsync, manual
// edit, git checkout) clobbered the file, checkUnchanged returns an
// error so the caller refuses the UPDATE instead of losing the
// external write.
func TestZoneFile_CheckUnchanged_DetectsExternalModification(t *testing.T) {
dir := withTempZonesDir(t, "auth.example.com")
path := filepath.Join(dir, "auth.example.com.zone")
zf := openZoneFile(path, "auth.example.com.")
zf.AutoCommit = false
_, snap, err := zf.loadRRs()
if err != nil {
t.Fatalf("loadRRs: %v", err)
}
// Pristine snapshot — checkUnchanged should pass.
if err := zf.checkUnchanged(snap); err != nil {
t.Errorf("pristine snapshot rejected: %v", err)
}
// Simulate an external editor clobbering the file (rsync-style:
// new content, different size, mtime advances).
time.Sleep(10 * time.Millisecond) // ensure mtime differs
if err := os.WriteFile(path, []byte("; external edit\n$ORIGIN auth.example.com.\nauth.example.com. 3600 IN SOA ns.example. admin.example. 1 60 60 60 60\n"), 0644); err != nil {
t.Fatalf("simulated external edit: %v", err)
}
if err := zf.checkUnchanged(snap); err == nil {
t.Errorf("checkUnchanged accepted modified file; expected concurrent-modification error")
}
}
func TestZoneFile_WriteAtomic_FileEndsWithNewline(t *testing.T) {
dir := withTempZonesDir(t, "auth.example.com")
path := filepath.Join(dir, "auth.example.com.zone")
zf := openZoneFile(path, "auth.example.com.")
zf.AutoCommit = false
rrs, _ := zf.loadRRs()
rrs, _, _ := zf.loadRRs()
_ = zf.writeAtomic(rrs, time.Now())
data, _ := os.ReadFile(path)