From 93ed180d8f755911a36ba67a8e4c69e85f42157a Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Fri, 22 May 2026 21:22:11 -0600 Subject: [PATCH] H1/H2/M1: atomicity at the file boundary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit H1 — Concurrent-modification detection. loadRRs now returns a fileSnapshot capturing (mtime, size) at read time. handleUpdate calls zf.checkUnchanged(snap) immediately before writeAtomic. If anything modified the file between load and write — rsync push, manual edit, `git checkout` — the UPDATE is refused with SERVFAIL. Caddy retries with a fresh load. Protects against the CLAUDE.md-documented rsync workflow racing the plugin. H2 — Git commit-failure policy. The previous code logged at WARN and continued, breaking the documented "file + git both updated" contract. Now logs at ERROR with structured fields (zone, path, error, recovery command) so operators discover the divergence. We do NOT roll back the file write: by the time the commit fails, the auto plugin may have already noticed the new mtime and reloaded; rolling back creates more races than it solves. Recovery is `git -C status` + manual commit. M1 — exec.CommandContext with 10s timeout on git invocations. If git hangs (NFS stall, gpg-sign prompt, broken pre-commit hook waiting on stdin), the per-zone mutex would otherwise be held forever and queue all subsequent UPDATEs. gitCommandTimeout caps the hang. M2 deferred. Dropping the separate `git add` cleanly requires either `-a` (wrong scope: auto-stages all tracked modifications) or `--include` (still needs prior staging). The race window between add and commit is theoretical for our setup (single-writer plugin + occasional `git status`). M1's timeout already mitigates the worst hang case. New tests: - TestZoneFile_CheckUnchanged_DetectsExternalModification (H1) --- update.go | 28 ++++++++++++++---- update_test.go | 2 +- zonefile.go | 75 +++++++++++++++++++++++++++++++++++++++++++----- zonefile_test.go | 45 +++++++++++++++++++++++++---- 4 files changed, 131 insertions(+), 19 deletions(-) diff --git a/update.go b/update.go index 7d7d45d..0409d9c 100644 --- a/update.go +++ b/update.go @@ -2,6 +2,7 @@ package rfc2136 import ( "fmt" + "path/filepath" "strings" "time" @@ -80,8 +81,8 @@ func (p *RFC2136) handleUpdate(w dns.ResponseWriter, r *dns.Msg, verified bool) zf.mu.Lock() defer zf.mu.Unlock() - // 3. Load the current zone contents. - rrs, err := zf.loadRRs() + // 3. Load the current zone contents (with file-identity snapshot). + rrs, snap, err := zf.loadRRs() if err != nil { log.Errorf("UPDATE failed: %v", err) return p.updateResp(w, resp, dns.RcodeServerFailure) @@ -126,17 +127,34 @@ func (p *RFC2136) handleUpdate(w dns.ResponseWriter, r *dns.Msg, verified bool) return p.updateResp(w, resp, dns.RcodeServerFailure) } + // 6b. Concurrent-modification check (Hamilton H1). Before clobbering + // the on-disk file, verify nothing changed it out from under us + // between loadRRs and now. The per-zone mutex serializes us against + // other in-process UPDATEs, but external editors (rsync push, + // manual edit, `git checkout`) can race in any time. If the file + // changed, refuse with SERVFAIL so Caddy retries on a fresh load. + if err := zf.checkUnchanged(snap); err != nil { + log.Warningf("UPDATE refused: %v", err) + return p.updateResp(w, resp, dns.RcodeServerFailure) + } + // 7. Atomic write. if err := zf.writeAtomic(updated, now); err != nil { log.Errorf("UPDATE write failed: %v", err) return p.updateResp(w, resp, dns.RcodeServerFailure) } - // 8. Auto-commit. Failure to commit isn't fatal to the UPDATE — - // the on-disk state is authoritative — but we log loudly. + // 8. Auto-commit. Failure to commit means the file is correct but + // the git audit trail diverges (Hamilton H2). We log at ERROR with + // structured detail so operators discover the divergence; recovery + // is `git -C status` + `git add` + manual commit. We do + // NOT roll back the file write — by the time the commit fails, the + // auto plugin may have already noticed the new mtime, and rolling + // back creates more races than it solves. msg := summarizeUpdate(zone, r.Ns) if err := zf.commit(msg); err != nil { - log.Warningf("git auto-commit failed: %v", err) + log.Errorf("git auto-commit failed; zone file is correct but audit trail diverged: zone=%s path=%s err=%v — recover with `git -C %s status` + manual commit", + zone, zf.Path, err, filepath.Dir(zf.Path)) } log.Infof("UPDATE applied: zone=%s prereqs=%d updates=%d msg=%q", diff --git a/update_test.go b/update_test.go index c47bd8c..4f56c1a 100644 --- a/update_test.go +++ b/update_test.go @@ -70,7 +70,7 @@ func runUpdate(t *testing.T, p *RFC2136, msg *dns.Msg) (rcode int) { func readZoneRecords(t *testing.T, p *RFC2136, zone string) []dns.RR { t.Helper() zf := p.zones[dns.Fqdn(zone)] - rrs, err := zf.loadRRs() + rrs, _, err := zf.loadRRs() if err != nil { t.Fatalf("loadRRs: %v", err) } diff --git a/zonefile.go b/zonefile.go index dca4e5e..a5d6ff9 100644 --- a/zonefile.go +++ b/zonefile.go @@ -1,6 +1,7 @@ package rfc2136 import ( + "context" "fmt" "os" "os/exec" @@ -12,6 +13,31 @@ import ( "github.com/miekg/dns" ) +// gitCommandTimeout caps any single git invocation we shell out to. If +// the process hangs (NFS stall, gpg-sign prompt, broken pre-commit hook +// waiting on stdin, etc.) we must not block the caller — which holds +// the per-zone mutex — forever. 10s is generous for a local-disk repo. +const gitCommandTimeout = 10 * time.Second + +// fileSnapshot captures the file-identity fingerprint at the moment we +// last read the zone file. We compare it to the live stat just before +// writing back to detect concurrent modification (rsync push, manual +// edit, `git checkout`). If anything changed the file out from under +// us, we refuse the UPDATE — Caddy will retry, and the next loadRRs +// will see the updated state. +type fileSnapshot struct { + mtime time.Time + size int64 +} + +// matches returns true if `info` reports the same mtime and size we +// captured. Inode comparison would be stricter, but mtime+size is +// sufficient for the failure modes we care about (rename-over-original +// edits change both). +func (s fileSnapshot) matches(info os.FileInfo) bool { + return s.mtime.Equal(info.ModTime()) && s.size == info.Size() +} + // zoneFile is a file-backed authority for a single DNS zone. Replaces // the Phase-1.3 in-memory recordStore. // @@ -74,13 +100,23 @@ func openZoneFile(path, origin string) *zoneFile { // loadRRs reads the zone file and parses it into an RR slice via // miekg/dns's zone parser. The parser handles $ORIGIN, $TTL, multi-line // SOA, comments, includes, etc. -func (z *zoneFile) loadRRs() ([]dns.RR, error) { +// +// Returns (rrs, snapshot, error). The snapshot fingerprints the file +// identity at read time so a subsequent writeIfUnchanged can detect +// concurrent modification. +func (z *zoneFile) loadRRs() ([]dns.RR, fileSnapshot, error) { f, err := os.Open(z.Path) if err != nil { - return nil, fmt.Errorf("open %s: %w", z.Path, err) + return nil, fileSnapshot{}, fmt.Errorf("open %s: %w", z.Path, err) } defer f.Close() + info, err := f.Stat() + if err != nil { + return nil, fileSnapshot{}, fmt.Errorf("stat %s: %w", z.Path, err) + } + snap := fileSnapshot{mtime: info.ModTime(), size: info.Size()} + parser := dns.NewZoneParser(f, z.Origin, z.Path) parser.SetDefaultTTL(3600) @@ -89,12 +125,29 @@ func (z *zoneFile) loadRRs() ([]dns.RR, error) { rrs = append(rrs, rr) } if err := parser.Err(); err != nil { - return nil, fmt.Errorf("parse %s: %w", z.Path, err) + return nil, snap, fmt.Errorf("parse %s: %w", z.Path, err) } if len(rrs) == 0 { - return nil, fmt.Errorf("%s: zero RRs parsed", z.Path) + return nil, snap, fmt.Errorf("%s: zero RRs parsed", z.Path) } - return rrs, nil + return rrs, snap, nil +} + +// checkUnchanged returns nil if the on-disk file still matches the +// captured snapshot. If the file has been modified (mtime or size +// differs), returns an error — the caller should refuse the UPDATE +// rather than clobber the external change. +func (z *zoneFile) checkUnchanged(snap fileSnapshot) error { + info, err := os.Stat(z.Path) + if err != nil { + return fmt.Errorf("stat %s: %w", z.Path, err) + } + if !snap.matches(info) { + return fmt.Errorf("concurrent modification detected on %s: mtime %s/%s size %d/%d", + z.Path, snap.mtime.Format(time.RFC3339Nano), info.ModTime().Format(time.RFC3339Nano), + snap.size, info.Size()) + } + return nil } // Lookup returns RRs in `rrs` matching (name, rtype). Both name and @@ -345,6 +398,12 @@ func (z *zoneFile) writeAtomic(rrs []dns.RR, now time.Time) error { // repository directory inferred from the zone file's parent. Returns // nil silently if AutoCommit is false. Returns an error if the commit // fails; the caller decides whether to roll back the file write. +// +// Both git invocations run under a context with a hard timeout +// (gitCommandTimeout). If git hangs (NFS stall, gpg-sign prompt, +// pre-commit hook waiting on stdin), we kill it rather than block the +// caller's per-zone mutex indefinitely. ACME storms must not be able +// to wedge the plugin via git getting stuck. func (z *zoneFile) commit(message string) error { if !z.AutoCommit { return nil @@ -352,15 +411,17 @@ func (z *zoneFile) commit(message string) error { // We run git from the directory containing the zone file. git will // walk upward to find the .git dir. dir := filepath.Dir(z.Path) + ctx, cancel := context.WithTimeout(context.Background(), gitCommandTimeout) + defer cancel() // `git add` first; if file is already in the index, no harm done. - add := exec.Command("git", + add := exec.CommandContext(ctx, "git", "-C", dir, "add", "--", z.Path, ) if out, err := add.CombinedOutput(); err != nil { return fmt.Errorf("git add failed: %w: %s", err, strings.TrimSpace(string(out))) } - commit := exec.Command("git", + commit := exec.CommandContext(ctx, "git", "-C", dir, "-c", "user.name="+z.GitAuthorName, "-c", "user.email="+z.GitAuthorEmail, diff --git a/zonefile_test.go b/zonefile_test.go index e576b30..a7c5582 100644 --- a/zonefile_test.go +++ b/zonefile_test.go @@ -25,7 +25,7 @@ func TestZoneFile_LoadRRs(t *testing.T) { dir := withTempZonesDir(t, "auth.example.com") zf := openZoneFile(filepath.Join(dir, "auth.example.com.zone"), "auth.example.com.") - rrs, err := zf.loadRRs() + rrs, _, err := zf.loadRRs() if err != nil { t.Fatalf("loadRRs: %v", err) } @@ -40,7 +40,7 @@ func TestZoneFile_LoadRRs(t *testing.T) { func TestZoneFile_LoadRRs_MissingFile(t *testing.T) { zf := openZoneFile("/nope/missing.zone", "missing.") - if _, err := zf.loadRRs(); err == nil { + if _, _, err := zf.loadRRs(); err == nil { t.Errorf("expected error loading missing file, got nil") } } @@ -234,7 +234,7 @@ func TestZoneFile_WriteAtomic_RoundTrip(t *testing.T) { zf := openZoneFile(path, "auth.example.com.") zf.AutoCommit = false // not testing commit here - rrs, err := zf.loadRRs() + rrs, _, err := zf.loadRRs() if err != nil { t.Fatalf("initial load: %v", err) } @@ -247,7 +247,7 @@ func TestZoneFile_WriteAtomic_RoundTrip(t *testing.T) { } // Re-load and verify. - after, err := zf.loadRRs() + after, _, err := zf.loadRRs() if err != nil { t.Fatalf("re-load: %v", err) } @@ -272,7 +272,7 @@ func TestZoneFile_WriteAtomic_LeavesNoTempFile(t *testing.T) { zf := openZoneFile(path, "auth.example.com.") zf.AutoCommit = false - rrs, _ := zf.loadRRs() + rrs, _, _ := zf.loadRRs() _ = zf.writeAtomic(rrs, time.Now()) // No .rfc2136-*.zone temp files should remain. @@ -282,13 +282,46 @@ func TestZoneFile_WriteAtomic_LeavesNoTempFile(t *testing.T) { } } +// TestZoneFile_CheckUnchanged_DetectsExternalModification covers H1: +// between loadRRs and the next writeAtomic, if anything (rsync, manual +// edit, git checkout) clobbered the file, checkUnchanged returns an +// error so the caller refuses the UPDATE instead of losing the +// external write. +func TestZoneFile_CheckUnchanged_DetectsExternalModification(t *testing.T) { + dir := withTempZonesDir(t, "auth.example.com") + path := filepath.Join(dir, "auth.example.com.zone") + zf := openZoneFile(path, "auth.example.com.") + zf.AutoCommit = false + + _, snap, err := zf.loadRRs() + if err != nil { + t.Fatalf("loadRRs: %v", err) + } + + // Pristine snapshot — checkUnchanged should pass. + if err := zf.checkUnchanged(snap); err != nil { + t.Errorf("pristine snapshot rejected: %v", err) + } + + // Simulate an external editor clobbering the file (rsync-style: + // new content, different size, mtime advances). + time.Sleep(10 * time.Millisecond) // ensure mtime differs + if err := os.WriteFile(path, []byte("; external edit\n$ORIGIN auth.example.com.\nauth.example.com. 3600 IN SOA ns.example. admin.example. 1 60 60 60 60\n"), 0644); err != nil { + t.Fatalf("simulated external edit: %v", err) + } + + if err := zf.checkUnchanged(snap); err == nil { + t.Errorf("checkUnchanged accepted modified file; expected concurrent-modification error") + } +} + func TestZoneFile_WriteAtomic_FileEndsWithNewline(t *testing.T) { dir := withTempZonesDir(t, "auth.example.com") path := filepath.Join(dir, "auth.example.com.zone") zf := openZoneFile(path, "auth.example.com.") zf.AutoCommit = false - rrs, _ := zf.loadRRs() + rrs, _, _ := zf.loadRRs() _ = zf.writeAtomic(rrs, time.Now()) data, _ := os.ReadFile(path)