H1/H2/M1: atomicity at the file boundary
H1 — Concurrent-modification detection. loadRRs now returns a fileSnapshot capturing (mtime, size) at read time. handleUpdate calls zf.checkUnchanged(snap) immediately before writeAtomic. If anything modified the file between load and write — rsync push, manual edit, `git checkout` — the UPDATE is refused with SERVFAIL. Caddy retries with a fresh load. Protects against the CLAUDE.md-documented rsync workflow racing the plugin. H2 — Git commit-failure policy. The previous code logged at WARN and continued, breaking the documented "file + git both updated" contract. Now logs at ERROR with structured fields (zone, path, error, recovery command) so operators discover the divergence. We do NOT roll back the file write: by the time the commit fails, the auto plugin may have already noticed the new mtime and reloaded; rolling back creates more races than it solves. Recovery is `git -C <dir> status` + manual commit. M1 — exec.CommandContext with 10s timeout on git invocations. If git hangs (NFS stall, gpg-sign prompt, broken pre-commit hook waiting on stdin), the per-zone mutex would otherwise be held forever and queue all subsequent UPDATEs. gitCommandTimeout caps the hang. M2 deferred. Dropping the separate `git add` cleanly requires either `-a` (wrong scope: auto-stages all tracked modifications) or `--include` (still needs prior staging). The race window between add and commit is theoretical for our setup (single-writer plugin + occasional `git status`). M1's timeout already mitigates the worst hang case. New tests: - TestZoneFile_CheckUnchanged_DetectsExternalModification (H1)
This commit is contained in:
parent
8e421f925e
commit
93ed180d8f
28
update.go
28
update.go
@ -2,6 +2,7 @@ package rfc2136
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@ -80,8 +81,8 @@ func (p *RFC2136) handleUpdate(w dns.ResponseWriter, r *dns.Msg, verified bool)
|
||||
zf.mu.Lock()
|
||||
defer zf.mu.Unlock()
|
||||
|
||||
// 3. Load the current zone contents.
|
||||
rrs, err := zf.loadRRs()
|
||||
// 3. Load the current zone contents (with file-identity snapshot).
|
||||
rrs, snap, err := zf.loadRRs()
|
||||
if err != nil {
|
||||
log.Errorf("UPDATE failed: %v", err)
|
||||
return p.updateResp(w, resp, dns.RcodeServerFailure)
|
||||
@ -126,17 +127,34 @@ func (p *RFC2136) handleUpdate(w dns.ResponseWriter, r *dns.Msg, verified bool)
|
||||
return p.updateResp(w, resp, dns.RcodeServerFailure)
|
||||
}
|
||||
|
||||
// 6b. Concurrent-modification check (Hamilton H1). Before clobbering
|
||||
// the on-disk file, verify nothing changed it out from under us
|
||||
// between loadRRs and now. The per-zone mutex serializes us against
|
||||
// other in-process UPDATEs, but external editors (rsync push,
|
||||
// manual edit, `git checkout`) can race in any time. If the file
|
||||
// changed, refuse with SERVFAIL so Caddy retries on a fresh load.
|
||||
if err := zf.checkUnchanged(snap); err != nil {
|
||||
log.Warningf("UPDATE refused: %v", err)
|
||||
return p.updateResp(w, resp, dns.RcodeServerFailure)
|
||||
}
|
||||
|
||||
// 7. Atomic write.
|
||||
if err := zf.writeAtomic(updated, now); err != nil {
|
||||
log.Errorf("UPDATE write failed: %v", err)
|
||||
return p.updateResp(w, resp, dns.RcodeServerFailure)
|
||||
}
|
||||
|
||||
// 8. Auto-commit. Failure to commit isn't fatal to the UPDATE —
|
||||
// the on-disk state is authoritative — but we log loudly.
|
||||
// 8. Auto-commit. Failure to commit means the file is correct but
|
||||
// the git audit trail diverges (Hamilton H2). We log at ERROR with
|
||||
// structured detail so operators discover the divergence; recovery
|
||||
// is `git -C <zonesDir> status` + `git add` + manual commit. We do
|
||||
// NOT roll back the file write — by the time the commit fails, the
|
||||
// auto plugin may have already noticed the new mtime, and rolling
|
||||
// back creates more races than it solves.
|
||||
msg := summarizeUpdate(zone, r.Ns)
|
||||
if err := zf.commit(msg); err != nil {
|
||||
log.Warningf("git auto-commit failed: %v", err)
|
||||
log.Errorf("git auto-commit failed; zone file is correct but audit trail diverged: zone=%s path=%s err=%v — recover with `git -C %s status` + manual commit",
|
||||
zone, zf.Path, err, filepath.Dir(zf.Path))
|
||||
}
|
||||
|
||||
log.Infof("UPDATE applied: zone=%s prereqs=%d updates=%d msg=%q",
|
||||
|
||||
@ -70,7 +70,7 @@ func runUpdate(t *testing.T, p *RFC2136, msg *dns.Msg) (rcode int) {
|
||||
func readZoneRecords(t *testing.T, p *RFC2136, zone string) []dns.RR {
|
||||
t.Helper()
|
||||
zf := p.zones[dns.Fqdn(zone)]
|
||||
rrs, err := zf.loadRRs()
|
||||
rrs, _, err := zf.loadRRs()
|
||||
if err != nil {
|
||||
t.Fatalf("loadRRs: %v", err)
|
||||
}
|
||||
|
||||
75
zonefile.go
75
zonefile.go
@ -1,6 +1,7 @@
|
||||
package rfc2136
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
@ -12,6 +13,31 @@ import (
|
||||
"github.com/miekg/dns"
|
||||
)
|
||||
|
||||
// gitCommandTimeout caps any single git invocation we shell out to. If
|
||||
// the process hangs (NFS stall, gpg-sign prompt, broken pre-commit hook
|
||||
// waiting on stdin, etc.) we must not block the caller — which holds
|
||||
// the per-zone mutex — forever. 10s is generous for a local-disk repo.
|
||||
const gitCommandTimeout = 10 * time.Second
|
||||
|
||||
// fileSnapshot captures the file-identity fingerprint at the moment we
|
||||
// last read the zone file. We compare it to the live stat just before
|
||||
// writing back to detect concurrent modification (rsync push, manual
|
||||
// edit, `git checkout`). If anything changed the file out from under
|
||||
// us, we refuse the UPDATE — Caddy will retry, and the next loadRRs
|
||||
// will see the updated state.
|
||||
type fileSnapshot struct {
|
||||
mtime time.Time
|
||||
size int64
|
||||
}
|
||||
|
||||
// matches returns true if `info` reports the same mtime and size we
|
||||
// captured. Inode comparison would be stricter, but mtime+size is
|
||||
// sufficient for the failure modes we care about (rename-over-original
|
||||
// edits change both).
|
||||
func (s fileSnapshot) matches(info os.FileInfo) bool {
|
||||
return s.mtime.Equal(info.ModTime()) && s.size == info.Size()
|
||||
}
|
||||
|
||||
// zoneFile is a file-backed authority for a single DNS zone. Replaces
|
||||
// the Phase-1.3 in-memory recordStore.
|
||||
//
|
||||
@ -74,13 +100,23 @@ func openZoneFile(path, origin string) *zoneFile {
|
||||
// loadRRs reads the zone file and parses it into an RR slice via
|
||||
// miekg/dns's zone parser. The parser handles $ORIGIN, $TTL, multi-line
|
||||
// SOA, comments, includes, etc.
|
||||
func (z *zoneFile) loadRRs() ([]dns.RR, error) {
|
||||
//
|
||||
// Returns (rrs, snapshot, error). The snapshot fingerprints the file
|
||||
// identity at read time so a subsequent writeIfUnchanged can detect
|
||||
// concurrent modification.
|
||||
func (z *zoneFile) loadRRs() ([]dns.RR, fileSnapshot, error) {
|
||||
f, err := os.Open(z.Path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open %s: %w", z.Path, err)
|
||||
return nil, fileSnapshot{}, fmt.Errorf("open %s: %w", z.Path, err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
info, err := f.Stat()
|
||||
if err != nil {
|
||||
return nil, fileSnapshot{}, fmt.Errorf("stat %s: %w", z.Path, err)
|
||||
}
|
||||
snap := fileSnapshot{mtime: info.ModTime(), size: info.Size()}
|
||||
|
||||
parser := dns.NewZoneParser(f, z.Origin, z.Path)
|
||||
parser.SetDefaultTTL(3600)
|
||||
|
||||
@ -89,12 +125,29 @@ func (z *zoneFile) loadRRs() ([]dns.RR, error) {
|
||||
rrs = append(rrs, rr)
|
||||
}
|
||||
if err := parser.Err(); err != nil {
|
||||
return nil, fmt.Errorf("parse %s: %w", z.Path, err)
|
||||
return nil, snap, fmt.Errorf("parse %s: %w", z.Path, err)
|
||||
}
|
||||
if len(rrs) == 0 {
|
||||
return nil, fmt.Errorf("%s: zero RRs parsed", z.Path)
|
||||
return nil, snap, fmt.Errorf("%s: zero RRs parsed", z.Path)
|
||||
}
|
||||
return rrs, nil
|
||||
return rrs, snap, nil
|
||||
}
|
||||
|
||||
// checkUnchanged returns nil if the on-disk file still matches the
|
||||
// captured snapshot. If the file has been modified (mtime or size
|
||||
// differs), returns an error — the caller should refuse the UPDATE
|
||||
// rather than clobber the external change.
|
||||
func (z *zoneFile) checkUnchanged(snap fileSnapshot) error {
|
||||
info, err := os.Stat(z.Path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("stat %s: %w", z.Path, err)
|
||||
}
|
||||
if !snap.matches(info) {
|
||||
return fmt.Errorf("concurrent modification detected on %s: mtime %s/%s size %d/%d",
|
||||
z.Path, snap.mtime.Format(time.RFC3339Nano), info.ModTime().Format(time.RFC3339Nano),
|
||||
snap.size, info.Size())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Lookup returns RRs in `rrs` matching (name, rtype). Both name and
|
||||
@ -345,6 +398,12 @@ func (z *zoneFile) writeAtomic(rrs []dns.RR, now time.Time) error {
|
||||
// repository directory inferred from the zone file's parent. Returns
|
||||
// nil silently if AutoCommit is false. Returns an error if the commit
|
||||
// fails; the caller decides whether to roll back the file write.
|
||||
//
|
||||
// Both git invocations run under a context with a hard timeout
|
||||
// (gitCommandTimeout). If git hangs (NFS stall, gpg-sign prompt,
|
||||
// pre-commit hook waiting on stdin), we kill it rather than block the
|
||||
// caller's per-zone mutex indefinitely. ACME storms must not be able
|
||||
// to wedge the plugin via git getting stuck.
|
||||
func (z *zoneFile) commit(message string) error {
|
||||
if !z.AutoCommit {
|
||||
return nil
|
||||
@ -352,15 +411,17 @@ func (z *zoneFile) commit(message string) error {
|
||||
// We run git from the directory containing the zone file. git will
|
||||
// walk upward to find the .git dir.
|
||||
dir := filepath.Dir(z.Path)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), gitCommandTimeout)
|
||||
defer cancel()
|
||||
// `git add` first; if file is already in the index, no harm done.
|
||||
add := exec.Command("git",
|
||||
add := exec.CommandContext(ctx, "git",
|
||||
"-C", dir,
|
||||
"add", "--", z.Path,
|
||||
)
|
||||
if out, err := add.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("git add failed: %w: %s", err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
commit := exec.Command("git",
|
||||
commit := exec.CommandContext(ctx, "git",
|
||||
"-C", dir,
|
||||
"-c", "user.name="+z.GitAuthorName,
|
||||
"-c", "user.email="+z.GitAuthorEmail,
|
||||
|
||||
@ -25,7 +25,7 @@ func TestZoneFile_LoadRRs(t *testing.T) {
|
||||
dir := withTempZonesDir(t, "auth.example.com")
|
||||
zf := openZoneFile(filepath.Join(dir, "auth.example.com.zone"), "auth.example.com.")
|
||||
|
||||
rrs, err := zf.loadRRs()
|
||||
rrs, _, err := zf.loadRRs()
|
||||
if err != nil {
|
||||
t.Fatalf("loadRRs: %v", err)
|
||||
}
|
||||
@ -40,7 +40,7 @@ func TestZoneFile_LoadRRs(t *testing.T) {
|
||||
|
||||
func TestZoneFile_LoadRRs_MissingFile(t *testing.T) {
|
||||
zf := openZoneFile("/nope/missing.zone", "missing.")
|
||||
if _, err := zf.loadRRs(); err == nil {
|
||||
if _, _, err := zf.loadRRs(); err == nil {
|
||||
t.Errorf("expected error loading missing file, got nil")
|
||||
}
|
||||
}
|
||||
@ -234,7 +234,7 @@ func TestZoneFile_WriteAtomic_RoundTrip(t *testing.T) {
|
||||
zf := openZoneFile(path, "auth.example.com.")
|
||||
zf.AutoCommit = false // not testing commit here
|
||||
|
||||
rrs, err := zf.loadRRs()
|
||||
rrs, _, err := zf.loadRRs()
|
||||
if err != nil {
|
||||
t.Fatalf("initial load: %v", err)
|
||||
}
|
||||
@ -247,7 +247,7 @@ func TestZoneFile_WriteAtomic_RoundTrip(t *testing.T) {
|
||||
}
|
||||
|
||||
// Re-load and verify.
|
||||
after, err := zf.loadRRs()
|
||||
after, _, err := zf.loadRRs()
|
||||
if err != nil {
|
||||
t.Fatalf("re-load: %v", err)
|
||||
}
|
||||
@ -272,7 +272,7 @@ func TestZoneFile_WriteAtomic_LeavesNoTempFile(t *testing.T) {
|
||||
zf := openZoneFile(path, "auth.example.com.")
|
||||
zf.AutoCommit = false
|
||||
|
||||
rrs, _ := zf.loadRRs()
|
||||
rrs, _, _ := zf.loadRRs()
|
||||
_ = zf.writeAtomic(rrs, time.Now())
|
||||
|
||||
// No .rfc2136-*.zone temp files should remain.
|
||||
@ -282,13 +282,46 @@ func TestZoneFile_WriteAtomic_LeavesNoTempFile(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestZoneFile_CheckUnchanged_DetectsExternalModification covers H1:
|
||||
// between loadRRs and the next writeAtomic, if anything (rsync, manual
|
||||
// edit, git checkout) clobbered the file, checkUnchanged returns an
|
||||
// error so the caller refuses the UPDATE instead of losing the
|
||||
// external write.
|
||||
func TestZoneFile_CheckUnchanged_DetectsExternalModification(t *testing.T) {
|
||||
dir := withTempZonesDir(t, "auth.example.com")
|
||||
path := filepath.Join(dir, "auth.example.com.zone")
|
||||
zf := openZoneFile(path, "auth.example.com.")
|
||||
zf.AutoCommit = false
|
||||
|
||||
_, snap, err := zf.loadRRs()
|
||||
if err != nil {
|
||||
t.Fatalf("loadRRs: %v", err)
|
||||
}
|
||||
|
||||
// Pristine snapshot — checkUnchanged should pass.
|
||||
if err := zf.checkUnchanged(snap); err != nil {
|
||||
t.Errorf("pristine snapshot rejected: %v", err)
|
||||
}
|
||||
|
||||
// Simulate an external editor clobbering the file (rsync-style:
|
||||
// new content, different size, mtime advances).
|
||||
time.Sleep(10 * time.Millisecond) // ensure mtime differs
|
||||
if err := os.WriteFile(path, []byte("; external edit\n$ORIGIN auth.example.com.\nauth.example.com. 3600 IN SOA ns.example. admin.example. 1 60 60 60 60\n"), 0644); err != nil {
|
||||
t.Fatalf("simulated external edit: %v", err)
|
||||
}
|
||||
|
||||
if err := zf.checkUnchanged(snap); err == nil {
|
||||
t.Errorf("checkUnchanged accepted modified file; expected concurrent-modification error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestZoneFile_WriteAtomic_FileEndsWithNewline(t *testing.T) {
|
||||
dir := withTempZonesDir(t, "auth.example.com")
|
||||
path := filepath.Join(dir, "auth.example.com.zone")
|
||||
zf := openZoneFile(path, "auth.example.com.")
|
||||
zf.AutoCommit = false
|
||||
|
||||
rrs, _ := zf.loadRRs()
|
||||
rrs, _, _ := zf.loadRRs()
|
||||
_ = zf.writeAtomic(rrs, time.Now())
|
||||
|
||||
data, _ := os.ReadFile(path)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user