coredns-rfc2136/zonefile.go
Ryan Malloy 93ed180d8f H1/H2/M1: atomicity at the file boundary
H1 — Concurrent-modification detection. loadRRs now returns a
fileSnapshot capturing (mtime, size) at read time. handleUpdate calls
zf.checkUnchanged(snap) immediately before writeAtomic. If anything
modified the file between load and write — rsync push, manual edit,
`git checkout` — the UPDATE is refused with SERVFAIL. Caddy retries
with a fresh load. Protects against the CLAUDE.md-documented rsync
workflow racing the plugin.

H2 — Git commit-failure policy. The previous code logged at WARN and
continued, breaking the documented "file + git both updated" contract.
Now logs at ERROR with structured fields (zone, path, error, recovery
command) so operators discover the divergence. We do NOT roll back the
file write: by the time the commit fails, the auto plugin may have
already noticed the new mtime and reloaded; rolling back creates more
races than it solves. Recovery is `git -C <dir> status` + manual
commit.

M1 — exec.CommandContext with 10s timeout on git invocations. If git
hangs (NFS stall, gpg-sign prompt, broken pre-commit hook waiting on
stdin), the per-zone mutex would otherwise be held forever and queue
all subsequent UPDATEs. gitCommandTimeout caps the hang.

M2 deferred. Dropping the separate `git add` cleanly requires either
`-a` (wrong scope: auto-stages all tracked modifications) or `--include`
(still needs prior staging). The race window between add and commit is
theoretical for our setup (single-writer plugin + occasional `git
status`). M1's timeout already mitigates the worst hang case.

New tests:
- TestZoneFile_CheckUnchanged_DetectsExternalModification (H1)
2026-05-22 21:22:11 -06:00

435 lines
14 KiB
Go

package rfc2136
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
"github.com/miekg/dns"
)
// gitCommandTimeout caps any single git invocation we shell out to. If
// the process hangs (NFS stall, gpg-sign prompt, broken pre-commit hook
// waiting on stdin, etc.) we must not block the caller — which holds
// the per-zone mutex — forever. 10s is generous for a local-disk repo.
const gitCommandTimeout = 10 * time.Second
// fileSnapshot captures the file-identity fingerprint at the moment we
// last read the zone file. We compare it to the live stat just before
// writing back to detect concurrent modification (rsync push, manual
// edit, `git checkout`). If anything changed the file out from under
// us, we refuse the UPDATE — Caddy will retry, and the next loadRRs
// will see the updated state.
type fileSnapshot struct {
mtime time.Time
size int64
}
// matches returns true if `info` reports the same mtime and size we
// captured. Inode comparison would be stricter, but mtime+size is
// sufficient for the failure modes we care about (rename-over-original
// edits change both).
func (s fileSnapshot) matches(info os.FileInfo) bool {
return s.mtime.Equal(info.ModTime()) && s.size == info.Size()
}
// zoneFile is a file-backed authority for a single DNS zone. Replaces
// the Phase-1.3 in-memory recordStore.
//
// On every UPDATE, the file is read fully into memory as parsed RRs,
// the requested adds/deletes are applied to that slice, the SOA serial
// is bumped (CalVer YYYYMMDDNN style), and the file is rewritten via
// an atomic temp-file rename. CoreDNS's `auto` plugin notices the
// mtime change within its reload interval (~30s) and re-serves the
// zone. HE eventually pulls on its SOA refresh.
//
// Concurrency: per-zone mutex serializes RFC 2136 UPDATEs against
// each other and against the plugin's own reads. It does NOT protect
// against external editors (e.g. a human running an editor while the
// plugin is mid-write); that's the operator's responsibility, and
// the typical mitigation is to do manual edits when no UPDATEs are
// in flight (or just accept the rare race — the worst case is one
// lost manual edit, easily restored from git).
type zoneFile struct {
mu sync.Mutex
// Path is the absolute path to the zone file on disk.
Path string
// Origin is the canonical (lowercase, trailing dot) zone apex.
Origin string
// AutoCommit, when true, runs `git add <path> && git commit ...`
// after every successful write. Defaults to true (per the chosen
// architecture: every dynamic update should leave a git trail).
AutoCommit bool
// GitAuthorName and GitAuthorEmail are passed to `git commit`
// via -c user.name and -c user.email so the commits are
// attributable without depending on the system git config.
GitAuthorName string
GitAuthorEmail string
}
// canon normalises a DNS name to the store's internal form: lowercase
// with trailing dot. miekg/dns sometimes hands us names without the
// trailing dot; passing through this once at the boundary keeps every
// lookup, every comparison consistent.
func canon(name string) string {
return strings.ToLower(dns.Fqdn(name))
}
// openZoneFile prepares a zoneFile handle. Does NOT read or parse the
// file; that happens lazily in each operation (so the file's content
// is always fresh and we never serve a stale snapshot).
func openZoneFile(path, origin string) *zoneFile {
return &zoneFile{
Path: path,
Origin: canon(origin),
AutoCommit: true,
GitAuthorName: "coredns-rfc2136",
GitAuthorEmail: "rfc2136@coredns",
}
}
// loadRRs reads the zone file and parses it into an RR slice via
// miekg/dns's zone parser. The parser handles $ORIGIN, $TTL, multi-line
// SOA, comments, includes, etc.
//
// Returns (rrs, snapshot, error). The snapshot fingerprints the file
// identity at read time so a subsequent writeIfUnchanged can detect
// concurrent modification.
func (z *zoneFile) loadRRs() ([]dns.RR, fileSnapshot, error) {
f, err := os.Open(z.Path)
if err != nil {
return nil, fileSnapshot{}, fmt.Errorf("open %s: %w", z.Path, err)
}
defer f.Close()
info, err := f.Stat()
if err != nil {
return nil, fileSnapshot{}, fmt.Errorf("stat %s: %w", z.Path, err)
}
snap := fileSnapshot{mtime: info.ModTime(), size: info.Size()}
parser := dns.NewZoneParser(f, z.Origin, z.Path)
parser.SetDefaultTTL(3600)
var rrs []dns.RR
for rr, ok := parser.Next(); ok; rr, ok = parser.Next() {
rrs = append(rrs, rr)
}
if err := parser.Err(); err != nil {
return nil, snap, fmt.Errorf("parse %s: %w", z.Path, err)
}
if len(rrs) == 0 {
return nil, snap, fmt.Errorf("%s: zero RRs parsed", z.Path)
}
return rrs, snap, nil
}
// checkUnchanged returns nil if the on-disk file still matches the
// captured snapshot. If the file has been modified (mtime or size
// differs), returns an error — the caller should refuse the UPDATE
// rather than clobber the external change.
func (z *zoneFile) checkUnchanged(snap fileSnapshot) error {
info, err := os.Stat(z.Path)
if err != nil {
return fmt.Errorf("stat %s: %w", z.Path, err)
}
if !snap.matches(info) {
return fmt.Errorf("concurrent modification detected on %s: mtime %s/%s size %d/%d",
z.Path, snap.mtime.Format(time.RFC3339Nano), info.ModTime().Format(time.RFC3339Nano),
snap.size, info.Size())
}
return nil
}
// Lookup returns RRs in `rrs` matching (name, rtype). Both name and
// the RR header names are canonicalised for the comparison. Pass-by-
// slice rather than holding state means we can let the caller batch
// multiple operations against one snapshot of the file.
func lookupIn(rrs []dns.RR, name string, rtype uint16) []dns.RR {
name = canon(name)
var out []dns.RR
for _, rr := range rrs {
hdr := rr.Header()
if canon(hdr.Name) == name && hdr.Rrtype == rtype {
out = append(out, rr)
}
}
return out
}
// nameExistsIn reports whether any RR's owner equals name (canonical).
func nameExistsIn(rrs []dns.RR, name string) bool {
name = canon(name)
for _, rr := range rrs {
if canon(rr.Header().Name) == name {
return true
}
}
return false
}
// removeRRsetFrom returns rrs minus every RR matching (name, rtype).
func removeRRsetFrom(rrs []dns.RR, name string, rtype uint16) []dns.RR {
name = canon(name)
out := rrs[:0:0]
for _, rr := range rrs {
hdr := rr.Header()
if canon(hdr.Name) == name && hdr.Rrtype == rtype {
continue
}
out = append(out, rr)
}
return out
}
// removeNameFrom returns rrs minus every RR with the given owner name.
func removeNameFrom(rrs []dns.RR, name string) []dns.RR {
name = canon(name)
out := rrs[:0:0]
for _, rr := range rrs {
if canon(rr.Header().Name) == name {
continue
}
out = append(out, rr)
}
return out
}
// removeRRFrom returns rrs minus the single RR matching the given one
// by owner + type + rdata. String() comparison covers rdata exactness.
func removeRRFrom(rrs []dns.RR, target dns.RR) []dns.RR {
targetStr := target.String()
out := rrs[:0:0]
matched := false
for _, rr := range rrs {
if !matched && rr.String() == targetStr {
matched = true
continue
}
out = append(out, rr)
}
return out
}
// addRRTo appends rr to rrs unless an identical RR already exists
// (de-dupe semantics per RFC 2136 §3.4.2.2).
func addRRTo(rrs []dns.RR, rr dns.RR) []dns.RR {
target := rr.String()
for _, existing := range rrs {
if existing.String() == target {
return rrs
}
}
return append(rrs, rr)
}
// serialCounterMul is the multiplier between the date prefix and the
// counter in our SOA-serial encoding. The format is YYMMDD*10000 + NNNN,
// giving 10000 bumps/day (NNNN ∈ [0001, 9999]). The 2-digit year keeps
// the maximum within uint32 (the RFC 1035 ceiling for SOA serials): for
// 2026-05-22, max serial 2,605,229,999 is well below 2^32-1=4,294,967,295.
// A 4-digit year (e.g., 20260522*10000) would overflow uint32.
const serialCounterMul = 10000
// bumpSerial advances the SOA's serial in CalVer YYMMDD*10000+NNNN form.
//
// Behaviour:
// - If cur encodes today (or a future-encoded date from a prior NNNN
// rollover), increment NNNN. On NNNN=9999, roll forward to the next
// encoded day with NNNN=0001. The encoded date drifts ahead of wall
// time during heavy churn and catches back up on quiet days; serial
// numbers stay strictly monotonic, which is the only DNS hard
// requirement.
// - Otherwise (older serial; including legacy YYYYMMDDNN-format serials
// left over from before this format change), jump to today*10000+1.
// Legacy serials migrate automatically here: a value like 2026052299
// (~2.026B) is numerically smaller than today's new-format minimum
// 2605220001 (~2.605B), so it falls to this branch and gets rewritten
// in-place. The new value is strictly greater, so AXFR receivers (HE
// et al.) treat it as a normal forward bump and pull cleanly.
//
// The SOA is found by type (there should be exactly one); mutated in
// place. The returned slice is the same slice with the SOA's serial
// updated.
func bumpSerial(rrs []dns.RR, now time.Time) error {
var soa *dns.SOA
for _, rr := range rrs {
if s, ok := rr.(*dns.SOA); ok {
soa = s
break
}
}
if soa == nil {
return fmt.Errorf("zone has no SOA record")
}
today := now.UTC().Format("060102") // YYMMDD
cur := fmt.Sprintf("%010d", soa.Serial)
// Try the new-format read: cur[:6] is YYMMDD, cur[6:10] is NNNN.
// We only honour this when the encoded date is today or later — an
// older encoded date means a normal new-day reset (or a legacy
// serial that happens to look like a valid YYMMDD prefix but is in
// the past, which is the same handling: jump to today).
if curDate := cur[:6]; isValidYYMMDD(curDate) && curDate >= today {
nnnn := atoi(cur[6:10])
if nnnn < 9999 {
soa.Serial = uint32(parseUint(curDate)*serialCounterMul + uint64(nnnn+1))
return nil
}
// NNNN=9999: roll to next encoded day, NNNN=0001.
d, err := time.Parse("060102", curDate)
if err != nil {
return fmt.Errorf("serial date %q unparseable: %w", curDate, err)
}
next := d.AddDate(0, 0, 1).Format("060102")
soa.Serial = uint32(parseUint(next)*serialCounterMul + 1)
return nil
}
// Older or unparseable: jump to today*10000+1. Migration path for
// legacy YYYYMMDDNN serials lives here.
candidate := uint32(parseUint(today)*serialCounterMul + 1)
if candidate <= soa.Serial {
// Defensive: don't regress. If something has somehow
// provisioned a serial >= today's new-format candidate (e.g.,
// far-future serial from a hand-edit), just +1 to advance.
soa.Serial++
return nil
}
soa.Serial = candidate
return nil
}
// isValidYYMMDD reports whether s is a 6-character YYMMDD string with a
// valid month and day. Year is any 2-digit value (00-99).
func isValidYYMMDD(s string) bool {
if len(s) != 6 {
return false
}
_, err := time.Parse("060102", s)
return err == nil
}
// atoi is a tiny helper that ignores errors — only called on a
// substring we already validated is two digits.
func atoi(s string) int {
n := 0
for _, c := range s {
n = n*10 + int(c-'0')
}
return n
}
// parseUint parses an all-digits string into a uint64. Used because
// strconv.ParseUint adds error-handling overhead we don't need on
// internally-controlled inputs.
func parseUint(s string) uint64 {
var n uint64
for _, c := range s {
n = n*10 + uint64(c-'0')
}
return n
}
// writeAtomic serializes rrs to a temp file in the same directory as
// z.Path, then renames over the destination. POSIX guarantees atomic
// rename on local filesystems, so a partial write can never leave a
// corrupt zone file on disk.
//
// Format: one RR per line, tab-separated owner/TTL/class/type/rdata.
// Comments and multi-line SOA formatting from the original file are
// NOT preserved (v1 limitation; sophisticated comment preservation can
// land in v2). A short header line is emitted with the write timestamp
// and the plugin name, so it's obvious in `git log` what touched the
// file.
func (z *zoneFile) writeAtomic(rrs []dns.RR, now time.Time) error {
dir := filepath.Dir(z.Path)
tmp, err := os.CreateTemp(dir, ".rfc2136-*.zone")
if err != nil {
return fmt.Errorf("create temp: %w", err)
}
tmpPath := tmp.Name()
// Best-effort cleanup if we fail before the rename.
defer func() {
if tmpPath != "" {
_ = os.Remove(tmpPath)
}
}()
header := fmt.Sprintf("; Auto-written by coredns-rfc2136 on %s\n; Zone: %s\n$ORIGIN %s\n",
now.UTC().Format(time.RFC3339), z.Origin, z.Origin)
if _, err := tmp.WriteString(header); err != nil {
_ = tmp.Close()
return fmt.Errorf("write header: %w", err)
}
for _, rr := range rrs {
if _, err := tmp.WriteString(rr.String() + "\n"); err != nil {
_ = tmp.Close()
return fmt.Errorf("write rr: %w", err)
}
}
if err := tmp.Sync(); err != nil {
_ = tmp.Close()
return fmt.Errorf("sync: %w", err)
}
if err := tmp.Close(); err != nil {
return fmt.Errorf("close: %w", err)
}
if err := os.Rename(tmpPath, z.Path); err != nil {
return fmt.Errorf("rename %s -> %s: %w", tmpPath, z.Path, err)
}
tmpPath = "" // suppress cleanup; rename consumed it
return nil
}
// commit stages and commits the zone file via git. Runs from the
// repository directory inferred from the zone file's parent. Returns
// nil silently if AutoCommit is false. Returns an error if the commit
// fails; the caller decides whether to roll back the file write.
//
// Both git invocations run under a context with a hard timeout
// (gitCommandTimeout). If git hangs (NFS stall, gpg-sign prompt,
// pre-commit hook waiting on stdin), we kill it rather than block the
// caller's per-zone mutex indefinitely. ACME storms must not be able
// to wedge the plugin via git getting stuck.
func (z *zoneFile) commit(message string) error {
if !z.AutoCommit {
return nil
}
// We run git from the directory containing the zone file. git will
// walk upward to find the .git dir.
dir := filepath.Dir(z.Path)
ctx, cancel := context.WithTimeout(context.Background(), gitCommandTimeout)
defer cancel()
// `git add` first; if file is already in the index, no harm done.
add := exec.CommandContext(ctx, "git",
"-C", dir,
"add", "--", z.Path,
)
if out, err := add.CombinedOutput(); err != nil {
return fmt.Errorf("git add failed: %w: %s", err, strings.TrimSpace(string(out)))
}
commit := exec.CommandContext(ctx, "git",
"-C", dir,
"-c", "user.name="+z.GitAuthorName,
"-c", "user.email="+z.GitAuthorEmail,
"commit", "-q", "-m", message, "--", z.Path,
)
if out, err := commit.CombinedOutput(); err != nil {
return fmt.Errorf("git commit failed: %w: %s", err, strings.TrimSpace(string(out)))
}
return nil
}