coredns-rfc2136/update.go
Ryan Malloy 93ed180d8f H1/H2/M1: atomicity at the file boundary
H1 — Concurrent-modification detection. loadRRs now returns a
fileSnapshot capturing (mtime, size) at read time. handleUpdate calls
zf.checkUnchanged(snap) immediately before writeAtomic. If anything
modified the file between load and write — rsync push, manual edit,
`git checkout` — the UPDATE is refused with SERVFAIL. Caddy retries
with a fresh load. Protects against the CLAUDE.md-documented rsync
workflow racing the plugin.

H2 — Git commit-failure policy. The previous code logged at WARN and
continued, breaking the documented "file + git both updated" contract.
Now logs at ERROR with structured fields (zone, path, error, recovery
command) so operators discover the divergence. We do NOT roll back the
file write: by the time the commit fails, the auto plugin may have
already noticed the new mtime and reloaded; rolling back creates more
races than it solves. Recovery is `git -C <dir> status` + manual
commit.

M1 — exec.CommandContext with 10s timeout on git invocations. If git
hangs (NFS stall, gpg-sign prompt, broken pre-commit hook waiting on
stdin), the per-zone mutex would otherwise be held forever and queue
all subsequent UPDATEs. gitCommandTimeout caps the hang.

M2 deferred. Dropping the separate `git add` cleanly requires either
`-a` (wrong scope: auto-stages all tracked modifications) or `--include`
(still needs prior staging). The race window between add and commit is
theoretical for our setup (single-writer plugin + occasional `git
status`). M1's timeout already mitigates the worst hang case.

New tests:
- TestZoneFile_CheckUnchanged_DetectsExternalModification (H1)
2026-05-22 21:22:11 -06:00

326 lines
11 KiB
Go

package rfc2136
import (
"fmt"
"path/filepath"
"strings"
"time"
"github.com/miekg/dns"
)
// handleUpdate implements the RFC 2136 UPDATE opcode against the
// on-disk zone file.
//
// Sequence per UPDATE message:
// 1. Validate the Zone section (RFC 2136 §2.3): must be exactly one
// SOA-typed record whose name is a zone we manage.
// 2. Acquire the zone file's mutex.
// 3. Load the file's RRs into memory.
// 4. Check each prerequisite (§3.2) against the loaded RRs. First
// failure short-circuits with the spec's rcode.
// 5. Apply each update RR (§3.4.2) to the in-memory slice.
// 6. Bump the SOA serial (CalVer YYYYMMDDNN).
// 7. Atomic write to disk (temp file + rename).
// 8. Optionally `git add && git commit` for audit trail.
//
// Steps 3-7 happen under the zone-file mutex. If 8 fails we log but
// don't roll back (the on-disk state is authoritative; lost commits
// can be re-staged via `git add` later).
//
// SECURITY CONTRACT — the `verified` parameter:
//
// handleUpdate mutates zone files on disk. The caller MUST set
// verified=true only after successfully validating the message's TSIG
// signature against a configured key. ServeDNS does this. A
// verified=false invocation is treated as an unauthenticated attempt
// and refused — preserving the security boundary even if a future
// internal caller (NOTIFY relay, admin RPC, refactor) reaches this
// function without going through the wire-level TSIG check.
//
// Tests that exercise post-auth logic pass verified=true. Tests that
// exercise auth rejection pass verified=false.
//
// This is defense-in-depth: ServeDNS already verifies; we re-assert at
// the function boundary so the security property survives refactors.
func (p *RFC2136) handleUpdate(w dns.ResponseWriter, r *dns.Msg, verified bool) (int, error) {
resp := new(dns.Msg)
resp.SetReply(r)
if verified {
// Only sign responses we authorize. Signing rejections leaks
// attestation that the named key exists on this server (see M9
// in the Hamilton review). Unauthorized callers get an
// unsigned Refused.
signResponseIfSigned(resp, r)
} else {
log.Warningf("handleUpdate refused: caller did not assert TSIG verification — possible internal bypass attempt")
return p.updateResp(w, resp, dns.RcodeRefused)
}
// 1. Validate the Zone section.
if len(r.Question) != 1 {
log.Debugf("UPDATE rejected: expected 1 Zone record, got %d", len(r.Question))
return p.updateResp(w, resp, dns.RcodeFormatError)
}
zoneQ := r.Question[0]
if zoneQ.Qtype != dns.TypeSOA {
log.Debugf("UPDATE rejected: Zone section type=%d, want SOA", zoneQ.Qtype)
return p.updateResp(w, resp, dns.RcodeFormatError)
}
zone := p.findZone(zoneQ.Name)
if zone == "" {
log.Debugf("UPDATE rejected: zone %q not authoritative", zoneQ.Name)
return p.updateResp(w, resp, dns.RcodeNotAuth)
}
zf, ok := p.zones[zone]
if !ok {
log.Errorf("UPDATE rejected: no zone file handle for %q (setup bug?)", zone)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
zf.mu.Lock()
defer zf.mu.Unlock()
// 3. Load the current zone contents (with file-identity snapshot).
rrs, snap, err := zf.loadRRs()
if err != nil {
log.Errorf("UPDATE failed: %v", err)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
// 4. Check prerequisites.
for _, rr := range r.Answer {
rcode := checkPrereq(zone, rrs, rr)
if rcode != dns.RcodeSuccess {
log.Debugf("UPDATE prereq failed: %s → rcode=%d", rr.String(), rcode)
return p.updateResp(w, resp, rcode)
}
}
// 5. Apply updates. Build a fresh RR slice rather than mutating in
// place — that way a partial application can't leave the slice in
// a half-modified state if an early update fails.
updated := rrs
changed := false
for _, rr := range r.Ns {
next, rcode, modified := applyUpdate(zone, p.TTL, updated, rr)
if rcode != dns.RcodeSuccess {
return p.updateResp(w, resp, rcode)
}
updated = next
if modified {
changed = true
}
}
if !changed {
// UPDATE was a valid no-op (e.g. only contained adds for RRs
// that were already present, deduped away). Return NOERROR
// without rewriting the file.
return p.updateResp(w, resp, dns.RcodeSuccess)
}
// 6. Bump SOA serial.
now := time.Now()
if err := bumpSerial(updated, now); err != nil {
log.Errorf("UPDATE failed: %v", err)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
// 6b. Concurrent-modification check (Hamilton H1). Before clobbering
// the on-disk file, verify nothing changed it out from under us
// between loadRRs and now. The per-zone mutex serializes us against
// other in-process UPDATEs, but external editors (rsync push,
// manual edit, `git checkout`) can race in any time. If the file
// changed, refuse with SERVFAIL so Caddy retries on a fresh load.
if err := zf.checkUnchanged(snap); err != nil {
log.Warningf("UPDATE refused: %v", err)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
// 7. Atomic write.
if err := zf.writeAtomic(updated, now); err != nil {
log.Errorf("UPDATE write failed: %v", err)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
// 8. Auto-commit. Failure to commit means the file is correct but
// the git audit trail diverges (Hamilton H2). We log at ERROR with
// structured detail so operators discover the divergence; recovery
// is `git -C <zonesDir> status` + `git add` + manual commit. We do
// NOT roll back the file write — by the time the commit fails, the
// auto plugin may have already noticed the new mtime, and rolling
// back creates more races than it solves.
msg := summarizeUpdate(zone, r.Ns)
if err := zf.commit(msg); err != nil {
log.Errorf("git auto-commit failed; zone file is correct but audit trail diverged: zone=%s path=%s err=%v — recover with `git -C %s status` + manual commit",
zone, zf.Path, err, filepath.Dir(zf.Path))
}
log.Infof("UPDATE applied: zone=%s prereqs=%d updates=%d msg=%q",
zone, len(r.Answer), len(r.Ns), msg)
return p.updateResp(w, resp, dns.RcodeSuccess)
}
// updateResp writes the response and returns the rcode/err pair for ServeDNS.
func (p *RFC2136) updateResp(w dns.ResponseWriter, resp *dns.Msg, rcode int) (int, error) {
resp.Rcode = rcode
_ = w.WriteMsg(resp)
return rcode, nil
}
// findZone returns the longest matching configured zone for qname, or
// "" if qname is outside all configured zones.
func (p *RFC2136) findZone(qname string) string {
qname = canon(qname)
var best string
for _, z := range p.Zones {
if qname == z || strings.HasSuffix(qname, "."+z) {
if len(z) > len(best) {
best = z
}
}
}
return best
}
// checkPrereq evaluates one record from the Prerequisite section
// against the loaded RR slice. Returns dns.RcodeSuccess if satisfied,
// or the spec rcode otherwise (§3.2).
func checkPrereq(zone string, rrs []dns.RR, rr dns.RR) int {
hdr := rr.Header()
name := canon(hdr.Name)
if !inZone(name, zone) {
return dns.RcodeNotZone
}
switch hdr.Class {
case dns.ClassANY:
if hdr.Rrtype == dns.TypeANY {
if !nameExistsIn(rrs, name) {
return dns.RcodeNameError
}
return dns.RcodeSuccess
}
if len(lookupIn(rrs, name, hdr.Rrtype)) == 0 {
return dns.RcodeNXRrset
}
return dns.RcodeSuccess
case dns.ClassNONE:
if hdr.Rrtype == dns.TypeANY {
if nameExistsIn(rrs, name) {
return dns.RcodeYXDomain
}
return dns.RcodeSuccess
}
if len(lookupIn(rrs, name, hdr.Rrtype)) > 0 {
return dns.RcodeYXRrset
}
return dns.RcodeSuccess
default:
// CLASS = zone class with rdata. Exact value-match prereqs
// (§3.2.5). Not used by Caddy/caddy-dns/rfc2136; treating as
// satisfied for now. v2 can implement value-prereq if a real
// caller needs it.
log.Debugf("prereq with rdata-match semantics not implemented; treating as satisfied")
return dns.RcodeSuccess
}
}
// applyUpdate handles one record in the Update section per §3.4.2.
// Returns the (possibly mutated) RR slice, an rcode (Success unless
// the update was rejected), and a flag indicating whether the slice
// was actually modified (to avoid no-op file rewrites).
func applyUpdate(zone string, defaultTTL uint32, rrs []dns.RR, rr dns.RR) ([]dns.RR, int, bool) {
hdr := rr.Header()
name := canon(hdr.Name)
if !inZone(name, zone) {
return rrs, dns.RcodeNotZone, false
}
switch hdr.Class {
case dns.ClassANY:
if hdr.Rrtype == dns.TypeANY {
// Wipe the whole name. Refuse apex wipes — that would
// destroy SOA + NS bedrock.
if isApex(name, zone) {
log.Debugf("apex wipe refused: %s", name)
return rrs, dns.RcodeRefused, false
}
before := len(rrs)
rrs = removeNameFrom(rrs, name)
return rrs, dns.RcodeSuccess, len(rrs) != before
}
// Apex SOA/NS removal refused for the same reason.
if isApex(name, zone) && (hdr.Rrtype == dns.TypeSOA || hdr.Rrtype == dns.TypeNS) {
log.Debugf("apex %s removal refused", dns.TypeToString[hdr.Rrtype])
return rrs, dns.RcodeRefused, false
}
before := len(rrs)
rrs = removeRRsetFrom(rrs, name, hdr.Rrtype)
return rrs, dns.RcodeSuccess, len(rrs) != before
case dns.ClassNONE:
// Refuse to delete apex SOA/NS by exact-RR match.
if isApex(name, zone) && (hdr.Rrtype == dns.TypeSOA || hdr.Rrtype == dns.TypeNS) {
return rrs, dns.RcodeRefused, false
}
before := len(rrs)
rrs = removeRRFrom(rrs, rr)
return rrs, dns.RcodeSuccess, len(rrs) != before
default:
// Apex SOA/NS adds refused — those are managed by the zone-file
// owner, not by dynamic updates.
if isApex(name, zone) && (hdr.Rrtype == dns.TypeSOA || hdr.Rrtype == dns.TypeNS) {
log.Debugf("apex %s add refused", dns.TypeToString[hdr.Rrtype])
return rrs, dns.RcodeRefused, false
}
if hdr.Ttl == 0 {
hdr.Ttl = defaultTTL
}
before := len(rrs)
rrs = addRRTo(rrs, rr)
return rrs, dns.RcodeSuccess, len(rrs) != before
}
}
// summarizeUpdate produces a one-line commit message describing the
// UPDATE for git history.
func summarizeUpdate(zone string, updates []dns.RR) string {
if len(updates) == 1 {
return fmt.Sprintf("rfc2136 %s: %s", zone, oneLineOp(updates[0]))
}
return fmt.Sprintf("rfc2136 %s: %d operations", zone, len(updates))
}
// oneLineOp returns a short human-readable description of a single
// update RR for inclusion in commit messages.
func oneLineOp(rr dns.RR) string {
hdr := rr.Header()
name := strings.TrimSuffix(canon(hdr.Name), ".")
ttype := dns.TypeToString[hdr.Rrtype]
switch hdr.Class {
case dns.ClassANY:
if hdr.Rrtype == dns.TypeANY {
return fmt.Sprintf("delete all %s", name)
}
return fmt.Sprintf("delete %s %s", ttype, name)
case dns.ClassNONE:
return fmt.Sprintf("delete-rr %s %s", ttype, name)
default:
return fmt.Sprintf("add %s %s", ttype, name)
}
}
// inZone reports whether name is within zone.
func inZone(name, zone string) bool {
return name == zone || strings.HasSuffix(name, "."+zone)
}
// isApex reports whether name IS the zone's apex.
func isApex(name, zone string) bool {
return name == zone
}