coredns-rfc2136/update.go
Ryan Malloy 7367401734 Send DNS NOTIFY to secondaries after every UPDATE
Per RFC 1996, a master that mutates a zone SHOULD notify its
secondaries so they can immediately AXFR rather than wait for their
next SOA-refresh poll. Without this, propagation lag from UPDATE to
public DNS is bounded by the secondary's refresh interval (300s for
us) — which is borderline for ACME validation timing.

New Corefile directive:
    notify <host[:port]> [<host[:port]>...]

Targets accept bare hostnames (port 53 default), host:port, or
[ipv6]:port. The same list applies to every zone in the rfc2136
block.

Implementation: fire-and-forget UDP per target, each in its own
goroutine, capped by a 2s timeout. The UPDATE response to the client
is never held pending NOTIFY acks (RFC 1996 §4 explicitly decouples
them). Failures log at DEBUG only — a briefly-unreachable secondary
is normal and would otherwise spam logs.

Retires the external scripts/notify-secondaries.py workflow for any
deployment that wires the directive: secondaries now hear about
changes within seconds of the UPDATE landing, no cron or manual
invocation needed.

New tests:
- TestSendNotify_DeliversToTarget — packet arrives, opcode + zone correct
- TestSendNotify_NoTargets_NoCrash — empty list short-circuits
- TestSendNotify_BadTarget_LogsButDoesNotBlock — fire-and-forget timing
- TestNotifyOne_AppendsDefaultPort — host vs host:port normalization
2026-05-23 00:54:45 -06:00

389 lines
13 KiB
Go

package rfc2136
import (
"fmt"
"path/filepath"
"strings"
"time"
"github.com/miekg/dns"
)
// handleUpdate implements the RFC 2136 UPDATE opcode against the
// on-disk zone file.
//
// Sequence per UPDATE message:
// 1. Validate the Zone section (RFC 2136 §2.3): must be exactly one
// SOA-typed record whose name is a zone we manage.
// 2. Acquire the zone file's mutex.
// 3. Load the file's RRs into memory.
// 4. Check each prerequisite (§3.2) against the loaded RRs. First
// failure short-circuits with the spec's rcode.
// 5. Apply each update RR (§3.4.2) to the in-memory slice.
// 6. Bump the SOA serial (CalVer YYYYMMDDNN).
// 7. Atomic write to disk (temp file + rename).
// 8. Optionally `git add && git commit` for audit trail.
//
// Steps 3-7 happen under the zone-file mutex. If 8 fails we log but
// don't roll back (the on-disk state is authoritative; lost commits
// can be re-staged via `git add` later).
//
// SECURITY CONTRACT — the `verified` parameter:
//
// handleUpdate mutates zone files on disk. The caller MUST set
// verified=true only after successfully validating the message's TSIG
// signature against a configured key. ServeDNS does this. A
// verified=false invocation is treated as an unauthenticated attempt
// and refused — preserving the security boundary even if a future
// internal caller (NOTIFY relay, admin RPC, refactor) reaches this
// function without going through the wire-level TSIG check.
//
// Tests that exercise post-auth logic pass verified=true. Tests that
// exercise auth rejection pass verified=false.
//
// This is defense-in-depth: ServeDNS already verifies; we re-assert at
// the function boundary so the security property survives refactors.
func (p *RFC2136) handleUpdate(w dns.ResponseWriter, r *dns.Msg, verified bool) (int, error) {
resp := new(dns.Msg)
resp.SetReply(r)
if verified {
// Only sign responses we authorize. Signing rejections leaks
// attestation that the named key exists on this server (see M9
// in the Hamilton review). Unauthorized callers get an
// unsigned Refused.
signResponseIfSigned(resp, r)
} else {
log.Warningf("handleUpdate refused: caller did not assert TSIG verification — possible internal bypass attempt")
return p.updateResp(w, resp, dns.RcodeRefused)
}
// 1. Validate the Zone section.
if len(r.Question) != 1 {
log.Debugf("UPDATE rejected: expected 1 Zone record, got %d", len(r.Question))
return p.updateResp(w, resp, dns.RcodeFormatError)
}
zoneQ := r.Question[0]
if zoneQ.Qtype != dns.TypeSOA {
log.Debugf("UPDATE rejected: Zone section type=%d, want SOA", zoneQ.Qtype)
return p.updateResp(w, resp, dns.RcodeFormatError)
}
zone := p.findZone(zoneQ.Name)
if zone == "" {
log.Debugf("UPDATE rejected: zone %q not authoritative", zoneQ.Name)
return p.updateResp(w, resp, dns.RcodeNotAuth)
}
zf, ok := p.zones[zone]
if !ok {
log.Errorf("UPDATE rejected: no zone file handle for %q (setup bug?)", zone)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
zf.mu.Lock()
defer zf.mu.Unlock()
// 3. Load the current zone contents (with file-identity snapshot).
rrs, snap, err := zf.loadRRs()
if err != nil {
log.Errorf("UPDATE failed: %v", err)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
// 4. Check prerequisites.
for _, rr := range r.Answer {
rcode := checkPrereq(zone, rrs, rr)
if rcode != dns.RcodeSuccess {
log.Debugf("UPDATE prereq failed: %s → rcode=%d", rr.String(), rcode)
return p.updateResp(w, resp, rcode)
}
}
// 5. Apply updates. Build a fresh RR slice rather than mutating in
// place — that way a partial application can't leave the slice in
// a half-modified state if an early update fails.
updated := rrs
changed := false
for _, rr := range r.Ns {
next, rcode, modified := applyUpdate(zone, p.TTL, updated, rr)
if rcode != dns.RcodeSuccess {
return p.updateResp(w, resp, rcode)
}
updated = next
if modified {
changed = true
}
}
if !changed {
// UPDATE was a valid no-op (e.g. only contained adds for RRs
// that were already present, deduped away per RFC 2136
// §3.4.2.2). Return NOERROR without rewriting the file or
// bumping the SOA serial.
//
// H7 — Policy decision documented:
//
// We DO NOT bump the SOA serial on no-op UPDATEs. Rationale:
// - DNS-wise, nothing changed. Forcing downstream secondaries
// (HE) to do an AXFR pull just to re-fetch identical content
// wastes bandwidth and is not what RFC 2136 implies.
// - The wire-visible cert-issuance chain for ACME does not
// depend on the second-UPDATE's serial bump — once the first
// UPDATE landed, the SOA already advanced and the auto plugin
// reloaded; subsequent identical UPDATEs are spurious and
// should be silent.
// - Caddy's caddy-dns/rfc2136 client treats NOERROR-no-bump as
// "yes I have your record" — which is the truthful answer.
//
// If a caller wants to force a serial bump for some reason, they
// can send a touch-UPDATE that adds-then-deletes a throwaway
// record. That's an explicit, intentional pattern and is
// supported.
return p.updateResp(w, resp, dns.RcodeSuccess)
}
// 6. Bump SOA serial.
now := time.Now()
if err := bumpSerial(updated, now); err != nil {
log.Errorf("UPDATE failed: %v", err)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
// 6b. Concurrent-modification check (Hamilton H1). Before clobbering
// the on-disk file, verify nothing changed it out from under us
// between loadRRs and now. The per-zone mutex serializes us against
// other in-process UPDATEs, but external editors (rsync push,
// manual edit, `git checkout`) can race in any time. If the file
// changed, refuse with SERVFAIL so Caddy retries on a fresh load.
if err := zf.checkUnchanged(snap); err != nil {
log.Warningf("UPDATE refused: %v", err)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
// 7. Atomic write.
if err := zf.writeAtomic(updated, now); err != nil {
log.Errorf("UPDATE write failed: %v", err)
return p.updateResp(w, resp, dns.RcodeServerFailure)
}
// 8. Auto-commit. Failure to commit means the file is correct but
// the git audit trail diverges (Hamilton H2). We log at ERROR with
// structured detail so operators discover the divergence; recovery
// is `git -C <zonesDir> status` + `git add` + manual commit. We do
// NOT roll back the file write — by the time the commit fails, the
// auto plugin may have already noticed the new mtime, and rolling
// back creates more races than it solves.
msg := summarizeUpdate(zone, r.Ns)
if err := zf.commit(msg); err != nil {
log.Errorf("git auto-commit failed; zone file is correct but audit trail diverged: zone=%s path=%s err=%v — recover with `git -C %s status` + manual commit",
zone, zf.Path, err, filepath.Dir(zf.Path))
}
log.Infof("UPDATE applied: zone=%s prereqs=%d updates=%d msg=%q",
zone, len(r.Answer), len(r.Ns), msg)
// Fire NOTIFY to configured secondaries (RFC 1996). Non-blocking:
// each target gets its own goroutine, capped by notifyTimeout. The
// UPDATE response to the client is not held on these acks — RFC
// 1996 §4 explicitly decouples them.
sendNotify(zone, p.NotifyTargets)
return p.updateResp(w, resp, dns.RcodeSuccess)
}
// updateResp writes the response and returns the rcode/err pair for ServeDNS.
func (p *RFC2136) updateResp(w dns.ResponseWriter, resp *dns.Msg, rcode int) (int, error) {
resp.Rcode = rcode
_ = w.WriteMsg(resp)
return rcode, nil
}
// findZone returns the longest matching configured zone for qname, or
// "" if qname is outside all configured zones.
func (p *RFC2136) findZone(qname string) string {
qname = canon(qname)
var best string
for _, z := range p.Zones {
if qname == z || strings.HasSuffix(qname, "."+z) {
if len(z) > len(best) {
best = z
}
}
}
return best
}
// checkPrereq evaluates one record from the Prerequisite section
// against the loaded RR slice. Returns dns.RcodeSuccess if satisfied,
// or the spec rcode otherwise (§3.2).
func checkPrereq(zone string, rrs []dns.RR, rr dns.RR) int {
hdr := rr.Header()
name := canon(hdr.Name)
if !inZone(name, zone) {
return dns.RcodeNotZone
}
switch hdr.Class {
case dns.ClassANY:
if hdr.Rrtype == dns.TypeANY {
if !nameExistsIn(rrs, name) {
return dns.RcodeNameError
}
return dns.RcodeSuccess
}
if len(lookupIn(rrs, name, hdr.Rrtype)) == 0 {
return dns.RcodeNXRrset
}
return dns.RcodeSuccess
case dns.ClassNONE:
if hdr.Rrtype == dns.TypeANY {
if nameExistsIn(rrs, name) {
return dns.RcodeYXDomain
}
return dns.RcodeSuccess
}
if len(lookupIn(rrs, name, hdr.Rrtype)) > 0 {
return dns.RcodeYXRrset
}
return dns.RcodeSuccess
default:
// CLASS = zone class with rdata. Exact value-match prereqs
// (§3.2.5). Not used by Caddy/caddy-dns/rfc2136; treating as
// satisfied for now. v2 can implement value-prereq if a real
// caller needs it.
log.Debugf("prereq with rdata-match semantics not implemented; treating as satisfied")
return dns.RcodeSuccess
}
}
// applyUpdate handles one record in the Update section per §3.4.2.
// Returns the (possibly mutated) RR slice, an rcode (Success unless
// the update was rejected), and a flag indicating whether the slice
// was actually modified (to avoid no-op file rewrites).
func applyUpdate(zone string, defaultTTL uint32, rrs []dns.RR, rr dns.RR) ([]dns.RR, int, bool) {
hdr := rr.Header()
name := canon(hdr.Name)
if !inZone(name, zone) {
return rrs, dns.RcodeNotZone, false
}
switch hdr.Class {
case dns.ClassANY:
if hdr.Rrtype == dns.TypeANY {
// Wipe the whole name. Refuse apex wipes — that would
// destroy SOA + NS bedrock.
if isApex(name, zone) {
log.Debugf("apex wipe refused: %s", name)
return rrs, dns.RcodeRefused, false
}
before := len(rrs)
rrs = removeNameFrom(rrs, name)
return rrs, dns.RcodeSuccess, len(rrs) != before
}
// Apex SOA/NS removal refused for the same reason.
if isApex(name, zone) && (hdr.Rrtype == dns.TypeSOA || hdr.Rrtype == dns.TypeNS) {
log.Debugf("apex %s removal refused", dns.TypeToString[hdr.Rrtype])
return rrs, dns.RcodeRefused, false
}
before := len(rrs)
rrs = removeRRsetFrom(rrs, name, hdr.Rrtype)
return rrs, dns.RcodeSuccess, len(rrs) != before
case dns.ClassNONE:
// Refuse to delete apex SOA/NS by exact-RR match.
if isApex(name, zone) && (hdr.Rrtype == dns.TypeSOA || hdr.Rrtype == dns.TypeNS) {
return rrs, dns.RcodeRefused, false
}
before := len(rrs)
rrs = removeRRFrom(rrs, rr)
return rrs, dns.RcodeSuccess, len(rrs) != before
default:
// Apex SOA/NS adds refused — those are managed by the zone-file
// owner, not by dynamic updates.
if isApex(name, zone) && (hdr.Rrtype == dns.TypeSOA || hdr.Rrtype == dns.TypeNS) {
log.Debugf("apex %s add refused", dns.TypeToString[hdr.Rrtype])
return rrs, dns.RcodeRefused, false
}
// Hamilton L2: don't mutate the caller's RR header. miekg/dns
// parses the UPDATE message into RRs the caller still owns;
// silently rewriting their TTL is a hygiene smell. Copy first,
// then apply our default if needed.
if hdr.Ttl == 0 {
rr = dns.Copy(rr)
rr.Header().Ttl = defaultTTL
}
before := len(rrs)
rrs = addRRTo(rrs, rr)
return rrs, dns.RcodeSuccess, len(rrs) != before
}
}
// summarizeUpdate produces a one-line commit message describing the
// UPDATE for git history. The output is sanitized — Hamilton M7 — to
// prevent attacker-controlled RR names (TSIG just authenticates the
// sender; the payload is still attacker-controlled) from injecting
// control characters into git log, log aggregators, or any downstream
// renderer that interprets ANSI/newlines.
func summarizeUpdate(zone string, updates []dns.RR) string {
var msg string
if len(updates) == 1 {
msg = fmt.Sprintf("rfc2136 %s: %s", zone, oneLineOp(updates[0]))
} else {
msg = fmt.Sprintf("rfc2136 %s: %d operations", zone, len(updates))
}
return sanitizeForCommitMessage(msg)
}
// sanitizeForCommitMessage strips control characters from s, replacing
// them with their printable escape form. This keeps git log + downstream
// renderers safe from attacker-injected newlines, escape sequences, etc.
func sanitizeForCommitMessage(s string) string {
var b strings.Builder
b.Grow(len(s))
for _, r := range s {
switch {
case r == '\n':
b.WriteString("\\n")
case r == '\r':
b.WriteString("\\r")
case r == '\t':
b.WriteString("\\t")
case r < 0x20 || r == 0x7f:
// Other C0 controls + DEL: emit \xNN.
fmt.Fprintf(&b, "\\x%02x", r)
default:
b.WriteRune(r)
}
}
return b.String()
}
// oneLineOp returns a short human-readable description of a single
// update RR for inclusion in commit messages.
func oneLineOp(rr dns.RR) string {
hdr := rr.Header()
name := strings.TrimSuffix(canon(hdr.Name), ".")
ttype := dns.TypeToString[hdr.Rrtype]
switch hdr.Class {
case dns.ClassANY:
if hdr.Rrtype == dns.TypeANY {
return fmt.Sprintf("delete all %s", name)
}
return fmt.Sprintf("delete %s %s", ttype, name)
case dns.ClassNONE:
return fmt.Sprintf("delete-rr %s %s", ttype, name)
default:
return fmt.Sprintf("add %s %s", ttype, name)
}
}
// inZone reports whether name is within zone.
func inZone(name, zone string) bool {
return name == zone || strings.HasSuffix(name, "."+zone)
}
// isApex reports whether name IS the zone's apex.
func isApex(name, zone string) bool {
return name == zone
}