Per RFC 1996, a master that mutates a zone SHOULD notify its
secondaries so they can immediately AXFR rather than wait for their
next SOA-refresh poll. Without this, propagation lag from UPDATE to
public DNS is bounded by the secondary's refresh interval (300s for
us) — which is borderline for ACME validation timing.
New Corefile directive:
notify <host[:port]> [<host[:port]>...]
Targets accept bare hostnames (port 53 default), host:port, or
[ipv6]:port. The same list applies to every zone in the rfc2136
block.
Implementation: fire-and-forget UDP per target, each in its own
goroutine, capped by a 2s timeout. The UPDATE response to the client
is never held pending NOTIFY acks (RFC 1996 §4 explicitly decouples
them). Failures log at DEBUG only — a briefly-unreachable secondary
is normal and would otherwise spam logs.
Retires the external scripts/notify-secondaries.py workflow for any
deployment that wires the directive: secondaries now hear about
changes within seconds of the UPDATE landing, no cron or manual
invocation needed.
New tests:
- TestSendNotify_DeliversToTarget — packet arrives, opcode + zone correct
- TestSendNotify_NoTargets_NoCrash — empty list short-circuits
- TestSendNotify_BadTarget_LogsButDoesNotBlock — fire-and-forget timing
- TestNotifyOne_AppendsDefaultPort — host vs host:port normalization
389 lines
13 KiB
Go
389 lines
13 KiB
Go
package rfc2136
|
|
|
|
import (
|
|
"fmt"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/miekg/dns"
|
|
)
|
|
|
|
// handleUpdate implements the RFC 2136 UPDATE opcode against the
|
|
// on-disk zone file.
|
|
//
|
|
// Sequence per UPDATE message:
|
|
// 1. Validate the Zone section (RFC 2136 §2.3): must be exactly one
|
|
// SOA-typed record whose name is a zone we manage.
|
|
// 2. Acquire the zone file's mutex.
|
|
// 3. Load the file's RRs into memory.
|
|
// 4. Check each prerequisite (§3.2) against the loaded RRs. First
|
|
// failure short-circuits with the spec's rcode.
|
|
// 5. Apply each update RR (§3.4.2) to the in-memory slice.
|
|
// 6. Bump the SOA serial (CalVer YYYYMMDDNN).
|
|
// 7. Atomic write to disk (temp file + rename).
|
|
// 8. Optionally `git add && git commit` for audit trail.
|
|
//
|
|
// Steps 3-7 happen under the zone-file mutex. If 8 fails we log but
|
|
// don't roll back (the on-disk state is authoritative; lost commits
|
|
// can be re-staged via `git add` later).
|
|
//
|
|
// SECURITY CONTRACT — the `verified` parameter:
|
|
//
|
|
// handleUpdate mutates zone files on disk. The caller MUST set
|
|
// verified=true only after successfully validating the message's TSIG
|
|
// signature against a configured key. ServeDNS does this. A
|
|
// verified=false invocation is treated as an unauthenticated attempt
|
|
// and refused — preserving the security boundary even if a future
|
|
// internal caller (NOTIFY relay, admin RPC, refactor) reaches this
|
|
// function without going through the wire-level TSIG check.
|
|
//
|
|
// Tests that exercise post-auth logic pass verified=true. Tests that
|
|
// exercise auth rejection pass verified=false.
|
|
//
|
|
// This is defense-in-depth: ServeDNS already verifies; we re-assert at
|
|
// the function boundary so the security property survives refactors.
|
|
func (p *RFC2136) handleUpdate(w dns.ResponseWriter, r *dns.Msg, verified bool) (int, error) {
|
|
resp := new(dns.Msg)
|
|
resp.SetReply(r)
|
|
if verified {
|
|
// Only sign responses we authorize. Signing rejections leaks
|
|
// attestation that the named key exists on this server (see M9
|
|
// in the Hamilton review). Unauthorized callers get an
|
|
// unsigned Refused.
|
|
signResponseIfSigned(resp, r)
|
|
} else {
|
|
log.Warningf("handleUpdate refused: caller did not assert TSIG verification — possible internal bypass attempt")
|
|
return p.updateResp(w, resp, dns.RcodeRefused)
|
|
}
|
|
|
|
// 1. Validate the Zone section.
|
|
if len(r.Question) != 1 {
|
|
log.Debugf("UPDATE rejected: expected 1 Zone record, got %d", len(r.Question))
|
|
return p.updateResp(w, resp, dns.RcodeFormatError)
|
|
}
|
|
zoneQ := r.Question[0]
|
|
if zoneQ.Qtype != dns.TypeSOA {
|
|
log.Debugf("UPDATE rejected: Zone section type=%d, want SOA", zoneQ.Qtype)
|
|
return p.updateResp(w, resp, dns.RcodeFormatError)
|
|
}
|
|
zone := p.findZone(zoneQ.Name)
|
|
if zone == "" {
|
|
log.Debugf("UPDATE rejected: zone %q not authoritative", zoneQ.Name)
|
|
return p.updateResp(w, resp, dns.RcodeNotAuth)
|
|
}
|
|
zf, ok := p.zones[zone]
|
|
if !ok {
|
|
log.Errorf("UPDATE rejected: no zone file handle for %q (setup bug?)", zone)
|
|
return p.updateResp(w, resp, dns.RcodeServerFailure)
|
|
}
|
|
|
|
zf.mu.Lock()
|
|
defer zf.mu.Unlock()
|
|
|
|
// 3. Load the current zone contents (with file-identity snapshot).
|
|
rrs, snap, err := zf.loadRRs()
|
|
if err != nil {
|
|
log.Errorf("UPDATE failed: %v", err)
|
|
return p.updateResp(w, resp, dns.RcodeServerFailure)
|
|
}
|
|
|
|
// 4. Check prerequisites.
|
|
for _, rr := range r.Answer {
|
|
rcode := checkPrereq(zone, rrs, rr)
|
|
if rcode != dns.RcodeSuccess {
|
|
log.Debugf("UPDATE prereq failed: %s → rcode=%d", rr.String(), rcode)
|
|
return p.updateResp(w, resp, rcode)
|
|
}
|
|
}
|
|
|
|
// 5. Apply updates. Build a fresh RR slice rather than mutating in
|
|
// place — that way a partial application can't leave the slice in
|
|
// a half-modified state if an early update fails.
|
|
updated := rrs
|
|
changed := false
|
|
for _, rr := range r.Ns {
|
|
next, rcode, modified := applyUpdate(zone, p.TTL, updated, rr)
|
|
if rcode != dns.RcodeSuccess {
|
|
return p.updateResp(w, resp, rcode)
|
|
}
|
|
updated = next
|
|
if modified {
|
|
changed = true
|
|
}
|
|
}
|
|
|
|
if !changed {
|
|
// UPDATE was a valid no-op (e.g. only contained adds for RRs
|
|
// that were already present, deduped away per RFC 2136
|
|
// §3.4.2.2). Return NOERROR without rewriting the file or
|
|
// bumping the SOA serial.
|
|
//
|
|
// H7 — Policy decision documented:
|
|
//
|
|
// We DO NOT bump the SOA serial on no-op UPDATEs. Rationale:
|
|
// - DNS-wise, nothing changed. Forcing downstream secondaries
|
|
// (HE) to do an AXFR pull just to re-fetch identical content
|
|
// wastes bandwidth and is not what RFC 2136 implies.
|
|
// - The wire-visible cert-issuance chain for ACME does not
|
|
// depend on the second-UPDATE's serial bump — once the first
|
|
// UPDATE landed, the SOA already advanced and the auto plugin
|
|
// reloaded; subsequent identical UPDATEs are spurious and
|
|
// should be silent.
|
|
// - Caddy's caddy-dns/rfc2136 client treats NOERROR-no-bump as
|
|
// "yes I have your record" — which is the truthful answer.
|
|
//
|
|
// If a caller wants to force a serial bump for some reason, they
|
|
// can send a touch-UPDATE that adds-then-deletes a throwaway
|
|
// record. That's an explicit, intentional pattern and is
|
|
// supported.
|
|
return p.updateResp(w, resp, dns.RcodeSuccess)
|
|
}
|
|
|
|
// 6. Bump SOA serial.
|
|
now := time.Now()
|
|
if err := bumpSerial(updated, now); err != nil {
|
|
log.Errorf("UPDATE failed: %v", err)
|
|
return p.updateResp(w, resp, dns.RcodeServerFailure)
|
|
}
|
|
|
|
// 6b. Concurrent-modification check (Hamilton H1). Before clobbering
|
|
// the on-disk file, verify nothing changed it out from under us
|
|
// between loadRRs and now. The per-zone mutex serializes us against
|
|
// other in-process UPDATEs, but external editors (rsync push,
|
|
// manual edit, `git checkout`) can race in any time. If the file
|
|
// changed, refuse with SERVFAIL so Caddy retries on a fresh load.
|
|
if err := zf.checkUnchanged(snap); err != nil {
|
|
log.Warningf("UPDATE refused: %v", err)
|
|
return p.updateResp(w, resp, dns.RcodeServerFailure)
|
|
}
|
|
|
|
// 7. Atomic write.
|
|
if err := zf.writeAtomic(updated, now); err != nil {
|
|
log.Errorf("UPDATE write failed: %v", err)
|
|
return p.updateResp(w, resp, dns.RcodeServerFailure)
|
|
}
|
|
|
|
// 8. Auto-commit. Failure to commit means the file is correct but
|
|
// the git audit trail diverges (Hamilton H2). We log at ERROR with
|
|
// structured detail so operators discover the divergence; recovery
|
|
// is `git -C <zonesDir> status` + `git add` + manual commit. We do
|
|
// NOT roll back the file write — by the time the commit fails, the
|
|
// auto plugin may have already noticed the new mtime, and rolling
|
|
// back creates more races than it solves.
|
|
msg := summarizeUpdate(zone, r.Ns)
|
|
if err := zf.commit(msg); err != nil {
|
|
log.Errorf("git auto-commit failed; zone file is correct but audit trail diverged: zone=%s path=%s err=%v — recover with `git -C %s status` + manual commit",
|
|
zone, zf.Path, err, filepath.Dir(zf.Path))
|
|
}
|
|
|
|
log.Infof("UPDATE applied: zone=%s prereqs=%d updates=%d msg=%q",
|
|
zone, len(r.Answer), len(r.Ns), msg)
|
|
|
|
// Fire NOTIFY to configured secondaries (RFC 1996). Non-blocking:
|
|
// each target gets its own goroutine, capped by notifyTimeout. The
|
|
// UPDATE response to the client is not held on these acks — RFC
|
|
// 1996 §4 explicitly decouples them.
|
|
sendNotify(zone, p.NotifyTargets)
|
|
|
|
return p.updateResp(w, resp, dns.RcodeSuccess)
|
|
}
|
|
|
|
// updateResp writes the response and returns the rcode/err pair for ServeDNS.
|
|
func (p *RFC2136) updateResp(w dns.ResponseWriter, resp *dns.Msg, rcode int) (int, error) {
|
|
resp.Rcode = rcode
|
|
_ = w.WriteMsg(resp)
|
|
return rcode, nil
|
|
}
|
|
|
|
// findZone returns the longest matching configured zone for qname, or
|
|
// "" if qname is outside all configured zones.
|
|
func (p *RFC2136) findZone(qname string) string {
|
|
qname = canon(qname)
|
|
var best string
|
|
for _, z := range p.Zones {
|
|
if qname == z || strings.HasSuffix(qname, "."+z) {
|
|
if len(z) > len(best) {
|
|
best = z
|
|
}
|
|
}
|
|
}
|
|
return best
|
|
}
|
|
|
|
// checkPrereq evaluates one record from the Prerequisite section
|
|
// against the loaded RR slice. Returns dns.RcodeSuccess if satisfied,
|
|
// or the spec rcode otherwise (§3.2).
|
|
func checkPrereq(zone string, rrs []dns.RR, rr dns.RR) int {
|
|
hdr := rr.Header()
|
|
name := canon(hdr.Name)
|
|
if !inZone(name, zone) {
|
|
return dns.RcodeNotZone
|
|
}
|
|
switch hdr.Class {
|
|
case dns.ClassANY:
|
|
if hdr.Rrtype == dns.TypeANY {
|
|
if !nameExistsIn(rrs, name) {
|
|
return dns.RcodeNameError
|
|
}
|
|
return dns.RcodeSuccess
|
|
}
|
|
if len(lookupIn(rrs, name, hdr.Rrtype)) == 0 {
|
|
return dns.RcodeNXRrset
|
|
}
|
|
return dns.RcodeSuccess
|
|
|
|
case dns.ClassNONE:
|
|
if hdr.Rrtype == dns.TypeANY {
|
|
if nameExistsIn(rrs, name) {
|
|
return dns.RcodeYXDomain
|
|
}
|
|
return dns.RcodeSuccess
|
|
}
|
|
if len(lookupIn(rrs, name, hdr.Rrtype)) > 0 {
|
|
return dns.RcodeYXRrset
|
|
}
|
|
return dns.RcodeSuccess
|
|
|
|
default:
|
|
// CLASS = zone class with rdata. Exact value-match prereqs
|
|
// (§3.2.5). Not used by Caddy/caddy-dns/rfc2136; treating as
|
|
// satisfied for now. v2 can implement value-prereq if a real
|
|
// caller needs it.
|
|
log.Debugf("prereq with rdata-match semantics not implemented; treating as satisfied")
|
|
return dns.RcodeSuccess
|
|
}
|
|
}
|
|
|
|
// applyUpdate handles one record in the Update section per §3.4.2.
|
|
// Returns the (possibly mutated) RR slice, an rcode (Success unless
|
|
// the update was rejected), and a flag indicating whether the slice
|
|
// was actually modified (to avoid no-op file rewrites).
|
|
func applyUpdate(zone string, defaultTTL uint32, rrs []dns.RR, rr dns.RR) ([]dns.RR, int, bool) {
|
|
hdr := rr.Header()
|
|
name := canon(hdr.Name)
|
|
if !inZone(name, zone) {
|
|
return rrs, dns.RcodeNotZone, false
|
|
}
|
|
|
|
switch hdr.Class {
|
|
case dns.ClassANY:
|
|
if hdr.Rrtype == dns.TypeANY {
|
|
// Wipe the whole name. Refuse apex wipes — that would
|
|
// destroy SOA + NS bedrock.
|
|
if isApex(name, zone) {
|
|
log.Debugf("apex wipe refused: %s", name)
|
|
return rrs, dns.RcodeRefused, false
|
|
}
|
|
before := len(rrs)
|
|
rrs = removeNameFrom(rrs, name)
|
|
return rrs, dns.RcodeSuccess, len(rrs) != before
|
|
}
|
|
// Apex SOA/NS removal refused for the same reason.
|
|
if isApex(name, zone) && (hdr.Rrtype == dns.TypeSOA || hdr.Rrtype == dns.TypeNS) {
|
|
log.Debugf("apex %s removal refused", dns.TypeToString[hdr.Rrtype])
|
|
return rrs, dns.RcodeRefused, false
|
|
}
|
|
before := len(rrs)
|
|
rrs = removeRRsetFrom(rrs, name, hdr.Rrtype)
|
|
return rrs, dns.RcodeSuccess, len(rrs) != before
|
|
|
|
case dns.ClassNONE:
|
|
// Refuse to delete apex SOA/NS by exact-RR match.
|
|
if isApex(name, zone) && (hdr.Rrtype == dns.TypeSOA || hdr.Rrtype == dns.TypeNS) {
|
|
return rrs, dns.RcodeRefused, false
|
|
}
|
|
before := len(rrs)
|
|
rrs = removeRRFrom(rrs, rr)
|
|
return rrs, dns.RcodeSuccess, len(rrs) != before
|
|
|
|
default:
|
|
// Apex SOA/NS adds refused — those are managed by the zone-file
|
|
// owner, not by dynamic updates.
|
|
if isApex(name, zone) && (hdr.Rrtype == dns.TypeSOA || hdr.Rrtype == dns.TypeNS) {
|
|
log.Debugf("apex %s add refused", dns.TypeToString[hdr.Rrtype])
|
|
return rrs, dns.RcodeRefused, false
|
|
}
|
|
// Hamilton L2: don't mutate the caller's RR header. miekg/dns
|
|
// parses the UPDATE message into RRs the caller still owns;
|
|
// silently rewriting their TTL is a hygiene smell. Copy first,
|
|
// then apply our default if needed.
|
|
if hdr.Ttl == 0 {
|
|
rr = dns.Copy(rr)
|
|
rr.Header().Ttl = defaultTTL
|
|
}
|
|
before := len(rrs)
|
|
rrs = addRRTo(rrs, rr)
|
|
return rrs, dns.RcodeSuccess, len(rrs) != before
|
|
}
|
|
}
|
|
|
|
// summarizeUpdate produces a one-line commit message describing the
|
|
// UPDATE for git history. The output is sanitized — Hamilton M7 — to
|
|
// prevent attacker-controlled RR names (TSIG just authenticates the
|
|
// sender; the payload is still attacker-controlled) from injecting
|
|
// control characters into git log, log aggregators, or any downstream
|
|
// renderer that interprets ANSI/newlines.
|
|
func summarizeUpdate(zone string, updates []dns.RR) string {
|
|
var msg string
|
|
if len(updates) == 1 {
|
|
msg = fmt.Sprintf("rfc2136 %s: %s", zone, oneLineOp(updates[0]))
|
|
} else {
|
|
msg = fmt.Sprintf("rfc2136 %s: %d operations", zone, len(updates))
|
|
}
|
|
return sanitizeForCommitMessage(msg)
|
|
}
|
|
|
|
// sanitizeForCommitMessage strips control characters from s, replacing
|
|
// them with their printable escape form. This keeps git log + downstream
|
|
// renderers safe from attacker-injected newlines, escape sequences, etc.
|
|
func sanitizeForCommitMessage(s string) string {
|
|
var b strings.Builder
|
|
b.Grow(len(s))
|
|
for _, r := range s {
|
|
switch {
|
|
case r == '\n':
|
|
b.WriteString("\\n")
|
|
case r == '\r':
|
|
b.WriteString("\\r")
|
|
case r == '\t':
|
|
b.WriteString("\\t")
|
|
case r < 0x20 || r == 0x7f:
|
|
// Other C0 controls + DEL: emit \xNN.
|
|
fmt.Fprintf(&b, "\\x%02x", r)
|
|
default:
|
|
b.WriteRune(r)
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
// oneLineOp returns a short human-readable description of a single
|
|
// update RR for inclusion in commit messages.
|
|
func oneLineOp(rr dns.RR) string {
|
|
hdr := rr.Header()
|
|
name := strings.TrimSuffix(canon(hdr.Name), ".")
|
|
ttype := dns.TypeToString[hdr.Rrtype]
|
|
switch hdr.Class {
|
|
case dns.ClassANY:
|
|
if hdr.Rrtype == dns.TypeANY {
|
|
return fmt.Sprintf("delete all %s", name)
|
|
}
|
|
return fmt.Sprintf("delete %s %s", ttype, name)
|
|
case dns.ClassNONE:
|
|
return fmt.Sprintf("delete-rr %s %s", ttype, name)
|
|
default:
|
|
return fmt.Sprintf("add %s %s", ttype, name)
|
|
}
|
|
}
|
|
|
|
// inZone reports whether name is within zone.
|
|
func inZone(name, zone string) bool {
|
|
return name == zone || strings.HasSuffix(name, "."+zone)
|
|
}
|
|
|
|
// isApex reports whether name IS the zone's apex.
|
|
func isApex(name, zone string) bool {
|
|
return name == zone
|
|
}
|