diff --git a/setup.go b/setup.go index 4ec968f..e048d57 100644 --- a/setup.go +++ b/setup.go @@ -272,8 +272,16 @@ func parse(c *caddy.Controller) (*RFC2136, error) { } // validateZoneFiles ensures every configured zone has an accessible -// file on disk at the expected path. Catches typos at CoreDNS startup -// rather than the first UPDATE. +// AND parseable file on disk at the expected path. Catches both typos +// (file missing) and corrupt zone content at CoreDNS startup rather +// than on the first UPDATE — the operator gets an immediate signal +// instead of discovering the breakage minutes later when ACME fires. +// +// Hamilton M4: the previous version only stat()'d the file. A zone +// with a syntax error sailed through startup, then the first UPDATE +// returned SERVFAIL with no startup-time signal. We now run the same +// loadRRs + assertSingleApexSOA path the UPDATE handler uses, so any +// parse-time or SOA-invariant failure surfaces at startup. func (p *RFC2136) validateZoneFiles() error { for zone, zf := range p.zones { st, err := os.Stat(zf.Path) @@ -283,6 +291,9 @@ func (p *RFC2136) validateZoneFiles() error { if st.IsDir() { return fmt.Errorf("zone %q: %s is a directory, expected a regular file", zone, zf.Path) } + if _, _, err := zf.loadRRs(); err != nil { + return fmt.Errorf("zone %q at %s: %w", zone, zf.Path, err) + } } return nil } diff --git a/tsig.go b/tsig.go index e028e44..59c09e5 100644 --- a/tsig.go +++ b/tsig.go @@ -86,6 +86,16 @@ func (p *RFC2136) checkTSIG(w dns.ResponseWriter, r *dns.Msg) error { // A nil from TsigStatus means verification succeeded; any non-nil // error means the signature was invalid, the time was outside the // fudge window, or some other auth failure. + // + // H6 — Replay window: the TSIG `fudge` field (RFC 8945 §5.2) is the + // allowed clock skew between client and server. miekg/dns enforces + // this in TsigVerify (rejecting with ErrTime if the request's time + // is too far from local time). We rely on miekg/dns's default + // tolerance (currently 300s); we do not configure it ourselves. + // If miekg/dns's default ever changes, our replay-window behavior + // changes with it — operators should monitor upstream release notes. + // A future enhancement is to make the fudge configurable via the + // Corefile (e.g., `tsig-fudge 300`). if status := w.TsigStatus(); status != nil { return fmt.Errorf("TSIG verification failed for key %q: %w", keyName, status) } diff --git a/update.go b/update.go index 0409d9c..139e3b3 100644 --- a/update.go +++ b/update.go @@ -115,8 +115,28 @@ func (p *RFC2136) handleUpdate(w dns.ResponseWriter, r *dns.Msg, verified bool) if !changed { // UPDATE was a valid no-op (e.g. only contained adds for RRs - // that were already present, deduped away). Return NOERROR - // without rewriting the file. + // that were already present, deduped away per RFC 2136 + // §3.4.2.2). Return NOERROR without rewriting the file or + // bumping the SOA serial. + // + // H7 — Policy decision documented: + // + // We DO NOT bump the SOA serial on no-op UPDATEs. Rationale: + // - DNS-wise, nothing changed. Forcing downstream secondaries + // (HE) to do an AXFR pull just to re-fetch identical content + // wastes bandwidth and is not what RFC 2136 implies. + // - The wire-visible cert-issuance chain for ACME does not + // depend on the second-UPDATE's serial bump — once the first + // UPDATE landed, the SOA already advanced and the auto plugin + // reloaded; subsequent identical UPDATEs are spurious and + // should be silent. + // - Caddy's caddy-dns/rfc2136 client treats NOERROR-no-bump as + // "yes I have your record" — which is the truthful answer. + // + // If a caller wants to force a serial bump for some reason, they + // can send a touch-UPDATE that adds-then-deletes a throwaway + // record. That's an explicit, intentional pattern and is + // supported. return p.updateResp(w, resp, dns.RcodeSuccess) } @@ -287,12 +307,43 @@ func applyUpdate(zone string, defaultTTL uint32, rrs []dns.RR, rr dns.RR) ([]dns } // summarizeUpdate produces a one-line commit message describing the -// UPDATE for git history. +// UPDATE for git history. The output is sanitized — Hamilton M7 — to +// prevent attacker-controlled RR names (TSIG just authenticates the +// sender; the payload is still attacker-controlled) from injecting +// control characters into git log, log aggregators, or any downstream +// renderer that interprets ANSI/newlines. func summarizeUpdate(zone string, updates []dns.RR) string { + var msg string if len(updates) == 1 { - return fmt.Sprintf("rfc2136 %s: %s", zone, oneLineOp(updates[0])) + msg = fmt.Sprintf("rfc2136 %s: %s", zone, oneLineOp(updates[0])) + } else { + msg = fmt.Sprintf("rfc2136 %s: %d operations", zone, len(updates)) } - return fmt.Sprintf("rfc2136 %s: %d operations", zone, len(updates)) + return sanitizeForCommitMessage(msg) +} + +// sanitizeForCommitMessage strips control characters from s, replacing +// them with their printable escape form. This keeps git log + downstream +// renderers safe from attacker-injected newlines, escape sequences, etc. +func sanitizeForCommitMessage(s string) string { + var b strings.Builder + b.Grow(len(s)) + for _, r := range s { + switch { + case r == '\n': + b.WriteString("\\n") + case r == '\r': + b.WriteString("\\r") + case r == '\t': + b.WriteString("\\t") + case r < 0x20 || r == 0x7f: + // Other C0 controls + DEL: emit \xNN. + fmt.Fprintf(&b, "\\x%02x", r) + default: + b.WriteRune(r) + } + } + return b.String() } // oneLineOp returns a short human-readable description of a single diff --git a/update_test.go b/update_test.go index 4f56c1a..c633359 100644 --- a/update_test.go +++ b/update_test.go @@ -12,14 +12,19 @@ import ( // captureWriter implements dns.ResponseWriter and stashes the message // passed to WriteMsg so tests can inspect it after handleUpdate returns. +// +// tsigErr, if non-nil, is what TsigStatus() returns — letting tests +// simulate TSIG-verification failure (bad MAC, fudge-window violation, +// unknown key etc.). type captureWriter struct { - msg *dns.Msg + msg *dns.Msg + tsigErr error } func (cw *captureWriter) WriteMsg(m *dns.Msg) error { cw.msg = m; return nil } func (cw *captureWriter) Write([]byte) (int, error) { return 0, nil } func (cw *captureWriter) Close() error { return nil } -func (cw *captureWriter) TsigStatus() error { return nil } +func (cw *captureWriter) TsigStatus() error { return cw.tsigErr } func (cw *captureWriter) TsigTimersOnly(bool) {} func (cw *captureWriter) Hijack() {} func (cw *captureWriter) LocalAddr() net.Addr { return &net.IPAddr{IP: net.IPv4(127, 0, 0, 1)} } @@ -155,6 +160,32 @@ func TestUpdate_DeleteRRset_RemovesAllOfType(t *testing.T) { } } +// TestCheckTSIG_BadStatus_Refused covers H6 — the path where miekg/dns +// has reported a TSIG verification failure (bad MAC, fudge-window +// violation, expired timestamp, etc.) via dns.ResponseWriter.TsigStatus. +// checkTSIG must surface this as an error so ServeDNS refuses the +// UPDATE. This is the test that would catch a regression in our +// reliance on miekg/dns's default fudge window. +func TestCheckTSIG_BadStatus_Refused(t *testing.T) { + p := newTestPluginWithZone(t, "auth.example.com") + // Configure a key so checkTSIG doesn't short-circuit on "no keys". + p.TSIGKeys = map[string]tsigKey{ + "acme-update-key.": {Algorithm: dns.HmacSHA256, Secret: []byte("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")}, + } + upd := newUpdate("auth.example.com", + mustRR(t, `foo.auth.example.com. 60 IN TXT "x"`), + ) + upd.SetTsig("acme-update-key.", dns.HmacSHA256, 300, 0) + + w := &captureWriter{ + // Simulate miekg/dns reporting "TSIG outside fudge window." + tsigErr: dns.ErrTime, + } + if err := p.checkTSIG(w, upd); err == nil { + t.Errorf("checkTSIG accepted request despite TsigStatus reporting %v", dns.ErrTime) + } +} + // TestUpdate_UnverifiedCaller_Refused proves the C2 defense-in-depth // contract: handleUpdate refuses any call that doesn't assert TSIG // verification, even if the rest of the message is well-formed and the @@ -178,6 +209,54 @@ func TestUpdate_UnverifiedCaller_Refused(t *testing.T) { } } +// TestUpdate_DeleteRR_IgnoresTTL covers M3: an UPDATE's delete-RR +// specifies a different TTL than the stored RR, but per RFC 2136 +// §3.4.2.4 the deletion must match by owner/class/type/rdata only. +// The old String()-based comparison would silently fail to match. +func TestUpdate_DeleteRR_IgnoresTTL(t *testing.T) { + p := newTestPluginWithZone(t, "auth.example.com") + // Add a TXT with TTL 60. + runUpdate(t, p, newUpdate("auth.example.com", + mustRR(t, `foo.auth.example.com. 60 IN TXT "match-me"`), + )) + // Issue a CLASS=NONE delete with a different TTL (per RFC 2136 + // §2.5.4, the TTL on a CLASS=NONE delete is supposed to be 0; some + // clients get this wrong, and even when they get it right, an + // implementation must ignore the value). + delRR := mustRR(t, `foo.auth.example.com. 0 IN TXT "match-me"`) + delRR.Header().Class = dns.ClassNONE + if rcode := runUpdate(t, p, newUpdate("auth.example.com", delRR)); rcode != dns.RcodeSuccess { + t.Fatalf("rcode = %d, want NOERROR", rcode) + } + for _, rr := range readZoneRecords(t, p, "auth.example.com") { + if txt, ok := rr.(*dns.TXT); ok && txt.Hdr.Name == "foo.auth.example.com." && txt.Txt[0] == "match-me" { + t.Errorf("TTL-differing delete did not remove the RR: %s", rr.String()) + } + } +} + +// TestSanitizeForCommitMessage covers M7: attacker-controlled RR names +// (TSIG only authenticates the sender; the payload is hostile by +// default) must not inject control characters into commit messages or +// downstream log renderers. +func TestSanitizeForCommitMessage(t *testing.T) { + cases := []struct { + in, want string + }{ + {"plain", "plain"}, + {"with\nnewline", "with\\nnewline"}, + {"tab\there", "tab\\there"}, + {"esc\x1b[31mred\x1b[0m", "esc\\x1b[31mred\\x1b[0m"}, + {"del\x7fchar", "del\\x7fchar"}, + } + for _, tc := range cases { + got := sanitizeForCommitMessage(tc.in) + if got != tc.want { + t.Errorf("sanitize(%q) = %q, want %q", tc.in, got, tc.want) + } + } +} + func TestUpdate_OutOfZone_Refused(t *testing.T) { p := newTestPluginWithZone(t, "auth.example.com") diff --git a/zonefile.go b/zonefile.go index 7c67e05..e18a148 100644 --- a/zonefile.go +++ b/zonefile.go @@ -251,13 +251,19 @@ func removeNameFrom(rrs []dns.RR, name string) []dns.RR { } // removeRRFrom returns rrs minus the single RR matching the given one -// by owner + type + rdata. String() comparison covers rdata exactness. +// by owner + type + rdata. +// +// Hamilton M3: per RFC 2136 §2.5.4, a delete-by-exact-match UPDATE +// carries CLASS=NONE and TTL=0 as protocol flags, not as match +// criteria. The target must match a stored RR by owner+type+rdata +// alone. We normalize both sides to the same class + TTL before +// invoking dns.IsDuplicate so the comparison is correct. func removeRRFrom(rrs []dns.RR, target dns.RR) []dns.RR { - targetStr := target.String() + targetN := normalizeForCompare(target) out := rrs[:0:0] matched := false for _, rr := range rrs { - if !matched && rr.String() == targetStr { + if !matched && dns.IsDuplicate(normalizeForCompare(rr), targetN) { matched = true continue } @@ -267,17 +273,30 @@ func removeRRFrom(rrs []dns.RR, target dns.RR) []dns.RR { } // addRRTo appends rr to rrs unless an identical RR already exists -// (de-dupe semantics per RFC 2136 §3.4.2.2). +// (de-dupe semantics per RFC 2136 §3.4.2.2). Same normalization as +// removeRRFrom — dedupe is TTL- and class-insensitive in the comparison +// (though the stored RR retains its original TTL/class). func addRRTo(rrs []dns.RR, rr dns.RR) []dns.RR { - target := rr.String() + rrN := normalizeForCompare(rr) for _, existing := range rrs { - if existing.String() == target { + if dns.IsDuplicate(normalizeForCompare(existing), rrN) { return rrs } } return append(rrs, rr) } +// normalizeForCompare returns a copy of rr with TTL=0 and class=IN so +// dns.IsDuplicate can be used to compare by (owner, type, rdata) alone. +// Required by RFC 2136 §2.5.4's "TTL and CLASS are flags, not match +// criteria" semantics. +func normalizeForCompare(rr dns.RR) dns.RR { + n := dns.Copy(rr) + n.Header().Ttl = 0 + n.Header().Class = dns.ClassINET + return n +} + // serialCounterMul is the multiplier between the date prefix and the // counter in our SOA-serial encoding. The format is YYMMDD*10000 + NNNN, // giving 10000 bumps/day (NNNN ∈ [0001, 9999]). The 2-digit year keeps