Hamilton M8: a compromised TSIG key — or a misconfigured client retrying forever — must not be able to drive unbounded UPDATE traffic. Each UPDATE costs disk IOPS, a git commit, and a slot in the SOA serial counter (now 9999/day per zone). Without a cap, a few hours of runaway traffic could exhaust the SOA serial counter and brick the zone for the day. Implementation: per-key token bucket in ratelimit.go. Default 100 tokens / 60 seconds. New keys start full so legitimate clients see no delay at boot. Refill is continuous, capped at the burst value. Configurable in Corefile: rate-limit off # disable entirely rate-limit <burst> <period-secs> # e.g., rate-limit 200 60 Enforcement runs in ServeDNS after TSIG verification — a request that fails auth doesn't consume a token (and a forged TSIG can't be used to deny service to a real key holder, since we never reached the rate check). 100/min is well above ACME's needs: a worst-case full-renewal storm across our ~84 zones emits maybe 200 UPDATEs total over several minutes. Anything beyond is suspicious by definition. New tests covering: first-call allowed, burst exhaustion, refill behavior, per-key isolation, refill-cap (no idle-accumulation overflow).
115 lines
4.3 KiB
Go
115 lines
4.3 KiB
Go
// Package rfc2136 is a CoreDNS plugin that accepts dynamic DNS updates
|
|
// per RFC 2136 (UPDATE opcode), authenticated via TSIG, and applies
|
|
// them to on-disk zone files. This is the right shape for stacks where
|
|
// the operator wants to keep zones in flat files (perhaps under git,
|
|
// with HE pulling AXFR), but also wants programmatic updates from
|
|
// clients like Caddy's caddy-dns/rfc2136 module.
|
|
//
|
|
// The plugin does NOT serve any queries — that's the job of the
|
|
// `auto`/`file` plugin running alongside it. This plugin's only
|
|
// responsibility is the UPDATE opcode path: verify TSIG, dissect the
|
|
// UPDATE, write the zone file, bump the SOA serial, optionally
|
|
// auto-commit to git. CoreDNS's auto plugin notices the mtime change
|
|
// and re-serves the zone within its reload interval.
|
|
//
|
|
// See the plan at
|
|
// ~/.claude/plans/dood-does-coredns-offer-enumerated-piglet.md
|
|
// for the architectural rationale.
|
|
package rfc2136
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/coredns/coredns/plugin"
|
|
"github.com/miekg/dns"
|
|
)
|
|
|
|
// DefaultTTL is applied to dynamically-added records whose UPDATE
|
|
// messages carry TTL=0. 60s matches the short-lived nature of ACME
|
|
// challenge records and keeps stale answers from lingering in
|
|
// resolver caches.
|
|
const DefaultTTL uint32 = 60
|
|
|
|
// RFC2136 is the plugin handler. One instance per Corefile server block.
|
|
type RFC2136 struct {
|
|
// Next is the downstream plugin in the chain — queries always
|
|
// pass through; only UPDATE opcode is intercepted.
|
|
Next plugin.Handler
|
|
|
|
// Zones is the set of canonical (dot-terminated, lowercase) zone
|
|
// names this instance accepts UPDATEs for. UPDATEs for any other
|
|
// zone are rejected with NOTAUTH.
|
|
Zones []string
|
|
|
|
// TSIGKeys is keyed by canonical key name (lowercased, trailing
|
|
// dot). Empty means TSIG is disabled — UPDATEs are refused
|
|
// unconditionally as a safety default.
|
|
TSIGKeys map[string]tsigKey
|
|
|
|
// TTL is applied to dynamically-injected records that don't carry
|
|
// an explicit TTL in the UPDATE message.
|
|
TTL uint32
|
|
|
|
// ZonesDir is the directory where <zone>.zone files live (matching
|
|
// the mount path inside the CoreDNS container). The plugin reads
|
|
// and writes files at <ZonesDir>/<zone>.zone.
|
|
ZonesDir string
|
|
|
|
// AutoCommit governs whether the plugin auto-commits zone-file
|
|
// changes to git after every successful UPDATE.
|
|
AutoCommit bool
|
|
|
|
// zones holds per-zone file handlers, keyed by canonical zone name.
|
|
// Populated in setup; mutexes live inside each zoneFile.
|
|
zones map[string]*zoneFile
|
|
|
|
// rateLimit caps UPDATE traffic per TSIG key (Hamilton M8). nil
|
|
// disables rate limiting (test mode, or insecure deployments).
|
|
// Populated in setup() once TSIG keys are known.
|
|
rateLimit *rateLimiter
|
|
}
|
|
|
|
// Name implements plugin.Handler.
|
|
func (p *RFC2136) Name() string { return "rfc2136" }
|
|
|
|
// ServeDNS implements plugin.Handler.
|
|
//
|
|
// Dispatch:
|
|
//
|
|
// UPDATE opcode → verify TSIG, then apply via the UPDATE handler.
|
|
// Anything else → pass through to Next (the auto plugin handles
|
|
// queries against the zone files we maintain).
|
|
func (p *RFC2136) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
|
|
if r.Opcode == dns.OpcodeUpdate {
|
|
if err := p.checkTSIG(w, r); err != nil {
|
|
log.Warningf("UPDATE rejected: %v", err)
|
|
resp := new(dns.Msg)
|
|
resp.SetRcode(r, dns.RcodeRefused)
|
|
// Do NOT sign rejection responses. Signing a refusal with
|
|
// the named key would attest the key exists on this server
|
|
// (Hamilton M9). Unsigned Refused is the right shape — the
|
|
// client sees "no TSIG on reply" and treats that as auth
|
|
// failure, which is correct because auth DID fail.
|
|
_ = w.WriteMsg(resp)
|
|
return dns.RcodeRefused, nil
|
|
}
|
|
// Hamilton M8: per-key rate limit. TSIG just authenticates the
|
|
// sender — it doesn't prove the sender's behavior is sane. A
|
|
// compromised key or a runaway client must not be able to
|
|
// exhaust disk/git/serial-counter resources.
|
|
if p.rateLimit != nil {
|
|
if tsig := r.IsTsig(); tsig != nil && !p.rateLimit.allow(strings.ToLower(tsig.Hdr.Name), time.Now()) {
|
|
log.Warningf("UPDATE rate-limited for key %q", tsig.Hdr.Name)
|
|
resp := new(dns.Msg)
|
|
resp.SetRcode(r, dns.RcodeRefused)
|
|
_ = w.WriteMsg(resp)
|
|
return dns.RcodeRefused, nil
|
|
}
|
|
}
|
|
return p.handleUpdate(w, r, true)
|
|
}
|
|
return plugin.NextOrFailure(p.Name(), p.Next, ctx, w, r)
|
|
}
|