Ryan Malloy 7367401734 Send DNS NOTIFY to secondaries after every UPDATE
Per RFC 1996, a master that mutates a zone SHOULD notify its
secondaries so they can immediately AXFR rather than wait for their
next SOA-refresh poll. Without this, propagation lag from UPDATE to
public DNS is bounded by the secondary's refresh interval (300s for
us) — which is borderline for ACME validation timing.

New Corefile directive:
    notify <host[:port]> [<host[:port]>...]

Targets accept bare hostnames (port 53 default), host:port, or
[ipv6]:port. The same list applies to every zone in the rfc2136
block.

Implementation: fire-and-forget UDP per target, each in its own
goroutine, capped by a 2s timeout. The UPDATE response to the client
is never held pending NOTIFY acks (RFC 1996 §4 explicitly decouples
them). Failures log at DEBUG only — a briefly-unreachable secondary
is normal and would otherwise spam logs.

Retires the external scripts/notify-secondaries.py workflow for any
deployment that wires the directive: secondaries now hear about
changes within seconds of the UPDATE landing, no cron or manual
invocation needed.

New tests:
- TestSendNotify_DeliversToTarget — packet arrives, opcode + zone correct
- TestSendNotify_NoTargets_NoCrash — empty list short-circuits
- TestSendNotify_BadTarget_LogsButDoesNotBlock — fire-and-forget timing
- TestNotifyOne_AppendsDefaultPort — host vs host:port normalization
2026-05-23 00:54:45 -06:00

344 lines
11 KiB
Go

package rfc2136
import (
"encoding/base64"
"fmt"
"os"
"path/filepath"
"strconv"
"time"
"github.com/coredns/caddy"
"github.com/coredns/coredns/core/dnsserver"
"github.com/coredns/coredns/plugin"
clog "github.com/coredns/coredns/plugin/pkg/log"
"github.com/miekg/dns"
)
// log is the package logger, scoped so messages are prefixed `[rfc2136]`.
var log = clog.NewWithPlugin("rfc2136")
func init() {
plugin.Register("rfc2136", setup)
// SECURITY/OPERATIONAL NOTE — process-global mutation:
//
// miekg/dns's default MsgAcceptFunc rejects UPDATE opcode messages
// with NOTIMP at the wire layer, before any plugin sees them
// (acceptfunc.go: "Don't allow dynamic updates, because then the
// sections can contain a whole bunch of RRs"). CoreDNS 1.14.3
// constructs its dns.Server instances without exposing a
// per-server MsgAcceptFunc to plugins (see
// coredns/core/dnsserver/server.go:159 — the dns.Server struct is
// hardcoded), so to accept UPDATE opcodes anywhere we must
// override the package-level default. We do that here.
//
// Consequences operators should understand:
//
// 1. The override is PROCESS-WIDE. Every CoreDNS server block in
// this binary will accept UPDATE opcodes at the wire layer,
// not just the one(s) where `rfc2136` is configured. Other
// blocks will pass the UPDATE through their plugin chains;
// since no plugin in those chains handles UPDATE, the request
// falls off the end of the chain and CoreDNS returns
// FormatError. No state is mutated — but the wire-layer
// gatekeeping moved into the plugin chain.
//
// 2. The actual security boundary is TSIG verification, which
// happens in this plugin's ServeDNS (checkTSIG) AND inside
// handleUpdate (assertAuthenticated). Defense in depth: any
// state-mutating path re-verifies, so a future refactor that
// adds a new caller cannot accidentally skip auth.
//
// 3. If you remove `rfc2136` from your Corefile and reload via
// SIGUSR1, this global is NOT restored. Restart the process
// to fully revert.
//
// The mitigation matrix is: (a) loud comment here, (b) startup
// INFO log listing the zones this plugin owns, (c) TSIG re-check
// inside handleUpdate. The architecturally clean fix would be
// upstream support in CoreDNS for per-Config MsgAcceptFunc — when
// that lands, delete this whole stanza.
dns.DefaultMsgAcceptFunc = msgAcceptFunc
}
// msgAcceptFunc mirrors miekg/dns's defaultMsgAcceptFunc but additionally
// allows OpcodeUpdate. For UPDATE messages, the conservative Ancount/
// Nscount limits in the default function don't apply -- per RFC 2136
// those sections (Prerequisite / Update) can carry many RRs.
func msgAcceptFunc(dh dns.Header) dns.MsgAcceptAction {
// Responses are silently ignored regardless of opcode (default behaviour).
if isResponse := dh.Bits&0x8000 != 0; isResponse {
return dns.MsgIgnore
}
opcode := int(dh.Bits>>11) & 0xF
switch opcode {
case dns.OpcodeQuery, dns.OpcodeNotify, dns.OpcodeUpdate:
// allowed
default:
return dns.MsgRejectNotImplemented
}
if dh.Qdcount != 1 {
return dns.MsgReject
}
// UPDATE messages legitimately carry multiple RRs in the
// Prerequisite (Ancount) and Update (Nscount) sections -- skip the
// "exactly 1" check that the default function applies for queries.
if opcode != dns.OpcodeUpdate {
if dh.Ancount > 1 {
return dns.MsgReject
}
if dh.Nscount > 1 {
return dns.MsgReject
}
}
if dh.Arcount > 2 {
return dns.MsgReject
}
return dns.MsgAccept
}
// setup is invoked by the CoreDNS plugin registry once per Corefile
// `rfc2136` directive. It parses the directive, validates that each
// declared zone has a corresponding file in zones-dir, registers
// TSIG keys with the underlying dns.Server, and links the handler
// into the plugin chain.
func setup(c *caddy.Controller) error {
p, err := parse(c)
if err != nil {
return plugin.Error("rfc2136", err)
}
if err := p.validateZoneFiles(); err != nil {
return plugin.Error("rfc2136", err)
}
cfg := dnsserver.GetConfig(c)
// Register TSIG keys with the underlying dns.Server so miekg/dns
// auto-verifies incoming signatures. We then just inspect the
// result via dns.ResponseWriter.TsigStatus() in our UPDATE handler.
if len(p.TSIGKeys) > 0 {
if cfg.TsigSecret == nil {
cfg.TsigSecret = make(map[string]string)
}
for name, key := range p.TSIGKeys {
cfg.TsigSecret[name] = base64.StdEncoding.EncodeToString(key.Secret)
}
}
cfg.AddPlugin(func(next plugin.Handler) plugin.Handler {
p.Next = next
return p
})
log.Infof("ready: zones=%v keys=%d ttl=%d dir=%q auto-commit=%t",
p.Zones, len(p.TSIGKeys), p.TTL, p.ZonesDir, p.AutoCommit)
// Surface the global MsgAcceptFunc override for operator audit —
// if a sibling server block on this process doesn't expect UPDATE
// opcodes to traverse it, the operator should know they will.
log.Infof("dns.DefaultMsgAcceptFunc was overridden process-wide to permit OpcodeUpdate; state mutation requires TSIG verification on this plugin's zones=%v only", p.Zones)
return nil
}
// parse reads a single `rfc2136 <zone> [<zone>...] { ... }` block.
//
// Grammar:
//
// rfc2136 <zone> [<zone>...] {
// zones-dir <path> ; required
// tsig-key <name> <algorithm> <base64-secret> ; may repeat
// ttl <seconds> ; default 60
// auto-commit <true|false> ; default true
// git-author <name> <email> ; optional
// }
func parse(c *caddy.Controller) (*RFC2136, error) {
p := &RFC2136{
TSIGKeys: make(map[string]tsigKey),
TTL: DefaultTTL,
AutoCommit: true,
}
// Per-zone git author overrides. Defaults are applied later.
var gitAuthorName, gitAuthorEmail string
// Rate-limit config (Hamilton M8). Defaults are
// defaultRateBurst/defaultRatePeriod from ratelimit.go; an explicit
// `rate-limit <burst> <period-seconds>` directive overrides.
rateBurst := defaultRateBurst
ratePeriod := defaultRatePeriod
rateLimitEnabled := true
for c.Next() {
args := c.RemainingArgs()
if len(args) < 1 {
return nil, c.ArgErr()
}
for _, z := range args {
p.Zones = append(p.Zones, plugin.Host(z).NormalizeExact()...)
}
for c.NextBlock() {
switch c.Val() {
case "zones-dir":
dArgs := c.RemainingArgs()
if len(dArgs) != 1 {
return nil, c.ArgErr()
}
p.ZonesDir = dArgs[0]
case "tsig-key":
kArgs := c.RemainingArgs()
if len(kArgs) != 3 {
return nil, c.Errf("tsig-key requires 3 args (name algorithm secret), got %d", len(kArgs))
}
keyName := canonicalKeyName(kArgs[0])
algo, err := parseTSIGAlgorithm(kArgs[1])
if err != nil {
return nil, c.Err(err.Error())
}
secret, err := decodeTSIGSecret(kArgs[2])
if err != nil {
return nil, c.Errf("tsig-key %q: %v", keyName, err)
}
if _, exists := p.TSIGKeys[keyName]; exists {
return nil, c.Errf("duplicate tsig-key %q", keyName)
}
p.TSIGKeys[keyName] = tsigKey{Algorithm: algo, Secret: secret}
case "ttl":
tArgs := c.RemainingArgs()
if len(tArgs) != 1 {
return nil, c.ArgErr()
}
ttl, err := strconv.ParseUint(tArgs[0], 10, 32)
if err != nil {
return nil, c.Errf("ttl must be a non-negative integer: %v", err)
}
p.TTL = uint32(ttl)
case "auto-commit":
aArgs := c.RemainingArgs()
if len(aArgs) != 1 {
return nil, c.ArgErr()
}
switch aArgs[0] {
case "true", "yes", "on":
p.AutoCommit = true
case "false", "no", "off":
p.AutoCommit = false
default:
return nil, c.Errf("auto-commit must be true|false, got %q", aArgs[0])
}
case "git-author":
gArgs := c.RemainingArgs()
if len(gArgs) != 2 {
return nil, c.Errf("git-author requires 2 args (name email), got %d", len(gArgs))
}
gitAuthorName = gArgs[0]
gitAuthorEmail = gArgs[1]
case "notify":
nArgs := c.RemainingArgs()
if len(nArgs) < 1 {
return nil, c.Errf("notify requires at least one secondary (host or host:port)")
}
p.NotifyTargets = append(p.NotifyTargets, nArgs...)
case "rate-limit":
rArgs := c.RemainingArgs()
switch len(rArgs) {
case 1:
if rArgs[0] == "off" || rArgs[0] == "false" || rArgs[0] == "no" {
rateLimitEnabled = false
break
}
return nil, c.Errf("rate-limit single-arg form must be 'off'; for limits use 'rate-limit <burst> <period-seconds>'")
case 2:
b, err := strconv.ParseUint(rArgs[0], 10, 31)
if err != nil || b < 1 {
return nil, c.Errf("rate-limit burst must be positive integer, got %q", rArgs[0])
}
pSec, err := strconv.ParseUint(rArgs[1], 10, 31)
if err != nil || pSec < 1 {
return nil, c.Errf("rate-limit period must be positive integer seconds, got %q", rArgs[1])
}
rateBurst = int(b)
ratePeriod = time.Duration(pSec) * time.Second
default:
return nil, c.Errf("rate-limit takes 'off' OR '<burst> <period-seconds>', got %d args", len(rArgs))
}
default:
return nil, c.Errf("unknown directive: %s", c.Val())
}
}
}
if len(p.Zones) == 0 {
return nil, c.Err("at least one zone must be specified")
}
if p.ZonesDir == "" {
return nil, c.Err("zones-dir is required")
}
// Construct rate limiter if enabled.
if rateLimitEnabled {
p.rateLimit = newRateLimiter(rateBurst, ratePeriod)
}
// Build zoneFile handles for each declared zone.
p.zones = make(map[string]*zoneFile, len(p.Zones))
for _, z := range p.Zones {
// Trailing dot → filename. supported.systems. → supported.systems.zone
stem := z
if l := len(stem); l > 0 && stem[l-1] == '.' {
stem = stem[:l-1]
}
path := filepath.Join(p.ZonesDir, stem+".zone")
zf := openZoneFile(path, z)
zf.AutoCommit = p.AutoCommit
if gitAuthorName != "" {
zf.GitAuthorName = gitAuthorName
}
if gitAuthorEmail != "" {
zf.GitAuthorEmail = gitAuthorEmail
}
p.zones[z] = zf
}
return p, nil
}
// validateZoneFiles ensures every configured zone has an accessible
// AND parseable file on disk at the expected path. Catches both typos
// (file missing) and corrupt zone content at CoreDNS startup rather
// than on the first UPDATE — the operator gets an immediate signal
// instead of discovering the breakage minutes later when ACME fires.
//
// Hamilton M4: the previous version only stat()'d the file. A zone
// with a syntax error sailed through startup, then the first UPDATE
// returned SERVFAIL with no startup-time signal. We now run the same
// loadRRs + assertSingleApexSOA path the UPDATE handler uses, so any
// parse-time or SOA-invariant failure surfaces at startup.
func (p *RFC2136) validateZoneFiles() error {
for zone, zf := range p.zones {
st, err := os.Stat(zf.Path)
if err != nil {
return fmt.Errorf("zone %q: file not accessible at %s: %w", zone, zf.Path, err)
}
if st.IsDir() {
return fmt.Errorf("zone %q: %s is a directory, expected a regular file", zone, zf.Path)
}
if _, _, err := zf.loadRRs(); err != nil {
return fmt.Errorf("zone %q at %s: %w", zone, zf.Path, err)
}
}
return nil
}