Per RFC 1996, a master that mutates a zone SHOULD notify its
secondaries so they can immediately AXFR rather than wait for their
next SOA-refresh poll. Without this, propagation lag from UPDATE to
public DNS is bounded by the secondary's refresh interval (300s for
us) — which is borderline for ACME validation timing.
New Corefile directive:
notify <host[:port]> [<host[:port]>...]
Targets accept bare hostnames (port 53 default), host:port, or
[ipv6]:port. The same list applies to every zone in the rfc2136
block.
Implementation: fire-and-forget UDP per target, each in its own
goroutine, capped by a 2s timeout. The UPDATE response to the client
is never held pending NOTIFY acks (RFC 1996 §4 explicitly decouples
them). Failures log at DEBUG only — a briefly-unreachable secondary
is normal and would otherwise spam logs.
Retires the external scripts/notify-secondaries.py workflow for any
deployment that wires the directive: secondaries now hear about
changes within seconds of the UPDATE landing, no cron or manual
invocation needed.
New tests:
- TestSendNotify_DeliversToTarget — packet arrives, opcode + zone correct
- TestSendNotify_NoTargets_NoCrash — empty list short-circuits
- TestSendNotify_BadTarget_LogsButDoesNotBlock — fire-and-forget timing
- TestNotifyOne_AppendsDefaultPort — host vs host:port normalization
344 lines
11 KiB
Go
344 lines
11 KiB
Go
package rfc2136
|
|
|
|
import (
|
|
"encoding/base64"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/coredns/caddy"
|
|
"github.com/coredns/coredns/core/dnsserver"
|
|
"github.com/coredns/coredns/plugin"
|
|
clog "github.com/coredns/coredns/plugin/pkg/log"
|
|
"github.com/miekg/dns"
|
|
)
|
|
|
|
// log is the package logger, scoped so messages are prefixed `[rfc2136]`.
|
|
var log = clog.NewWithPlugin("rfc2136")
|
|
|
|
func init() {
|
|
plugin.Register("rfc2136", setup)
|
|
|
|
// SECURITY/OPERATIONAL NOTE — process-global mutation:
|
|
//
|
|
// miekg/dns's default MsgAcceptFunc rejects UPDATE opcode messages
|
|
// with NOTIMP at the wire layer, before any plugin sees them
|
|
// (acceptfunc.go: "Don't allow dynamic updates, because then the
|
|
// sections can contain a whole bunch of RRs"). CoreDNS 1.14.3
|
|
// constructs its dns.Server instances without exposing a
|
|
// per-server MsgAcceptFunc to plugins (see
|
|
// coredns/core/dnsserver/server.go:159 — the dns.Server struct is
|
|
// hardcoded), so to accept UPDATE opcodes anywhere we must
|
|
// override the package-level default. We do that here.
|
|
//
|
|
// Consequences operators should understand:
|
|
//
|
|
// 1. The override is PROCESS-WIDE. Every CoreDNS server block in
|
|
// this binary will accept UPDATE opcodes at the wire layer,
|
|
// not just the one(s) where `rfc2136` is configured. Other
|
|
// blocks will pass the UPDATE through their plugin chains;
|
|
// since no plugin in those chains handles UPDATE, the request
|
|
// falls off the end of the chain and CoreDNS returns
|
|
// FormatError. No state is mutated — but the wire-layer
|
|
// gatekeeping moved into the plugin chain.
|
|
//
|
|
// 2. The actual security boundary is TSIG verification, which
|
|
// happens in this plugin's ServeDNS (checkTSIG) AND inside
|
|
// handleUpdate (assertAuthenticated). Defense in depth: any
|
|
// state-mutating path re-verifies, so a future refactor that
|
|
// adds a new caller cannot accidentally skip auth.
|
|
//
|
|
// 3. If you remove `rfc2136` from your Corefile and reload via
|
|
// SIGUSR1, this global is NOT restored. Restart the process
|
|
// to fully revert.
|
|
//
|
|
// The mitigation matrix is: (a) loud comment here, (b) startup
|
|
// INFO log listing the zones this plugin owns, (c) TSIG re-check
|
|
// inside handleUpdate. The architecturally clean fix would be
|
|
// upstream support in CoreDNS for per-Config MsgAcceptFunc — when
|
|
// that lands, delete this whole stanza.
|
|
dns.DefaultMsgAcceptFunc = msgAcceptFunc
|
|
}
|
|
|
|
// msgAcceptFunc mirrors miekg/dns's defaultMsgAcceptFunc but additionally
|
|
// allows OpcodeUpdate. For UPDATE messages, the conservative Ancount/
|
|
// Nscount limits in the default function don't apply -- per RFC 2136
|
|
// those sections (Prerequisite / Update) can carry many RRs.
|
|
func msgAcceptFunc(dh dns.Header) dns.MsgAcceptAction {
|
|
// Responses are silently ignored regardless of opcode (default behaviour).
|
|
if isResponse := dh.Bits&0x8000 != 0; isResponse {
|
|
return dns.MsgIgnore
|
|
}
|
|
|
|
opcode := int(dh.Bits>>11) & 0xF
|
|
switch opcode {
|
|
case dns.OpcodeQuery, dns.OpcodeNotify, dns.OpcodeUpdate:
|
|
// allowed
|
|
default:
|
|
return dns.MsgRejectNotImplemented
|
|
}
|
|
|
|
if dh.Qdcount != 1 {
|
|
return dns.MsgReject
|
|
}
|
|
|
|
// UPDATE messages legitimately carry multiple RRs in the
|
|
// Prerequisite (Ancount) and Update (Nscount) sections -- skip the
|
|
// "exactly 1" check that the default function applies for queries.
|
|
if opcode != dns.OpcodeUpdate {
|
|
if dh.Ancount > 1 {
|
|
return dns.MsgReject
|
|
}
|
|
if dh.Nscount > 1 {
|
|
return dns.MsgReject
|
|
}
|
|
}
|
|
|
|
if dh.Arcount > 2 {
|
|
return dns.MsgReject
|
|
}
|
|
return dns.MsgAccept
|
|
}
|
|
|
|
// setup is invoked by the CoreDNS plugin registry once per Corefile
|
|
// `rfc2136` directive. It parses the directive, validates that each
|
|
// declared zone has a corresponding file in zones-dir, registers
|
|
// TSIG keys with the underlying dns.Server, and links the handler
|
|
// into the plugin chain.
|
|
func setup(c *caddy.Controller) error {
|
|
p, err := parse(c)
|
|
if err != nil {
|
|
return plugin.Error("rfc2136", err)
|
|
}
|
|
if err := p.validateZoneFiles(); err != nil {
|
|
return plugin.Error("rfc2136", err)
|
|
}
|
|
|
|
cfg := dnsserver.GetConfig(c)
|
|
|
|
// Register TSIG keys with the underlying dns.Server so miekg/dns
|
|
// auto-verifies incoming signatures. We then just inspect the
|
|
// result via dns.ResponseWriter.TsigStatus() in our UPDATE handler.
|
|
if len(p.TSIGKeys) > 0 {
|
|
if cfg.TsigSecret == nil {
|
|
cfg.TsigSecret = make(map[string]string)
|
|
}
|
|
for name, key := range p.TSIGKeys {
|
|
cfg.TsigSecret[name] = base64.StdEncoding.EncodeToString(key.Secret)
|
|
}
|
|
}
|
|
|
|
cfg.AddPlugin(func(next plugin.Handler) plugin.Handler {
|
|
p.Next = next
|
|
return p
|
|
})
|
|
|
|
log.Infof("ready: zones=%v keys=%d ttl=%d dir=%q auto-commit=%t",
|
|
p.Zones, len(p.TSIGKeys), p.TTL, p.ZonesDir, p.AutoCommit)
|
|
// Surface the global MsgAcceptFunc override for operator audit —
|
|
// if a sibling server block on this process doesn't expect UPDATE
|
|
// opcodes to traverse it, the operator should know they will.
|
|
log.Infof("dns.DefaultMsgAcceptFunc was overridden process-wide to permit OpcodeUpdate; state mutation requires TSIG verification on this plugin's zones=%v only", p.Zones)
|
|
return nil
|
|
}
|
|
|
|
// parse reads a single `rfc2136 <zone> [<zone>...] { ... }` block.
|
|
//
|
|
// Grammar:
|
|
//
|
|
// rfc2136 <zone> [<zone>...] {
|
|
// zones-dir <path> ; required
|
|
// tsig-key <name> <algorithm> <base64-secret> ; may repeat
|
|
// ttl <seconds> ; default 60
|
|
// auto-commit <true|false> ; default true
|
|
// git-author <name> <email> ; optional
|
|
// }
|
|
func parse(c *caddy.Controller) (*RFC2136, error) {
|
|
p := &RFC2136{
|
|
TSIGKeys: make(map[string]tsigKey),
|
|
TTL: DefaultTTL,
|
|
AutoCommit: true,
|
|
}
|
|
|
|
// Per-zone git author overrides. Defaults are applied later.
|
|
var gitAuthorName, gitAuthorEmail string
|
|
|
|
// Rate-limit config (Hamilton M8). Defaults are
|
|
// defaultRateBurst/defaultRatePeriod from ratelimit.go; an explicit
|
|
// `rate-limit <burst> <period-seconds>` directive overrides.
|
|
rateBurst := defaultRateBurst
|
|
ratePeriod := defaultRatePeriod
|
|
rateLimitEnabled := true
|
|
|
|
for c.Next() {
|
|
args := c.RemainingArgs()
|
|
if len(args) < 1 {
|
|
return nil, c.ArgErr()
|
|
}
|
|
for _, z := range args {
|
|
p.Zones = append(p.Zones, plugin.Host(z).NormalizeExact()...)
|
|
}
|
|
|
|
for c.NextBlock() {
|
|
switch c.Val() {
|
|
|
|
case "zones-dir":
|
|
dArgs := c.RemainingArgs()
|
|
if len(dArgs) != 1 {
|
|
return nil, c.ArgErr()
|
|
}
|
|
p.ZonesDir = dArgs[0]
|
|
|
|
case "tsig-key":
|
|
kArgs := c.RemainingArgs()
|
|
if len(kArgs) != 3 {
|
|
return nil, c.Errf("tsig-key requires 3 args (name algorithm secret), got %d", len(kArgs))
|
|
}
|
|
keyName := canonicalKeyName(kArgs[0])
|
|
algo, err := parseTSIGAlgorithm(kArgs[1])
|
|
if err != nil {
|
|
return nil, c.Err(err.Error())
|
|
}
|
|
secret, err := decodeTSIGSecret(kArgs[2])
|
|
if err != nil {
|
|
return nil, c.Errf("tsig-key %q: %v", keyName, err)
|
|
}
|
|
if _, exists := p.TSIGKeys[keyName]; exists {
|
|
return nil, c.Errf("duplicate tsig-key %q", keyName)
|
|
}
|
|
p.TSIGKeys[keyName] = tsigKey{Algorithm: algo, Secret: secret}
|
|
|
|
case "ttl":
|
|
tArgs := c.RemainingArgs()
|
|
if len(tArgs) != 1 {
|
|
return nil, c.ArgErr()
|
|
}
|
|
ttl, err := strconv.ParseUint(tArgs[0], 10, 32)
|
|
if err != nil {
|
|
return nil, c.Errf("ttl must be a non-negative integer: %v", err)
|
|
}
|
|
p.TTL = uint32(ttl)
|
|
|
|
case "auto-commit":
|
|
aArgs := c.RemainingArgs()
|
|
if len(aArgs) != 1 {
|
|
return nil, c.ArgErr()
|
|
}
|
|
switch aArgs[0] {
|
|
case "true", "yes", "on":
|
|
p.AutoCommit = true
|
|
case "false", "no", "off":
|
|
p.AutoCommit = false
|
|
default:
|
|
return nil, c.Errf("auto-commit must be true|false, got %q", aArgs[0])
|
|
}
|
|
|
|
case "git-author":
|
|
gArgs := c.RemainingArgs()
|
|
if len(gArgs) != 2 {
|
|
return nil, c.Errf("git-author requires 2 args (name email), got %d", len(gArgs))
|
|
}
|
|
gitAuthorName = gArgs[0]
|
|
gitAuthorEmail = gArgs[1]
|
|
|
|
case "notify":
|
|
nArgs := c.RemainingArgs()
|
|
if len(nArgs) < 1 {
|
|
return nil, c.Errf("notify requires at least one secondary (host or host:port)")
|
|
}
|
|
p.NotifyTargets = append(p.NotifyTargets, nArgs...)
|
|
|
|
case "rate-limit":
|
|
rArgs := c.RemainingArgs()
|
|
switch len(rArgs) {
|
|
case 1:
|
|
if rArgs[0] == "off" || rArgs[0] == "false" || rArgs[0] == "no" {
|
|
rateLimitEnabled = false
|
|
break
|
|
}
|
|
return nil, c.Errf("rate-limit single-arg form must be 'off'; for limits use 'rate-limit <burst> <period-seconds>'")
|
|
case 2:
|
|
b, err := strconv.ParseUint(rArgs[0], 10, 31)
|
|
if err != nil || b < 1 {
|
|
return nil, c.Errf("rate-limit burst must be positive integer, got %q", rArgs[0])
|
|
}
|
|
pSec, err := strconv.ParseUint(rArgs[1], 10, 31)
|
|
if err != nil || pSec < 1 {
|
|
return nil, c.Errf("rate-limit period must be positive integer seconds, got %q", rArgs[1])
|
|
}
|
|
rateBurst = int(b)
|
|
ratePeriod = time.Duration(pSec) * time.Second
|
|
default:
|
|
return nil, c.Errf("rate-limit takes 'off' OR '<burst> <period-seconds>', got %d args", len(rArgs))
|
|
}
|
|
|
|
default:
|
|
return nil, c.Errf("unknown directive: %s", c.Val())
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(p.Zones) == 0 {
|
|
return nil, c.Err("at least one zone must be specified")
|
|
}
|
|
if p.ZonesDir == "" {
|
|
return nil, c.Err("zones-dir is required")
|
|
}
|
|
|
|
// Construct rate limiter if enabled.
|
|
if rateLimitEnabled {
|
|
p.rateLimit = newRateLimiter(rateBurst, ratePeriod)
|
|
}
|
|
|
|
// Build zoneFile handles for each declared zone.
|
|
p.zones = make(map[string]*zoneFile, len(p.Zones))
|
|
for _, z := range p.Zones {
|
|
// Trailing dot → filename. supported.systems. → supported.systems.zone
|
|
stem := z
|
|
if l := len(stem); l > 0 && stem[l-1] == '.' {
|
|
stem = stem[:l-1]
|
|
}
|
|
path := filepath.Join(p.ZonesDir, stem+".zone")
|
|
zf := openZoneFile(path, z)
|
|
zf.AutoCommit = p.AutoCommit
|
|
if gitAuthorName != "" {
|
|
zf.GitAuthorName = gitAuthorName
|
|
}
|
|
if gitAuthorEmail != "" {
|
|
zf.GitAuthorEmail = gitAuthorEmail
|
|
}
|
|
p.zones[z] = zf
|
|
}
|
|
|
|
return p, nil
|
|
}
|
|
|
|
// validateZoneFiles ensures every configured zone has an accessible
|
|
// AND parseable file on disk at the expected path. Catches both typos
|
|
// (file missing) and corrupt zone content at CoreDNS startup rather
|
|
// than on the first UPDATE — the operator gets an immediate signal
|
|
// instead of discovering the breakage minutes later when ACME fires.
|
|
//
|
|
// Hamilton M4: the previous version only stat()'d the file. A zone
|
|
// with a syntax error sailed through startup, then the first UPDATE
|
|
// returned SERVFAIL with no startup-time signal. We now run the same
|
|
// loadRRs + assertSingleApexSOA path the UPDATE handler uses, so any
|
|
// parse-time or SOA-invariant failure surfaces at startup.
|
|
func (p *RFC2136) validateZoneFiles() error {
|
|
for zone, zf := range p.zones {
|
|
st, err := os.Stat(zf.Path)
|
|
if err != nil {
|
|
return fmt.Errorf("zone %q: file not accessible at %s: %w", zone, zf.Path, err)
|
|
}
|
|
if st.IsDir() {
|
|
return fmt.Errorf("zone %q: %s is a directory, expected a regular file", zone, zf.Path)
|
|
}
|
|
if _, _, err := zf.loadRRs(); err != nil {
|
|
return fmt.Errorf("zone %q at %s: %w", zone, zf.Path, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|