coredns-rfc2136/zonefile.go
Ryan Malloy 89993ca207 L1/L2/L4: cleanup + README operational guide
L1 — Replace hand-rolled atoi/parseUint with strconv.ParseUint wrapped
in mustParseUint. Hamilton's reasoning: the comment "strconv adds
overhead we don't need" is the Lauren-Bug shape — we already validated
the input. Until we hadn't, on a path we couldn't predict. Stdlib's
edge-case coverage is the safer default; the wrapper panics on
malformed input so any future regression surfaces in CI, not as a
silent 0 serial.

L2 — applyUpdate no longer mutates the caller's RR header TTL. miekg/
dns parses the UPDATE message into RRs the caller still owns; silently
rewriting hdr.Ttl was a hygiene smell with no current functional
consequence but a clear documentation issue. Now we dns.Copy() the RR
before any header mutation.

L4 — README expanded with an "Operational constraints" section
documenting the contracts and limits operators should understand
before relying on this in production:
  - Single-process atomicity only (with rsync-race mitigation)
  - Process-global MsgAcceptFunc override
  - No-op UPDATE doesn't bump SOA (with touch-UPDATE workaround)
  - SOA invariants enforced strictly (zero, multi, non-apex SOA all
    refused)
  - Serial counter NNNN=9999 rollover semantics
  - TSIG replay window dependency on miekg/dns default
  - Git commit failure logged at ERROR, not rolled back
  - Per-key rate limit knobs

Every constraint maps to a Hamilton review finding; documenting the
contract in operator-facing prose closes the gap between code and
expectation that the review identified.
2026-05-22 21:33:37 -06:00

512 lines
18 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package rfc2136
import (
"context"
"fmt"
"math"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"github.com/miekg/dns"
)
// gitCommandTimeout caps any single git invocation we shell out to. If
// the process hangs (NFS stall, gpg-sign prompt, broken pre-commit hook
// waiting on stdin, etc.) we must not block the caller — which holds
// the per-zone mutex — forever. 10s is generous for a local-disk repo.
const gitCommandTimeout = 10 * time.Second
// fileSnapshot captures the file-identity fingerprint at the moment we
// last read the zone file. We compare it to the live stat just before
// writing back to detect concurrent modification (rsync push, manual
// edit, `git checkout`). If anything changed the file out from under
// us, we refuse the UPDATE — Caddy will retry, and the next loadRRs
// will see the updated state.
type fileSnapshot struct {
mtime time.Time
size int64
}
// matches returns true if `info` reports the same mtime and size we
// captured. Inode comparison would be stricter, but mtime+size is
// sufficient for the failure modes we care about (rename-over-original
// edits change both).
func (s fileSnapshot) matches(info os.FileInfo) bool {
return s.mtime.Equal(info.ModTime()) && s.size == info.Size()
}
// zoneFile is a file-backed authority for a single DNS zone. Replaces
// the Phase-1.3 in-memory recordStore.
//
// On every UPDATE, the file is read fully into memory as parsed RRs,
// the requested adds/deletes are applied to that slice, the SOA serial
// is bumped (CalVer YYYYMMDDNN style), and the file is rewritten via
// an atomic temp-file rename. CoreDNS's `auto` plugin notices the
// mtime change within its reload interval (~30s) and re-serves the
// zone. HE eventually pulls on its SOA refresh.
//
// Concurrency: per-zone mutex serializes RFC 2136 UPDATEs against
// each other and against the plugin's own reads. It does NOT protect
// against external editors (e.g. a human running an editor while the
// plugin is mid-write); that's the operator's responsibility, and
// the typical mitigation is to do manual edits when no UPDATEs are
// in flight (or just accept the rare race — the worst case is one
// lost manual edit, easily restored from git).
type zoneFile struct {
mu sync.Mutex
// Path is the absolute path to the zone file on disk.
Path string
// Origin is the canonical (lowercase, trailing dot) zone apex.
Origin string
// AutoCommit, when true, runs `git add <path> && git commit ...`
// after every successful write. Defaults to true (per the chosen
// architecture: every dynamic update should leave a git trail).
AutoCommit bool
// GitAuthorName and GitAuthorEmail are passed to `git commit`
// via -c user.name and -c user.email so the commits are
// attributable without depending on the system git config.
GitAuthorName string
GitAuthorEmail string
}
// canon normalises a DNS name to the store's internal form: lowercase
// with trailing dot. miekg/dns sometimes hands us names without the
// trailing dot; passing through this once at the boundary keeps every
// lookup, every comparison consistent.
func canon(name string) string {
return strings.ToLower(dns.Fqdn(name))
}
// openZoneFile prepares a zoneFile handle. Does NOT read or parse the
// file; that happens lazily in each operation (so the file's content
// is always fresh and we never serve a stale snapshot).
func openZoneFile(path, origin string) *zoneFile {
return &zoneFile{
Path: path,
Origin: canon(origin),
AutoCommit: true,
GitAuthorName: "coredns-rfc2136",
GitAuthorEmail: "rfc2136@coredns",
}
}
// loadRRs reads the zone file and parses it into an RR slice via
// miekg/dns's zone parser. The parser handles $ORIGIN, $TTL, multi-line
// SOA, comments, includes, etc.
//
// Returns (rrs, snapshot, error). The snapshot fingerprints the file
// identity at read time so a subsequent writeIfUnchanged can detect
// concurrent modification.
//
// Hamilton H4 — strict-parse validation: a single malformed line could
// otherwise produce a partial parse where parser.Err() returns nil but
// some records silently went missing. To catch this, we enforce a
// post-parse invariant: exactly one SOA RR, and that SOA's name equals
// the configured zone origin. A zone file that's been partially eaten
// by the parser usually loses its SOA along the way — checking SOA
// presence catches both H4 (silent truncation) and H3 (multi-SOA or
// wrong-apex SOA) with a single guard.
func (z *zoneFile) loadRRs() ([]dns.RR, fileSnapshot, error) {
f, err := os.Open(z.Path)
if err != nil {
return nil, fileSnapshot{}, fmt.Errorf("open %s: %w", z.Path, err)
}
defer f.Close()
info, err := f.Stat()
if err != nil {
return nil, fileSnapshot{}, fmt.Errorf("stat %s: %w", z.Path, err)
}
snap := fileSnapshot{mtime: info.ModTime(), size: info.Size()}
parser := dns.NewZoneParser(f, z.Origin, z.Path)
parser.SetDefaultTTL(3600)
var rrs []dns.RR
for rr, ok := parser.Next(); ok; rr, ok = parser.Next() {
rrs = append(rrs, rr)
}
if err := parser.Err(); err != nil {
return nil, snap, fmt.Errorf("parse %s: %w", z.Path, err)
}
if len(rrs) == 0 {
return nil, snap, fmt.Errorf("%s: zero RRs parsed", z.Path)
}
// H3/H4 invariant: exactly one SOA, anchored at the zone origin.
// Refuse to operate on a zone file whose SOA structure is wrong —
// any subsequent bumpSerial or write would compound the damage.
if err := assertSingleApexSOA(rrs, z.Origin); err != nil {
return nil, snap, fmt.Errorf("zone %s integrity check failed: %w", z.Path, err)
}
return rrs, snap, nil
}
// assertSingleApexSOA enforces that rrs contains exactly one SOA and
// that its owner matches the zone origin. Returns an error otherwise.
// This is the H3+H4 zone-integrity invariant.
func assertSingleApexSOA(rrs []dns.RR, origin string) error {
origin = canon(origin)
var soas []*dns.SOA
for _, rr := range rrs {
if s, ok := rr.(*dns.SOA); ok {
soas = append(soas, s)
}
}
switch len(soas) {
case 0:
return fmt.Errorf("no SOA record found (expected one at %q)", origin)
case 1:
if canon(soas[0].Hdr.Name) != origin {
return fmt.Errorf("SOA owner is %q, expected zone apex %q", soas[0].Hdr.Name, origin)
}
return nil
default:
names := make([]string, len(soas))
for i, s := range soas {
names[i] = s.Hdr.Name
}
return fmt.Errorf("multiple SOA records found (%d): %s", len(soas), strings.Join(names, ", "))
}
}
// checkUnchanged returns nil if the on-disk file still matches the
// captured snapshot. If the file has been modified (mtime or size
// differs), returns an error — the caller should refuse the UPDATE
// rather than clobber the external change.
func (z *zoneFile) checkUnchanged(snap fileSnapshot) error {
info, err := os.Stat(z.Path)
if err != nil {
return fmt.Errorf("stat %s: %w", z.Path, err)
}
if !snap.matches(info) {
return fmt.Errorf("concurrent modification detected on %s: mtime %s/%s size %d/%d",
z.Path, snap.mtime.Format(time.RFC3339Nano), info.ModTime().Format(time.RFC3339Nano),
snap.size, info.Size())
}
return nil
}
// Lookup returns RRs in `rrs` matching (name, rtype). Both name and
// the RR header names are canonicalised for the comparison. Pass-by-
// slice rather than holding state means we can let the caller batch
// multiple operations against one snapshot of the file.
func lookupIn(rrs []dns.RR, name string, rtype uint16) []dns.RR {
name = canon(name)
var out []dns.RR
for _, rr := range rrs {
hdr := rr.Header()
if canon(hdr.Name) == name && hdr.Rrtype == rtype {
out = append(out, rr)
}
}
return out
}
// nameExistsIn reports whether any RR's owner equals name (canonical).
func nameExistsIn(rrs []dns.RR, name string) bool {
name = canon(name)
for _, rr := range rrs {
if canon(rr.Header().Name) == name {
return true
}
}
return false
}
// removeRRsetFrom returns rrs minus every RR matching (name, rtype).
func removeRRsetFrom(rrs []dns.RR, name string, rtype uint16) []dns.RR {
name = canon(name)
out := rrs[:0:0]
for _, rr := range rrs {
hdr := rr.Header()
if canon(hdr.Name) == name && hdr.Rrtype == rtype {
continue
}
out = append(out, rr)
}
return out
}
// removeNameFrom returns rrs minus every RR with the given owner name.
func removeNameFrom(rrs []dns.RR, name string) []dns.RR {
name = canon(name)
out := rrs[:0:0]
for _, rr := range rrs {
if canon(rr.Header().Name) == name {
continue
}
out = append(out, rr)
}
return out
}
// removeRRFrom returns rrs minus the single RR matching the given one
// by owner + type + rdata.
//
// Hamilton M3: per RFC 2136 §2.5.4, a delete-by-exact-match UPDATE
// carries CLASS=NONE and TTL=0 as protocol flags, not as match
// criteria. The target must match a stored RR by owner+type+rdata
// alone. We normalize both sides to the same class + TTL before
// invoking dns.IsDuplicate so the comparison is correct.
func removeRRFrom(rrs []dns.RR, target dns.RR) []dns.RR {
targetN := normalizeForCompare(target)
out := rrs[:0:0]
matched := false
for _, rr := range rrs {
if !matched && dns.IsDuplicate(normalizeForCompare(rr), targetN) {
matched = true
continue
}
out = append(out, rr)
}
return out
}
// addRRTo appends rr to rrs unless an identical RR already exists
// (de-dupe semantics per RFC 2136 §3.4.2.2). Same normalization as
// removeRRFrom — dedupe is TTL- and class-insensitive in the comparison
// (though the stored RR retains its original TTL/class).
func addRRTo(rrs []dns.RR, rr dns.RR) []dns.RR {
rrN := normalizeForCompare(rr)
for _, existing := range rrs {
if dns.IsDuplicate(normalizeForCompare(existing), rrN) {
return rrs
}
}
return append(rrs, rr)
}
// normalizeForCompare returns a copy of rr with TTL=0 and class=IN so
// dns.IsDuplicate can be used to compare by (owner, type, rdata) alone.
// Required by RFC 2136 §2.5.4's "TTL and CLASS are flags, not match
// criteria" semantics.
func normalizeForCompare(rr dns.RR) dns.RR {
n := dns.Copy(rr)
n.Header().Ttl = 0
n.Header().Class = dns.ClassINET
return n
}
// serialCounterMul is the multiplier between the date prefix and the
// counter in our SOA-serial encoding. The format is YYMMDD*10000 + NNNN,
// giving 10000 bumps/day (NNNN ∈ [0001, 9999]). The 2-digit year keeps
// the maximum within uint32 (the RFC 1035 ceiling for SOA serials): for
// 2026-05-22, max serial 2,605,229,999 is well below 2^32-1=4,294,967,295.
// A 4-digit year (e.g., 20260522*10000) would overflow uint32.
const serialCounterMul = 10000
// bumpSerial advances the SOA's serial in CalVer YYMMDD*10000+NNNN form.
//
// Behaviour:
// - If cur encodes today (or a future-encoded date from a prior NNNN
// rollover), increment NNNN. On NNNN=9999, roll forward to the next
// encoded day with NNNN=0001. The encoded date drifts ahead of wall
// time during heavy churn and catches back up on quiet days; serial
// numbers stay strictly monotonic, which is the only DNS hard
// requirement.
// - Otherwise (older serial; including legacy YYYYMMDDNN-format serials
// left over from before this format change), jump to today*10000+1.
// Legacy serials migrate automatically here: a value like 2026052299
// (~2.026B) is numerically smaller than today's new-format minimum
// 2605220001 (~2.605B), so it falls to this branch and gets rewritten
// in-place. The new value is strictly greater, so AXFR receivers (HE
// et al.) treat it as a normal forward bump and pull cleanly.
//
// The SOA is found by type (there should be exactly one); mutated in
// place. The returned slice is the same slice with the SOA's serial
// updated.
func bumpSerial(rrs []dns.RR, now time.Time) error {
var soa *dns.SOA
for _, rr := range rrs {
if s, ok := rr.(*dns.SOA); ok {
soa = s
break
}
}
if soa == nil {
return fmt.Errorf("zone has no SOA record")
}
today := now.UTC().Format("060102") // YYMMDD
cur := fmt.Sprintf("%010d", soa.Serial)
// Try the new-format read: cur[:6] is YYMMDD, cur[6:10] is NNNN.
// We only honour this when the encoded date is today or later — an
// older encoded date means a normal new-day reset (or a legacy
// serial that happens to look like a valid YYMMDD prefix but is in
// the past, which is the same handling: jump to today).
if curDate := cur[:6]; isValidYYMMDD(curDate) && curDate >= today {
nnnn := int(mustParseUint(cur[6:10]))
if nnnn < 9999 {
soa.Serial = uint32(mustParseUint(curDate)*serialCounterMul + uint64(nnnn+1))
return nil
}
// NNNN=9999: roll to next encoded day, NNNN=0001.
d, err := time.Parse("060102", curDate)
if err != nil {
return fmt.Errorf("serial date %q unparseable: %w", curDate, err)
}
next := d.AddDate(0, 0, 1).Format("060102")
soa.Serial = uint32(mustParseUint(next)*serialCounterMul + 1)
return nil
}
// Older or unparseable: jump to today*10000+1. Migration path for
// legacy YYYYMMDDNN serials lives here.
candidate := uint32(mustParseUint(today)*serialCounterMul + 1)
// H5 — explicit MaxUint32 guard. Plain `>` comparison is correct in
// practice (we'd never wrap during the zone's lifetime: 10000
// bumps/day × 365 days × ~117 years = ~427M, well under 2^32). The
// real failure mode we must prevent is wrap-to-0: if soa.Serial
// somehow reached MaxUint32 (hand-edit, fuzz, or a future code path
// we haven't written), `soa.Serial++` would wrap to 0, and
// downstream secondaries per RFC 1982 treat 0-after-MaxUint32 as
// "older" — they refuse to AXFR, and the zone goes dark. Loud
// refusal forces the operator to manually reset the serial,
// instead of silently bricking the zone.
if candidate <= soa.Serial {
if soa.Serial == math.MaxUint32 {
return fmt.Errorf("SOA serial at uint32 max (%d) — refusing to wrap to 0; operator must reset zone serial manually (see RFC 1982 §3.2)", soa.Serial)
}
// Defensive monotonic advance for the unusual "current serial
// is already > today's new-format minimum" case (e.g., a
// hand-edit set it to a far-future value).
soa.Serial++
return nil
}
soa.Serial = candidate
return nil
}
// isValidYYMMDD reports whether s is a 6-character YYMMDD string with a
// valid month and day. Year is any 2-digit value (00-99).
func isValidYYMMDD(s string) bool {
if len(s) != 6 {
return false
}
_, err := time.Parse("060102", s)
return err == nil
}
// mustParseUint parses an all-digit string into uint64. Panics on
// malformed input — the caller is responsible for passing only strings
// that were validated as digit-substrings (e.g., a fixed-width slice of
// a YYMMDDNNNN-formatted serial). Using strconv via this thin wrapper
// keeps the panic behavior explicit while sharing stdlib's robust
// parsing (Hamilton L1).
func mustParseUint(s string) uint64 {
n, err := strconv.ParseUint(s, 10, 64)
if err != nil {
// Programmer error if we ever hit this — every caller passes
// digits-only strings derived from time.Format or a sliced
// SOA serial. Panic so the bug surfaces in tests/CI rather
// than silently producing a 0 serial.
panic(fmt.Sprintf("mustParseUint(%q): %v", s, err))
}
return n
}
// writeAtomic serializes rrs to a temp file in the same directory as
// z.Path, then renames over the destination. POSIX guarantees atomic
// rename on local filesystems, so a partial write can never leave a
// corrupt zone file on disk.
//
// Format: one RR per line, tab-separated owner/TTL/class/type/rdata.
// Comments and multi-line SOA formatting from the original file are
// NOT preserved (v1 limitation; sophisticated comment preservation can
// land in v2). A short header line is emitted with the write timestamp
// and the plugin name, so it's obvious in `git log` what touched the
// file.
func (z *zoneFile) writeAtomic(rrs []dns.RR, now time.Time) error {
dir := filepath.Dir(z.Path)
tmp, err := os.CreateTemp(dir, ".rfc2136-*.zone")
if err != nil {
return fmt.Errorf("create temp: %w", err)
}
tmpPath := tmp.Name()
// Best-effort cleanup if we fail before the rename.
defer func() {
if tmpPath != "" {
_ = os.Remove(tmpPath)
}
}()
header := fmt.Sprintf("; Auto-written by coredns-rfc2136 on %s\n; Zone: %s\n$ORIGIN %s\n",
now.UTC().Format(time.RFC3339), z.Origin, z.Origin)
if _, err := tmp.WriteString(header); err != nil {
_ = tmp.Close()
return fmt.Errorf("write header: %w", err)
}
for _, rr := range rrs {
if _, err := tmp.WriteString(rr.String() + "\n"); err != nil {
_ = tmp.Close()
return fmt.Errorf("write rr: %w", err)
}
}
if err := tmp.Sync(); err != nil {
_ = tmp.Close()
return fmt.Errorf("sync: %w", err)
}
if err := tmp.Close(); err != nil {
return fmt.Errorf("close: %w", err)
}
if err := os.Rename(tmpPath, z.Path); err != nil {
return fmt.Errorf("rename %s -> %s: %w", tmpPath, z.Path, err)
}
tmpPath = "" // suppress cleanup; rename consumed it
return nil
}
// commit stages and commits the zone file via git. Runs from the
// repository directory inferred from the zone file's parent. Returns
// nil silently if AutoCommit is false. Returns an error if the commit
// fails; the caller decides whether to roll back the file write.
//
// Both git invocations run under a context with a hard timeout
// (gitCommandTimeout). If git hangs (NFS stall, gpg-sign prompt,
// pre-commit hook waiting on stdin), we kill it rather than block the
// caller's per-zone mutex indefinitely. ACME storms must not be able
// to wedge the plugin via git getting stuck.
func (z *zoneFile) commit(message string) error {
if !z.AutoCommit {
return nil
}
// We run git from the directory containing the zone file. git will
// walk upward to find the .git dir.
dir := filepath.Dir(z.Path)
ctx, cancel := context.WithTimeout(context.Background(), gitCommandTimeout)
defer cancel()
// `git add` first; if file is already in the index, no harm done.
add := exec.CommandContext(ctx, "git",
"-C", dir,
"add", "--", z.Path,
)
if out, err := add.CombinedOutput(); err != nil {
return fmt.Errorf("git add failed: %w: %s", err, strings.TrimSpace(string(out)))
}
commit := exec.CommandContext(ctx, "git",
"-C", dir,
"-c", "user.name="+z.GitAuthorName,
"-c", "user.email="+z.GitAuthorEmail,
"commit", "-q", "-m", message, "--", z.Path,
)
if out, err := commit.CombinedOutput(); err != nil {
return fmt.Errorf("git commit failed: %w: %s", err, strings.TrimSpace(string(out)))
}
return nil
}