M8: per-key UPDATE rate limiting (token bucket)

Hamilton M8: a compromised TSIG key — or a misconfigured client
retrying forever — must not be able to drive unbounded UPDATE traffic.
Each UPDATE costs disk IOPS, a git commit, and a slot in the SOA
serial counter (now 9999/day per zone). Without a cap, a few hours of
runaway traffic could exhaust the SOA serial counter and brick the
zone for the day.

Implementation: per-key token bucket in ratelimit.go. Default 100
tokens / 60 seconds. New keys start full so legitimate clients see no
delay at boot. Refill is continuous, capped at the burst value.

Configurable in Corefile:
  rate-limit off                    # disable entirely
  rate-limit <burst> <period-secs>  # e.g., rate-limit 200 60

Enforcement runs in ServeDNS after TSIG verification — a request that
fails auth doesn't consume a token (and a forged TSIG can't be used to
deny service to a real key holder, since we never reached the rate
check).

100/min is well above ACME's needs: a worst-case full-renewal storm
across our ~84 zones emits maybe 200 UPDATEs total over several
minutes. Anything beyond is suspicious by definition.

New tests covering: first-call allowed, burst exhaustion, refill
behavior, per-key isolation, refill-cap (no idle-accumulation
overflow).
This commit is contained in:
Ryan Malloy 2026-05-22 21:31:17 -06:00
parent 6ab2b6af6d
commit 8d1477350a
4 changed files with 222 additions and 0 deletions

View File

@ -19,6 +19,8 @@ package rfc2136
import (
"context"
"strings"
"time"
"github.com/coredns/coredns/plugin"
"github.com/miekg/dns"
@ -62,6 +64,11 @@ type RFC2136 struct {
// zones holds per-zone file handlers, keyed by canonical zone name.
// Populated in setup; mutexes live inside each zoneFile.
zones map[string]*zoneFile
// rateLimit caps UPDATE traffic per TSIG key (Hamilton M8). nil
// disables rate limiting (test mode, or insecure deployments).
// Populated in setup() once TSIG keys are known.
rateLimit *rateLimiter
}
// Name implements plugin.Handler.
@ -88,6 +95,19 @@ func (p *RFC2136) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
_ = w.WriteMsg(resp)
return dns.RcodeRefused, nil
}
// Hamilton M8: per-key rate limit. TSIG just authenticates the
// sender — it doesn't prove the sender's behavior is sane. A
// compromised key or a runaway client must not be able to
// exhaust disk/git/serial-counter resources.
if p.rateLimit != nil {
if tsig := r.IsTsig(); tsig != nil && !p.rateLimit.allow(strings.ToLower(tsig.Hdr.Name), time.Now()) {
log.Warningf("UPDATE rate-limited for key %q", tsig.Hdr.Name)
resp := new(dns.Msg)
resp.SetRcode(r, dns.RcodeRefused)
_ = w.WriteMsg(resp)
return dns.RcodeRefused, nil
}
}
return p.handleUpdate(w, r, true)
}
return plugin.NextOrFailure(p.Name(), p.Next, ctx, w, r)

82
ratelimit.go Normal file
View File

@ -0,0 +1,82 @@
package rfc2136
import (
"sync"
"time"
)
// Per-key token bucket. Hamilton M8: a compromised TSIG key — or a
// misconfigured client retrying forever — must not be able to drive
// unbounded UPDATE traffic. Each UPDATE costs disk IOPS, a git commit,
// and a slot in the SOA serial counter (9999/day per zone). 100
// UPDATEs/minute per key is well above any legitimate ACME workflow
// (a full renewal storm across our ~84 zones might emit ~200 UPDATEs
// total over several minutes); anything beyond is suspicious.
const (
defaultRateBurst = 100 // max tokens
defaultRatePeriod = time.Minute // refill window
)
// rateLimiter is a goroutine-safe per-key token bucket. The zero value
// is unusable; construct via newRateLimiter.
type rateLimiter struct {
mu sync.Mutex
buckets map[string]*bucket
burst float64 // max tokens
period time.Duration // time to fully refill
}
type bucket struct {
tokens float64
lastRefill time.Time
}
func newRateLimiter(burst int, period time.Duration) *rateLimiter {
if burst <= 0 {
burst = defaultRateBurst
}
if period <= 0 {
period = defaultRatePeriod
}
return &rateLimiter{
buckets: make(map[string]*bucket),
burst: float64(burst),
period: period,
}
}
// allow attempts to take one token for `key`. Returns true if a token
// was available, false otherwise. New keys start full (burst tokens).
//
// Refill is continuous: tokens accumulate at burst/period per second.
// The bucket caps at burst tokens.
func (rl *rateLimiter) allow(key string, now time.Time) bool {
rl.mu.Lock()
defer rl.mu.Unlock()
b, ok := rl.buckets[key]
if !ok {
// First time we see this key — start the bucket full so
// legitimate clients don't see refill delays at boot.
rl.buckets[key] = &bucket{
tokens: rl.burst - 1,
lastRefill: now,
}
return true
}
// Refill: tokens earned since last access.
elapsed := now.Sub(b.lastRefill).Seconds()
earned := elapsed * (rl.burst / rl.period.Seconds())
b.tokens += earned
if b.tokens > rl.burst {
b.tokens = rl.burst
}
b.lastRefill = now
if b.tokens >= 1.0 {
b.tokens -= 1.0
return true
}
return false
}

83
ratelimit_test.go Normal file
View File

@ -0,0 +1,83 @@
package rfc2136
import (
"testing"
"time"
)
func TestRateLimiter_FirstCallAllowed(t *testing.T) {
rl := newRateLimiter(5, time.Minute)
now := time.Now()
if !rl.allow("key-a", now) {
t.Errorf("first call for new key must be allowed")
}
}
func TestRateLimiter_BurstExhausts(t *testing.T) {
rl := newRateLimiter(3, time.Minute)
now := time.Now()
// First 3 calls succeed.
for i := 0; i < 3; i++ {
if !rl.allow("key-a", now) {
t.Fatalf("call %d should be allowed (burst=3)", i+1)
}
}
// 4th immediately after burst should be denied (no time elapsed
// for refill).
if rl.allow("key-a", now) {
t.Errorf("4th call exceeded burst; should be denied")
}
}
func TestRateLimiter_RefillsOverTime(t *testing.T) {
// burst=2, period=1s → refill rate is 2 tokens/sec.
rl := newRateLimiter(2, time.Second)
t0 := time.Now()
if !rl.allow("k", t0) {
t.Fatal("call 1")
}
if !rl.allow("k", t0) {
t.Fatal("call 2")
}
if rl.allow("k", t0) {
t.Fatal("call 3 should be denied; bucket empty")
}
// Advance time by 500ms — should refill ~1 token.
if !rl.allow("k", t0.Add(500*time.Millisecond)) {
t.Errorf("expected refill after 500ms")
}
}
func TestRateLimiter_PerKeyIsolation(t *testing.T) {
rl := newRateLimiter(2, time.Minute)
now := time.Now()
// Exhaust key-a.
rl.allow("key-a", now)
rl.allow("key-a", now)
if rl.allow("key-a", now) {
t.Fatal("key-a still has tokens; setup wrong")
}
// key-b is independent — must still be allowed.
if !rl.allow("key-b", now) {
t.Errorf("key-b was rate-limited despite no prior use")
}
}
// TestRateLimiter_DoesNotOverflow guards against refill math
// accumulating beyond burst (which would let an attacker burst more
// after a long idle period than the configured cap).
func TestRateLimiter_DoesNotOverflow(t *testing.T) {
rl := newRateLimiter(5, time.Second)
t0 := time.Now()
rl.allow("k", t0) // create bucket
// Advance time 1 hour. Refill should cap at burst=5.
tFuture := t0.Add(time.Hour)
for i := 0; i < 5; i++ {
if !rl.allow("k", tFuture) {
t.Fatalf("post-idle call %d should be allowed (cap=5)", i+1)
}
}
if rl.allow("k", tFuture) {
t.Errorf("post-idle call 6 should be denied; cap exceeded")
}
}

View File

@ -6,6 +6,7 @@ import (
"os"
"path/filepath"
"strconv"
"time"
"github.com/coredns/caddy"
"github.com/coredns/coredns/core/dnsserver"
@ -164,6 +165,13 @@ func parse(c *caddy.Controller) (*RFC2136, error) {
// Per-zone git author overrides. Defaults are applied later.
var gitAuthorName, gitAuthorEmail string
// Rate-limit config (Hamilton M8). Defaults are
// defaultRateBurst/defaultRatePeriod from ratelimit.go; an explicit
// `rate-limit <burst> <period-seconds>` directive overrides.
rateBurst := defaultRateBurst
ratePeriod := defaultRatePeriod
rateLimitEnabled := true
for c.Next() {
args := c.RemainingArgs()
if len(args) < 1 {
@ -235,6 +243,30 @@ func parse(c *caddy.Controller) (*RFC2136, error) {
gitAuthorName = gArgs[0]
gitAuthorEmail = gArgs[1]
case "rate-limit":
rArgs := c.RemainingArgs()
switch len(rArgs) {
case 1:
if rArgs[0] == "off" || rArgs[0] == "false" || rArgs[0] == "no" {
rateLimitEnabled = false
break
}
return nil, c.Errf("rate-limit single-arg form must be 'off'; for limits use 'rate-limit <burst> <period-seconds>'")
case 2:
b, err := strconv.ParseUint(rArgs[0], 10, 31)
if err != nil || b < 1 {
return nil, c.Errf("rate-limit burst must be positive integer, got %q", rArgs[0])
}
pSec, err := strconv.ParseUint(rArgs[1], 10, 31)
if err != nil || pSec < 1 {
return nil, c.Errf("rate-limit period must be positive integer seconds, got %q", rArgs[1])
}
rateBurst = int(b)
ratePeriod = time.Duration(pSec) * time.Second
default:
return nil, c.Errf("rate-limit takes 'off' OR '<burst> <period-seconds>', got %d args", len(rArgs))
}
default:
return nil, c.Errf("unknown directive: %s", c.Val())
}
@ -248,6 +280,11 @@ func parse(c *caddy.Controller) (*RFC2136, error) {
return nil, c.Err("zones-dir is required")
}
// Construct rate limiter if enabled.
if rateLimitEnabled {
p.rateLimit = newRateLimiter(rateBurst, ratePeriod)
}
// Build zoneFile handles for each declared zone.
p.zones = make(map[string]*zoneFile, len(p.Zones))
for _, z := range p.Zones {