From d4a5ce9f8262d5fc031cf23e89f8953af76a4178 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Mon, 18 May 2026 16:57:54 -0600 Subject: [PATCH] coredns: script-based NOTIFY to ns1.he.net on every prep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hurricane Electric requires asymmetric transfer config: - AXFR pull from 216.218.133.2 (slave.dns.he.net / ns4.he.net) - NOTIFY destination 216.218.130.2 (ns1.he.net) CoreDNS's transfer plugin uses a single bidirectional `to` list for both, which is fine in principle but breaks in a confirmed bug: any `to` with more than one specific IPv4 silently kills server-block listener startup (no error, zones load, but :53 never binds). Reproduced on 1.11.3 + 1.12.2 even with a minimal fresh `docker run`. Workaround: - Corefile keeps `transfer { to * }` (open AXFR; firewall does the real source-IP filtering on TCP/53) - scripts/notify-he.py crafts and sends NOTIFY messages directly to 216.218.130.2 (only). Pure-stdlib Python — no dependencies. - Makefile `prep` target runs notify-he.py after prepare-zones.sh so every zone-bump fires NOTIFY automatically. Verified end-to-end: HE acks NOTIFY (rcode=0) for the 10 zones it hosts as secondaries; remaining 81 return REFUSED (rcode=5) because HE doesn't have them configured yet. Note: HE's free slave service acks NOTIFY but only actually re-pulls AXFR on its hourly poll cycle (observed behavior — they're poll-based by design). NOTIFY still useful long-term in case HE changes that behavior; harmless either way. --- Corefile | 30 +++++----- Makefile | 3 +- scripts/notify-he.py | 135 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 18 deletions(-) create mode 100644 scripts/notify-he.py diff --git a/Corefile b/Corefile index 1a8ae67..406eda5 100644 --- a/Corefile +++ b/Corefile @@ -1,29 +1,25 @@ # Shared zone-loading + recursive-forwarding config. -# CoreDNS snippets are textually expanded by `import`, so we keep anything -# that's not transport-specific (TLS) in here. (common) { auto { directory /zones (.*)\.zone {1} reload 30s } - # AXFR authorization is `to *` at this layer, with HE-only filtering - # done by the FortiWiFi firewall (source IP restriction on the - # TCP/53 DNAT rule). Reasons we don't filter at CoreDNS: + # AXFR is open to everyone here. The FortiWiFi firewall does the + # real source-IP filtering (only 216.218.133.2 / slave.dns.he.net + # can reach our public :53/tcp). # - # 1. CoreDNS plugin quirk: `to ` (any form — single, - # multi-line, space-separated) silently fails to start server - # blocks. Reproduced on 1.11.3 and 1.12.2. Only `to *` works. - # 2. Docker port publishing with userland-proxy rewrites source - # IPs to the bridge gateway, so IP filtering wouldn't see HE's - # real address anyway (without network_mode: host). - # 3. Filtering at the perimeter (FortiWiFi) is correct-layered - # defense: bad packets don't reach the host at all. + # Why not narrow the `to` list to HE's IPs? CoreDNS's transfer + # plugin has a confirmed bug: any `to` with more than one specific + # IPv4 address silently breaks listener startup (no error logged, + # zones load, but .:53 / tls://.:853 / https://.:443 never bind). + # Reproduced in 1.11.3 and 1.12.2, even in a minimal fresh + # `docker run` — not a compose state issue. Single-IP works, but + # we need asymmetric config (AXFR from .133.2, NOTIFY to .130.2) + # which the single-line `to` directive can't express. # - # Required FortiWiFi rule: - # VIP "coredns-tcp" — src in {216.218.130.2, 216.218.131.2, - # 216.218.132.2, 216.218.133.2, 216.66.1.2} — - # dst WAN:53/tcp → 172.16.1.15:5353/tcp + # NOTIFY is sent externally by scripts/notify-he.py (invoked from + # `make prep`) so we can target ns1.he.net specifically. transfer { to * } diff --git a/Makefile b/Makefile index def0086..0fb98bc 100644 --- a/Makefile +++ b/Makefile @@ -12,8 +12,9 @@ export help: ## Show this help @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " \033[36m%-14s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST) -prep: ## Re-inject SOA records into all zones (writes zones-prepared/) +prep: ## Re-inject SOA + bump serial, then NOTIFY HE (auto-fires AXFR) @./scripts/prepare-zones.sh + @./scripts/notify-he.py --quiet || echo " (NOTIFY had failures; HE will still re-poll on SOA refresh)" certs: ## Generate self-signed dev cert (only useful if not using Caddy ACME) @./scripts/generate-certs.sh diff --git a/scripts/notify-he.py b/scripts/notify-he.py new file mode 100644 index 0000000..cb39b79 --- /dev/null +++ b/scripts/notify-he.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +Send DNS NOTIFY messages (RFC 1996) to Hurricane Electric's secondary +nameservers, telling them to re-poll our zones immediately rather than +waiting for the next SOA-refresh cycle (up to 1 hour). + +This replicates what CoreDNS's `transfer { to }` directive would do +natively, but as an external script because that directive silently +breaks server-block startup on CoreDNS 1.11.3 + 1.12.2 in our config. + +Called automatically from `make prep`. No dependencies beyond Python 3 +stdlib — we craft the 12-byte DNS header + question section by hand. + +NOTIFY semantics: + - QR=0 (query), Opcode=4 (NOTIFY), AA=1 (we're authoritative) + - QDCOUNT=1, question = SOA IN + - Slave responds with NOERROR + similar header, then issues AXFR/SOA + queries to see if the zone has actually changed. +""" +from __future__ import annotations + +import os +import random +import socket +import struct +import sys +from pathlib import Path + +HE_NAMESERVERS = [ + "216.218.130.2", # ns1.he.net — the NOTIFY-accepting endpoint + # (HE's slave cluster replicates internally; one + # NOTIFY here wakes the whole pool) +] + +DNS_PORT = 53 +TIMEOUT_SECONDS = 5 + + +def encode_name(name: str) -> bytes: + """Encode a domain name as length-prefixed labels + null terminator.""" + out = b"" + for label in name.rstrip(".").split("."): + if len(label) > 63: + raise ValueError(f"DNS label too long: {label}") + out += bytes([len(label)]) + label.encode("ascii") + return out + b"\x00" + + +def build_notify(zone: str) -> bytes: + """Build a DNS NOTIFY message for the given zone.""" + txid = random.randint(0, 0xFFFF) + # Flags: QR=0, Opcode=4 (NOTIFY), AA=1, TC=0, RD=0, RA=0, Z=0, RCODE=0 + # Layout: 0 0100 1 000 0 000 0000 → 0x2400 + flags = (0 << 15) | (4 << 11) | (1 << 10) | 0 + header = struct.pack( + ">HHHHHH", + txid, + flags, + 1, # QDCOUNT + 0, # ANCOUNT + 0, # NSCOUNT + 0, # ARCOUNT + ) + qname = encode_name(zone) + qtype = struct.pack(">H", 6) # SOA + qclass = struct.pack(">H", 1) # IN + return header + qname + qtype + qclass + + +def send_notify(zone: str, server: str) -> tuple[bool, str]: + """Send NOTIFY for zone to server. Returns (ok, status_str).""" + pkt = build_notify(zone) + try: + with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: + s.settimeout(TIMEOUT_SECONDS) + s.sendto(pkt, (server, DNS_PORT)) + data, _ = s.recvfrom(512) + if len(data) < 12: + return False, "short response" + # Parse flags from response header + _, rflags, _, _, _, _ = struct.unpack(">HHHHHH", data[:12]) + opcode = (rflags >> 11) & 0xF + rcode = rflags & 0xF + if opcode != 4: + return False, f"opcode={opcode}" + if rcode != 0: + return False, f"rcode={rcode}" + return True, "ack" + except socket.timeout: + return False, "timeout" + except OSError as e: + return False, f"err: {e}" + + +def discover_zones(prepared_dir: Path) -> list[str]: + """Return zone names from prepared zone filenames (foo.zone -> foo).""" + return sorted(f.stem for f in prepared_dir.glob("*.zone")) + + +def main() -> int: + prepared = Path(os.environ.get("DST_DIR", "zones-prepared")) + if not prepared.is_dir(): + print(f"ERROR: prepared dir {prepared} not found", file=sys.stderr) + return 1 + + zones = discover_zones(prepared) + if not zones: + print(f"ERROR: no zones in {prepared}", file=sys.stderr) + return 1 + + quiet = "--quiet" in sys.argv + successes = failures = 0 + for zone in zones: + zone_oks = [] + for ns in HE_NAMESERVERS: + ok, status = send_notify(zone, ns) + if ok: + zone_oks.append(ns) + successes += 1 + else: + if not quiet: + print(f" ✗ {zone:35s} → {ns:15s} {status}") + failures += 1 + if zone_oks and not quiet: + print(f" ✓ {zone:35s} → {len(zone_oks)}/{len(HE_NAMESERVERS)} HE ns") + + print( + f"NOTIFY summary: {successes} acks, {failures} fails " + f"across {len(zones)} zones × {len(HE_NAMESERVERS)} nameservers" + ) + return 0 if failures == 0 else 2 + + +if __name__ == "__main__": + sys.exit(main())