app: wire HA health prober into scheduled tasks

Run the prober on a ticker in scheduledTasks. Enabled by default (10s interval, 5s timeout). No-op when no HA routes exist. Fixes #2129 Fixes #2902
2026-04-25 17:15:33 +02:00 · 2026-04-15 13:41:30 +00:00
parent 90e65ccd63
commit 8a97dd134b
3 changed files with 69 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -60,8 +60,19 @@ internet is a security-sensitive choice. `autogroup:danger-all` can only be used
 - **CLI**: `headscale nodes register` is deprecated in favour of `headscale auth register --auth-id <id> --user <user>` [#1850](https://github.com/juanfont/headscale/pull/1850)
  - The old command continues to work but will be removed in a future release

+### HA subnet router health probing
+
+Headscale now actively probes HA subnet routers to detect nodes that are connected but not
+forwarding traffic. The control plane periodically pings HA subnet routers via the Noise
+control channel and fails over to a healthy standby if the primary stops responding. This is
+enabled by default (`node.routes.ha.probe_interval: 10s`, `probe_timeout: 5s`) and only
+active when HA routes exist (2+ nodes advertising the same prefix). Set `probe_interval` to
+`0` to disable. This complements the existing disconnect-based failover, catching "zombie
+connected" routers that maintain their control session but cannot route packets.
+
 ### Changes

+- **Debug endpoints**: Add node connectivity ping page for verifying control-plane reachability [#3183](https://github.com/juanfont/headscale/pull/3183)
 - **OIDC registration**: Add a confirmation page before completing node registration, showing the device hostname and machine key fingerprint [#3180](https://github.com/juanfont/headscale/pull/3180)
 - **Debug endpoints**: Omit secret fields (`Pass`, `ClientSecret`, `APIKey`) from `/debug/config` JSON output [#3180](https://github.com/juanfont/headscale/pull/3180)
 - **Debug endpoints**: Route `statsviz` through `tsweb.Protected` [#3180](https://github.com/juanfont/headscale/pull/3180)
--- a/config-example.yaml
+++ b/config-example.yaml
@@ -165,6 +165,31 @@ node:
    # Time before an inactive ephemeral node is deleted.
    inactivity_timeout: 30m

+  # HA subnet router health probing.
+  #
+  # A subnet router can hold its control-plane session open yet be unable to
+  # forward traffic ("zombie connected"). The normal disconnect-based failover
+  # never fires because the Noise session is still alive.
+  #
+  # When HA routes exist (2+ nodes advertising the same prefix), headscale
+  # pings each HA node every probe_interval via the Noise channel. If a node
+  # fails to respond within probe_timeout it is marked unhealthy and the
+  # primary role moves to the next healthy node. A node that later responds
+  # is marked healthy again but does NOT reclaim primary (avoids flapping).
+  #
+  # Worst-case detection time is probe_interval + probe_timeout (15s default).
+  # No-op when no HA routes exist. Set probe_interval to 0 to disable.
+  # probe_timeout must be less than probe_interval.
+  routes:
+    ha:
+      # How often to ping HA subnet routers. Set to 0 to disable probing.
+      # Must be >= 2s when enabled.
+      probe_interval: 10s
+
+      # How long to wait for a ping response before marking a node unhealthy.
+      # Must be >= 1s and less than probe_interval.
+      probe_timeout: 5s
+
 database:
  # Database type. Available options: sqlite, postgres
  # Please note that using Postgres is highly discouraged as it is only supported for legacy reasons.
--- a/hscontrol/app.go
+++ b/hscontrol/app.go
@@ -276,6 +276,31 @@ func (h *Headscale) scheduledTasks(ctx context.Context) {
 		extraRecordsUpdate = make(chan []tailcfg.DNSRecord)
 	}

+	var (
+		haProber     *state.HAHealthProber
+		haHealthChan <-chan time.Time
+	)
+	if h.cfg.Node.Routes.HA.ProbeInterval > 0 {
+		haProber = state.NewHAHealthProber(
+			h.state,
+			h.cfg.Node.Routes.HA,
+			h.cfg.ServerURL,
+			h.mapBatcher.IsConnected,
+		)
+
+		haTicker := time.NewTicker(h.cfg.Node.Routes.HA.ProbeInterval)
+		defer haTicker.Stop()
+
+		haHealthChan = haTicker.C
+
+		log.Info().
+			Dur("interval", h.cfg.Node.Routes.HA.ProbeInterval).
+			Dur("timeout", h.cfg.Node.Routes.HA.ProbeTimeout).
+			Msg("HA subnet router health probing enabled")
+	} else {
+		haHealthChan = make(<-chan time.Time)
+	}
+
 	for {
 		select {
 		case <-ctx.Done():
@@ -332,6 +357,9 @@ func (h *Headscale) scheduledTasks(ctx context.Context) {
 			h.cfg.TailcfgDNSConfig.ExtraRecords = records

 			h.Change(change.ExtraRecords())
+
+		case <-haHealthChan:
+			haProber.ProbeOnce(ctx, h.Change)
 		}
 	}
 }
@@ -1113,6 +1141,11 @@ func (h *Headscale) StartBatcherForTest(tb testing.TB) {
 	tb.Cleanup(func() { h.mapBatcher.Close() })
 }

+// MapBatcher returns the map response batcher (for test use).
+func (h *Headscale) MapBatcher() *mapper.Batcher {
+	return h.mapBatcher
+}
+
 // StartEphemeralGCForTest starts the ephemeral node garbage collector.
 // It registers a cleanup function on tb to stop the collector.
 // It panics when called outside of tests.