app: wire HA health prober into scheduled tasks

Run the prober on a ticker in scheduledTasks. Enabled by default (10s interval, 5s timeout). No-op when no HA routes exist. Fixes #2129 Fixes #2902
2026-04-25 17:15:33 +02:00 · 2026-04-15 13:41:30 +00:00
parent 90e65ccd63
commit 8a97dd134b
3 changed files with 69 additions and 0 deletions
--- a/config-example.yaml
+++ b/config-example.yaml
@@ -165,6 +165,31 @@ node:
    # Time before an inactive ephemeral node is deleted.
    inactivity_timeout: 30m

+  # HA subnet router health probing.
+  #
+  # A subnet router can hold its control-plane session open yet be unable to
+  # forward traffic ("zombie connected"). The normal disconnect-based failover
+  # never fires because the Noise session is still alive.
+  #
+  # When HA routes exist (2+ nodes advertising the same prefix), headscale
+  # pings each HA node every probe_interval via the Noise channel. If a node
+  # fails to respond within probe_timeout it is marked unhealthy and the
+  # primary role moves to the next healthy node. A node that later responds
+  # is marked healthy again but does NOT reclaim primary (avoids flapping).
+  #
+  # Worst-case detection time is probe_interval + probe_timeout (15s default).
+  # No-op when no HA routes exist. Set probe_interval to 0 to disable.
+  # probe_timeout must be less than probe_interval.
+  routes:
+    ha:
+      # How often to ping HA subnet routers. Set to 0 to disable probing.
+      # Must be >= 2s when enabled.
+      probe_interval: 10s
+
+      # How long to wait for a ping response before marking a node unhealthy.
+      # Must be >= 1s and less than probe_interval.
+      probe_timeout: 5s
+
 database:
  # Database type. Available options: sqlite, postgres
  # Please note that using Postgres is highly discouraged as it is only supported for legacy reasons.