app: wire HA health prober into scheduled tasks

Run the prober on a ticker in scheduledTasks. Enabled by default
(10s interval, 5s timeout). No-op when no HA routes exist.

Fixes #2129
Fixes #2902
This commit is contained in:
Kristoffer Dalby
2026-04-15 13:41:30 +00:00
parent 90e65ccd63
commit 8a97dd134b
3 changed files with 69 additions and 0 deletions

View File

@@ -60,8 +60,19 @@ internet is a security-sensitive choice. `autogroup:danger-all` can only be used
- **CLI**: `headscale nodes register` is deprecated in favour of `headscale auth register --auth-id <id> --user <user>` [#1850](https://github.com/juanfont/headscale/pull/1850)
- The old command continues to work but will be removed in a future release
### HA subnet router health probing
Headscale now actively probes HA subnet routers to detect nodes that are connected but not
forwarding traffic. The control plane periodically pings HA subnet routers via the Noise
control channel and fails over to a healthy standby if the primary stops responding. This is
enabled by default (`node.routes.ha.probe_interval: 10s`, `probe_timeout: 5s`) and only
active when HA routes exist (2+ nodes advertising the same prefix). Set `probe_interval` to
`0` to disable. This complements the existing disconnect-based failover, catching "zombie
connected" routers that maintain their control session but cannot route packets.
### Changes
- **Debug endpoints**: Add node connectivity ping page for verifying control-plane reachability [#3183](https://github.com/juanfont/headscale/pull/3183)
- **OIDC registration**: Add a confirmation page before completing node registration, showing the device hostname and machine key fingerprint [#3180](https://github.com/juanfont/headscale/pull/3180)
- **Debug endpoints**: Omit secret fields (`Pass`, `ClientSecret`, `APIKey`) from `/debug/config` JSON output [#3180](https://github.com/juanfont/headscale/pull/3180)
- **Debug endpoints**: Route `statsviz` through `tsweb.Protected` [#3180](https://github.com/juanfont/headscale/pull/3180)

View File

@@ -165,6 +165,31 @@ node:
# Time before an inactive ephemeral node is deleted.
inactivity_timeout: 30m
# HA subnet router health probing.
#
# A subnet router can hold its control-plane session open yet be unable to
# forward traffic ("zombie connected"). The normal disconnect-based failover
# never fires because the Noise session is still alive.
#
# When HA routes exist (2+ nodes advertising the same prefix), headscale
# pings each HA node every probe_interval via the Noise channel. If a node
# fails to respond within probe_timeout it is marked unhealthy and the
# primary role moves to the next healthy node. A node that later responds
# is marked healthy again but does NOT reclaim primary (avoids flapping).
#
# Worst-case detection time is probe_interval + probe_timeout (15s default).
# No-op when no HA routes exist. Set probe_interval to 0 to disable.
# probe_timeout must be less than probe_interval.
routes:
ha:
# How often to ping HA subnet routers. Set to 0 to disable probing.
# Must be >= 2s when enabled.
probe_interval: 10s
# How long to wait for a ping response before marking a node unhealthy.
# Must be >= 1s and less than probe_interval.
probe_timeout: 5s
database:
# Database type. Available options: sqlite, postgres
# Please note that using Postgres is highly discouraged as it is only supported for legacy reasons.

View File

@@ -276,6 +276,31 @@ func (h *Headscale) scheduledTasks(ctx context.Context) {
extraRecordsUpdate = make(chan []tailcfg.DNSRecord)
}
var (
haProber *state.HAHealthProber
haHealthChan <-chan time.Time
)
if h.cfg.Node.Routes.HA.ProbeInterval > 0 {
haProber = state.NewHAHealthProber(
h.state,
h.cfg.Node.Routes.HA,
h.cfg.ServerURL,
h.mapBatcher.IsConnected,
)
haTicker := time.NewTicker(h.cfg.Node.Routes.HA.ProbeInterval)
defer haTicker.Stop()
haHealthChan = haTicker.C
log.Info().
Dur("interval", h.cfg.Node.Routes.HA.ProbeInterval).
Dur("timeout", h.cfg.Node.Routes.HA.ProbeTimeout).
Msg("HA subnet router health probing enabled")
} else {
haHealthChan = make(<-chan time.Time)
}
for {
select {
case <-ctx.Done():
@@ -332,6 +357,9 @@ func (h *Headscale) scheduledTasks(ctx context.Context) {
h.cfg.TailcfgDNSConfig.ExtraRecords = records
h.Change(change.ExtraRecords())
case <-haHealthChan:
haProber.ProbeOnce(ctx, h.Change)
}
}
}
@@ -1113,6 +1141,11 @@ func (h *Headscale) StartBatcherForTest(tb testing.TB) {
tb.Cleanup(func() { h.mapBatcher.Close() })
}
// MapBatcher returns the map response batcher (for test use).
func (h *Headscale) MapBatcher() *mapper.Batcher {
return h.mapBatcher
}
// StartEphemeralGCForTest starts the ephemeral node garbage collector.
// It registers a cleanup function on tb to stop the collector.
// It panics when called outside of tests.