mirror of
https://github.com/juanfont/headscale
synced 2026-04-25 17:15:33 +02:00
app: wire HA health prober into scheduled tasks
Run the prober on a ticker in scheduledTasks. Enabled by default (10s interval, 5s timeout). No-op when no HA routes exist. Fixes #2129 Fixes #2902
This commit is contained in:
11
CHANGELOG.md
11
CHANGELOG.md
@@ -60,8 +60,19 @@ internet is a security-sensitive choice. `autogroup:danger-all` can only be used
|
||||
- **CLI**: `headscale nodes register` is deprecated in favour of `headscale auth register --auth-id <id> --user <user>` [#1850](https://github.com/juanfont/headscale/pull/1850)
|
||||
- The old command continues to work but will be removed in a future release
|
||||
|
||||
### HA subnet router health probing
|
||||
|
||||
Headscale now actively probes HA subnet routers to detect nodes that are connected but not
|
||||
forwarding traffic. The control plane periodically pings HA subnet routers via the Noise
|
||||
control channel and fails over to a healthy standby if the primary stops responding. This is
|
||||
enabled by default (`node.routes.ha.probe_interval: 10s`, `probe_timeout: 5s`) and only
|
||||
active when HA routes exist (2+ nodes advertising the same prefix). Set `probe_interval` to
|
||||
`0` to disable. This complements the existing disconnect-based failover, catching "zombie
|
||||
connected" routers that maintain their control session but cannot route packets.
|
||||
|
||||
### Changes
|
||||
|
||||
- **Debug endpoints**: Add node connectivity ping page for verifying control-plane reachability [#3183](https://github.com/juanfont/headscale/pull/3183)
|
||||
- **OIDC registration**: Add a confirmation page before completing node registration, showing the device hostname and machine key fingerprint [#3180](https://github.com/juanfont/headscale/pull/3180)
|
||||
- **Debug endpoints**: Omit secret fields (`Pass`, `ClientSecret`, `APIKey`) from `/debug/config` JSON output [#3180](https://github.com/juanfont/headscale/pull/3180)
|
||||
- **Debug endpoints**: Route `statsviz` through `tsweb.Protected` [#3180](https://github.com/juanfont/headscale/pull/3180)
|
||||
|
||||
@@ -165,6 +165,31 @@ node:
|
||||
# Time before an inactive ephemeral node is deleted.
|
||||
inactivity_timeout: 30m
|
||||
|
||||
# HA subnet router health probing.
|
||||
#
|
||||
# A subnet router can hold its control-plane session open yet be unable to
|
||||
# forward traffic ("zombie connected"). The normal disconnect-based failover
|
||||
# never fires because the Noise session is still alive.
|
||||
#
|
||||
# When HA routes exist (2+ nodes advertising the same prefix), headscale
|
||||
# pings each HA node every probe_interval via the Noise channel. If a node
|
||||
# fails to respond within probe_timeout it is marked unhealthy and the
|
||||
# primary role moves to the next healthy node. A node that later responds
|
||||
# is marked healthy again but does NOT reclaim primary (avoids flapping).
|
||||
#
|
||||
# Worst-case detection time is probe_interval + probe_timeout (15s default).
|
||||
# No-op when no HA routes exist. Set probe_interval to 0 to disable.
|
||||
# probe_timeout must be less than probe_interval.
|
||||
routes:
|
||||
ha:
|
||||
# How often to ping HA subnet routers. Set to 0 to disable probing.
|
||||
# Must be >= 2s when enabled.
|
||||
probe_interval: 10s
|
||||
|
||||
# How long to wait for a ping response before marking a node unhealthy.
|
||||
# Must be >= 1s and less than probe_interval.
|
||||
probe_timeout: 5s
|
||||
|
||||
database:
|
||||
# Database type. Available options: sqlite, postgres
|
||||
# Please note that using Postgres is highly discouraged as it is only supported for legacy reasons.
|
||||
|
||||
@@ -276,6 +276,31 @@ func (h *Headscale) scheduledTasks(ctx context.Context) {
|
||||
extraRecordsUpdate = make(chan []tailcfg.DNSRecord)
|
||||
}
|
||||
|
||||
var (
|
||||
haProber *state.HAHealthProber
|
||||
haHealthChan <-chan time.Time
|
||||
)
|
||||
if h.cfg.Node.Routes.HA.ProbeInterval > 0 {
|
||||
haProber = state.NewHAHealthProber(
|
||||
h.state,
|
||||
h.cfg.Node.Routes.HA,
|
||||
h.cfg.ServerURL,
|
||||
h.mapBatcher.IsConnected,
|
||||
)
|
||||
|
||||
haTicker := time.NewTicker(h.cfg.Node.Routes.HA.ProbeInterval)
|
||||
defer haTicker.Stop()
|
||||
|
||||
haHealthChan = haTicker.C
|
||||
|
||||
log.Info().
|
||||
Dur("interval", h.cfg.Node.Routes.HA.ProbeInterval).
|
||||
Dur("timeout", h.cfg.Node.Routes.HA.ProbeTimeout).
|
||||
Msg("HA subnet router health probing enabled")
|
||||
} else {
|
||||
haHealthChan = make(<-chan time.Time)
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
@@ -332,6 +357,9 @@ func (h *Headscale) scheduledTasks(ctx context.Context) {
|
||||
h.cfg.TailcfgDNSConfig.ExtraRecords = records
|
||||
|
||||
h.Change(change.ExtraRecords())
|
||||
|
||||
case <-haHealthChan:
|
||||
haProber.ProbeOnce(ctx, h.Change)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1113,6 +1141,11 @@ func (h *Headscale) StartBatcherForTest(tb testing.TB) {
|
||||
tb.Cleanup(func() { h.mapBatcher.Close() })
|
||||
}
|
||||
|
||||
// MapBatcher returns the map response batcher (for test use).
|
||||
func (h *Headscale) MapBatcher() *mapper.Batcher {
|
||||
return h.mapBatcher
|
||||
}
|
||||
|
||||
// StartEphemeralGCForTest starts the ephemeral node garbage collector.
|
||||
// It registers a cleanup function on tb to stop the collector.
|
||||
// It panics when called outside of tests.
|
||||
|
||||
Reference in New Issue
Block a user