diff --git a/CHANGELOG.md b/CHANGELOG.md index 86078132..6612d694 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -60,8 +60,19 @@ internet is a security-sensitive choice. `autogroup:danger-all` can only be used - **CLI**: `headscale nodes register` is deprecated in favour of `headscale auth register --auth-id --user ` [#1850](https://github.com/juanfont/headscale/pull/1850) - The old command continues to work but will be removed in a future release +### HA subnet router health probing + +Headscale now actively probes HA subnet routers to detect nodes that are connected but not +forwarding traffic. The control plane periodically pings HA subnet routers via the Noise +control channel and fails over to a healthy standby if the primary stops responding. This is +enabled by default (`node.routes.ha.probe_interval: 10s`, `probe_timeout: 5s`) and only +active when HA routes exist (2+ nodes advertising the same prefix). Set `probe_interval` to +`0` to disable. This complements the existing disconnect-based failover, catching "zombie +connected" routers that maintain their control session but cannot route packets. + ### Changes +- **Debug endpoints**: Add node connectivity ping page for verifying control-plane reachability [#3183](https://github.com/juanfont/headscale/pull/3183) - **OIDC registration**: Add a confirmation page before completing node registration, showing the device hostname and machine key fingerprint [#3180](https://github.com/juanfont/headscale/pull/3180) - **Debug endpoints**: Omit secret fields (`Pass`, `ClientSecret`, `APIKey`) from `/debug/config` JSON output [#3180](https://github.com/juanfont/headscale/pull/3180) - **Debug endpoints**: Route `statsviz` through `tsweb.Protected` [#3180](https://github.com/juanfont/headscale/pull/3180) diff --git a/config-example.yaml b/config-example.yaml index d3ca2f09..0cf9d1b9 100644 --- a/config-example.yaml +++ b/config-example.yaml @@ -165,6 +165,31 @@ node: # Time before an inactive ephemeral node is deleted. inactivity_timeout: 30m + # HA subnet router health probing. + # + # A subnet router can hold its control-plane session open yet be unable to + # forward traffic ("zombie connected"). The normal disconnect-based failover + # never fires because the Noise session is still alive. + # + # When HA routes exist (2+ nodes advertising the same prefix), headscale + # pings each HA node every probe_interval via the Noise channel. If a node + # fails to respond within probe_timeout it is marked unhealthy and the + # primary role moves to the next healthy node. A node that later responds + # is marked healthy again but does NOT reclaim primary (avoids flapping). + # + # Worst-case detection time is probe_interval + probe_timeout (15s default). + # No-op when no HA routes exist. Set probe_interval to 0 to disable. + # probe_timeout must be less than probe_interval. + routes: + ha: + # How often to ping HA subnet routers. Set to 0 to disable probing. + # Must be >= 2s when enabled. + probe_interval: 10s + + # How long to wait for a ping response before marking a node unhealthy. + # Must be >= 1s and less than probe_interval. + probe_timeout: 5s + database: # Database type. Available options: sqlite, postgres # Please note that using Postgres is highly discouraged as it is only supported for legacy reasons. diff --git a/hscontrol/app.go b/hscontrol/app.go index ac63b472..49f3cb4a 100644 --- a/hscontrol/app.go +++ b/hscontrol/app.go @@ -276,6 +276,31 @@ func (h *Headscale) scheduledTasks(ctx context.Context) { extraRecordsUpdate = make(chan []tailcfg.DNSRecord) } + var ( + haProber *state.HAHealthProber + haHealthChan <-chan time.Time + ) + if h.cfg.Node.Routes.HA.ProbeInterval > 0 { + haProber = state.NewHAHealthProber( + h.state, + h.cfg.Node.Routes.HA, + h.cfg.ServerURL, + h.mapBatcher.IsConnected, + ) + + haTicker := time.NewTicker(h.cfg.Node.Routes.HA.ProbeInterval) + defer haTicker.Stop() + + haHealthChan = haTicker.C + + log.Info(). + Dur("interval", h.cfg.Node.Routes.HA.ProbeInterval). + Dur("timeout", h.cfg.Node.Routes.HA.ProbeTimeout). + Msg("HA subnet router health probing enabled") + } else { + haHealthChan = make(<-chan time.Time) + } + for { select { case <-ctx.Done(): @@ -332,6 +357,9 @@ func (h *Headscale) scheduledTasks(ctx context.Context) { h.cfg.TailcfgDNSConfig.ExtraRecords = records h.Change(change.ExtraRecords()) + + case <-haHealthChan: + haProber.ProbeOnce(ctx, h.Change) } } } @@ -1113,6 +1141,11 @@ func (h *Headscale) StartBatcherForTest(tb testing.TB) { tb.Cleanup(func() { h.mapBatcher.Close() }) } +// MapBatcher returns the map response batcher (for test use). +func (h *Headscale) MapBatcher() *mapper.Batcher { + return h.mapBatcher +} + // StartEphemeralGCForTest starts the ephemeral node garbage collector. // It registers a cleanup function on tb to stop the collector. // It panics when called outside of tests.