Files
headscale/hscontrol/servertest/ha_health_test.go
Kristoffer Dalby 0378e2d2c6 servertest: add HA health probing tests
Five scenarios: healthy probes, failover on unhealthy primary,
no-flap recovery, connect clears unhealthy, no-op without HA routes.

Updates #2129
Updates #2902
2026-04-16 15:10:56 +01:00

227 lines
6.9 KiB
Go

package servertest_test
import (
"context"
"net/netip"
"testing"
"time"
"github.com/juanfont/headscale/hscontrol/servertest"
"github.com/juanfont/headscale/hscontrol/state"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/juanfont/headscale/hscontrol/types/change"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"tailscale.com/tailcfg"
)
// advertiseAndApproveRoute sets RoutableIPs on a client and approves
// the route on the server. Returns the node ID.
func advertiseAndApproveRoute(
t *testing.T,
srv *servertest.TestServer,
c *servertest.TestClient,
route netip.Prefix,
) types.NodeID {
t.Helper()
c.Direct().SetHostinfo(&tailcfg.Hostinfo{
BackendLogID: "servertest-" + c.Name,
Hostname: c.Name,
RoutableIPs: []netip.Prefix{route},
})
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = c.Direct().SendUpdate(ctx)
nodeID := findNodeID(t, srv, c.Name)
_, rc, err := srv.State().SetApprovedRoutes(nodeID, []netip.Prefix{route})
require.NoError(t, err)
srv.App.Change(rc)
return nodeID
}
// TestHAHealthProbe_HealthyNodes verifies that the prober correctly
// pings HA nodes and they all respond healthy.
func TestHAHealthProbe_HealthyNodes(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "ha-healthy")
route := netip.MustParsePrefix("10.50.0.0/24")
c1 := servertest.NewClient(t, srv, "ha-router1", servertest.WithUser(user))
c2 := servertest.NewClient(t, srv, "ha-router2", servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
c2.WaitForPeers(t, 1, 10*time.Second)
nodeID1 := advertiseAndApproveRoute(t, srv, c1, route)
advertiseAndApproveRoute(t, srv, c2, route)
prober := state.NewHAHealthProber(
srv.State(),
types.HARouteConfig{
ProbeInterval: 30 * time.Second,
ProbeTimeout: 5 * time.Second,
},
srv.URL,
srv.App.MapBatcher().IsConnected,
)
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
prober.ProbeOnce(ctx, srv.App.Change)
// Both nodes should be healthy, primary unchanged (node 1).
assert.True(t, srv.State().PrimaryRoutes().IsNodeHealthy(nodeID1))
primaries := srv.State().GetNodePrimaryRoutes(nodeID1)
assert.Contains(t, primaries, route)
}
// TestHAHealthProbe_UnhealthyFailover verifies that marking a primary
// node unhealthy via the PrimaryRoutes API triggers failover to the
// standby.
func TestHAHealthProbe_UnhealthyFailover(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "ha-failover")
route := netip.MustParsePrefix("10.60.0.0/24")
c1 := servertest.NewClient(t, srv, "ha-fail-r1", servertest.WithUser(user))
c2 := servertest.NewClient(t, srv, "ha-fail-r2", servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
c2.WaitForPeers(t, 1, 10*time.Second)
nodeID1 := advertiseAndApproveRoute(t, srv, c1, route)
nodeID2 := advertiseAndApproveRoute(t, srv, c2, route)
// Node 1 should be primary (lower ID).
primaries := srv.State().GetNodePrimaryRoutes(nodeID1)
require.Contains(t, primaries, route, "node 1 should be primary initially")
// Mark node 1 unhealthy — should failover to node 2.
changed := srv.State().PrimaryRoutes().SetNodeHealthy(nodeID1, false)
assert.True(t, changed, "marking primary unhealthy should change primaries")
primaries2 := srv.State().GetNodePrimaryRoutes(nodeID2)
assert.Contains(t, primaries2, route, "node 2 should be primary after failover")
primaries1 := srv.State().GetNodePrimaryRoutes(nodeID1)
assert.NotContains(t, primaries1, route, "node 1 should not be primary")
}
// TestHAHealthProbe_RecoveryNoFlap verifies that marking an unhealthy
// node healthy again does NOT cause it to reclaim primary (stability).
func TestHAHealthProbe_RecoveryNoFlap(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "ha-noflap")
route := netip.MustParsePrefix("10.70.0.0/24")
c1 := servertest.NewClient(t, srv, "ha-nf-r1", servertest.WithUser(user))
c2 := servertest.NewClient(t, srv, "ha-nf-r2", servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
c2.WaitForPeers(t, 1, 10*time.Second)
nodeID1 := advertiseAndApproveRoute(t, srv, c1, route)
nodeID2 := advertiseAndApproveRoute(t, srv, c2, route)
// Failover: node 1 → node 2.
srv.State().PrimaryRoutes().SetNodeHealthy(nodeID1, false)
primaries := srv.State().GetNodePrimaryRoutes(nodeID2)
require.Contains(t, primaries, route, "node 2 should be primary")
// Recovery: node 1 healthy again. Node 2 should STAY primary.
changed := srv.State().PrimaryRoutes().SetNodeHealthy(nodeID1, true)
assert.False(t, changed, "recovery should not change primaries (no flap)")
primaries = srv.State().GetNodePrimaryRoutes(nodeID2)
assert.Contains(t, primaries, route, "node 2 should remain primary after recovery")
}
// TestHAHealthProbe_ConnectClearsUnhealthy verifies that reconnecting
// a node clears its unhealthy state.
func TestHAHealthProbe_ConnectClearsUnhealthy(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "ha-reconnect")
route := netip.MustParsePrefix("10.80.0.0/24")
c1 := servertest.NewClient(t, srv, "ha-rc-r1", servertest.WithUser(user))
c2 := servertest.NewClient(t, srv, "ha-rc-r2", servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
c2.WaitForPeers(t, 1, 10*time.Second)
nodeID1 := advertiseAndApproveRoute(t, srv, c1, route)
advertiseAndApproveRoute(t, srv, c2, route)
// Mark unhealthy.
srv.State().PrimaryRoutes().SetNodeHealthy(nodeID1, false)
assert.False(t, srv.State().PrimaryRoutes().IsNodeHealthy(nodeID1))
// Reconnect clears unhealthy via State.Connect → ClearUnhealthy.
c1.Disconnect(t)
c1.Reconnect(t)
c1.WaitForPeers(t, 1, 10*time.Second)
assert.True(t, srv.State().PrimaryRoutes().IsNodeHealthy(nodeID1),
"reconnect should clear unhealthy state")
}
// TestHAHealthProbe_NoHARoutes verifies that the prober is a no-op
// when no HA configuration exists.
func TestHAHealthProbe_NoHARoutes(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "ha-noha")
c1 := servertest.NewClient(t, srv, "noha-r1", servertest.WithUser(user))
c2 := servertest.NewClient(t, srv, "noha-r2", servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
c2.WaitForPeers(t, 1, 10*time.Second)
// Different routes — not HA.
advertiseAndApproveRoute(t, srv, c1, netip.MustParsePrefix("10.90.0.0/24"))
advertiseAndApproveRoute(t, srv, c2, netip.MustParsePrefix("10.91.0.0/24"))
prober := state.NewHAHealthProber(
srv.State(),
types.HARouteConfig{
ProbeInterval: 30 * time.Second,
ProbeTimeout: 5 * time.Second,
},
srv.URL,
srv.App.MapBatcher().IsConnected,
)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
var dispatched bool
prober.ProbeOnce(ctx, func(_ ...change.Change) {
dispatched = true
})
assert.False(t, dispatched, "no HA routes should produce no changes")
}