mirror of
https://github.com/juanfont/headscale
synced 2026-04-25 17:15:33 +02:00
servertest: add HA health probing tests
Five scenarios: healthy probes, failover on unhealthy primary, no-flap recovery, connect clears unhealthy, no-op without HA routes. Updates #2129 Updates #2902
This commit is contained in:
226
hscontrol/servertest/ha_health_test.go
Normal file
226
hscontrol/servertest/ha_health_test.go
Normal file
@@ -0,0 +1,226 @@
|
||||
package servertest_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/netip"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/juanfont/headscale/hscontrol/servertest"
|
||||
"github.com/juanfont/headscale/hscontrol/state"
|
||||
"github.com/juanfont/headscale/hscontrol/types"
|
||||
"github.com/juanfont/headscale/hscontrol/types/change"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"tailscale.com/tailcfg"
|
||||
)
|
||||
|
||||
// advertiseAndApproveRoute sets RoutableIPs on a client and approves
|
||||
// the route on the server. Returns the node ID.
|
||||
func advertiseAndApproveRoute(
|
||||
t *testing.T,
|
||||
srv *servertest.TestServer,
|
||||
c *servertest.TestClient,
|
||||
route netip.Prefix,
|
||||
) types.NodeID {
|
||||
t.Helper()
|
||||
|
||||
c.Direct().SetHostinfo(&tailcfg.Hostinfo{
|
||||
BackendLogID: "servertest-" + c.Name,
|
||||
Hostname: c.Name,
|
||||
RoutableIPs: []netip.Prefix{route},
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
_ = c.Direct().SendUpdate(ctx)
|
||||
|
||||
nodeID := findNodeID(t, srv, c.Name)
|
||||
|
||||
_, rc, err := srv.State().SetApprovedRoutes(nodeID, []netip.Prefix{route})
|
||||
require.NoError(t, err)
|
||||
srv.App.Change(rc)
|
||||
|
||||
return nodeID
|
||||
}
|
||||
|
||||
// TestHAHealthProbe_HealthyNodes verifies that the prober correctly
|
||||
// pings HA nodes and they all respond healthy.
|
||||
func TestHAHealthProbe_HealthyNodes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
srv := servertest.NewServer(t)
|
||||
user := srv.CreateUser(t, "ha-healthy")
|
||||
|
||||
route := netip.MustParsePrefix("10.50.0.0/24")
|
||||
|
||||
c1 := servertest.NewClient(t, srv, "ha-router1", servertest.WithUser(user))
|
||||
c2 := servertest.NewClient(t, srv, "ha-router2", servertest.WithUser(user))
|
||||
|
||||
c1.WaitForPeers(t, 1, 10*time.Second)
|
||||
c2.WaitForPeers(t, 1, 10*time.Second)
|
||||
|
||||
nodeID1 := advertiseAndApproveRoute(t, srv, c1, route)
|
||||
advertiseAndApproveRoute(t, srv, c2, route)
|
||||
|
||||
prober := state.NewHAHealthProber(
|
||||
srv.State(),
|
||||
types.HARouteConfig{
|
||||
ProbeInterval: 30 * time.Second,
|
||||
ProbeTimeout: 5 * time.Second,
|
||||
},
|
||||
srv.URL,
|
||||
srv.App.MapBatcher().IsConnected,
|
||||
)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
|
||||
prober.ProbeOnce(ctx, srv.App.Change)
|
||||
|
||||
// Both nodes should be healthy, primary unchanged (node 1).
|
||||
assert.True(t, srv.State().PrimaryRoutes().IsNodeHealthy(nodeID1))
|
||||
|
||||
primaries := srv.State().GetNodePrimaryRoutes(nodeID1)
|
||||
assert.Contains(t, primaries, route)
|
||||
}
|
||||
|
||||
// TestHAHealthProbe_UnhealthyFailover verifies that marking a primary
|
||||
// node unhealthy via the PrimaryRoutes API triggers failover to the
|
||||
// standby.
|
||||
func TestHAHealthProbe_UnhealthyFailover(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
srv := servertest.NewServer(t)
|
||||
user := srv.CreateUser(t, "ha-failover")
|
||||
|
||||
route := netip.MustParsePrefix("10.60.0.0/24")
|
||||
|
||||
c1 := servertest.NewClient(t, srv, "ha-fail-r1", servertest.WithUser(user))
|
||||
c2 := servertest.NewClient(t, srv, "ha-fail-r2", servertest.WithUser(user))
|
||||
|
||||
c1.WaitForPeers(t, 1, 10*time.Second)
|
||||
c2.WaitForPeers(t, 1, 10*time.Second)
|
||||
|
||||
nodeID1 := advertiseAndApproveRoute(t, srv, c1, route)
|
||||
nodeID2 := advertiseAndApproveRoute(t, srv, c2, route)
|
||||
|
||||
// Node 1 should be primary (lower ID).
|
||||
primaries := srv.State().GetNodePrimaryRoutes(nodeID1)
|
||||
require.Contains(t, primaries, route, "node 1 should be primary initially")
|
||||
|
||||
// Mark node 1 unhealthy — should failover to node 2.
|
||||
changed := srv.State().PrimaryRoutes().SetNodeHealthy(nodeID1, false)
|
||||
assert.True(t, changed, "marking primary unhealthy should change primaries")
|
||||
|
||||
primaries2 := srv.State().GetNodePrimaryRoutes(nodeID2)
|
||||
assert.Contains(t, primaries2, route, "node 2 should be primary after failover")
|
||||
|
||||
primaries1 := srv.State().GetNodePrimaryRoutes(nodeID1)
|
||||
assert.NotContains(t, primaries1, route, "node 1 should not be primary")
|
||||
}
|
||||
|
||||
// TestHAHealthProbe_RecoveryNoFlap verifies that marking an unhealthy
|
||||
// node healthy again does NOT cause it to reclaim primary (stability).
|
||||
func TestHAHealthProbe_RecoveryNoFlap(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
srv := servertest.NewServer(t)
|
||||
user := srv.CreateUser(t, "ha-noflap")
|
||||
|
||||
route := netip.MustParsePrefix("10.70.0.0/24")
|
||||
|
||||
c1 := servertest.NewClient(t, srv, "ha-nf-r1", servertest.WithUser(user))
|
||||
c2 := servertest.NewClient(t, srv, "ha-nf-r2", servertest.WithUser(user))
|
||||
|
||||
c1.WaitForPeers(t, 1, 10*time.Second)
|
||||
c2.WaitForPeers(t, 1, 10*time.Second)
|
||||
|
||||
nodeID1 := advertiseAndApproveRoute(t, srv, c1, route)
|
||||
nodeID2 := advertiseAndApproveRoute(t, srv, c2, route)
|
||||
|
||||
// Failover: node 1 → node 2.
|
||||
srv.State().PrimaryRoutes().SetNodeHealthy(nodeID1, false)
|
||||
primaries := srv.State().GetNodePrimaryRoutes(nodeID2)
|
||||
require.Contains(t, primaries, route, "node 2 should be primary")
|
||||
|
||||
// Recovery: node 1 healthy again. Node 2 should STAY primary.
|
||||
changed := srv.State().PrimaryRoutes().SetNodeHealthy(nodeID1, true)
|
||||
assert.False(t, changed, "recovery should not change primaries (no flap)")
|
||||
|
||||
primaries = srv.State().GetNodePrimaryRoutes(nodeID2)
|
||||
assert.Contains(t, primaries, route, "node 2 should remain primary after recovery")
|
||||
}
|
||||
|
||||
// TestHAHealthProbe_ConnectClearsUnhealthy verifies that reconnecting
|
||||
// a node clears its unhealthy state.
|
||||
func TestHAHealthProbe_ConnectClearsUnhealthy(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
srv := servertest.NewServer(t)
|
||||
user := srv.CreateUser(t, "ha-reconnect")
|
||||
|
||||
route := netip.MustParsePrefix("10.80.0.0/24")
|
||||
|
||||
c1 := servertest.NewClient(t, srv, "ha-rc-r1", servertest.WithUser(user))
|
||||
c2 := servertest.NewClient(t, srv, "ha-rc-r2", servertest.WithUser(user))
|
||||
|
||||
c1.WaitForPeers(t, 1, 10*time.Second)
|
||||
c2.WaitForPeers(t, 1, 10*time.Second)
|
||||
|
||||
nodeID1 := advertiseAndApproveRoute(t, srv, c1, route)
|
||||
advertiseAndApproveRoute(t, srv, c2, route)
|
||||
|
||||
// Mark unhealthy.
|
||||
srv.State().PrimaryRoutes().SetNodeHealthy(nodeID1, false)
|
||||
assert.False(t, srv.State().PrimaryRoutes().IsNodeHealthy(nodeID1))
|
||||
|
||||
// Reconnect clears unhealthy via State.Connect → ClearUnhealthy.
|
||||
c1.Disconnect(t)
|
||||
c1.Reconnect(t)
|
||||
|
||||
c1.WaitForPeers(t, 1, 10*time.Second)
|
||||
|
||||
assert.True(t, srv.State().PrimaryRoutes().IsNodeHealthy(nodeID1),
|
||||
"reconnect should clear unhealthy state")
|
||||
}
|
||||
|
||||
// TestHAHealthProbe_NoHARoutes verifies that the prober is a no-op
|
||||
// when no HA configuration exists.
|
||||
func TestHAHealthProbe_NoHARoutes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
srv := servertest.NewServer(t)
|
||||
user := srv.CreateUser(t, "ha-noha")
|
||||
|
||||
c1 := servertest.NewClient(t, srv, "noha-r1", servertest.WithUser(user))
|
||||
c2 := servertest.NewClient(t, srv, "noha-r2", servertest.WithUser(user))
|
||||
|
||||
c1.WaitForPeers(t, 1, 10*time.Second)
|
||||
c2.WaitForPeers(t, 1, 10*time.Second)
|
||||
|
||||
// Different routes — not HA.
|
||||
advertiseAndApproveRoute(t, srv, c1, netip.MustParsePrefix("10.90.0.0/24"))
|
||||
advertiseAndApproveRoute(t, srv, c2, netip.MustParsePrefix("10.91.0.0/24"))
|
||||
|
||||
prober := state.NewHAHealthProber(
|
||||
srv.State(),
|
||||
types.HARouteConfig{
|
||||
ProbeInterval: 30 * time.Second,
|
||||
ProbeTimeout: 5 * time.Second,
|
||||
},
|
||||
srv.URL,
|
||||
srv.App.MapBatcher().IsConnected,
|
||||
)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var dispatched bool
|
||||
|
||||
prober.ProbeOnce(ctx, func(_ ...change.Change) {
|
||||
dispatched = true
|
||||
})
|
||||
assert.False(t, dispatched, "no HA routes should produce no changes")
|
||||
}
|
||||
Reference in New Issue
Block a user