fix: enable TCP keepalive on RPC connections (#1348)

Garage RPC connections have no TCP keepalive enabled. When a connection dies silently (proxy pod restart, NAT timeout, network partition), it's only detected by application-level pings after ~60s (4 failed pings x 15s interval). During this window, the node appears connected but all RPC calls to it fail.

Enable TCP keepalive on both outgoing and incoming RPC connections via socket2:
- Idle time before first probe: 30s (TCP_KEEPALIVE_TIME)
- Probe interval after first: 10s (TCP_KEEPALIVE_INTERVAL)

A helper set_keepalive() function avoids duplicating the socket2 setup. Incoming connection keepalive failures are logged as warnings but don't reject the connection.

Companion to #1345 (stale address pruning + connect timeout). Together they address both halves of the reconnection problem: faster detection (this PR) and faster recovery.

Co-authored-by: Raj Singh <raj@tailscale.com>
Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/1348
Reviewed-by: maximilien <git@mricher.fr>
Co-authored-by: rajsinghtech <rajsinghtech@noreply.localhost>
Co-committed-by: rajsinghtech <rajsinghtech@noreply.localhost>
This commit is contained in:
rajsinghtech
2026-02-20 21:28:29 +00:00
committed by Alex
parent 55370d9b4d
commit 69cd230568
4 changed files with 27 additions and 0 deletions

1
Cargo.lock generated
View File

@@ -1713,6 +1713,7 @@ dependencies = [
"rand 0.9.2",
"rmp-serde",
"serde",
"socket2 0.6.2",
"thiserror 2.0.18",
"tokio",
"tokio-stream",

View File

@@ -144,6 +144,7 @@ tokio = { version = "1.49", default-features = false, features = [
] }
tokio-util = { version = "0.7", features = ["compat", "io"] }
tokio-stream = { version = "0.1", features = ["net"] }
socket2 = { version = "0.6", features = ["all"] }
opentelemetry = { version = "0.17", features = ["rt-tokio", "metrics", "trace"] }
opentelemetry-prometheus = "0.10"

View File

@@ -27,6 +27,7 @@ rmp-serde.workspace = true
hex.workspace = true
rand.workspace = true
socket2.workspace = true
log.workspace = true
arc-swap.workspace = true

View File

@@ -1,6 +1,7 @@
use std::collections::HashMap;
use std::net::{IpAddr, SocketAddr};
use std::sync::{Arc, RwLock};
use std::time::Duration;
use log::{debug, error, info, trace, warn};
@@ -39,6 +40,19 @@ pub(crate) type VersionTag = [u8; 16];
/// Since Garage v1.0, we have replaced the prefix by `grgnet` (shorthand for `garage_net`).
pub(crate) const NETAPP_VERSION_TAG: u64 = 0x6772676e65740010; // grgnet 0x0010 (1.0)
/// Time a connection must be idle before the first keepalive probe is sent.
const TCP_KEEPALIVE_TIME: Duration = Duration::from_secs(30);
/// Interval between keepalive probes after the first.
const TCP_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(10);
fn set_keepalive(stream: &TcpStream) -> Result<(), std::io::Error> {
let sock_ref = socket2::SockRef::from(stream);
let keepalive = socket2::TcpKeepalive::new()
.with_time(TCP_KEEPALIVE_TIME)
.with_interval(TCP_KEEPALIVE_INTERVAL);
sock_ref.set_tcp_keepalive(&keepalive)
}
/// `HelloMessage` is sent by the client on a Netapp connection to indicate
/// that they are also a server and ready to receive incoming connections
/// at the specified address and port. If the client doesn't know their
@@ -252,6 +266,13 @@ impl NetApp {
_ = must_exit.changed() => continue,
};
if let Err(e) = set_keepalive(&socket) {
warn!(
"Failed to set keepalive on connection from {}: {}",
peer_addr, e
);
}
info!(
"Incoming connection from {}, negotiating handshake...",
peer_addr
@@ -314,6 +335,9 @@ impl NetApp {
}
None => TcpStream::connect(ip).await?,
};
if let Err(e) = set_keepalive(&stream) {
warn!("Failed to set keepalive on connection to {}: {}", ip, e);
}
info!("Connected to {}, negotiating handshake...", ip);
ClientConn::init(self, stream, id).await?;
Ok(())