mirror of
https://github.com/kharonsec/garage.git
synced 2026-04-25 20:44:55 +02:00
fix: enable TCP keepalive on RPC connections (#1348)
Garage RPC connections have no TCP keepalive enabled. When a connection dies silently (proxy pod restart, NAT timeout, network partition), it's only detected by application-level pings after ~60s (4 failed pings x 15s interval). During this window, the node appears connected but all RPC calls to it fail. Enable TCP keepalive on both outgoing and incoming RPC connections via socket2: - Idle time before first probe: 30s (TCP_KEEPALIVE_TIME) - Probe interval after first: 10s (TCP_KEEPALIVE_INTERVAL) A helper set_keepalive() function avoids duplicating the socket2 setup. Incoming connection keepalive failures are logged as warnings but don't reject the connection. Companion to #1345 (stale address pruning + connect timeout). Together they address both halves of the reconnection problem: faster detection (this PR) and faster recovery. Co-authored-by: Raj Singh <raj@tailscale.com> Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/1348 Reviewed-by: maximilien <git@mricher.fr> Co-authored-by: rajsinghtech <rajsinghtech@noreply.localhost> Co-committed-by: rajsinghtech <rajsinghtech@noreply.localhost>
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -1713,6 +1713,7 @@ dependencies = [
|
||||
"rand 0.9.2",
|
||||
"rmp-serde",
|
||||
"serde",
|
||||
"socket2 0.6.2",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
|
||||
@@ -144,6 +144,7 @@ tokio = { version = "1.49", default-features = false, features = [
|
||||
] }
|
||||
tokio-util = { version = "0.7", features = ["compat", "io"] }
|
||||
tokio-stream = { version = "0.1", features = ["net"] }
|
||||
socket2 = { version = "0.6", features = ["all"] }
|
||||
|
||||
opentelemetry = { version = "0.17", features = ["rt-tokio", "metrics", "trace"] }
|
||||
opentelemetry-prometheus = "0.10"
|
||||
|
||||
@@ -27,6 +27,7 @@ rmp-serde.workspace = true
|
||||
hex.workspace = true
|
||||
|
||||
rand.workspace = true
|
||||
socket2.workspace = true
|
||||
|
||||
log.workspace = true
|
||||
arc-swap.workspace = true
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::collections::HashMap;
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::time::Duration;
|
||||
|
||||
use log::{debug, error, info, trace, warn};
|
||||
|
||||
@@ -39,6 +40,19 @@ pub(crate) type VersionTag = [u8; 16];
|
||||
/// Since Garage v1.0, we have replaced the prefix by `grgnet` (shorthand for `garage_net`).
|
||||
pub(crate) const NETAPP_VERSION_TAG: u64 = 0x6772676e65740010; // grgnet 0x0010 (1.0)
|
||||
|
||||
/// Time a connection must be idle before the first keepalive probe is sent.
|
||||
const TCP_KEEPALIVE_TIME: Duration = Duration::from_secs(30);
|
||||
/// Interval between keepalive probes after the first.
|
||||
const TCP_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(10);
|
||||
|
||||
fn set_keepalive(stream: &TcpStream) -> Result<(), std::io::Error> {
|
||||
let sock_ref = socket2::SockRef::from(stream);
|
||||
let keepalive = socket2::TcpKeepalive::new()
|
||||
.with_time(TCP_KEEPALIVE_TIME)
|
||||
.with_interval(TCP_KEEPALIVE_INTERVAL);
|
||||
sock_ref.set_tcp_keepalive(&keepalive)
|
||||
}
|
||||
|
||||
/// `HelloMessage` is sent by the client on a Netapp connection to indicate
|
||||
/// that they are also a server and ready to receive incoming connections
|
||||
/// at the specified address and port. If the client doesn't know their
|
||||
@@ -252,6 +266,13 @@ impl NetApp {
|
||||
_ = must_exit.changed() => continue,
|
||||
};
|
||||
|
||||
if let Err(e) = set_keepalive(&socket) {
|
||||
warn!(
|
||||
"Failed to set keepalive on connection from {}: {}",
|
||||
peer_addr, e
|
||||
);
|
||||
}
|
||||
|
||||
info!(
|
||||
"Incoming connection from {}, negotiating handshake...",
|
||||
peer_addr
|
||||
@@ -314,6 +335,9 @@ impl NetApp {
|
||||
}
|
||||
None => TcpStream::connect(ip).await?,
|
||||
};
|
||||
if let Err(e) = set_keepalive(&stream) {
|
||||
warn!("Failed to set keepalive on connection to {}: {}", ip, e);
|
||||
}
|
||||
info!("Connected to {}, negotiating handshake...", ip);
|
||||
ClientConn::init(self, stream, id).await?;
|
||||
Ok(())
|
||||
|
||||
Reference in New Issue
Block a user