nym: race two gateway connects on the cold-start path (bound the lottery tail)
The read-tunnel cold/auto path drew ONE random entry gateway; a dead draw blocked connect_to_mixnet() until run_tunnel's 10s cap and consecutive dead draws stacked into a 15-35s tail (measured 6/15 cold starts). With no warm gateway hint, build_tunnel now races TWO ephemeral gateway connects (identical anonymity cfg, ephemeral keys) via connect_gateway_racing and takes the first up; the loser is reaped (disconnected if connected, dropped if pending) so no gateway session leaks. Only the gateway handshake is doubled - the exit/IPR is still built ONCE on the winner. Warm reconnects (cached gateway hint) stay single-build. Money path (streamexit.rs) untouched. Verify before release: emulator cold-start timing (~15 trials) shows the tail bounded, no session/task leak across many cold starts, warm path unaffected.
This commit is contained in:
+99
-6
@@ -840,14 +840,30 @@ async fn build_tunnel(
|
||||
cfg.traffic.message_sending_average_delay = Duration::from_millis(4);
|
||||
|
||||
// Mirror the mainnet env setup the SDK's own constructors run before connect.
|
||||
// Done ONCE here (not per-raced-client): `setup_env` writes process-wide env
|
||||
// vars and must not be raced across the two connect tasks on the cold path.
|
||||
nym_sdk::setup_env(None::<&std::path::Path>);
|
||||
let mut builder = MixnetClientBuilder::new_ephemeral().debug_config(cfg);
|
||||
// Warm-connect: ask for last run's entry gateway. With ephemeral storage this
|
||||
// is a Specified gateway selection with no persisted keys.
|
||||
if let Some(gw) = entry_gateway {
|
||||
builder = builder.request_gateway(gw);
|
||||
|
||||
// GATEWAY CONNECT. Two shapes, both on the identical anonymity `cfg` (`Copy`):
|
||||
// * WARM hint (`entry_gateway.is_some()`): reconnect to the KNOWN-good first
|
||||
// hop — a Specified gateway, ephemeral storage, no persisted keys. NO race:
|
||||
// we want that specific gateway.
|
||||
// * COLD / auto (`entry_gateway.is_none()`): the first hop is a RANDOM draw and
|
||||
// a dead draw blocks `connect_to_mixnet()` until `run_tunnel`'s 10s cap, with
|
||||
// consecutive dead draws stacking into a multi-second tail. Race TWO ephemeral
|
||||
// gateway connects and take the first up (see `connect_gateway_racing`). Only
|
||||
// the gateway handshake is doubled — the exit/IPR below is still built ONCE.
|
||||
let client = match entry_gateway {
|
||||
Some(gw) => {
|
||||
MixnetClientBuilder::new_ephemeral()
|
||||
.debug_config(cfg)
|
||||
.request_gateway(gw)
|
||||
.build()?
|
||||
.connect_to_mixnet()
|
||||
.await?
|
||||
}
|
||||
let client = builder.build()?.connect_to_mixnet().await?;
|
||||
None => connect_gateway_racing(cfg).await?,
|
||||
};
|
||||
|
||||
// Capture the ENTRY GATEWAY actually used, from the client's own nym-address,
|
||||
// BEFORE `from_client` consumes the client.
|
||||
@@ -865,6 +881,83 @@ async fn build_tunnel(
|
||||
Ok((tunnel, entry_gw, ipr))
|
||||
}
|
||||
|
||||
/// Cold/auto gateway connect with a BOUNDED latency tail — the fix for the Nym
|
||||
/// cold-start "gateway lottery". Used ONLY on the auto path (no warm hint), where
|
||||
/// the entry gateway is a RANDOM draw: a dead draw blocks `connect_to_mixnet()`
|
||||
/// until `run_tunnel`'s 10s cap and consecutive dead draws stack into the tail.
|
||||
///
|
||||
/// Race EXACTLY TWO ephemeral `MixnetClient`s — IDENTICAL anonymity `cfg`,
|
||||
/// ephemeral keys, nothing persisted — through the gateway handshake and return
|
||||
/// the FIRST that connects. Only the gateway handshake is doubled; the caller
|
||||
/// builds the exit/IPR ONCE on the winner. Two (not more) bounds the Nym
|
||||
/// free-tier bandwidth burst.
|
||||
///
|
||||
/// The loser is REAPED so a CONNECTED client is never leaked: it is aborted (a
|
||||
/// still-pending connect just drops its half-built client) and, in a DETACHED task
|
||||
/// so the winner returns immediately, `disconnect()`ed IFF it had already
|
||||
/// connected. If BOTH draws fail, the error is returned so `run_tunnel`'s loop
|
||||
/// re-selects — the same contract as the single build.
|
||||
async fn connect_gateway_racing(
|
||||
cfg: nym_sdk::DebugConfig,
|
||||
) -> Result<nym_sdk::mixnet::MixnetClient, smolmix::SmolmixError> {
|
||||
use nym_sdk::mixnet::{MixnetClient, MixnetClientBuilder};
|
||||
|
||||
async fn connect_one(cfg: nym_sdk::DebugConfig) -> Result<MixnetClient, smolmix::SmolmixError> {
|
||||
Ok(MixnetClientBuilder::new_ephemeral()
|
||||
.debug_config(cfg)
|
||||
.build()?
|
||||
.connect_to_mixnet()
|
||||
.await?)
|
||||
}
|
||||
|
||||
// Spawn both so the loser can be aborted cleanly. `cfg` is `Copy`, so each task
|
||||
// gets the identical anonymity config.
|
||||
let mut a = tokio::spawn(connect_one(cfg));
|
||||
let mut b = tokio::spawn(connect_one(cfg));
|
||||
|
||||
// Whichever finishes first; keep `other` to reap (on a win) or fall back to (if
|
||||
// the first draw errored).
|
||||
let (first, other) = tokio::select! {
|
||||
r = &mut a => (r, b),
|
||||
r = &mut b => (r, a),
|
||||
};
|
||||
// A JoinError (task panic) folds into an error so `other` still gets its turn.
|
||||
let first = first.unwrap_or_else(|e| {
|
||||
Err(smolmix::SmolmixError::Io(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("nym gateway connect task failed: {e}"),
|
||||
)))
|
||||
});
|
||||
|
||||
match first {
|
||||
// First to finish connected — it WINS. Reap the loser off the hot path.
|
||||
Ok(client) => {
|
||||
other.abort();
|
||||
tokio::spawn(async move {
|
||||
// If the loser connected before the abort landed, disconnect it so
|
||||
// no live gateway session leaks; a pending connect was just dropped.
|
||||
if let Ok(Ok(loser)) = other.await {
|
||||
loser.disconnect().await;
|
||||
}
|
||||
});
|
||||
Ok(client)
|
||||
}
|
||||
// First draw failed — a lone client has no dead-draw tail, so just await the
|
||||
// survivor; if it fails too, surface an error and `run_tunnel` re-selects.
|
||||
Err(first_err) => match other.await {
|
||||
Ok(Ok(client)) => Ok(client),
|
||||
Ok(Err(second_err)) => {
|
||||
warn!(
|
||||
"[timing] nym: both raced gateway connects failed \
|
||||
({first_err}; {second_err}); run_tunnel will re-select"
|
||||
);
|
||||
Err(second_err)
|
||||
}
|
||||
Err(_join) => Err(first_err),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
Reference in New Issue
Block a user