nym: race two gateway connects on the cold-start path (bound the lottery tail)
The read-tunnel cold/auto path drew ONE random entry gateway; a dead draw blocked connect_to_mixnet() until run_tunnel's 10s cap and consecutive dead draws stacked into a 15-35s tail (measured 6/15 cold starts). With no warm gateway hint, build_tunnel now races TWO ephemeral gateway connects (identical anonymity cfg, ephemeral keys) via connect_gateway_racing and takes the first up; the loser is reaped (disconnected if connected, dropped if pending) so no gateway session leaks. Only the gateway handshake is doubled - the exit/IPR is still built ONCE on the winner. Warm reconnects (cached gateway hint) stay single-build. Money path (streamexit.rs) untouched. Verify before release: emulator cold-start timing (~15 trials) shows the tail bounded, no session/task leak across many cold starts, warm path unaffected.
This commit is contained in:
+99
-6
@@ -840,14 +840,30 @@ async fn build_tunnel(
|
|||||||
cfg.traffic.message_sending_average_delay = Duration::from_millis(4);
|
cfg.traffic.message_sending_average_delay = Duration::from_millis(4);
|
||||||
|
|
||||||
// Mirror the mainnet env setup the SDK's own constructors run before connect.
|
// Mirror the mainnet env setup the SDK's own constructors run before connect.
|
||||||
|
// Done ONCE here (not per-raced-client): `setup_env` writes process-wide env
|
||||||
|
// vars and must not be raced across the two connect tasks on the cold path.
|
||||||
nym_sdk::setup_env(None::<&std::path::Path>);
|
nym_sdk::setup_env(None::<&std::path::Path>);
|
||||||
let mut builder = MixnetClientBuilder::new_ephemeral().debug_config(cfg);
|
|
||||||
// Warm-connect: ask for last run's entry gateway. With ephemeral storage this
|
// GATEWAY CONNECT. Two shapes, both on the identical anonymity `cfg` (`Copy`):
|
||||||
// is a Specified gateway selection with no persisted keys.
|
// * WARM hint (`entry_gateway.is_some()`): reconnect to the KNOWN-good first
|
||||||
if let Some(gw) = entry_gateway {
|
// hop — a Specified gateway, ephemeral storage, no persisted keys. NO race:
|
||||||
builder = builder.request_gateway(gw);
|
// we want that specific gateway.
|
||||||
|
// * COLD / auto (`entry_gateway.is_none()`): the first hop is a RANDOM draw and
|
||||||
|
// a dead draw blocks `connect_to_mixnet()` until `run_tunnel`'s 10s cap, with
|
||||||
|
// consecutive dead draws stacking into a multi-second tail. Race TWO ephemeral
|
||||||
|
// gateway connects and take the first up (see `connect_gateway_racing`). Only
|
||||||
|
// the gateway handshake is doubled — the exit/IPR below is still built ONCE.
|
||||||
|
let client = match entry_gateway {
|
||||||
|
Some(gw) => {
|
||||||
|
MixnetClientBuilder::new_ephemeral()
|
||||||
|
.debug_config(cfg)
|
||||||
|
.request_gateway(gw)
|
||||||
|
.build()?
|
||||||
|
.connect_to_mixnet()
|
||||||
|
.await?
|
||||||
}
|
}
|
||||||
let client = builder.build()?.connect_to_mixnet().await?;
|
None => connect_gateway_racing(cfg).await?,
|
||||||
|
};
|
||||||
|
|
||||||
// Capture the ENTRY GATEWAY actually used, from the client's own nym-address,
|
// Capture the ENTRY GATEWAY actually used, from the client's own nym-address,
|
||||||
// BEFORE `from_client` consumes the client.
|
// BEFORE `from_client` consumes the client.
|
||||||
@@ -865,6 +881,83 @@ async fn build_tunnel(
|
|||||||
Ok((tunnel, entry_gw, ipr))
|
Ok((tunnel, entry_gw, ipr))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Cold/auto gateway connect with a BOUNDED latency tail — the fix for the Nym
|
||||||
|
/// cold-start "gateway lottery". Used ONLY on the auto path (no warm hint), where
|
||||||
|
/// the entry gateway is a RANDOM draw: a dead draw blocks `connect_to_mixnet()`
|
||||||
|
/// until `run_tunnel`'s 10s cap and consecutive dead draws stack into the tail.
|
||||||
|
///
|
||||||
|
/// Race EXACTLY TWO ephemeral `MixnetClient`s — IDENTICAL anonymity `cfg`,
|
||||||
|
/// ephemeral keys, nothing persisted — through the gateway handshake and return
|
||||||
|
/// the FIRST that connects. Only the gateway handshake is doubled; the caller
|
||||||
|
/// builds the exit/IPR ONCE on the winner. Two (not more) bounds the Nym
|
||||||
|
/// free-tier bandwidth burst.
|
||||||
|
///
|
||||||
|
/// The loser is REAPED so a CONNECTED client is never leaked: it is aborted (a
|
||||||
|
/// still-pending connect just drops its half-built client) and, in a DETACHED task
|
||||||
|
/// so the winner returns immediately, `disconnect()`ed IFF it had already
|
||||||
|
/// connected. If BOTH draws fail, the error is returned so `run_tunnel`'s loop
|
||||||
|
/// re-selects — the same contract as the single build.
|
||||||
|
async fn connect_gateway_racing(
|
||||||
|
cfg: nym_sdk::DebugConfig,
|
||||||
|
) -> Result<nym_sdk::mixnet::MixnetClient, smolmix::SmolmixError> {
|
||||||
|
use nym_sdk::mixnet::{MixnetClient, MixnetClientBuilder};
|
||||||
|
|
||||||
|
async fn connect_one(cfg: nym_sdk::DebugConfig) -> Result<MixnetClient, smolmix::SmolmixError> {
|
||||||
|
Ok(MixnetClientBuilder::new_ephemeral()
|
||||||
|
.debug_config(cfg)
|
||||||
|
.build()?
|
||||||
|
.connect_to_mixnet()
|
||||||
|
.await?)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Spawn both so the loser can be aborted cleanly. `cfg` is `Copy`, so each task
|
||||||
|
// gets the identical anonymity config.
|
||||||
|
let mut a = tokio::spawn(connect_one(cfg));
|
||||||
|
let mut b = tokio::spawn(connect_one(cfg));
|
||||||
|
|
||||||
|
// Whichever finishes first; keep `other` to reap (on a win) or fall back to (if
|
||||||
|
// the first draw errored).
|
||||||
|
let (first, other) = tokio::select! {
|
||||||
|
r = &mut a => (r, b),
|
||||||
|
r = &mut b => (r, a),
|
||||||
|
};
|
||||||
|
// A JoinError (task panic) folds into an error so `other` still gets its turn.
|
||||||
|
let first = first.unwrap_or_else(|e| {
|
||||||
|
Err(smolmix::SmolmixError::Io(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::Other,
|
||||||
|
format!("nym gateway connect task failed: {e}"),
|
||||||
|
)))
|
||||||
|
});
|
||||||
|
|
||||||
|
match first {
|
||||||
|
// First to finish connected — it WINS. Reap the loser off the hot path.
|
||||||
|
Ok(client) => {
|
||||||
|
other.abort();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
// If the loser connected before the abort landed, disconnect it so
|
||||||
|
// no live gateway session leaks; a pending connect was just dropped.
|
||||||
|
if let Ok(Ok(loser)) = other.await {
|
||||||
|
loser.disconnect().await;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
Ok(client)
|
||||||
|
}
|
||||||
|
// First draw failed — a lone client has no dead-draw tail, so just await the
|
||||||
|
// survivor; if it fails too, surface an error and `run_tunnel` re-selects.
|
||||||
|
Err(first_err) => match other.await {
|
||||||
|
Ok(Ok(client)) => Ok(client),
|
||||||
|
Ok(Err(second_err)) => {
|
||||||
|
warn!(
|
||||||
|
"[timing] nym: both raced gateway connects failed \
|
||||||
|
({first_err}; {second_err}); run_tunnel will re-select"
|
||||||
|
);
|
||||||
|
Err(second_err)
|
||||||
|
}
|
||||||
|
Err(_join) => Err(first_err),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|||||||
Reference in New Issue
Block a user