1
0
forked from GRIN/grim

nym: fast interactive reads + faster reconnect (no money-path change)

Profile/username/NIP-05 reads were ~10s and the tunnel stalled 20s on a
dead gateway. Fixes:
- Profile/accepts/dm-relay fetches stream scoped to their dial set and
  return on the first matching event instead of waiting for every relay
  (or the full timeout) - the ~10s nprofile search.
- HTTP over the mixnet is tunnel-first, scoped-exit only as fallback when
  the tunnel is not up (NIP-11/price/name lookups are public data).
- Name re-verify interval 78s -> 6h (was a debug leftover churning reads).
- Discovery relay NIP-11 probes run in parallel, not sequentially.
- Tunnel build timeout split from the exit dial cap: build 20s -> 10s
  (env GOBLIN_NYM_BUILD_TIMEOUT) so a dead gateway is abandoned fast; the
  exit money-path dial stays 20s.
- Cold start brings the tunnel up first, then prewarms the exit once after
  publish (grant sequencing preserved).
- NIP-05 search bounded to 15s instead of hanging up to ~90s.

Money path (transport.rs, streamexit.rs) byte-for-byte unchanged.
This commit is contained in:
2ro
2026-07-02 18:57:52 -04:00
parent 5733b9a894
commit aa39737d3b
5 changed files with 122 additions and 74 deletions
+12 -1
View File
@@ -1492,7 +1492,18 @@ fn resolve_nip05_blocking(name: &str, domain: &str) -> Option<nip05::Nip05Resolu
.enable_all()
.build()
.ok()?;
rt.block_on(nip05::resolve(&name, &domain))
// Overall 15s cap: without it a miss could block ~90s (up to a 30s tunnel
// wait + a 60s HTTP timeout), which reads to the user as a silent
// indefinite hang. Capping makes a miss fast and retryable instead.
rt.block_on(async {
tokio::time::timeout(
std::time::Duration::from_secs(15),
nip05::resolve(&name, &domain),
)
.await
.ok()
.flatten()
})
})
.join()
.ok()
+42 -17
View File
@@ -67,8 +67,10 @@ const RATE_UNKNOWN_PER_HOUR: usize = 10;
const RESEND_WINDOW_SECS: i64 = 7 * 86_400;
/// How often a cached @username is re-validated against the identity server, so
/// a released or reassigned name stops being shown. Doubles as the freshness
/// gate in `resolve_contact_identity`.
const NAME_REVERIFY_INTERVAL_SECS: i64 = 78;
/// gate in `resolve_contact_identity`. Tuned for release/name-change detection
/// freshness, not liveness — a name rarely changes, so 6h is ample and keeps the
/// mixnet re-verify traffic off the interactive path.
const NAME_REVERIFY_INTERVAL_SECS: i64 = 6 * 3600;
/// Cap on contacts re-verified per sweep, so a large contact list rolls through
/// instead of bursting dozens of simultaneous mixnet lookups at once.
const NAME_REVERIFY_MAX_PER_TICK: usize = 8;
@@ -227,15 +229,25 @@ impl NostrService {
connect_relays(&client, &dial).await;
}
let filter = Filter::new().kind(Kind::Metadata).author(pk).limit(1);
let events = client
.fetch_events(filter, Duration::from_secs(10))
// First-event-wins, scoped to the relays we just dialed: stream from
// exactly that set and return on the FIRST kind-0 that parses as
// Metadata (capped at 10s by the stream's own auto-close). The old
// `fetch_events` waited for EVERY relay (or the full 10s), so a single
// dead hint relay in the set always cost the whole 10s.
use futures::StreamExt;
let mut stream = client
.stream_events_from(dial, filter, Duration::from_secs(10))
.await
.ok()?;
let md: Metadata = serde_json::from_str(&events.first()?.content).ok()?;
Some(NostrProfile {
name: md.name.filter(|s| !s.is_empty()),
nip05: md.nip05.filter(|s| !s.is_empty()),
})
while let Some(event) = stream.next().await {
if let Ok(md) = serde_json::from_str::<Metadata>(&event.content) {
return Some(NostrProfile {
name: md.name.filter(|s| !s.is_empty()),
nip05: md.nip05.filter(|s| !s.is_empty()),
});
}
}
None
})
}
@@ -247,14 +259,23 @@ impl NostrService {
let client = self.client.read().clone()?;
let pk = PublicKey::from_hex(hex).ok()?;
let filter = Filter::new().kind(Kind::Metadata).author(pk).limit(1);
let events = client
.fetch_events(filter, Duration::from_secs(8))
// First-event-wins, scoped to our own connected relays (cap 8s): return on
// the first kind-0 that parses as Metadata rather than waiting on every
// relay / the full timeout, so one dead relay can't stall the request gate.
use futures::StreamExt;
let mut stream = client
.stream_events_from(self.relays(), filter, Duration::from_secs(8))
.await
.ok()?;
let md: Metadata = serde_json::from_str(&events.first()?.content).ok()?;
md.custom
.get("goblin_accepts_requests")
.and_then(|v| v.as_bool())
while let Some(event) = stream.next().await {
if let Ok(md) = serde_json::from_str::<Metadata>(&event.content) {
return md
.custom
.get("goblin_accepts_requests")
.and_then(|v| v.as_bool());
}
}
None
}
/// Republish our kind-0 profile + kind-10050 DM relays (e.g. after toggling
@@ -637,8 +658,12 @@ impl NostrService {
let filter = Filter::new().kind(Kind::InboxRelays).author(*pk).limit(1);
let mut out = vec![];
let mut v3 = false;
if let Ok(events) = client.fetch_events_from(&from, filter, FETCH_TIMEOUT).await
&& let Some(event) = events.first()
// Cap at 10s (not the 30s catch-up FETCH_TIMEOUT): this is on the
// interactive send path, so a slow/dead discovery relay must fail fast and
// fall back to relay hints + our own set rather than stall the send.
if let Ok(events) = client
.fetch_events_from(&from, filter, Duration::from_secs(10))
.await && let Some(event) = events.first()
{
for tag in event.tags.iter() {
let parts = tag.as_slice();
+10 -7
View File
@@ -323,13 +323,16 @@ pub async fn probe(url: &str) -> bool {
/// The pool's "discovery" relays that pass the lazy NIP-11 gate right now.
pub async fn usable_discovery_relays() -> Vec<String> {
let mut out = vec![];
for url in load().discovery_relays() {
if probe(&url).await {
out.push(url);
}
}
out
// Probe every candidate CONCURRENTLY (each is a NIP-11 HTTP round trip over
// the mixnet — sequentially this cost ~N × a full round trip). The PROBES
// cache is RwLock-safe under concurrent access. Zip the pass/fail results back
// to the urls and keep the passing ones in the original pool order.
let urls = load().discovery_relays();
let results = futures::future::join_all(urls.iter().map(|url| probe(url))).await;
urls.into_iter()
.zip(results)
.filter_map(|(url, ok)| ok.then_some(url))
.collect()
}
/// Weighted-random candidate ORDER for the advertised set: the Goblin relay
+9 -6
View File
@@ -136,12 +136,15 @@ async fn request_once(
let https = url.scheme() == "https";
let port = url.port().unwrap_or(if https { 443 } else { 80 });
// MONEY-PATH ANCHOR fork: HTTPS to a host whose relay advertises a
// co-located scoped Nym exit (its NIP-11 probe, in practice) rides a
// MixnetStream to that exit instead of the tunnel — no public DNS, no
// public IPR. Failure just falls through to the tunnel path below (anchor
// + fallback, never pin-only).
let exit_io = if https {
// TUNNEL-FIRST for HTTP. NIP-11/HTTP is PUBLIC data (relay docs, price, name
// authority) and both egresses are mixnet-private, so in steady state we ride
// the already-warm tunnel — opening a fresh MixnetStream + settle to a scoped
// exit PER request was pure latency here. Only when the tunnel isn't up yet
// (`!is_ready()`) do we fall to a host's co-located scoped exit to avoid a cold
// wait; failure there just falls through to the tunnel path below. transport.rs
// (relay websockets) stays exit-first and is untouched — this is the HTTP path
// only.
let exit_io = if https && !nymproc::is_ready() {
match crate::nostr::pool::load().exit_for_host(&host) {
Some(exit) => exit_connect(&host, &exit).await,
None => None,
+49 -43
View File
@@ -79,6 +79,10 @@ static RELAY_CONSUMER: AtomicBool = AtomicBool::new(false);
/// Guards the background bootstrap thread so `warm_up()` is idempotent.
static STARTED: AtomicBool = AtomicBool::new(false);
/// Guards the one-shot scoped-exit prewarm so it fires exactly once — after the
/// FIRST tunnel is published — and never again on a later reselect.
static PREWARMED: AtomicBool = AtomicBool::new(false);
/// Pre-warm the mixnet tunnel in the background so relays / NIP-05 / price are
/// ready by first use. Idempotent — later calls (including the lazy-init path
/// in [`wait_for_tunnel`]) are no-ops.
@@ -190,31 +194,12 @@ fn run_tunnel() {
// True while a FALLBACK (auto-selected) exit carries the traffic even
// though an anchor is configured — makes the ANCHOR RECOVERED log honest.
let mut fell_back = false;
// COLD-START SEQUENCING (money path first): if the pool advertises a
// co-located scoped exit, let ITS mixnet client grab its Nym free-tier
// bandwidth grant before this tunnel competes for one. Two ephemeral
// clients bootstrapping at once serialize on the grant (~1 min); waiting a
// bounded head-start for the exit client means only ONE bootstraps at a
// time, so the money-path relay connects in seconds and this tunnel
// (fallback / HTTP / discovery, all non-blocking) builds right after. No
// exit in the pool → no wait. Cold start only: on a later reselect the
// exit is long-ready, so `is_ready()` returns instantly.
if crate::nostr::pool::load().has_exit() {
// Kick the exit client's bootstrap NOW — nothing else touches it
// until the first relay dial (after a wallet opens), so waiting
// without this would just burn the head start and the grant race
// would happen anyway.
tokio::spawn(super::streamexit::prewarm());
let head_start = Instant::now();
while !super::streamexit::is_ready() && head_start.elapsed() < EXIT_HEAD_START {
tokio::time::sleep(Duration::from_millis(200)).await;
}
info!(
"[timing] nym: tunnel bootstrap proceeding after {}ms exit head-start (exit ready: {})",
head_start.elapsed().as_millis(),
super::streamexit::is_ready()
);
}
// COLD-START SEQUENCING (reads-first): the TUNNEL bootstraps first and takes
// its Nym free-tier bandwidth grant, so interactive reads get the tunnel
// ~2-3s sooner. The scoped money-path exit is prewarmed AFTER the first
// tunnel is published (see the `PREWARMED` guard below `MIXNET_READY`), which
// preserves grant-sequencing (tunnel first, then exit) without making reads
// wait out an exit head-start on cold start.
loop {
let started = Instant::now();
attempt += 1;
@@ -240,7 +225,8 @@ fn run_tunnel() {
// Cap the build: a dead gateway pick otherwise blocks on the Nym SDK's
// own long "connection response" timeout (~74s measured) before we can
// reselect. Abandoning the future drops the half-built tunnel.
let build = match tokio::time::timeout(BOOTSTRAP_TIMEOUT, build_tunnel(pin)).await {
let build_cap = tunnel_build_timeout();
let build = match tokio::time::timeout(build_cap, build_tunnel(pin)).await {
Ok(result) => result,
Err(_) => {
if choice == ExitChoice::Anchor {
@@ -249,14 +235,14 @@ fn run_tunnel() {
warn!(
"[timing] nym: ANCHOR DEAD — anchor build exceeded {}s (attempt {attempt}); \
FALLBACK to auto-select now",
BOOTSTRAP_TIMEOUT.as_secs()
build_cap.as_secs()
);
continue;
}
warn!(
"[timing] nym: DEAD GATEWAY — build_tunnel exceeded {}s (attempt {attempt}); \
re-selecting immediately",
BOOTSTRAP_TIMEOUT.as_secs()
build_cap.as_secs()
);
delay = Duration::from_secs(5);
continue;
@@ -328,6 +314,16 @@ fn run_tunnel() {
}
*TUNNEL.write() = Some(tunnel.clone());
MIXNET_READY.store(true, Ordering::Relaxed);
// Prewarm the scoped money-path exit ONCE, now that the tunnel is
// up (grant-sequencing: the tunnel already took its grant, the exit
// takes the next one) — but reads already have the tunnel. Guarded
// so a later reselect never re-fires it, and gated on the pool
// actually advertising a co-located exit.
if crate::nostr::pool::load().has_exit()
&& !PREWARMED.swap(true, Ordering::SeqCst)
{
tokio::spawn(super::streamexit::prewarm());
}
delay = Duration::from_secs(5);
// Hold the exit warm and govern its health. The watchdog weighs TWO
// signals: the cheap DNS keepalive (as before) AND — authoritatively,
@@ -427,23 +423,33 @@ const RELAY_HARD_GRACE: Duration = Duration::from_secs(90);
/// mixnet into the 2-3 minute loop this build fixes.
const MIN_EXIT_LIFETIME: Duration = Duration::from_secs(20);
/// Abandon a single `build_tunnel()` that hasn't finished within this and
/// re-select. A healthy gateway+IPR bootstrap completes in ~4-7s; without this
/// cap a DEAD first pick blocked for ~74s (measured) on the Nym SDK's own
/// "listening for connection response" timeout before we even got to reselect.
/// A few seconds of patience, not a minute. Shared with the scoped-exit egress
/// ([`super::streamexit`]) as ITS dial cap, so both mixnet bootstraps fail
/// equally fast.
/// The scoped-exit (money-path) mixnet dial cap: how long
/// [`super::streamexit::open_stream`] (and the HTTP exit fallback in
/// [`super::exit_connect`]) may spend bootstrapping before failing over. Without a
/// cap a DEAD pick blocked for ~74s (measured) on the Nym SDK's own "listening for
/// connection response" timeout. The TUNNEL's own build uses the shorter
/// [`TUNNEL_BUILD_TIMEOUT`]; this stays at 20s so the money path — which has no
/// tunnel to fall back to — gets more patience before it gives up.
pub(crate) const BOOTSTRAP_TIMEOUT: Duration = Duration::from_secs(20);
/// Cold-start head start for the scoped-exit client: the public-IPR tunnel waits
/// up to this long for [`super::streamexit::is_ready`] before it bootstraps, so
/// the money-path exit client claims its Nym free-tier bandwidth grant FIRST and
/// the two ephemeral clients don't serialize on the grant (~1 min otherwise; see
/// the cold-start sequencer in [`run_tunnel`] and the NOTE in
/// [`super::streamexit`]). Bounded so a missing/failed exit never holds the
/// tunnel more than briefly; the exit typically readies well inside it.
const EXIT_HEAD_START: Duration = Duration::from_secs(12);
/// Abandon a single `build_tunnel()` that hasn't finished within this and
/// re-select — the TUNNEL's build cap (the exit keeps [`BOOTSTRAP_TIMEOUT`] as
/// its money-path dial cap). A healthy gateway+IPR bootstrap completes in ~4-7s,
/// so 10s gives one slow-but-working build room while a dead first pick is
/// abandoned in a third of the old 30s. Runtime-overridable (seconds) via
/// `GOBLIN_NYM_BUILD_TIMEOUT` for the timing harness.
const TUNNEL_BUILD_TIMEOUT: Duration = Duration::from_secs(10);
/// The effective tunnel build cap: [`TUNNEL_BUILD_TIMEOUT`] unless
/// `GOBLIN_NYM_BUILD_TIMEOUT` (whole seconds) overrides it. Re-read each attempt
/// so a timing harness can flip it without a restart.
fn tunnel_build_timeout() -> Duration {
std::env::var("GOBLIN_NYM_BUILD_TIMEOUT")
.ok()
.and_then(|s| s.parse::<u64>().ok())
.map(Duration::from_secs)
.unwrap_or(TUNNEL_BUILD_TIMEOUT)
}
/// Watchdog poll cadence. The relay-reachability check is a bare atomic load
/// (free), so a short cadence costs nothing and never touches the network; the