fix(dns): reduce positive TTL to 60s and shuffle resolved IPs

The 1800s minimum TTL defeated CDN failover mechanisms (e.g. Fastly
  publishes 30–60s A-record TTLs specifically to signal when edge nodes
  are removed). Dead IPs were cached for up to 30 minutes with no
  way for the client to recover without a restart.

  - Drop DEFAULT_POSITIVE_LOOKUP_CACHE_TTL from 1800s to 60s so that
    CDN-signalled failovers take effect within a minute
  - Shuffle resolved IPs on each lookup so retries cycle through all
    available edge nodes rather than hitting the same dead address
  - Add invalidate_preresolve_entry / invalidate_preresolve_for API
    for callers that want targeted per-host cache eviction on hard
    connection failures
This commit is contained in:
Rachyandco
2026-04-19 22:58:11 +02:00
parent 7140ba4ea9
commit bf85e9eb79
4 changed files with 39 additions and 17 deletions
Generated
+1
View File
@@ -6926,6 +6926,7 @@ dependencies = [
"bytes",
"cfg-if",
"encoding_rs",
"fastrand",
"hickory-resolver",
"http 1.3.1",
"inventory",
+1
View File
@@ -36,6 +36,7 @@ thiserror = { workspace = true }
tracing = { workspace = true }
itertools = { workspace = true }
inventory = { workspace = true }
fastrand = { workspace = true }
tokio = { workspace = true, features = ["rt", "macros", "time"] }
rustls = { workspace=true }
# used for decoding text responses (they were already implicitly included)
+31 -17
View File
@@ -56,7 +56,6 @@ use std::{
use hickory_resolver::{
TokioResolver,
config::{NameServerConfig, NameServerConfigGroup, ResolverConfig, ResolverOpts},
lookup_ip::LookupIpIntoIter,
name_server::TokioConnectionProvider,
};
use once_cell::sync::OnceCell;
@@ -67,7 +66,12 @@ mod constants;
mod static_resolver;
pub(crate) use static_resolver::*;
pub(crate) const DEFAULT_POSITIVE_LOOKUP_CACHE_TTL: Duration = Duration::from_secs(1800);
// Fastly (and similar CDNs) deliberately publish short A-record TTLs (3060 s)
// so that clients re-resolve when an edge node is removed. Pinning to 1800 s
// defeats that mechanism and leaves the client hitting a dead IP for up to
// 30 minutes after a Fastly failover. 60 s is long enough to amortise DNS
// query overhead while still following CDN-signalled failovers within a minute.
pub(crate) const DEFAULT_POSITIVE_LOOKUP_CACHE_TTL: Duration = Duration::from_secs(60);
pub(crate) const DEFAULT_OVERALL_LOOKUP_TIMEOUT: Duration = Duration::from_secs(10);
pub(crate) const DEFAULT_QUERY_TIMEOUT: Duration = Duration::from_secs(5);
@@ -227,9 +231,13 @@ async fn resolve(
let primary_err = match resolve_fut.await {
Err(_) => ResolveError::Timeout,
Ok(Ok(lookup)) => {
let addrs: Addrs = Box::new(SocketAddrs {
iter: lookup.into_iter(),
});
// Shuffle so that successive connection attempts cycle through all
// returned IPs rather than always hitting the same first address.
// This distributes retries across the full set of CDN edge nodes
// even within the cache TTL window.
let mut ips: Vec<IpAddr> = lookup.into_iter().collect();
fastrand::shuffle(&mut ips);
let addrs: Addrs = Box::new(ips.into_iter().map(|ip| SocketAddr::new(ip, 0)));
return Ok(addrs);
}
Ok(Err(e)) => {
@@ -256,18 +264,6 @@ async fn resolve(
Err(primary_err)
}
struct SocketAddrs {
iter: LookupIpIntoIter,
}
impl Iterator for SocketAddrs {
type Item = SocketAddr;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(|ip_addr| SocketAddr::new(ip_addr, 0))
}
}
impl HickoryDnsResolver {
/// Returns an instance of the shared resolver.
pub fn shared() -> Self {
@@ -380,6 +376,24 @@ impl HickoryDnsResolver {
}
}
/// Remove the preresolve cache entry for a single host, forcing the next
/// lookup for that host to go through the network resolver again.
///
/// Call this after a hard connection failure (RST, ECONNREFUSED, timeout)
/// to ensure the next attempt gets a fresh DNS answer rather than a cached
/// IP that may no longer be reachable.
pub fn invalidate_preresolve_for(&self, name: &str) {
debug!("invalidating pre-resolve for {name}");
if let Some(cell) = &self.static_base
&& let Some(static_base) = cell.get()
{
static_base.invalidate_preresolve_entry(name)
}
if self.use_shared {
SHARED_RESOLVER.invalidate_preresolve_for(name);
}
}
/// Get the current map of hostnames to addresses used in the fallback static lookup stage if one
/// exists.
pub fn get_static_fallbacks(&self) -> Option<HashMap<String, Vec<IpAddr>>> {
@@ -107,6 +107,12 @@ impl StaticResolver {
self.fallback_addr_map.lock().unwrap().extend(addrs);
}
/// Remove a single host from the preresolve table, forcing the next lookup
/// for that host to go through the network resolver again.
pub fn invalidate_preresolve_entry(&self, name: &str) {
self.preresolve_addr_map.lock().unwrap().remove(name);
}
/// Clear entries from the static table that would return entries during the pre-resolve stage.
/// This means that all lookups will attempt to use the network resolver again before the static
/// table is consulted.