fix(dns): reduce positive TTL to 60s and shuffle resolved IPs
The 1800s minimum TTL defeated CDN failover mechanisms (e.g. Fastly
publishes 30–60s A-record TTLs specifically to signal when edge nodes
are removed). Dead IPs were cached for up to 30 minutes with no
way for the client to recover without a restart.
- Drop DEFAULT_POSITIVE_LOOKUP_CACHE_TTL from 1800s to 60s so that
CDN-signalled failovers take effect within a minute
- Shuffle resolved IPs on each lookup so retries cycle through all
available edge nodes rather than hitting the same dead address
- Add invalidate_preresolve_entry / invalidate_preresolve_for API
for callers that want targeted per-host cache eviction on hard
connection failures
This commit is contained in:
Generated
+1
@@ -6926,6 +6926,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"cfg-if",
|
||||
"encoding_rs",
|
||||
"fastrand",
|
||||
"hickory-resolver",
|
||||
"http 1.3.1",
|
||||
"inventory",
|
||||
|
||||
@@ -36,6 +36,7 @@ thiserror = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
itertools = { workspace = true }
|
||||
inventory = { workspace = true }
|
||||
fastrand = { workspace = true }
|
||||
tokio = { workspace = true, features = ["rt", "macros", "time"] }
|
||||
rustls = { workspace=true }
|
||||
# used for decoding text responses (they were already implicitly included)
|
||||
|
||||
@@ -56,7 +56,6 @@ use std::{
|
||||
use hickory_resolver::{
|
||||
TokioResolver,
|
||||
config::{NameServerConfig, NameServerConfigGroup, ResolverConfig, ResolverOpts},
|
||||
lookup_ip::LookupIpIntoIter,
|
||||
name_server::TokioConnectionProvider,
|
||||
};
|
||||
use once_cell::sync::OnceCell;
|
||||
@@ -67,7 +66,12 @@ mod constants;
|
||||
mod static_resolver;
|
||||
pub(crate) use static_resolver::*;
|
||||
|
||||
pub(crate) const DEFAULT_POSITIVE_LOOKUP_CACHE_TTL: Duration = Duration::from_secs(1800);
|
||||
// Fastly (and similar CDNs) deliberately publish short A-record TTLs (30–60 s)
|
||||
// so that clients re-resolve when an edge node is removed. Pinning to 1800 s
|
||||
// defeats that mechanism and leaves the client hitting a dead IP for up to
|
||||
// 30 minutes after a Fastly failover. 60 s is long enough to amortise DNS
|
||||
// query overhead while still following CDN-signalled failovers within a minute.
|
||||
pub(crate) const DEFAULT_POSITIVE_LOOKUP_CACHE_TTL: Duration = Duration::from_secs(60);
|
||||
pub(crate) const DEFAULT_OVERALL_LOOKUP_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
pub(crate) const DEFAULT_QUERY_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
|
||||
@@ -227,9 +231,13 @@ async fn resolve(
|
||||
let primary_err = match resolve_fut.await {
|
||||
Err(_) => ResolveError::Timeout,
|
||||
Ok(Ok(lookup)) => {
|
||||
let addrs: Addrs = Box::new(SocketAddrs {
|
||||
iter: lookup.into_iter(),
|
||||
});
|
||||
// Shuffle so that successive connection attempts cycle through all
|
||||
// returned IPs rather than always hitting the same first address.
|
||||
// This distributes retries across the full set of CDN edge nodes
|
||||
// even within the cache TTL window.
|
||||
let mut ips: Vec<IpAddr> = lookup.into_iter().collect();
|
||||
fastrand::shuffle(&mut ips);
|
||||
let addrs: Addrs = Box::new(ips.into_iter().map(|ip| SocketAddr::new(ip, 0)));
|
||||
return Ok(addrs);
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
@@ -256,18 +264,6 @@ async fn resolve(
|
||||
Err(primary_err)
|
||||
}
|
||||
|
||||
struct SocketAddrs {
|
||||
iter: LookupIpIntoIter,
|
||||
}
|
||||
|
||||
impl Iterator for SocketAddrs {
|
||||
type Item = SocketAddr;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.iter.next().map(|ip_addr| SocketAddr::new(ip_addr, 0))
|
||||
}
|
||||
}
|
||||
|
||||
impl HickoryDnsResolver {
|
||||
/// Returns an instance of the shared resolver.
|
||||
pub fn shared() -> Self {
|
||||
@@ -380,6 +376,24 @@ impl HickoryDnsResolver {
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove the preresolve cache entry for a single host, forcing the next
|
||||
/// lookup for that host to go through the network resolver again.
|
||||
///
|
||||
/// Call this after a hard connection failure (RST, ECONNREFUSED, timeout)
|
||||
/// to ensure the next attempt gets a fresh DNS answer rather than a cached
|
||||
/// IP that may no longer be reachable.
|
||||
pub fn invalidate_preresolve_for(&self, name: &str) {
|
||||
debug!("invalidating pre-resolve for {name}");
|
||||
if let Some(cell) = &self.static_base
|
||||
&& let Some(static_base) = cell.get()
|
||||
{
|
||||
static_base.invalidate_preresolve_entry(name)
|
||||
}
|
||||
if self.use_shared {
|
||||
SHARED_RESOLVER.invalidate_preresolve_for(name);
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the current map of hostnames to addresses used in the fallback static lookup stage if one
|
||||
/// exists.
|
||||
pub fn get_static_fallbacks(&self) -> Option<HashMap<String, Vec<IpAddr>>> {
|
||||
|
||||
@@ -107,6 +107,12 @@ impl StaticResolver {
|
||||
self.fallback_addr_map.lock().unwrap().extend(addrs);
|
||||
}
|
||||
|
||||
/// Remove a single host from the preresolve table, forcing the next lookup
|
||||
/// for that host to go through the network resolver again.
|
||||
pub fn invalidate_preresolve_entry(&self, name: &str) {
|
||||
self.preresolve_addr_map.lock().unwrap().remove(name);
|
||||
}
|
||||
|
||||
/// Clear entries from the static table that would return entries during the pre-resolve stage.
|
||||
/// This means that all lookups will attempt to use the network resolver again before the static
|
||||
/// table is consulted.
|
||||
|
||||
Reference in New Issue
Block a user