DEV(cache_critical_dns): improve error reporting for failures

There are two failure modes that can be expected - no target SRV DNS RRs
found or no healthy service available at target addresses. Prior to this
patch, there was no way to differentiate from log messages between the
two cases.

Introduce an EmptyCache exception which may be raised by either the
ResolverCache or HealthyCache. The exception message contains enough
information about where the exception occurred to troubleshoot issues.

An existing bug was fixed in this commit. Previously if a target address
changed during runtime, an old cached (healthy) address would be
returned.. The behaviour has been corrected to return the newly cached
address.
This commit is contained in:
Michael Fitz-Payne 2023-03-07 12:20:24 +10:00 committed by Michael Fitz-Payne
parent 5ea89d1fcb
commit f38779adf4

@ -177,6 +177,8 @@ end
CacheMeta = Struct.new(:first_seen, :last_seen) CacheMeta = Struct.new(:first_seen, :last_seen)
class EmptyCacheError < StandardError; end
class ResolverCache class ResolverCache
def initialize(name) def initialize(name)
# instance of Name|SRVName # instance of Name|SRVName
@ -186,22 +188,29 @@ class ResolverCache
@cached = {} @cached = {}
end end
# resolve returns a list of resolved addresses ordered by the time first seen, # Returns a list of resolved addresses ordered by first seen time. Most
# most recently seen at the head of the list. # recently seen address is first.
# Addresses last seen more than 30 minutes ago are evicted from the cache on # If an exception occurs during DNS resolution we return whatever addresses
# a call to resolve(). # are cached.
# If an exception occurs during DNS resolution we return whatever addresses are # Addresses last seen more than 30 minutes ago are evicted from the cache.
# cached. # Raises EmptyCacheError if the cache is empty after DNS resolution and cache
# eviction is performed.
def resolve def resolve
@name.resolve.each do |address| begin
if @cached[address] @name.resolve.each do |address|
@cached[address].last_seen = Time.now.utc if @cached[address]
else @cached[address].last_seen = Time.now.utc
@cached[address] = CacheMeta.new(Time.now.utc, Time.now.utc) else
@cached[address] = CacheMeta.new(Time.now.utc, Time.now.utc)
end
end end
rescue Resolv::ResolvError, Resolv::ResolvTimeout
end end
ensure
@cached = @cached.delete_if { |_, meta| Time.now.utc - 30 * 60 > meta.last_seen } @cached = @cached.delete_if { |_, meta| Time.now.utc - 30 * 60 > meta.last_seen }
if @cached.empty?
raise EmptyCacheError, "DNS resolver found no usable addresses"
end
@cached.sort_by { |_, meta| meta.first_seen }.reverse.map(&:first) @cached.sort_by { |_, meta| meta.first_seen }.reverse.map(&:first)
end end
end end
@ -213,11 +222,25 @@ class HealthyCache
@cached = nil # a single IP address that was most recently found to be healthy @cached = nil # a single IP address that was most recently found to be healthy
end end
# Returns the first healthy server found in the list of resolved addresses.
# Returns the last known healthy server if all servers disappear from the
# DNS.
# Raises EmptyCacheError if no healthy servers have been cached.
def first_healthy def first_healthy
address = @resolver_cache.resolve.lazy.select { |addr| @check.call(addr) }.first begin
if !nilempty(address).nil? addresses = @resolver_cache.resolve
rescue EmptyCacheError
return @cached if @cached
raise
end
if (address = addresses.lazy.select { |addr| @check.call(addr) }.first)
@cached = address @cached = address
end end
if @cached.nil?
raise EmptyCacheError, "no healthy servers found amongst #{addresses}"
end
@cached @cached
end end
end end
@ -413,8 +436,6 @@ def run_and_report(hostname_vars)
end end
def run(hostname_vars) def run(hostname_vars)
# HOSTNAME: [IP_ADDRESS, ...]
# this will usually be a single address
resolved = {} resolved = {}
errors = Hash.new(0) errors = Hash.new(0)
@ -431,13 +452,9 @@ def run(hostname_vars)
HOST_HEALTHY_CACHE[var] ||= HealthyCache.new(HOST_RESOLVER_CACHE[var], HEALTH_CHECKS[var.to_sym]) HOST_HEALTHY_CACHE[var] ||= HealthyCache.new(HOST_RESOLVER_CACHE[var], HEALTH_CHECKS[var.to_sym])
begin begin
if (address = HOST_HEALTHY_CACHE[var].first_healthy) address = HOST_HEALTHY_CACHE[var].first_healthy
resolved[name] = [address] resolved[name] = [address]
else rescue EmptyCacheError => e
error("#{var}: #{name}: no address")
errors[name] += 1
end
rescue => e
error("#{var}: #{name}: #{e}") error("#{var}: #{name}: #{e}")
errors[name] += 1 errors[name] += 1
end end
@ -458,9 +475,8 @@ def run(hostname_vars)
if changed if changed
File.write(HOSTS_PATH, hosts_content) File.write(HOSTS_PATH, hosts_content)
end end
rescue => e rescue => e
error("DNS lookup failed: #{e}") error("unhandled exception during run: #{e}")
errors[nil] = 1 errors[nil] = 1
ensure ensure
return errors return errors