From f38779adf4d038af2bc093a1c0d85dbd7d86f547 Mon Sep 17 00:00:00 2001 From: Michael Fitz-Payne Date: Tue, 7 Mar 2023 12:20:24 +1000 Subject: [PATCH] DEV(cache_critical_dns): improve error reporting for failures There are two failure modes that can be expected - no target SRV DNS RRs found or no healthy service available at target addresses. Prior to this patch, there was no way to differentiate from log messages between the two cases. Introduce an EmptyCache exception which may be raised by either the ResolverCache or HealthyCache. The exception message contains enough information about where the exception occurred to troubleshoot issues. An existing bug was fixed in this commit. Previously if a target address changed during runtime, an old cached (healthy) address would be returned.. The behaviour has been corrected to return the newly cached address. --- script/cache_critical_dns | 66 ++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 25 deletions(-) diff --git a/script/cache_critical_dns b/script/cache_critical_dns index 76b679cdbbd..d45bbf47ce3 100755 --- a/script/cache_critical_dns +++ b/script/cache_critical_dns @@ -177,6 +177,8 @@ end CacheMeta = Struct.new(:first_seen, :last_seen) +class EmptyCacheError < StandardError; end + class ResolverCache def initialize(name) # instance of Name|SRVName @@ -186,22 +188,29 @@ class ResolverCache @cached = {} end - # resolve returns a list of resolved addresses ordered by the time first seen, - # most recently seen at the head of the list. - # Addresses last seen more than 30 minutes ago are evicted from the cache on - # a call to resolve(). - # If an exception occurs during DNS resolution we return whatever addresses are - # cached. + # Returns a list of resolved addresses ordered by first seen time. Most + # recently seen address is first. + # If an exception occurs during DNS resolution we return whatever addresses + # are cached. + # Addresses last seen more than 30 minutes ago are evicted from the cache. + # Raises EmptyCacheError if the cache is empty after DNS resolution and cache + # eviction is performed. def resolve - @name.resolve.each do |address| - if @cached[address] - @cached[address].last_seen = Time.now.utc - else - @cached[address] = CacheMeta.new(Time.now.utc, Time.now.utc) + begin + @name.resolve.each do |address| + if @cached[address] + @cached[address].last_seen = Time.now.utc + else + @cached[address] = CacheMeta.new(Time.now.utc, Time.now.utc) + end end + rescue Resolv::ResolvError, Resolv::ResolvTimeout end - ensure + @cached = @cached.delete_if { |_, meta| Time.now.utc - 30 * 60 > meta.last_seen } + if @cached.empty? + raise EmptyCacheError, "DNS resolver found no usable addresses" + end @cached.sort_by { |_, meta| meta.first_seen }.reverse.map(&:first) end end @@ -213,11 +222,25 @@ class HealthyCache @cached = nil # a single IP address that was most recently found to be healthy end + # Returns the first healthy server found in the list of resolved addresses. + # Returns the last known healthy server if all servers disappear from the + # DNS. + # Raises EmptyCacheError if no healthy servers have been cached. def first_healthy - address = @resolver_cache.resolve.lazy.select { |addr| @check.call(addr) }.first - if !nilempty(address).nil? + begin + addresses = @resolver_cache.resolve + rescue EmptyCacheError + return @cached if @cached + raise + end + + if (address = addresses.lazy.select { |addr| @check.call(addr) }.first) @cached = address end + if @cached.nil? + raise EmptyCacheError, "no healthy servers found amongst #{addresses}" + end + @cached end end @@ -413,8 +436,6 @@ def run_and_report(hostname_vars) end def run(hostname_vars) - # HOSTNAME: [IP_ADDRESS, ...] - # this will usually be a single address resolved = {} errors = Hash.new(0) @@ -431,13 +452,9 @@ def run(hostname_vars) HOST_HEALTHY_CACHE[var] ||= HealthyCache.new(HOST_RESOLVER_CACHE[var], HEALTH_CHECKS[var.to_sym]) begin - if (address = HOST_HEALTHY_CACHE[var].first_healthy) - resolved[name] = [address] - else - error("#{var}: #{name}: no address") - errors[name] += 1 - end - rescue => e + address = HOST_HEALTHY_CACHE[var].first_healthy + resolved[name] = [address] + rescue EmptyCacheError => e error("#{var}: #{name}: #{e}") errors[name] += 1 end @@ -458,9 +475,8 @@ def run(hostname_vars) if changed File.write(HOSTS_PATH, hosts_content) end - rescue => e - error("DNS lookup failed: #{e}") + error("unhandled exception during run: #{e}") errors[nil] = 1 ensure return errors