FEATURE: cache https redirects per hostname

If a hostname does an https redirect we cache that so next
lookup does not incur it.

Also, only rate limit per ip once per final destination

Raise final destination protection to 1000 ip lookups an hour
This commit is contained in:
Sam Saffron 2017-10-17 16:22:38 +11:00
parent 646c6eb7cd
commit 8185b8cb06
2 changed files with 60 additions and 4 deletions

View File

@ -1,11 +1,30 @@
require "socket"
require "ipaddr"
require 'socket'
require 'ipaddr'
require 'excon'
require 'rate_limiter'
# Determine the final endpoint for a Web URI, following redirects
class FinalDestination
def self.clear_https_cache!(domain)
key = redis_https_key(domain)
$redis.without_namespace.del(key)
end
def self.cache_https_domain(domain)
key = redis_https_key(domain)
$redis.without_namespace.setex(key, "1", 1.day.to_i).present?
end
def self.is_https_domain?(domain)
key = redis_https_key(domain)
$redis.without_namespace.get(key).present?
end
def self.redis_https_key(domain)
"HTTPS_DOMAIN_#{domain}"
end
attr_reader :status, :cookie, :status_code
def initialize(url, opts = nil)
@ -31,6 +50,7 @@ class FinalDestination
@status = :ready
@http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head
@cookie = nil
@limited_ips = []
end
def self.connection_timeout
@ -66,6 +86,11 @@ class FinalDestination
end
def resolve
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
@uri.scheme = "https"
@uri = URI(@uri.to_s)
end
if @limit < 0
@status = :too_many_redirects
return nil
@ -132,9 +157,17 @@ class FinalDestination
end
if location
old_port = @uri.port
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
@uri = URI(location) rescue nil
@limit -= 1
# https redirect, so just cache that whole new domain is https
if old_port == 80 && @uri.port == 443 && (URI::HTTPS === @uri)
FinalDestination.cache_https_domain(@uri.hostname)
end
return resolve
end
@ -191,8 +224,9 @@ class FinalDestination
end
# Rate limit how often this IP can be crawled
unless @opts[:skip_rate_limit]
RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 100, 1.hour).performed!
if !@opts[:skip_rate_limit] && !@limited_ips.include?(address)
@limited_ips << address
RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 1000, 1.hour).performed!
end
true

View File

@ -20,6 +20,7 @@ describe FinalDestination do
when 'internal-ipv6.com' then '2001:abc:de:01:3:3d0:6a65:c2bf'
when 'ignore-me.com' then '53.84.143.152'
when 'force.get.com' then '22.102.29.40'
when 'wikipedia.com' then '1.2.3.4'
else
as_ip = IPAddr.new(host) rescue nil
raise "couldn't lookup #{host}" if as_ip.nil?
@ -308,6 +309,27 @@ describe FinalDestination do
end
end
describe "https cache" do
it 'will cache https lookups' do
FinalDestination.clear_https_cache!("wikipedia.com")
stub_request(:head, "http://wikipedia.com/image.png")
.to_return(status: 302, body: "", headers: { location: 'https://wikipedia.com/image.png' })
stub_request(:head, "https://wikipedia.com/image.png")
.to_return(status: 200, body: "", headers: [])
stub_request(:get, "https://wikipedia.com/image.png").to_return(status: 200, body: "", headers: {})
fd('http://wikipedia.com/image.png').resolve
stub_request(:head, "https://wikipedia.com/image2.png")
.to_return(status: 200, body: "", headers: [])
stub_request(:get, "https://wikipedia.com/image2.png").to_return(status: 200, body: "", headers: {})
fd('http://wikipedia.com/image2.png').resolve
end
end
describe "#escape_url" do
it "correctly escapes url" do
fragment_url = "https://eviltrout.com/2016/02/25/fixing-android-performance.html#discourse-comments"