From fa5880e04f7e04be8e6780e8ab32e754a1118264 Mon Sep 17 00:00:00 2001 From: Sam Date: Mon, 29 Jan 2018 15:36:52 +1100 Subject: [PATCH] PERF: ability to crawl for titles without extra HEAD req Also, introduces a much more aggressive timeout for title crawling and introduces gzip to body that is crawled --- lib/final_destination.rb | 124 +++++++++++++++++++++- lib/retrieve_title.rb | 41 ++++--- spec/components/final_destination_spec.rb | 26 +++++ spec/components/retrieve_title_spec.rb | 11 ++ 4 files changed, 176 insertions(+), 26 deletions(-) diff --git a/lib/final_destination.rb b/lib/final_destination.rb index acc922e61fd..c6c989de9a9 100644 --- a/lib/final_destination.rb +++ b/lib/final_destination.rb @@ -51,12 +51,17 @@ class FinalDestination @cookie = nil @limited_ips = [] @verbose = @opts[:verbose] || false + @timeout = @opts[:timeout] || nil end def self.connection_timeout 20 end + def timeout + @timeout || FinalDestination.connection_timeout + end + def redirected? @limit < @opts[:max_redirects] end @@ -75,12 +80,58 @@ class FinalDestination def small_get(headers) Net::HTTP.start(@uri.host, @uri.port, use_ssl: @uri.is_a?(URI::HTTPS)) do |http| - http.open_timeout = FinalDestination.connection_timeout - http.read_timeout = FinalDestination.connection_timeout + http.open_timeout = timeout + http.read_timeout = timeout http.request_get(@uri.request_uri, headers) end end + # this is a new interface for simply getting + # N bytes accounting for all internal logic + def get(uri = @uri, redirects = @limit, extra_headers: {}, &blk) + raise "Must specify block" unless block_given? + + if uri && uri.port == 80 && FinalDestination.is_https_domain?(uri.hostname) + uri.scheme = "https" + uri = URI(uri.to_s) + end + + unless validate_uri + return nil + end + + result, (location, cookie) = safe_get(uri, &blk) + + if result == :redirect && (redirects == 0 || !location) + return nil + end + + if result == :redirect + old_port = uri.port + + location = "#{uri.scheme}://#{uri.host}#{location}" if location[0] == "/" + uri = URI(location) rescue nil + + # https redirect, so just cache that whole new domain is https + if old_port == 80 && uri&.port == 443 && (URI::HTTPS === uri) + FinalDestination.cache_https_domain(uri.hostname) + end + + return nil if !uri + + extra = nil + if cookie + extra = { 'Cookie' => cookie } + end + + get(uri, redirects - 1, extra_headers: extra, &blk) + elsif result == :ok + uri.to_s + else + nil + end + end + def resolve if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname) @uri.scheme = "https" @@ -108,7 +159,7 @@ class FinalDestination headers = request_headers response = Excon.public_send(@http_verb, @uri.to_s, - read_timeout: FinalDestination.connection_timeout, + read_timeout: timeout, headers: headers ) @@ -135,7 +186,7 @@ class FinalDestination headers['set-cookie'] = cookie_val.join end - # TODO this is confusing why grap location for anything not + # TODO this is confusing why grab location for anything not # between 300-400 ? if location_val = get_response.get_fields('location') headers['location'] = location_val.join @@ -279,4 +330,69 @@ class FinalDestination nil end + protected + + def safe_get(uri) + + result = nil + unsafe_close = false + + safe_session(uri) do |http| + + headers = request_headers.merge( + 'Accept-Encoding' => 'gzip', + 'Host' => uri.host + ) + + req = Net::HTTP::Get.new(uri.request_uri, headers) + + http.request(req) do |resp| + if Net::HTTPRedirection === resp + result = :redirect, [resp['location'], resp['Set-Cookie']] + end + + if Net::HTTPSuccess === resp + + resp.decode_content = true + resp.read_body { |chunk| + read_next = true + + catch(:done) do + if read_next + read_next = false + yield resp, chunk, uri + read_next = true + end + end + + # no clean way of finishing abruptly cause + # response likes reading till the end + if !read_next + unsafe_close = true + http.finish + raise StandardError + end + } + result = :ok + end + end + end + + result + rescue StandardError + if unsafe_close + :ok + else + raise + end + end + + def safe_session(uri) + Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == "https")) do |http| + http.read_timeout = timeout + http.open_timeout = timeout + yield http + end + end + end diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index 37b17716b0b..26df654e8ef 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -1,10 +1,10 @@ require_dependency 'final_destination' module RetrieveTitle - class ReadEnough < StandardError; end + CRAWL_TIMEOUT = 1 def self.crawl(url) - extract_title(fetch_beginning(url)) + fetch_title(url) rescue Exception # If there was a connection error, do nothing end @@ -41,7 +41,7 @@ module RetrieveTitle # Amazon and YouTube leave the title until very late. Exceptions are bad # but these are large sites. - return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/ + return 500 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/ return 300 if uri.host =~ /youtube\.com$/ || uri.host =~ /youtu.be/ # default is 10k @@ -49,27 +49,24 @@ module RetrieveTitle end # Fetch the beginning of a HTML document at a url - def self.fetch_beginning(url) - fd = FinalDestination.new(url) - uri = fd.resolve - return "" unless uri + def self.fetch_title(url) + fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT) - result = "" - streamer = lambda do |chunk, _, _| - result << chunk + current = nil + title = nil - # Using exceptions for flow control is really bad, but there really seems to - # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for - # that matter!) - raise ReadEnough.new if result.size > (max_chunk_size(uri) * 1024) + fd.get do |_response, chunk, uri| + + if current + current << chunk + else + current = chunk + end + + max_size = max_chunk_size(uri) * 1024 + title = extract_title(current) + throw :done if title || max_size < current.length end - Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers) - result - - rescue Excon::Errors::SocketError => ex - return result if ex.socket_error.is_a?(ReadEnough) - raise - rescue ReadEnough - result + return title || "" end end diff --git a/spec/components/final_destination_spec.rb b/spec/components/final_destination_spec.rb index 93b6f061d26..38cd37e2bfa 100644 --- a/spec/components/final_destination_spec.rb +++ b/spec/components/final_destination_spec.rb @@ -207,6 +207,32 @@ describe FinalDestination do end end + describe '.get' do + + it "can correctly stream with a redirect" do + + FinalDestination.clear_https_cache!("wikipedia.com") + + stub_request(:get, "http://wikipedia.com/"). + to_return(status: 302, body: "" , headers: { "location" => "https://wikipedia.com/" }) + + # webmock does not do chunks + stub_request(:get, "https://wikipedia.com/"). + to_return(status: 200, body: "" , headers: {}) + + result = nil + chunk = nil + + result = FinalDestination.new("http://wikipedia.com", opts).get do |resp, c| + chunk = c + throw :done + end + + expect(result).to eq("https://wikipedia.com/") + expect(chunk).to eq("") + end + end + describe '.validate_uri' do context "host lookups" do it "works for various hosts" do diff --git a/spec/components/retrieve_title_spec.rb b/spec/components/retrieve_title_spec.rb index 09edb28eae3..3a7af745fc1 100644 --- a/spec/components/retrieve_title_spec.rb +++ b/spec/components/retrieve_title_spec.rb @@ -54,7 +54,18 @@ describe RetrieveTitle do ) expect(title).to eq("Video Title") end + end + context "crawl" do + it "can properly extract a title from a url" do + stub_request(:get, "https://brelksdjflaskfj.com/amazing") + .to_return(status: 200, body: "very amazing") + + # we still resolve the IP address for every host + IPSocket.stubs(:getaddress).returns('100.2.3.4') + + expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing") + end end end