From fa5880e04f7e04be8e6780e8ab32e754a1118264 Mon Sep 17 00:00:00 2001
From: Sam <sam.saffron@gmail.com>
Date: Mon, 29 Jan 2018 15:36:52 +1100
Subject: [PATCH] PERF: ability to crawl for titles without extra HEAD req

Also, introduces a much more aggressive timeout for title crawling
and introduces gzip to body that is crawled
---
 lib/final_destination.rb                  | 124 +++++++++++++++++++++-
 lib/retrieve_title.rb                     |  41 ++++---
 spec/components/final_destination_spec.rb |  26 +++++
 spec/components/retrieve_title_spec.rb    |  11 ++
 4 files changed, 176 insertions(+), 26 deletions(-)

diff --git a/lib/final_destination.rb b/lib/final_destination.rb
index acc922e61fd..c6c989de9a9 100644
--- a/lib/final_destination.rb
+++ b/lib/final_destination.rb
@@ -51,12 +51,17 @@ class FinalDestination
     @cookie = nil
     @limited_ips = []
     @verbose = @opts[:verbose] || false
+    @timeout = @opts[:timeout] || nil
   end
 
   def self.connection_timeout
     20
   end
 
+  def timeout
+    @timeout || FinalDestination.connection_timeout
+  end
+
   def redirected?
     @limit < @opts[:max_redirects]
   end
@@ -75,12 +80,58 @@ class FinalDestination
 
   def small_get(headers)
     Net::HTTP.start(@uri.host, @uri.port, use_ssl: @uri.is_a?(URI::HTTPS)) do |http|
-      http.open_timeout = FinalDestination.connection_timeout
-      http.read_timeout = FinalDestination.connection_timeout
+      http.open_timeout = timeout
+      http.read_timeout = timeout
       http.request_get(@uri.request_uri, headers)
     end
   end
 
+  # this is a new interface for simply getting
+  # N bytes accounting for all internal logic
+  def get(uri = @uri, redirects = @limit, extra_headers: {}, &blk)
+    raise "Must specify block" unless block_given?
+
+    if uri && uri.port == 80 && FinalDestination.is_https_domain?(uri.hostname)
+      uri.scheme = "https"
+      uri = URI(uri.to_s)
+    end
+
+    unless validate_uri
+      return nil
+    end
+
+    result, (location, cookie) = safe_get(uri, &blk)
+
+    if result == :redirect && (redirects == 0 || !location)
+      return nil
+    end
+
+    if result == :redirect
+      old_port = uri.port
+
+      location = "#{uri.scheme}://#{uri.host}#{location}" if location[0] == "/"
+      uri = URI(location) rescue nil
+
+      # https redirect, so just cache that whole new domain is https
+      if old_port == 80 && uri&.port == 443 && (URI::HTTPS === uri)
+        FinalDestination.cache_https_domain(uri.hostname)
+      end
+
+      return nil if !uri
+
+      extra = nil
+      if cookie
+        extra = { 'Cookie' => cookie }
+      end
+
+      get(uri, redirects - 1, extra_headers: extra, &blk)
+    elsif result == :ok
+      uri.to_s
+    else
+      nil
+    end
+  end
+
   def resolve
     if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
       @uri.scheme = "https"
@@ -108,7 +159,7 @@ class FinalDestination
     headers = request_headers
     response = Excon.public_send(@http_verb,
       @uri.to_s,
-      read_timeout: FinalDestination.connection_timeout,
+      read_timeout: timeout,
       headers: headers
     )
 
@@ -135,7 +186,7 @@ class FinalDestination
         headers['set-cookie'] = cookie_val.join
       end
 
-      # TODO this is confusing why grap location for anything not
+      # TODO this is confusing why grab location for anything not
       # between 300-400 ?
       if location_val = get_response.get_fields('location')
         headers['location'] = location_val.join
@@ -279,4 +330,69 @@ class FinalDestination
     nil
   end
 
+  protected
+
+  def safe_get(uri)
+
+    result = nil
+    unsafe_close = false
+
+    safe_session(uri) do |http|
+
+      headers = request_headers.merge(
+        'Accept-Encoding' => 'gzip',
+        'Host' => uri.host
+      )
+
+      req = Net::HTTP::Get.new(uri.request_uri, headers)
+
+      http.request(req) do |resp|
+        if Net::HTTPRedirection === resp
+          result = :redirect, [resp['location'], resp['Set-Cookie']]
+        end
+
+        if Net::HTTPSuccess === resp
+
+          resp.decode_content = true
+          resp.read_body { |chunk|
+            read_next = true
+
+            catch(:done) do
+              if read_next
+                read_next = false
+                yield resp, chunk, uri
+                read_next = true
+              end
+            end
+
+            # no clean way of finishing abruptly cause
+            # response likes reading till the end
+            if !read_next
+              unsafe_close = true
+              http.finish
+              raise StandardError
+            end
+          }
+          result = :ok
+        end
+      end
+    end
+
+    result
+  rescue StandardError
+    if unsafe_close
+      :ok
+    else
+      raise
+    end
+  end
+
+  def safe_session(uri)
+    Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == "https")) do |http|
+      http.read_timeout = timeout
+      http.open_timeout = timeout
+      yield http
+    end
+  end
+
 end
diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb
index 37b17716b0b..26df654e8ef 100644
--- a/lib/retrieve_title.rb
+++ b/lib/retrieve_title.rb
@@ -1,10 +1,10 @@
 require_dependency 'final_destination'
 
 module RetrieveTitle
-  class ReadEnough < StandardError; end
+  CRAWL_TIMEOUT = 1
 
   def self.crawl(url)
-    extract_title(fetch_beginning(url))
+    fetch_title(url)
   rescue Exception
     # If there was a connection error, do nothing
   end
@@ -41,7 +41,7 @@ module RetrieveTitle
 
       # Amazon and YouTube leave the title until very late. Exceptions are bad
       # but these are large sites.
-      return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
+      return 500 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
       return 300 if uri.host =~ /youtube\.com$/ || uri.host =~ /youtu.be/
 
       # default is 10k
@@ -49,27 +49,24 @@ module RetrieveTitle
     end
 
     # Fetch the beginning of a HTML document at a url
-    def self.fetch_beginning(url)
-      fd = FinalDestination.new(url)
-      uri = fd.resolve
-      return "" unless uri
+    def self.fetch_title(url)
+      fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT)
 
-      result = ""
-      streamer = lambda do |chunk, _, _|
-        result << chunk
+      current = nil
+      title = nil
 
-        # Using exceptions for flow control is really bad, but there really seems to
-        # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
-        # that matter!)
-        raise ReadEnough.new if result.size > (max_chunk_size(uri) * 1024)
+      fd.get do |_response, chunk, uri|
+
+        if current
+          current << chunk
+        else
+          current = chunk
+        end
+
+        max_size = max_chunk_size(uri) * 1024
+        title = extract_title(current)
+        throw :done if title || max_size < current.length
       end
-      Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
-      result
-
-    rescue Excon::Errors::SocketError => ex
-      return result if ex.socket_error.is_a?(ReadEnough)
-      raise
-    rescue ReadEnough
-      result
+      return title || ""
     end
 end
diff --git a/spec/components/final_destination_spec.rb b/spec/components/final_destination_spec.rb
index 93b6f061d26..38cd37e2bfa 100644
--- a/spec/components/final_destination_spec.rb
+++ b/spec/components/final_destination_spec.rb
@@ -207,6 +207,32 @@ describe FinalDestination do
     end
   end
 
+  describe '.get' do
+
+    it "can correctly stream with a redirect" do
+
+      FinalDestination.clear_https_cache!("wikipedia.com")
+
+      stub_request(:get, "http://wikipedia.com/").
+        to_return(status: 302, body: "" , headers: { "location" => "https://wikipedia.com/" })
+
+      # webmock does not do chunks
+      stub_request(:get, "https://wikipedia.com/").
+        to_return(status: 200, body: "<html><head>" , headers: {})
+
+      result = nil
+      chunk = nil
+
+      result = FinalDestination.new("http://wikipedia.com", opts).get do |resp, c|
+        chunk = c
+        throw :done
+      end
+
+      expect(result).to eq("https://wikipedia.com/")
+      expect(chunk).to eq("<html><head>")
+    end
+  end
+
   describe '.validate_uri' do
     context "host lookups" do
       it "works for various hosts" do
diff --git a/spec/components/retrieve_title_spec.rb b/spec/components/retrieve_title_spec.rb
index 09edb28eae3..3a7af745fc1 100644
--- a/spec/components/retrieve_title_spec.rb
+++ b/spec/components/retrieve_title_spec.rb
@@ -54,7 +54,18 @@ describe RetrieveTitle do
       )
       expect(title).to eq("Video Title")
     end
+  end
 
+  context "crawl" do
+    it "can properly extract a title from a url" do
+      stub_request(:get, "https://brelksdjflaskfj.com/amazing")
+        .to_return(status: 200, body: "<html><title>very amazing</title>")
+
+      # we still resolve the IP address for every host
+      IPSocket.stubs(:getaddress).returns('100.2.3.4')
+
+      expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing")
+    end
   end
 
 end