From 8e9cbe9db4d7782ff7936ab6ff15aac74202d5d8 Mon Sep 17 00:00:00 2001 From: Dan Ungureanu Date: Tue, 22 Mar 2022 20:13:27 +0200 Subject: [PATCH] FIX: Do not raise if title cannot be crawled (#16247) If the crawled page returned an error, `FinalDestination#safe_get` yielded `nil` for `uri` and `chunk` arguments. Another problem is that `get` did not handle the case when `safe_get` failed and did not return the `location` and `set_cookie` headers. --- lib/final_destination.rb | 2 ++ lib/retrieve_title.rb | 2 ++ spec/lib/retrieve_title_spec.rb | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/lib/final_destination.rb b/lib/final_destination.rb index 9bc5d4040ee..9ec7c6876d1 100644 --- a/lib/final_destination.rb +++ b/lib/final_destination.rb @@ -145,6 +145,8 @@ class FinalDestination return if @stop_at_blocked_pages && blocked_domain?(@uri) result, headers_subset = safe_get(@uri, &blk) + return nil if !result + cookie = headers_subset.set_cookie location = headers_subset.location diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index c91ea639a65..7a9a37294ff 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -62,6 +62,8 @@ module RetrieveTitle fd.get do |_response, chunk, uri| unless Net::HTTPRedirection === _response + throw :done if uri.blank? + if current current << chunk else diff --git a/spec/lib/retrieve_title_spec.rb b/spec/lib/retrieve_title_spec.rb index 1b554123083..91852ca6c90 100644 --- a/spec/lib/retrieve_title_spec.rb +++ b/spec/lib/retrieve_title_spec.rb @@ -136,6 +136,12 @@ describe RetrieveTitle do expect(RetrieveTitle.crawl("https://cat.com/meow/no-onebox")).to be_blank end + + it "doesn't return a title if response is unsuccessful" do + stub_request(:get, "https://example.com").to_return(status: 404, body: "") + + expect(RetrieveTitle.crawl("https://example.com")).to eq(nil) + end end context 'fetch_title' do