From d15867463f33d8c91e5ccd65ebcce4ce61e26627 Mon Sep 17 00:00:00 2001 From: Osama Sayegh Date: Mon, 23 May 2022 13:52:06 +0300 Subject: [PATCH] FEATURE: Site setting for blocking onebox of URLs that redirect (#16881) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Meta topic: https://meta.discourse.org/t/prevent-to-linkify-when-there-is-a-redirect/226964/2?u=osama. This commit adds a new site setting `block_onebox_on_redirect` (default off) for blocking oneboxes (full and inline) of URLs that redirect. Note that an initial http → https redirect is still allowed if the redirect location is identical to the source (minus the scheme of course). For example, if a user includes a link to `http://example.com/page` and the link resolves to `https://example.com/page`, then the link will onebox (assuming it can be oneboxed) even if the setting is enabled. The reason for this is a user may type out a URL (i.e. the URL is short and memorizable) with http and since a lot of sites support TLS with http traffic automatically redirected to https, so we should still allow the URL to onebox. --- config/locales/server.en.yml | 1 + config/site_settings.yml | 2 + lib/final_destination.rb | 49 +++++++++++------ lib/inline_oneboxer.rb | 9 +++- lib/oneboxer.rb | 9 +++- lib/retrieve_title.rb | 18 +++++-- spec/lib/inline_oneboxer_spec.rb | 48 +++++++++++++++++ spec/lib/oneboxer_spec.rb | 91 ++++++++++++++++++++++++++++++++ 8 files changed, 206 insertions(+), 21 deletions(-) diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 01f3b93dc7e..067653bb9dc 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -1539,6 +1539,7 @@ en: show_pinned_excerpt_desktop: "Show excerpt on pinned topics in desktop view." post_onebox_maxlength: "Maximum length of a oneboxed Discourse post in characters." blocked_onebox_domains: "A list of domains that will never be oneboxed e.g. wikipedia.org\n(Wildcard symbols * ? not supported)" + block_onebox_on_redirect: "Block onebox for URLs that redirect." allowed_inline_onebox_domains: "A list of domains that will be oneboxed in miniature form if linked without a title" enable_inline_onebox_on_all_domains: "Ignore inline_onebox_domain_allowlist site setting and allow inline onebox on all domains." force_custom_user_agent_hosts: "Hosts for which to use the custom onebox user agent on all requests. (Especially useful for hosts that limit access by user agent)." diff --git a/config/site_settings.yml b/config/site_settings.yml index 4e254263add..483ec6d3b4f 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -1742,6 +1742,8 @@ onebox: facebook_app_access_token: default: "" secret: true + block_onebox_on_redirect: + default: false cache_onebox_response_body: default: false hidden: true diff --git a/lib/final_destination.rb b/lib/final_destination.rb index 9ec7c6876d1..6426924232b 100644 --- a/lib/final_destination.rb +++ b/lib/final_destination.rb @@ -45,8 +45,10 @@ class FinalDestination @default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT @opts[:max_redirects] ||= 5 @opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) } + @https_redirect_ignore_limit = @opts[:initial_https_redirect_ignore_limit] - @limit = @opts[:max_redirects] + @max_redirects = @opts[:max_redirects] + @limit = @max_redirects @ignored = [] if @limit > 0 @@ -94,7 +96,7 @@ class FinalDestination end def redirected? - @limit < @opts[:max_redirects] + @limit < @max_redirects end def request_headers @@ -145,26 +147,31 @@ class FinalDestination return if @stop_at_blocked_pages && blocked_domain?(@uri) result, headers_subset = safe_get(@uri, &blk) - return nil if !result + return if !result cookie = headers_subset.set_cookie location = headers_subset.location - if result == :redirect && (redirects == 0 || !location) - return nil - end - if result == :redirect - old_port = @uri.port + return if !location + + old_uri = @uri location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/" @uri = uri(location) + if @uri && redirects == @max_redirects && @https_redirect_ignore_limit && same_uri_but_https?(old_uri, @uri) + redirects += 1 + @https_redirect_ignore_limit = false + end + + return if redirects == 0 + # https redirect, so just cache that whole new domain is https - if old_port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri) + if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri) FinalDestination.cache_https_domain(@uri.hostname) end - return nil if !@uri + return if !@uri extra = nil extra = { 'Cookie' => cookie } if cookie @@ -186,13 +193,13 @@ class FinalDestination if @limit < 0 @status = :too_many_redirects log(:warn, "FinalDestination could not resolve URL (too many redirects): #{@uri}") if @verbose - return nil + return end unless validate_uri @status = :invalid_address log(:warn, "FinalDestination could not resolve URL (invalid URI): #{@uri}") if @verbose - return nil + return end @ignored.each do |host| @@ -315,17 +322,21 @@ class FinalDestination return @uri end - old_port = @uri.port + old_uri = @uri location = "#{location}##{@uri.fragment}" if @preserve_fragment_url && @uri.fragment.present? location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/" @uri = uri(location) + + if @uri && @limit == @max_redirects && @https_redirect_ignore_limit && same_uri_but_https?(old_uri, @uri) + @limit += 1 + @https_redirect_ignore_limit = false + end @limit -= 1 # https redirect, so just cache that whole new domain is https - if old_port == 80 && @uri.port == 443 && (URI::HTTPS === @uri) + if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri) FinalDestination.cache_https_domain(@uri.hostname) end - return resolve end @@ -546,4 +557,12 @@ class FinalDestination def blocked_domain?(uri) Onebox::DomainChecker.is_blocked?(uri.hostname) end + + def same_uri_but_https?(before, after) + before = before.to_s + after = after.to_s + before.start_with?("http://") && + after.start_with?("https://") && + before.sub("http://", "") == after.sub("https://", "") + end end diff --git a/lib/inline_oneboxer.rb b/lib/inline_oneboxer.rb index 2ce9b34a329..30e381875d6 100644 --- a/lib/inline_oneboxer.rb +++ b/lib/inline_oneboxer.rb @@ -62,7 +62,14 @@ class InlineOneboxer uri.hostname.present? && (always_allow || allowed_domains.include?(uri.hostname)) && !Onebox::DomainChecker.is_blocked?(uri.hostname) - title = RetrieveTitle.crawl(url) + if SiteSetting.block_onebox_on_redirect + max_redirects = 0 + end + title = RetrieveTitle.crawl( + url, + max_redirects: max_redirects, + initial_https_redirect_ignore_limit: SiteSetting.block_onebox_on_redirect + ) title = nil if title && title.length < MIN_TITLE_LENGTH return onebox_for(url, title, opts) end diff --git a/lib/oneboxer.rb b/lib/oneboxer.rb index 8efc2cf697d..d21d940c494 100644 --- a/lib/oneboxer.rb +++ b/lib/oneboxer.rb @@ -397,9 +397,16 @@ module Oneboxer available_strategies ||= Oneboxer.ordered_strategies(uri.hostname) strategy = available_strategies.shift + if SiteSetting.block_onebox_on_redirect + max_redirects = 0 + end fd = FinalDestination.new( url, - get_final_destination_options(url, strategy).merge(stop_at_blocked_pages: true) + get_final_destination_options(url, strategy).merge( + stop_at_blocked_pages: true, + max_redirects: max_redirects, + initial_https_redirect_ignore_limit: SiteSetting.block_onebox_on_redirect + ) ) uri = fd.resolve diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index 7a9a37294ff..c71d69e2295 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -3,8 +3,12 @@ module RetrieveTitle CRAWL_TIMEOUT = 1 - def self.crawl(url) - fetch_title(url) + def self.crawl(url, max_redirects: nil, initial_https_redirect_ignore_limit: false) + fetch_title( + url, + max_redirects: max_redirects, + initial_https_redirect_ignore_limit: initial_https_redirect_ignore_limit + ) rescue Exception => ex raise if Rails.env.test? Rails.logger.error(ex) @@ -53,8 +57,14 @@ module RetrieveTitle end # Fetch the beginning of a HTML document at a url - def self.fetch_title(url) - fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT, stop_at_blocked_pages: true) + def self.fetch_title(url, max_redirects: nil, initial_https_redirect_ignore_limit: false) + fd = FinalDestination.new( + url, + timeout: CRAWL_TIMEOUT, + stop_at_blocked_pages: true, + max_redirects: max_redirects, + initial_https_redirect_ignore_limit: initial_https_redirect_ignore_limit + ) current = nil title = nil diff --git a/spec/lib/inline_oneboxer_spec.rb b/spec/lib/inline_oneboxer_spec.rb index b7341e88494..1e84076b72d 100644 --- a/spec/lib/inline_oneboxer_spec.rb +++ b/spec/lib/inline_oneboxer_spec.rb @@ -265,5 +265,53 @@ describe InlineOneboxer do expect(onebox[:title]).to be_blank end end + + context "when block_onebox_on_redirect is enabled" do + before do + SiteSetting.block_onebox_on_redirect = true + end + + after do + FinalDestination.clear_https_cache!("redirects.com") + end + + it "doesn't onebox if the URL redirects" do + stub_request(:get, "https://redirects.com/blah/gg") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://redirects.com/blah/gg/redirect" } + ) + onebox = InlineOneboxer.lookup("https://redirects.com/blah/gg", skip_cache: true) + expect(onebox[:title]).to be_blank + end + + it "allows an initial http -> https redirect if the redirect URL is identical to the original" do + stub_request(:get, "http://redirects.com/blah/gg") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://redirects.com/blah/gg" } + ) + stub_request(:get, "https://redirects.com/blah/gg") + .to_return( + status: 200, + body: "The Redirects Website" + ) + onebox = InlineOneboxer.lookup("http://redirects.com/blah/gg", skip_cache: true) + expect(onebox[:title]).to eq("The Redirects Website") + end + + it "doesn't allow an initial http -> https redirect if the redirect URL is different to the original" do + stub_request(:get, "http://redirects.com/blah/gg") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://redirects.com/blah/gg/2" } + ) + onebox = InlineOneboxer.lookup("http://redirects.com/blah/gg", skip_cache: true) + expect(onebox[:title]).to be_blank + end + end end end diff --git a/spec/lib/oneboxer_spec.rb b/spec/lib/oneboxer_spec.rb index bf17e6edb53..56c90495ac7 100644 --- a/spec/lib/oneboxer_spec.rb +++ b/spec/lib/oneboxer_spec.rb @@ -243,6 +243,97 @@ describe Oneboxer do end end + context "when block_onebox_on_redirect setting is enabled" do + before do + Discourse.cache.clear + SiteSetting.block_onebox_on_redirect = true + end + + after do + FinalDestination.clear_https_cache!("redirects2.com") + FinalDestination.clear_https_cache!("redirects3.com") + FinalDestination.clear_https_cache!("redirects4.com") + end + + it "doesn't return onebox if the URL redirects" do + stub_request(:head, "https://redirects2.com/full-onebox") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://redirects2.com/real-full-onebox" } + ) + stub_request(:get, "https://redirects2.com/full-onebox") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://redirects2.com/real-full-onebox" } + ) + result = Oneboxer.external_onebox("https://redirects2.com/full-onebox") + expect(result[:onebox]).to be_blank + end + + it "allows an initial http -> https redirect if the redirect URL is identical to the original" do + stub_request(:get, "http://redirects3.com/full-onebox") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://redirects3.com/full-onebox" } + ) + stub_request(:head, "http://redirects3.com/full-onebox") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://redirects3.com/full-onebox" } + ) + + stub_request(:get, "https://redirects3.com/full-onebox") + .to_return( + status: 200, + body: html + ) + stub_request(:head, "https://redirects3.com/full-onebox") + .to_return( + status: 200, + body: "", + ) + result = Oneboxer.external_onebox("http://redirects3.com/full-onebox") + onebox = result[:onebox] + expect(onebox).to include("https://redirects3.com/full-onebox") + expect(onebox).to include("Cats") + expect(onebox).to include("Meow") + end + + it "doesn't allow an initial http -> https redirect if the redirect URL is different to the original" do + stub_request(:get, "http://redirects4.com/full-onebox") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://redirects4.com/full-onebox/2" } + ) + stub_request(:head, "http://redirects4.com/full-onebox") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://redirects4.com/full-onebox/2" } + ) + + stub_request(:get, "https://redirects4.com/full-onebox") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://redirects4.com/full-onebox/2" } + ) + stub_request(:head, "https://redirects4.com/full-onebox") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://redirects4.com/full-onebox/2" } + ) + result = Oneboxer.external_onebox("http://redirects4.com/full-onebox") + expect(result[:onebox]).to be_blank + end + end + it "censors external oneboxes" do Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "bad word")