FIX: Apply onebox blocked domain checks on every redirect (#16150)

The `blocked onebox domains` setting lets site owners change what sites
are allowed to be oneboxed. When a link is entered into a post,
Discourse checks the domain of the link against that setting and blocks
the onebox if the domain is blocked. But if there's a chain of
redirects, then only the final destination website is checked against
the site setting.

This commit amends that behavior so that every website in the redirect
chain is checked against the site setting, and if anything is blocked
the original link doesn't onebox at all in the post. The
`Discourse-No-Onebox` header is also checked in every response and the
onebox is blocked if the header is set to "1".

Additionally, Discourse will now include the `Discourse-No-Onebox`
header with every response if the site requires login to access content.
This is done to signal to a Discourse instance that it shouldn't attempt
to onebox other Discourse instances if they're login-only. Non-Discourse
websites can also use include that header if they don't wish to have
Discourse onebox their content.

Internal ticket: t59305.
This commit is contained in:
Osama Sayegh 2022-03-11 09:18:12 +03:00 committed by GitHub
parent 8e010aecfb
commit b0656f3ed0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 171 additions and 38 deletions

View File

@ -90,6 +90,9 @@ class ApplicationController < ActionController::Base
response.cache_control[:no_cache] = true response.cache_control[:no_cache] = true
response.cache_control[:extras] = ["no-store"] response.cache_control[:extras] = ["no-store"]
end end
if SiteSetting.login_required
response.headers['Discourse-No-Onebox'] = '1'
end
end end
def conditionally_allow_site_embedding def conditionally_allow_site_embedding

View File

@ -74,6 +74,7 @@ class FinalDestination
@preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) } @preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) }
@validate_uri = @opts.fetch(:validate_uri) { true } @validate_uri = @opts.fetch(:validate_uri) { true }
@user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : @default_user_agent @user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : @default_user_agent
@stop_at_blocked_pages = @opts[:stop_at_blocked_pages]
end end
def self.connection_timeout def self.connection_timeout
@ -140,9 +141,12 @@ class FinalDestination
uri = URI(uri.to_s) uri = URI(uri.to_s)
end end
return nil unless validate_uri return if !validate_uri
return if @stop_at_blocked_pages && blocked_domain?(uri)
result, (location, cookie) = safe_get(uri, &blk) result, headers_subset = safe_get(uri, &blk)
cookie = headers_subset.set_cookie
location = headers_subset.location
if result == :redirect && (redirects == 0 || !location) if result == :redirect && (redirects == 0 || !location)
return nil return nil
@ -222,6 +226,13 @@ class FinalDestination
response_block: request_validator response_block: request_validator
) )
if @stop_at_blocked_pages
if blocked_domain?(@uri) || response.headers['Discourse-No-Onebox'] == "1"
@status = :blocked_page
return
end
end
location = nil location = nil
response_headers = nil response_headers = nil
response_status = response.status.to_i response_status = response.status.to_i
@ -253,6 +264,18 @@ class FinalDestination
when 103, 400, 405, 406, 409, 500, 501 when 103, 400, 405, 406, 409, 500, 501
response_status, small_headers = small_get(request_headers) response_status, small_headers = small_get(request_headers)
if @stop_at_blocked_pages
# this may seem weird, but the #to_hash method of the response object
# of ruby's net/http lib returns a hash where each value is an array.
# small_headers here is like that so our no onebox header value is an
# array if it's set. Also the hash keys are always lower-cased.
dont_onebox = small_headers["discourse-no-onebox"]&.join("") == "1"
if dont_onebox || blocked_domain?(@uri)
@status = :blocked_page
return
end
end
if response_status == 200 if response_status == 200
@status = :resolved @status = :resolved
return @uri return @uri
@ -425,6 +448,7 @@ class FinalDestination
def safe_get(uri) def safe_get(uri)
result = nil result = nil
unsafe_close = false unsafe_close = false
headers_subset = Struct.new(:location, :set_cookie).new
safe_session(uri) do |http| safe_session(uri) do |http|
headers = request_headers.merge( headers = request_headers.merge(
@ -435,8 +459,19 @@ class FinalDestination
req = Net::HTTP::Get.new(uri.request_uri, headers) req = Net::HTTP::Get.new(uri.request_uri, headers)
http.request(req) do |resp| http.request(req) do |resp|
headers_subset.set_cookie = resp['Set-Cookie']
if @stop_at_blocked_pages
dont_onebox = resp["Discourse-No-Onebox"] == "1"
if dont_onebox
result = :blocked, headers_subset
next
end
end
if Net::HTTPRedirection === resp if Net::HTTPRedirection === resp
result = :redirect, [resp['location'], resp['Set-Cookie']] headers_subset.location = resp['location']
result = :redirect, headers_subset
end end
if Net::HTTPSuccess === resp if Net::HTTPSuccess === resp
@ -460,7 +495,7 @@ class FinalDestination
raise StandardError raise StandardError
end end
end end
result = :ok result = :ok, headers_subset
else else
catch(:done) do catch(:done) do
yield resp, nil, nil yield resp, nil, nil
@ -471,7 +506,7 @@ class FinalDestination
result result
rescue StandardError rescue StandardError
unsafe_close ? :ok : raise unsafe_close ? [:ok, headers_subset] : raise
end end
def safe_session(uri) def safe_session(uri)
@ -505,4 +540,8 @@ class FinalDestination
uri(complete_url) uri(complete_url)
end end
def blocked_domain?(uri)
Onebox::DomainChecker.is_blocked?(uri.hostname)
end
end end

View File

@ -397,9 +397,14 @@ module Oneboxer
available_strategies ||= Oneboxer.ordered_strategies(uri.hostname) available_strategies ||= Oneboxer.ordered_strategies(uri.hostname)
strategy = available_strategies.shift strategy = available_strategies.shift
fd = FinalDestination.new(url, get_final_destination_options(url, strategy)) fd = FinalDestination.new(
url,
get_final_destination_options(url, strategy).merge(stop_at_blocked_pages: true)
)
uri = fd.resolve uri = fd.resolve
return blank_onebox if fd.status == :blocked_page
if fd.status != :resolved if fd.status != :resolved
args = { link: url } args = { link: url }
if fd.status == :invalid_address if fd.status == :invalid_address
@ -416,7 +421,7 @@ module Oneboxer
return error_box return error_box
end end
return blank_onebox if uri.blank? || Onebox::DomainChecker.is_blocked?(uri.hostname) return blank_onebox if uri.blank?
onebox_options = { onebox_options = {
max_width: 695, max_width: 695,

View File

@ -5,8 +5,9 @@ module RetrieveTitle
def self.crawl(url) def self.crawl(url)
fetch_title(url) fetch_title(url)
rescue Exception rescue Exception => ex
# If there was a connection error, do nothing raise if Rails.env.test?
Rails.logger.error(ex)
end end
def self.extract_title(html, encoding = nil) def self.extract_title(html, encoding = nil)
@ -52,17 +53,13 @@ module RetrieveTitle
# Fetch the beginning of a HTML document at a url # Fetch the beginning of a HTML document at a url
def self.fetch_title(url) def self.fetch_title(url)
fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT) fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT, stop_at_blocked_pages: true)
current = nil current = nil
title = nil title = nil
encoding = nil encoding = nil
fd.get do |_response, chunk, uri| fd.get do |_response, chunk, uri|
if (uri.present? && Onebox::DomainChecker.is_blocked?(uri.hostname))
throw :done
end
unless Net::HTTPRedirection === _response unless Net::HTTPRedirection === _response
if current if current
current << chunk current << chunk

View File

@ -29,34 +29,19 @@ describe InlineOneboxer do
it "puts an entry in the cache" do it "puts an entry in the cache" do
SiteSetting.enable_inline_onebox_on_all_domains = true SiteSetting.enable_inline_onebox_on_all_domains = true
url = "https://example.com/random-url" url = "https://example.com/good-url"
stub_request(:get, url).to_return(status: 200, body: "<html><head><title>a blog</title></head></html>") stub_request(:get, url).to_return(status: 200, body: "<html><head><title>a blog</title></head></html>")
InlineOneboxer.invalidate(url) InlineOneboxer.invalidate(url)
expect(InlineOneboxer.cache_lookup(url)).to be_blank expect(InlineOneboxer.cache_lookup(url)).to be_blank
result = InlineOneboxer.lookup(url) result = InlineOneboxer.lookup(url)
expect(result).to be_present expect(result[:title]).to be_present
cached = InlineOneboxer.cache_lookup(url) cached = InlineOneboxer.cache_lookup(url)
expect(cached[:url]).to eq(url) expect(cached[:url]).to eq(url)
expect(cached[:title]).to eq('a blog') expect(cached[:title]).to eq('a blog')
end end
it "puts an entry in the cache for failed onebox" do
SiteSetting.enable_inline_onebox_on_all_domains = true
url = "https://example.com/random-url"
InlineOneboxer.invalidate(url)
expect(InlineOneboxer.cache_lookup(url)).to be_blank
result = InlineOneboxer.lookup(url)
expect(result).to be_present
cached = InlineOneboxer.cache_lookup(url)
expect(cached[:url]).to eq(url)
expect(cached[:title]).to be_nil
end
end end
context ".lookup" do context ".lookup" do
@ -199,6 +184,11 @@ describe InlineOneboxer do
describe "lookups for blocked domains in the hostname" do describe "lookups for blocked domains in the hostname" do
shared_examples "blocks the domain" do |setting, domain_to_test| shared_examples "blocks the domain" do |setting, domain_to_test|
it "does not retrieve title" do it "does not retrieve title" do
stub_request(:get, domain_to_test)
.to_return(
status: 200,
body: "<html><head><title>hello world</title></head></html>",
)
SiteSetting.blocked_onebox_domains = setting SiteSetting.blocked_onebox_domains = setting
onebox = InlineOneboxer.lookup(domain_to_test, skip_cache: true) onebox = InlineOneboxer.lookup(domain_to_test, skip_cache: true)
@ -209,11 +199,16 @@ describe InlineOneboxer do
shared_examples "does not fulfil blocked domain" do |setting, domain_to_test| shared_examples "does not fulfil blocked domain" do |setting, domain_to_test|
it "retrieves title" do it "retrieves title" do
stub_request(:get, domain_to_test)
.to_return(
status: 200,
body: "<html><head><title>hello world</title></head></html>",
)
SiteSetting.blocked_onebox_domains = setting SiteSetting.blocked_onebox_domains = setting
onebox = InlineOneboxer.lookup(domain_to_test, skip_cache: true) onebox = InlineOneboxer.lookup(domain_to_test, skip_cache: true)
expect(onebox).to be_present expect(onebox[:title]).to be_present
end end
end end
@ -228,6 +223,47 @@ describe InlineOneboxer do
include_examples "does not fulfil blocked domain", "kitten.cloud", "https://cat.2kitten.cloud" include_examples "does not fulfil blocked domain", "kitten.cloud", "https://cat.2kitten.cloud"
include_examples "does not fulfil blocked domain", "kitten.cloud", "https://cat.kitten.cloud9" include_examples "does not fulfil blocked domain", "kitten.cloud", "https://cat.kitten.cloud9"
include_examples "does not fulfil blocked domain", "api.cat.org", "https://api-cat.org" include_examples "does not fulfil blocked domain", "api.cat.org", "https://api-cat.org"
it "doesn't retrieve title if a blocked domain is encountered anywhere in the redirect chain" do
SiteSetting.blocked_onebox_domains = "redirect.com"
stub_request(:get, "https://mainwebsite.com/blah")
.to_return(status: 301, body: "", headers: { "location" => "https://redirect.com/blah" })
stub_request(:get, "https://redirect.com/blah")
.to_return(status: 301, body: "", headers: { "location" => "https://finalwebsite.com/blah" })
stub_request(:get, "https://finalwebsite.com/blah")
.to_return(status: 200, body: "<html><head><title>hello world</title></head></html>")
onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah", skip_cache: true)
expect(onebox[:title]).to be_blank
end
it "doesn't retrieve title if the Discourse-No-Onebox header == 1" do
stub_request(:get, "https://mainwebsite.com/blah")
.to_return(
status: 200,
body: "<html><head><title>hello world</title></head></html>",
headers: { "Discourse-No-Onebox" => "1" }
)
onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah", skip_cache: true)
expect(onebox[:title]).to be_blank
stub_request(:get, "https://mainwebsite.com/blah/2")
.to_return(
status: 301,
body: "",
headers: { "location" => "https://mainwebsite.com/blah/2/redirect" }
)
stub_request(:get, "https://mainwebsite.com/blah/2/redirect")
.to_return(
status: 301,
body: "",
headers: { "Discourse-No-Onebox" => "1", "location" => "https://somethingdoesnotmatter.com" }
)
onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah/2", skip_cache: true)
expect(onebox[:title]).to be_blank
onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah/2/redirect", skip_cache: true)
expect(onebox[:title]).to be_blank
end
end end
end end
end end

View File

@ -182,11 +182,16 @@ describe Oneboxer do
stub_request(:get, "https://kitten.com").to_return(status: 200, body: html, headers: {}) stub_request(:get, "https://kitten.com").to_return(status: 200, body: html, headers: {})
stub_request(:head, "https://kitten.com").to_return(status: 200, body: "", headers: {}) stub_request(:head, "https://kitten.com").to_return(status: 200, body: "", headers: {})
expect(Oneboxer.external_onebox("http://cat.com/meow")[:onebox]).to be_empty result = Oneboxer.external_onebox("http://cat.com/meow")
expect(Oneboxer.external_onebox("https://kitten.com")[:onebox]).to be_empty expect(result[:onebox]).to be_empty
expect(result[:preview]).to be_empty
result = Oneboxer.external_onebox("http://kitten.com")
expect(result[:onebox]).to be_empty
expect(result[:preview]).to be_empty
end end
it "returns onebox if 'midway redirect' is blocked but final redirect uri is not blocked" do it "does not return onebox if anything in the redirect chain is blocked" do
SiteSetting.blocked_onebox_domains = "middle.com" SiteSetting.blocked_onebox_domains = "middle.com"
stub_request(:get, "https://cat.com/start").to_return(status: 301, body: "a", headers: { "location" => "https://middle.com/midway" }) stub_request(:get, "https://cat.com/start").to_return(status: 301, body: "a", headers: { "location" => "https://middle.com/midway" })
@ -197,7 +202,44 @@ describe Oneboxer do
stub_request(:get, "https://cat.com/end").to_return(status: 200, body: html) stub_request(:get, "https://cat.com/end").to_return(status: 200, body: html)
stub_request(:head, "https://cat.com/end").to_return(status: 200, body: "", headers: {}) stub_request(:head, "https://cat.com/end").to_return(status: 200, body: "", headers: {})
expect(Oneboxer.external_onebox("https://cat.com/start")[:onebox]).to be_present result = Oneboxer.external_onebox("https://cat.com/start")
expect(result[:onebox]).to be_empty
expect(result[:preview]).to be_empty
end
it "does not return onebox if the Discourse-No-Onebox header == 1" do
stub_request(:get, "https://website.com/discourse-no-onebox")
.to_return(status: 200, body: "abc", headers: { "Discourse-No-Onebox" => "1" })
stub_request(:head, "https://website.com/discourse-no-onebox")
.to_return(status: 200, body: "", headers: { "Discourse-No-Onebox" => "1" })
result = Oneboxer.external_onebox("https://website.com/discourse-no-onebox")
expect(result[:onebox]).to be_empty
expect(result[:preview]).to be_empty
end
it "does not return onebox if the Discourse-No-Onebox header == 1 anywhere in the redirect chain" do
stub_request(:get, "https://website.com/redirect-no-onebox")
.to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://willneverreach.com" })
stub_request(:head, "https://website.com/redirect-no-onebox")
.to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://willneverreach.com" })
result = Oneboxer.external_onebox("https://website.com/redirect-no-onebox")
expect(result[:onebox]).to be_empty
expect(result[:preview]).to be_empty
stub_request(:get, "https://website.com/redirect")
.to_return(status: 301, body: "", headers: { "location" => "https://website.com/redirect/dont-onebox" })
stub_request(:head, "https://website.com/redirect")
.to_return(status: 301, body: "", headers: { "location" => "https://website.com/redirect/dont-onebox" })
stub_request(:get, "https://website.com/redirect/dont-onebox")
.to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://wontreachme.com" })
stub_request(:head, "https://website.com/redirect/dont-onebox")
.to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://wontreachme.com" })
result = Oneboxer.external_onebox("https://website.com/redirect")
expect(result[:onebox]).to be_empty
expect(result[:preview]).to be_empty
end end
end end

View File

@ -111,7 +111,7 @@ describe RetrieveTitle do
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil) expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil)
end end
it "returns title if 'midway redirect' is blocked but final redirect uri is not blocked" do it "doesn't return title if a blocked domain is encountered anywhere in the redirect chain" do
SiteSetting.blocked_onebox_domains = "wikipedia.com" SiteSetting.blocked_onebox_domains = "wikipedia.com"
stub_request(:get, "http://foobar.com/amazing") stub_request(:get, "http://foobar.com/amazing")
@ -123,7 +123,18 @@ describe RetrieveTitle do
stub_request(:get, "https://cat.com/meow") stub_request(:get, "https://cat.com/meow")
.to_return(status: 200, body: "<html><title>very amazing</title>", headers: {}) .to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing") expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to be_blank
end
it "doesn't return title if the Discourse-No-Onebox header == 1" do
stub_request(:get, "https://cat.com/meow/no-onebox")
.to_return(
status: 200,
body: "<html><title>discourse stay away</title>",
headers: { "Discourse-No-Onebox" => "1" }
)
expect(RetrieveTitle.crawl("https://cat.com/meow/no-onebox")).to be_blank
end end
end end