mirror of
https://github.com/discourse/discourse.git
synced 2024-11-25 09:42:07 +08:00
FEATURE: Site setting for blocking onebox of URLs that redirect (#16881)
Meta topic: https://meta.discourse.org/t/prevent-to-linkify-when-there-is-a-redirect/226964/2?u=osama. This commit adds a new site setting `block_onebox_on_redirect` (default off) for blocking oneboxes (full and inline) of URLs that redirect. Note that an initial http → https redirect is still allowed if the redirect location is identical to the source (minus the scheme of course). For example, if a user includes a link to `http://example.com/page` and the link resolves to `https://example.com/page`, then the link will onebox (assuming it can be oneboxed) even if the setting is enabled. The reason for this is a user may type out a URL (i.e. the URL is short and memorizable) with http and since a lot of sites support TLS with http traffic automatically redirected to https, so we should still allow the URL to onebox.
This commit is contained in:
parent
a03ae9b323
commit
d15867463f
|
@ -1539,6 +1539,7 @@ en:
|
||||||
show_pinned_excerpt_desktop: "Show excerpt on pinned topics in desktop view."
|
show_pinned_excerpt_desktop: "Show excerpt on pinned topics in desktop view."
|
||||||
post_onebox_maxlength: "Maximum length of a oneboxed Discourse post in characters."
|
post_onebox_maxlength: "Maximum length of a oneboxed Discourse post in characters."
|
||||||
blocked_onebox_domains: "A list of domains that will never be oneboxed e.g. wikipedia.org\n(Wildcard symbols * ? not supported)"
|
blocked_onebox_domains: "A list of domains that will never be oneboxed e.g. wikipedia.org\n(Wildcard symbols * ? not supported)"
|
||||||
|
block_onebox_on_redirect: "Block onebox for URLs that redirect."
|
||||||
allowed_inline_onebox_domains: "A list of domains that will be oneboxed in miniature form if linked without a title"
|
allowed_inline_onebox_domains: "A list of domains that will be oneboxed in miniature form if linked without a title"
|
||||||
enable_inline_onebox_on_all_domains: "Ignore inline_onebox_domain_allowlist site setting and allow inline onebox on all domains."
|
enable_inline_onebox_on_all_domains: "Ignore inline_onebox_domain_allowlist site setting and allow inline onebox on all domains."
|
||||||
force_custom_user_agent_hosts: "Hosts for which to use the custom onebox user agent on all requests. (Especially useful for hosts that limit access by user agent)."
|
force_custom_user_agent_hosts: "Hosts for which to use the custom onebox user agent on all requests. (Especially useful for hosts that limit access by user agent)."
|
||||||
|
|
|
@ -1742,6 +1742,8 @@ onebox:
|
||||||
facebook_app_access_token:
|
facebook_app_access_token:
|
||||||
default: ""
|
default: ""
|
||||||
secret: true
|
secret: true
|
||||||
|
block_onebox_on_redirect:
|
||||||
|
default: false
|
||||||
cache_onebox_response_body:
|
cache_onebox_response_body:
|
||||||
default: false
|
default: false
|
||||||
hidden: true
|
hidden: true
|
||||||
|
|
|
@ -45,8 +45,10 @@ class FinalDestination
|
||||||
@default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT
|
@default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT
|
||||||
@opts[:max_redirects] ||= 5
|
@opts[:max_redirects] ||= 5
|
||||||
@opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) }
|
@opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) }
|
||||||
|
@https_redirect_ignore_limit = @opts[:initial_https_redirect_ignore_limit]
|
||||||
|
|
||||||
@limit = @opts[:max_redirects]
|
@max_redirects = @opts[:max_redirects]
|
||||||
|
@limit = @max_redirects
|
||||||
|
|
||||||
@ignored = []
|
@ignored = []
|
||||||
if @limit > 0
|
if @limit > 0
|
||||||
|
@ -94,7 +96,7 @@ class FinalDestination
|
||||||
end
|
end
|
||||||
|
|
||||||
def redirected?
|
def redirected?
|
||||||
@limit < @opts[:max_redirects]
|
@limit < @max_redirects
|
||||||
end
|
end
|
||||||
|
|
||||||
def request_headers
|
def request_headers
|
||||||
|
@ -145,26 +147,31 @@ class FinalDestination
|
||||||
return if @stop_at_blocked_pages && blocked_domain?(@uri)
|
return if @stop_at_blocked_pages && blocked_domain?(@uri)
|
||||||
|
|
||||||
result, headers_subset = safe_get(@uri, &blk)
|
result, headers_subset = safe_get(@uri, &blk)
|
||||||
return nil if !result
|
return if !result
|
||||||
|
|
||||||
cookie = headers_subset.set_cookie
|
cookie = headers_subset.set_cookie
|
||||||
location = headers_subset.location
|
location = headers_subset.location
|
||||||
|
|
||||||
if result == :redirect && (redirects == 0 || !location)
|
|
||||||
return nil
|
|
||||||
end
|
|
||||||
|
|
||||||
if result == :redirect
|
if result == :redirect
|
||||||
old_port = @uri.port
|
return if !location
|
||||||
|
|
||||||
|
old_uri = @uri
|
||||||
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
||||||
@uri = uri(location)
|
@uri = uri(location)
|
||||||
|
|
||||||
|
if @uri && redirects == @max_redirects && @https_redirect_ignore_limit && same_uri_but_https?(old_uri, @uri)
|
||||||
|
redirects += 1
|
||||||
|
@https_redirect_ignore_limit = false
|
||||||
|
end
|
||||||
|
|
||||||
|
return if redirects == 0
|
||||||
|
|
||||||
# https redirect, so just cache that whole new domain is https
|
# https redirect, so just cache that whole new domain is https
|
||||||
if old_port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri)
|
if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri)
|
||||||
FinalDestination.cache_https_domain(@uri.hostname)
|
FinalDestination.cache_https_domain(@uri.hostname)
|
||||||
end
|
end
|
||||||
|
|
||||||
return nil if !@uri
|
return if !@uri
|
||||||
|
|
||||||
extra = nil
|
extra = nil
|
||||||
extra = { 'Cookie' => cookie } if cookie
|
extra = { 'Cookie' => cookie } if cookie
|
||||||
|
@ -186,13 +193,13 @@ class FinalDestination
|
||||||
if @limit < 0
|
if @limit < 0
|
||||||
@status = :too_many_redirects
|
@status = :too_many_redirects
|
||||||
log(:warn, "FinalDestination could not resolve URL (too many redirects): #{@uri}") if @verbose
|
log(:warn, "FinalDestination could not resolve URL (too many redirects): #{@uri}") if @verbose
|
||||||
return nil
|
return
|
||||||
end
|
end
|
||||||
|
|
||||||
unless validate_uri
|
unless validate_uri
|
||||||
@status = :invalid_address
|
@status = :invalid_address
|
||||||
log(:warn, "FinalDestination could not resolve URL (invalid URI): #{@uri}") if @verbose
|
log(:warn, "FinalDestination could not resolve URL (invalid URI): #{@uri}") if @verbose
|
||||||
return nil
|
return
|
||||||
end
|
end
|
||||||
|
|
||||||
@ignored.each do |host|
|
@ignored.each do |host|
|
||||||
|
@ -315,17 +322,21 @@ class FinalDestination
|
||||||
return @uri
|
return @uri
|
||||||
end
|
end
|
||||||
|
|
||||||
old_port = @uri.port
|
old_uri = @uri
|
||||||
location = "#{location}##{@uri.fragment}" if @preserve_fragment_url && @uri.fragment.present?
|
location = "#{location}##{@uri.fragment}" if @preserve_fragment_url && @uri.fragment.present?
|
||||||
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
||||||
@uri = uri(location)
|
@uri = uri(location)
|
||||||
|
|
||||||
|
if @uri && @limit == @max_redirects && @https_redirect_ignore_limit && same_uri_but_https?(old_uri, @uri)
|
||||||
|
@limit += 1
|
||||||
|
@https_redirect_ignore_limit = false
|
||||||
|
end
|
||||||
@limit -= 1
|
@limit -= 1
|
||||||
|
|
||||||
# https redirect, so just cache that whole new domain is https
|
# https redirect, so just cache that whole new domain is https
|
||||||
if old_port == 80 && @uri.port == 443 && (URI::HTTPS === @uri)
|
if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri)
|
||||||
FinalDestination.cache_https_domain(@uri.hostname)
|
FinalDestination.cache_https_domain(@uri.hostname)
|
||||||
end
|
end
|
||||||
|
|
||||||
return resolve
|
return resolve
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -546,4 +557,12 @@ class FinalDestination
|
||||||
def blocked_domain?(uri)
|
def blocked_domain?(uri)
|
||||||
Onebox::DomainChecker.is_blocked?(uri.hostname)
|
Onebox::DomainChecker.is_blocked?(uri.hostname)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def same_uri_but_https?(before, after)
|
||||||
|
before = before.to_s
|
||||||
|
after = after.to_s
|
||||||
|
before.start_with?("http://") &&
|
||||||
|
after.start_with?("https://") &&
|
||||||
|
before.sub("http://", "") == after.sub("https://", "")
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -62,7 +62,14 @@ class InlineOneboxer
|
||||||
uri.hostname.present? &&
|
uri.hostname.present? &&
|
||||||
(always_allow || allowed_domains.include?(uri.hostname)) &&
|
(always_allow || allowed_domains.include?(uri.hostname)) &&
|
||||||
!Onebox::DomainChecker.is_blocked?(uri.hostname)
|
!Onebox::DomainChecker.is_blocked?(uri.hostname)
|
||||||
title = RetrieveTitle.crawl(url)
|
if SiteSetting.block_onebox_on_redirect
|
||||||
|
max_redirects = 0
|
||||||
|
end
|
||||||
|
title = RetrieveTitle.crawl(
|
||||||
|
url,
|
||||||
|
max_redirects: max_redirects,
|
||||||
|
initial_https_redirect_ignore_limit: SiteSetting.block_onebox_on_redirect
|
||||||
|
)
|
||||||
title = nil if title && title.length < MIN_TITLE_LENGTH
|
title = nil if title && title.length < MIN_TITLE_LENGTH
|
||||||
return onebox_for(url, title, opts)
|
return onebox_for(url, title, opts)
|
||||||
end
|
end
|
||||||
|
|
|
@ -397,9 +397,16 @@ module Oneboxer
|
||||||
available_strategies ||= Oneboxer.ordered_strategies(uri.hostname)
|
available_strategies ||= Oneboxer.ordered_strategies(uri.hostname)
|
||||||
strategy = available_strategies.shift
|
strategy = available_strategies.shift
|
||||||
|
|
||||||
|
if SiteSetting.block_onebox_on_redirect
|
||||||
|
max_redirects = 0
|
||||||
|
end
|
||||||
fd = FinalDestination.new(
|
fd = FinalDestination.new(
|
||||||
url,
|
url,
|
||||||
get_final_destination_options(url, strategy).merge(stop_at_blocked_pages: true)
|
get_final_destination_options(url, strategy).merge(
|
||||||
|
stop_at_blocked_pages: true,
|
||||||
|
max_redirects: max_redirects,
|
||||||
|
initial_https_redirect_ignore_limit: SiteSetting.block_onebox_on_redirect
|
||||||
|
)
|
||||||
)
|
)
|
||||||
uri = fd.resolve
|
uri = fd.resolve
|
||||||
|
|
||||||
|
|
|
@ -3,8 +3,12 @@
|
||||||
module RetrieveTitle
|
module RetrieveTitle
|
||||||
CRAWL_TIMEOUT = 1
|
CRAWL_TIMEOUT = 1
|
||||||
|
|
||||||
def self.crawl(url)
|
def self.crawl(url, max_redirects: nil, initial_https_redirect_ignore_limit: false)
|
||||||
fetch_title(url)
|
fetch_title(
|
||||||
|
url,
|
||||||
|
max_redirects: max_redirects,
|
||||||
|
initial_https_redirect_ignore_limit: initial_https_redirect_ignore_limit
|
||||||
|
)
|
||||||
rescue Exception => ex
|
rescue Exception => ex
|
||||||
raise if Rails.env.test?
|
raise if Rails.env.test?
|
||||||
Rails.logger.error(ex)
|
Rails.logger.error(ex)
|
||||||
|
@ -53,8 +57,14 @@ module RetrieveTitle
|
||||||
end
|
end
|
||||||
|
|
||||||
# Fetch the beginning of a HTML document at a url
|
# Fetch the beginning of a HTML document at a url
|
||||||
def self.fetch_title(url)
|
def self.fetch_title(url, max_redirects: nil, initial_https_redirect_ignore_limit: false)
|
||||||
fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT, stop_at_blocked_pages: true)
|
fd = FinalDestination.new(
|
||||||
|
url,
|
||||||
|
timeout: CRAWL_TIMEOUT,
|
||||||
|
stop_at_blocked_pages: true,
|
||||||
|
max_redirects: max_redirects,
|
||||||
|
initial_https_redirect_ignore_limit: initial_https_redirect_ignore_limit
|
||||||
|
)
|
||||||
|
|
||||||
current = nil
|
current = nil
|
||||||
title = nil
|
title = nil
|
||||||
|
|
|
@ -265,5 +265,53 @@ describe InlineOneboxer do
|
||||||
expect(onebox[:title]).to be_blank
|
expect(onebox[:title]).to be_blank
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "when block_onebox_on_redirect is enabled" do
|
||||||
|
before do
|
||||||
|
SiteSetting.block_onebox_on_redirect = true
|
||||||
|
end
|
||||||
|
|
||||||
|
after do
|
||||||
|
FinalDestination.clear_https_cache!("redirects.com")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "doesn't onebox if the URL redirects" do
|
||||||
|
stub_request(:get, "https://redirects.com/blah/gg")
|
||||||
|
.to_return(
|
||||||
|
status: 301,
|
||||||
|
body: "",
|
||||||
|
headers: { "location" => "https://redirects.com/blah/gg/redirect" }
|
||||||
|
)
|
||||||
|
onebox = InlineOneboxer.lookup("https://redirects.com/blah/gg", skip_cache: true)
|
||||||
|
expect(onebox[:title]).to be_blank
|
||||||
|
end
|
||||||
|
|
||||||
|
it "allows an initial http -> https redirect if the redirect URL is identical to the original" do
|
||||||
|
stub_request(:get, "http://redirects.com/blah/gg")
|
||||||
|
.to_return(
|
||||||
|
status: 301,
|
||||||
|
body: "",
|
||||||
|
headers: { "location" => "https://redirects.com/blah/gg" }
|
||||||
|
)
|
||||||
|
stub_request(:get, "https://redirects.com/blah/gg")
|
||||||
|
.to_return(
|
||||||
|
status: 200,
|
||||||
|
body: "<html><head><title>The Redirects Website</title></head></html>"
|
||||||
|
)
|
||||||
|
onebox = InlineOneboxer.lookup("http://redirects.com/blah/gg", skip_cache: true)
|
||||||
|
expect(onebox[:title]).to eq("The Redirects Website")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "doesn't allow an initial http -> https redirect if the redirect URL is different to the original" do
|
||||||
|
stub_request(:get, "http://redirects.com/blah/gg")
|
||||||
|
.to_return(
|
||||||
|
status: 301,
|
||||||
|
body: "",
|
||||||
|
headers: { "location" => "https://redirects.com/blah/gg/2" }
|
||||||
|
)
|
||||||
|
onebox = InlineOneboxer.lookup("http://redirects.com/blah/gg", skip_cache: true)
|
||||||
|
expect(onebox[:title]).to be_blank
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -243,6 +243,97 @@ describe Oneboxer do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "when block_onebox_on_redirect setting is enabled" do
|
||||||
|
before do
|
||||||
|
Discourse.cache.clear
|
||||||
|
SiteSetting.block_onebox_on_redirect = true
|
||||||
|
end
|
||||||
|
|
||||||
|
after do
|
||||||
|
FinalDestination.clear_https_cache!("redirects2.com")
|
||||||
|
FinalDestination.clear_https_cache!("redirects3.com")
|
||||||
|
FinalDestination.clear_https_cache!("redirects4.com")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "doesn't return onebox if the URL redirects" do
|
||||||
|
stub_request(:head, "https://redirects2.com/full-onebox")
|
||||||
|
.to_return(
|
||||||
|
status: 301,
|
||||||
|
body: "",
|
||||||
|
headers: { "location" => "https://redirects2.com/real-full-onebox" }
|
||||||
|
)
|
||||||
|
stub_request(:get, "https://redirects2.com/full-onebox")
|
||||||
|
.to_return(
|
||||||
|
status: 301,
|
||||||
|
body: "",
|
||||||
|
headers: { "location" => "https://redirects2.com/real-full-onebox" }
|
||||||
|
)
|
||||||
|
result = Oneboxer.external_onebox("https://redirects2.com/full-onebox")
|
||||||
|
expect(result[:onebox]).to be_blank
|
||||||
|
end
|
||||||
|
|
||||||
|
it "allows an initial http -> https redirect if the redirect URL is identical to the original" do
|
||||||
|
stub_request(:get, "http://redirects3.com/full-onebox")
|
||||||
|
.to_return(
|
||||||
|
status: 301,
|
||||||
|
body: "",
|
||||||
|
headers: { "location" => "https://redirects3.com/full-onebox" }
|
||||||
|
)
|
||||||
|
stub_request(:head, "http://redirects3.com/full-onebox")
|
||||||
|
.to_return(
|
||||||
|
status: 301,
|
||||||
|
body: "",
|
||||||
|
headers: { "location" => "https://redirects3.com/full-onebox" }
|
||||||
|
)
|
||||||
|
|
||||||
|
stub_request(:get, "https://redirects3.com/full-onebox")
|
||||||
|
.to_return(
|
||||||
|
status: 200,
|
||||||
|
body: html
|
||||||
|
)
|
||||||
|
stub_request(:head, "https://redirects3.com/full-onebox")
|
||||||
|
.to_return(
|
||||||
|
status: 200,
|
||||||
|
body: "",
|
||||||
|
)
|
||||||
|
result = Oneboxer.external_onebox("http://redirects3.com/full-onebox")
|
||||||
|
onebox = result[:onebox]
|
||||||
|
expect(onebox).to include("https://redirects3.com/full-onebox")
|
||||||
|
expect(onebox).to include("Cats")
|
||||||
|
expect(onebox).to include("Meow")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "doesn't allow an initial http -> https redirect if the redirect URL is different to the original" do
|
||||||
|
stub_request(:get, "http://redirects4.com/full-onebox")
|
||||||
|
.to_return(
|
||||||
|
status: 301,
|
||||||
|
body: "",
|
||||||
|
headers: { "location" => "https://redirects4.com/full-onebox/2" }
|
||||||
|
)
|
||||||
|
stub_request(:head, "http://redirects4.com/full-onebox")
|
||||||
|
.to_return(
|
||||||
|
status: 301,
|
||||||
|
body: "",
|
||||||
|
headers: { "location" => "https://redirects4.com/full-onebox/2" }
|
||||||
|
)
|
||||||
|
|
||||||
|
stub_request(:get, "https://redirects4.com/full-onebox")
|
||||||
|
.to_return(
|
||||||
|
status: 301,
|
||||||
|
body: "",
|
||||||
|
headers: { "location" => "https://redirects4.com/full-onebox/2" }
|
||||||
|
)
|
||||||
|
stub_request(:head, "https://redirects4.com/full-onebox")
|
||||||
|
.to_return(
|
||||||
|
status: 301,
|
||||||
|
body: "",
|
||||||
|
headers: { "location" => "https://redirects4.com/full-onebox/2" }
|
||||||
|
)
|
||||||
|
result = Oneboxer.external_onebox("http://redirects4.com/full-onebox")
|
||||||
|
expect(result[:onebox]).to be_blank
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
it "censors external oneboxes" do
|
it "censors external oneboxes" do
|
||||||
Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "bad word")
|
Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "bad word")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user