FIX: Attempt to onebox even if response body exceeds max_download_kb (#26929)

In 95a82d608d, we lowered the default for
`Onebox.options.max_download_kb` from 10mb to 2mb for security hardening
purposes. However, this resulted in multiple bug reports where seemingly
nomral URLs stopped being oneboxed. It turns out that lowering
`Onebox.options.max_download_kb` resulted in `Onebox::Helpers::DownloadTooLarge` being raised
more often for more URLs  in `Onebox::Helpers.fetch_response` which
`Onebox::Helpers.fetch_html_doc` relies on. When
`Onebox::Helpers::DownloadTooLarge` is raised in
`Onebox::Helpers.fetch_response`, we throw away whatever response body
which we have already downloaded at that point. This is not ideal
because Nokogiri can parse incomplete HTML documents and there is a
really high chance that the incomplete HTML document still contains the
information which we need for oneboxing.

Therefore, this commit updates `Onebox::Helpers.fetch_html_doc` to not
throw away the response body when the size of the response body exceeds
`Onebox.options.max_download_size`. Instead, we just take whatever
response which we have and get Nokogiri to parse it.
This commit is contained in:
Alan Guo Xiang Tan 2024-05-09 07:00:34 +08:00 committed by GitHub
parent 13334a3da0
commit c8da2a33e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 45 additions and 4 deletions

View File

@ -13,15 +13,20 @@ module Onebox
html.gsub(/<[^>]+>/, " ").gsub(/\n/, "")
end
# Fetches the HTML response body for a URL.
#
# Note that the size of the response body is capped at `Onebox.options.max_download_kb`. When the limit has been reached,
# this method will return the response body that has been downloaded up to the limit.
def self.fetch_html_doc(url, headers = nil, body_cacher = nil)
response =
(
begin
fetch_response(url, headers: headers, body_cacher: body_cacher)
fetch_response(url, headers:, body_cacher:, raise_error_when_response_too_large: false)
rescue StandardError
nil
end
)
doc = Nokogiri.HTML(response)
uri = Addressable::URI.parse(url)
@ -45,7 +50,12 @@ module Onebox
response =
(
begin
fetch_response(uri.to_s, headers: headers, body_cacher: body_cacher)
fetch_response(
uri.to_s,
headers:,
body_cacher:,
raise_error_when_response_too_large: false,
)
rescue StandardError
nil
end
@ -63,7 +73,8 @@ module Onebox
redirect_limit: 5,
domain: nil,
headers: nil,
body_cacher: nil
body_cacher: nil,
raise_error_when_response_too_large: true
)
redirect_limit = Onebox.options.redirect_limit if redirect_limit >
Onebox.options.redirect_limit
@ -125,7 +136,11 @@ module Onebox
response.read_body do |chunk|
result.write(chunk)
raise DownloadTooLarge.new if result.size > size_bytes
if result.size > size_bytes
raise_error_when_response_too_large ? raise(DownloadTooLarge.new) : break
end
raise Timeout::Error.new if (Time.now - start_time) > Onebox.options.timeout
end

View File

@ -16,6 +16,7 @@ RSpec.describe Onebox::Helpers do
around do |example|
previous_options = Onebox.options.to_h
Onebox.options = { max_download_kb: 1 }
stub_request(:get, "http://example.com/large-file").to_return(
status: 200,
body: onebox_response("slides"),
@ -32,6 +33,15 @@ RSpec.describe Onebox::Helpers do
)
end
it "returns the body of the response when size of response body exceeds the limit and `raise_error_when_response_too_large` has been set to `false`" do
expect(
described_class.fetch_response(
"http://example.com/large-file",
raise_error_when_response_too_large: false,
),
).to eq(onebox_response("slides"))
end
it "raises an exception when private url requested" do
FinalDestination::TestHelper.stub_to_fail do
expect { described_class.fetch_response("http://example.com/large-file") }.to raise_error(
@ -49,6 +59,22 @@ RSpec.describe Onebox::Helpers do
expect(described_class.fetch_html_doc(uri).to_s).to match("success")
end
it "does not raise an error when response body exceeds Onebox's `max_download_kb` limit" do
previous_options = Onebox.options.to_h
Onebox.options = previous_options.merge(max_download_kb: 1)
stub_request(:get, "http://example.com/large-file").to_return(
status: 200,
body: onebox_response("slides"),
)
expect(described_class.fetch_html_doc("http://example.com/large-file").to_s).to include(
"ECMAScript 2015 by David Leonard",
)
ensure
Onebox.options = previous_options
end
context "with canonical link" do
it "follows canonical link" do
uri = "https://www.example.com"