FIX: Inline Onebox should use encoding from Content-Type header when present (#11625)

* FIX: Inline onebox should use encoding from Content-Type header when present

* Use Regexp.last_match(1)

Signed-off-by: OsamaSayegh <asooomaasoooma90@gmail.com>
This commit is contained in:
Osama Sayegh 2021-01-04 22:32:08 +03:00 committed by GitHub
parent 6a9ec7c65d
commit 558e9dd310
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 34 additions and 4 deletions

View File

@ -9,9 +9,9 @@ module RetrieveTitle
# If there was a connection error, do nothing
end
def self.extract_title(html)
def self.extract_title(html, encoding = nil)
title = nil
if doc = Nokogiri::HTML5(html)
if doc = Nokogiri::HTML5(html, nil, encoding)
title = doc.at('title')&.inner_text
@ -54,6 +54,7 @@ module RetrieveTitle
current = nil
title = nil
encoding = nil
fd.get do |_response, chunk, uri|
@ -62,9 +63,17 @@ module RetrieveTitle
else
current = chunk
end
if !encoding && content_type = _response['content-type']&.strip&.downcase
if content_type =~ /charset="?([a-z0-9_-]+)"?/
encoding = Regexp.last_match(1)
if !Encoding.list.map(&:name).map(&:downcase).include?(encoding)
encoding = nil
end
end
end
max_size = max_chunk_size(uri) * 1024
title = extract_title(current)
title = extract_title(current, encoding)
throw :done if title || max_size < current.length
end
title

View File

@ -67,6 +67,27 @@ describe RetrieveTitle do
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing")
end
end
it "detects and uses encoding from Content-Type header" do
stub_request(:get, "https://brelksdjflaskfj.com/amazing")
.to_return(
status: 200,
body: "<html><title>fancy apostrophes </title>".dup.force_encoding('ASCII-8BIT'),
headers: { 'Content-Type' => 'text/html; charset="utf-8"' }
)
IPSocket.stubs(:getaddress).returns('100.2.3.4')
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("fancy apostrophes ")
stub_request(:get, "https://brelksdjflaskfj.com/amazing")
.to_return(
status: 200,
body: "<html><title>japanese こんにちは website</title>".encode('EUC-JP').force_encoding('ASCII-8BIT'),
headers: { 'Content-Type' => 'text/html;charset=euc-jp' }
)
IPSocket.stubs(:getaddress).returns('100.2.3.4')
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("japanese こんにちは website")
end
end
end