mirror of
https://github.com/discourse/discourse.git
synced 2024-11-22 10:57:04 +08:00
FIX: Inline Onebox should use encoding from Content-Type header when present (#11625)
* FIX: Inline onebox should use encoding from Content-Type header when present * Use Regexp.last_match(1) Signed-off-by: OsamaSayegh <asooomaasoooma90@gmail.com>
This commit is contained in:
parent
6a9ec7c65d
commit
558e9dd310
|
@ -9,9 +9,9 @@ module RetrieveTitle
|
|||
# If there was a connection error, do nothing
|
||||
end
|
||||
|
||||
def self.extract_title(html)
|
||||
def self.extract_title(html, encoding = nil)
|
||||
title = nil
|
||||
if doc = Nokogiri::HTML5(html)
|
||||
if doc = Nokogiri::HTML5(html, nil, encoding)
|
||||
|
||||
title = doc.at('title')&.inner_text
|
||||
|
||||
|
@ -54,6 +54,7 @@ module RetrieveTitle
|
|||
|
||||
current = nil
|
||||
title = nil
|
||||
encoding = nil
|
||||
|
||||
fd.get do |_response, chunk, uri|
|
||||
|
||||
|
@ -62,9 +63,17 @@ module RetrieveTitle
|
|||
else
|
||||
current = chunk
|
||||
end
|
||||
if !encoding && content_type = _response['content-type']&.strip&.downcase
|
||||
if content_type =~ /charset="?([a-z0-9_-]+)"?/
|
||||
encoding = Regexp.last_match(1)
|
||||
if !Encoding.list.map(&:name).map(&:downcase).include?(encoding)
|
||||
encoding = nil
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
max_size = max_chunk_size(uri) * 1024
|
||||
title = extract_title(current)
|
||||
title = extract_title(current, encoding)
|
||||
throw :done if title || max_size < current.length
|
||||
end
|
||||
title
|
||||
|
|
|
@ -67,6 +67,27 @@ describe RetrieveTitle do
|
|||
|
||||
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing")
|
||||
end
|
||||
end
|
||||
|
||||
it "detects and uses encoding from Content-Type header" do
|
||||
stub_request(:get, "https://brelksdjflaskfj.com/amazing")
|
||||
.to_return(
|
||||
status: 200,
|
||||
body: "<html><title>fancy apostrophes ’’’</title>".dup.force_encoding('ASCII-8BIT'),
|
||||
headers: { 'Content-Type' => 'text/html; charset="utf-8"' }
|
||||
)
|
||||
|
||||
IPSocket.stubs(:getaddress).returns('100.2.3.4')
|
||||
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("fancy apostrophes ’’’")
|
||||
|
||||
stub_request(:get, "https://brelksdjflaskfj.com/amazing")
|
||||
.to_return(
|
||||
status: 200,
|
||||
body: "<html><title>japanese こんにちは website</title>".encode('EUC-JP').force_encoding('ASCII-8BIT'),
|
||||
headers: { 'Content-Type' => 'text/html;charset=euc-jp' }
|
||||
)
|
||||
|
||||
IPSocket.stubs(:getaddress).returns('100.2.3.4')
|
||||
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("japanese こんにちは website")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue
Block a user