2019-04-30 08:27:42 +08:00
|
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
2017-07-22 03:29:04 +08:00
|
|
|
|
require 'rails_helper'
|
|
|
|
|
|
|
|
|
|
describe RetrieveTitle do
|
|
|
|
|
|
|
|
|
|
context "extract_title" do
|
|
|
|
|
|
|
|
|
|
it "will extract the value from the title tag" do
|
|
|
|
|
title = RetrieveTitle.extract_title(
|
|
|
|
|
"<html><title>My Cool Title</title></html>"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
expect(title).to eq("My Cool Title")
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "will strip whitespace" do
|
|
|
|
|
title = RetrieveTitle.extract_title(
|
|
|
|
|
"<html><title> Another Title\n\n </title></html>"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
expect(title).to eq("Another Title")
|
|
|
|
|
end
|
|
|
|
|
|
2017-08-03 02:27:21 +08:00
|
|
|
|
it "will pick og:title if title is missing" do
|
|
|
|
|
title = RetrieveTitle.extract_title(<<~HTML
|
|
|
|
|
<html>
|
|
|
|
|
<meta property="og:title" content="Good Title"
|
|
|
|
|
</html>
|
|
|
|
|
HTML
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
expect(title).to eq("Good Title")
|
|
|
|
|
end
|
|
|
|
|
|
2021-07-12 22:35:57 +08:00
|
|
|
|
it "will prefer the title over the opengraph tag" do
|
2017-07-22 03:29:04 +08:00
|
|
|
|
title = RetrieveTitle.extract_title(<<~HTML
|
|
|
|
|
<html>
|
2017-08-03 02:27:21 +08:00
|
|
|
|
<title>Good Title</title>
|
|
|
|
|
<meta property="og:title" content="Bad Title"
|
2017-07-22 03:29:04 +08:00
|
|
|
|
</html>
|
|
|
|
|
HTML
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
expect(title).to eq("Good Title")
|
|
|
|
|
end
|
|
|
|
|
|
2017-09-28 21:29:50 +08:00
|
|
|
|
it "will parse a YouTube url from javascript" do
|
|
|
|
|
title = RetrieveTitle.extract_title(<<~HTML
|
|
|
|
|
<html>
|
|
|
|
|
<title>YouTube</title>
|
|
|
|
|
<script>document.title = "Video Title";</script>
|
|
|
|
|
</html>
|
|
|
|
|
HTML
|
|
|
|
|
)
|
|
|
|
|
expect(title).to eq("Video Title")
|
|
|
|
|
end
|
2018-01-29 12:36:52 +08:00
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
context "crawl" do
|
|
|
|
|
it "can properly extract a title from a url" do
|
|
|
|
|
stub_request(:get, "https://brelksdjflaskfj.com/amazing")
|
|
|
|
|
.to_return(status: 200, body: "<html><title>very amazing</title>")
|
2017-09-28 21:29:50 +08:00
|
|
|
|
|
2018-01-29 12:36:52 +08:00
|
|
|
|
# we still resolve the IP address for every host
|
|
|
|
|
IPSocket.stubs(:getaddress).returns('100.2.3.4')
|
|
|
|
|
|
|
|
|
|
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing")
|
|
|
|
|
end
|
2017-07-22 03:29:04 +08:00
|
|
|
|
|
2021-01-05 03:32:08 +08:00
|
|
|
|
it "detects and uses encoding from Content-Type header" do
|
|
|
|
|
stub_request(:get, "https://brelksdjflaskfj.com/amazing")
|
|
|
|
|
.to_return(
|
|
|
|
|
status: 200,
|
|
|
|
|
body: "<html><title>fancy apostrophes ’’’</title>".dup.force_encoding('ASCII-8BIT'),
|
|
|
|
|
headers: { 'Content-Type' => 'text/html; charset="utf-8"' }
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
IPSocket.stubs(:getaddress).returns('100.2.3.4')
|
|
|
|
|
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("fancy apostrophes ’’’")
|
|
|
|
|
|
|
|
|
|
stub_request(:get, "https://brelksdjflaskfj.com/amazing")
|
|
|
|
|
.to_return(
|
|
|
|
|
status: 200,
|
|
|
|
|
body: "<html><title>japanese こんにちは website</title>".encode('EUC-JP').force_encoding('ASCII-8BIT'),
|
|
|
|
|
headers: { 'Content-Type' => 'text/html;charset=euc-jp' }
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
IPSocket.stubs(:getaddress).returns('100.2.3.4')
|
|
|
|
|
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("japanese こんにちは website")
|
|
|
|
|
end
|
2021-06-24 22:23:39 +08:00
|
|
|
|
|
|
|
|
|
it "can follow redirect" do
|
|
|
|
|
stub_request(:get, "http://foobar.com/amazing").
|
|
|
|
|
to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })
|
|
|
|
|
|
|
|
|
|
stub_request(:get, "https://wikipedia.com/amazing").
|
|
|
|
|
to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
|
2022-01-20 14:12:34 +08:00
|
|
|
|
|
|
|
|
|
IPSocket.stubs(:getaddress).returns('100.2.3.4')
|
|
|
|
|
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "returns empty title if redirect uri is in blacklist" do
|
|
|
|
|
SiteSetting.blocked_onebox_domains = "wikipedia.com"
|
|
|
|
|
|
|
|
|
|
stub_request(:get, "http://foobar.com/amazing")
|
|
|
|
|
.to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })
|
|
|
|
|
|
|
|
|
|
stub_request(:get, "https://wikipedia.com/amazing")
|
|
|
|
|
.to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
|
|
|
|
|
|
|
|
|
|
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "returns title if 'midway redirect' is blocked but final redirect uri is not blocked" do
|
|
|
|
|
SiteSetting.blocked_onebox_domains = "wikipedia.com"
|
|
|
|
|
|
|
|
|
|
stub_request(:get, "http://foobar.com/amazing")
|
|
|
|
|
.to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })
|
|
|
|
|
|
|
|
|
|
stub_request(:get, "https://wikipedia.com/amazing")
|
|
|
|
|
.to_return(status: 301, body: "", headers: { "location" => "https://cat.com/meow" })
|
|
|
|
|
|
|
|
|
|
stub_request(:get, "https://cat.com/meow")
|
|
|
|
|
.to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
|
2021-06-24 22:23:39 +08:00
|
|
|
|
|
|
|
|
|
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
|
|
|
|
|
end
|
2021-01-05 03:32:08 +08:00
|
|
|
|
end
|
2021-09-03 15:45:58 +08:00
|
|
|
|
|
|
|
|
|
context 'fetch_title' do
|
|
|
|
|
it "does not parse broken title tag" do
|
|
|
|
|
# webmock does not do chunks
|
|
|
|
|
stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
|
|
|
|
|
to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</ti" , headers: {})
|
|
|
|
|
|
|
|
|
|
title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
|
|
|
|
|
expect(title).to eq(nil)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
it "can parse correct title tag" do
|
|
|
|
|
# webmock does not do chunks
|
|
|
|
|
stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
|
|
|
|
|
to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</title>" , headers: {})
|
|
|
|
|
|
|
|
|
|
title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
|
|
|
|
|
expect(title).to eq("Internet - Wikipedia")
|
|
|
|
|
end
|
|
|
|
|
end
|
2017-07-22 03:29:04 +08:00
|
|
|
|
end
|