discourse/spec/lib/retrieve_title_spec.rb

# frozen_string_literal: true

RSpec.describe RetrieveTitle do
  describe ".extract_title" do
    it "will extract the value from the title tag" do
      title = RetrieveTitle.extract_title(
        "<html><title>My Cool Title</title></html>"
      )

      expect(title).to eq("My Cool Title")
    end

    it "will strip whitespace" do
      title = RetrieveTitle.extract_title(
        "<html><title>   Another Title\n\n </title></html>"
      )

      expect(title).to eq("Another Title")
    end

    it "will pick og:title if title is missing" do
      title = RetrieveTitle.extract_title(<<~HTML
        <html>
          <meta property="og:title" content="Good Title"
        </html>
        HTML
      )

      expect(title).to eq("Good Title")
    end

    it "will prefer the title over the opengraph tag" do
      title = RetrieveTitle.extract_title(<<~HTML
        <html>
          <title>Good Title</title>
          <meta property="og:title" content="Bad Title"
        </html>
        HTML
      )

      expect(title).to eq("Good Title")
    end

    it "will parse a YouTube url from javascript" do
      title = RetrieveTitle.extract_title(<<~HTML
        <html>
          <title>YouTube</title>
          <script>document.title = "Video Title";</script>
        </html>
        HTML
      )
      expect(title).to eq("Video Title")
    end

    it "will not exception out for invalid html" do
      attributes = (1..1000).map { |x| " attr#{x}='1' " }.join
      title = RetrieveTitle.extract_title <<~HTML
        <html>
          <title>test</title>
          <body #{attributes}>
        </html>
      HTML

      expect(title).to eq(nil)
    end
  end

  describe ".crawl" do
    it "can properly extract a title from a url" do
      stub_request(:get, "https://brelksdjflaskfj.com/amazing")
        .to_return(status: 200, body: "<html><title>very amazing</title>")

      # we still resolve the IP address for every host
      IPSocket.stubs(:getaddress).returns('100.2.3.4')

      expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing")
    end

    it "detects and uses encoding from Content-Type header" do
      stub_request(:get, "https://brelksdjflaskfj.com/amazing")
        .to_return(
          status: 200,
          body: "<html><title>fancy apostrophes ’’’</title>".dup.force_encoding('ASCII-8BIT'),
          headers: { 'Content-Type' => 'text/html; charset="utf-8"' }
        )

      IPSocket.stubs(:getaddress).returns('100.2.3.4')
      expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("fancy apostrophes ’’’")

      stub_request(:get, "https://brelksdjflaskfj.com/amazing")
        .to_return(
          status: 200,
          body: "<html><title>japanese こんにちは website</title>".encode('EUC-JP').force_encoding('ASCII-8BIT'),
          headers: { 'Content-Type' => 'text/html;charset=euc-jp' }
        )

      IPSocket.stubs(:getaddress).returns('100.2.3.4')
      expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("japanese こんにちは website")
    end

    it "can follow redirect" do
      stub_request(:get, "http://foobar.com/amazing").
        to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })

      stub_request(:get, "https://wikipedia.com/amazing").
        to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})

      IPSocket.stubs(:getaddress).returns('100.2.3.4')
      expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
    end

    it "returns empty title if redirect uri is in blacklist" do
      SiteSetting.blocked_onebox_domains = "wikipedia.com"

      stub_request(:get, "http://foobar.com/amazing")
        .to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })

      stub_request(:get, "https://wikipedia.com/amazing")
        .to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})

      expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil)
    end

    it "doesn't return title if a blocked domain is encountered anywhere in the redirect chain" do
      SiteSetting.blocked_onebox_domains = "wikipedia.com"

      stub_request(:get, "http://foobar.com/amazing")
        .to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })

      stub_request(:get, "https://wikipedia.com/amazing")
        .to_return(status: 301, body: "", headers: { "location" => "https://cat.com/meow" })

      stub_request(:get, "https://cat.com/meow")
        .to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})

      expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to be_blank
    end

    it "doesn't return title if the Discourse-No-Onebox header == 1" do
      stub_request(:get, "https://cat.com/meow/no-onebox")
        .to_return(
          status: 200,
          body: "<html><title>discourse stay away</title>",
          headers: { "Discourse-No-Onebox" => "1" }
        )

      expect(RetrieveTitle.crawl("https://cat.com/meow/no-onebox")).to be_blank
    end

    it "doesn't return a title if response is unsuccessful" do
      stub_request(:get, "https://example.com").to_return(status: 404, body: "")

      expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
    end

    it "it raises errors other than Net::ReadTimeout, e.g. NoMethodError" do
      stub_request(:get, "https://example.com").to_raise(NoMethodError)

      expect { RetrieveTitle.crawl("https://example.com") }.to raise_error(NoMethodError)
    end

    it "it ignores Net::ReadTimeout errors" do
      stub_request(:get, "https://example.com").to_raise(Net::ReadTimeout)

      expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
    end

    it "ignores SSRF lookup errors" do
      subject.stubs(:fetch_title).raises(FinalDestination::SSRFDetector::LookupFailedError)

      expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
    end
  end

  describe '.fetch_title' do
    it "does not parse broken title tag" do
      # webmock does not do chunks
      stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
        to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</ti" , headers: {})

      title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
      expect(title).to eq(nil)
    end

    it "can parse correct title tag" do
      # webmock does not do chunks
      stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
        to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</title>" , headers: {})

      title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
      expect(title).to eq("Internet - Wikipedia")
    end
  end
end
-												DEV: use #frozen_string_literal: true on all spec

This change both speeds up specs (less strings to allocate) and helps catch
cases where methods in Discourse are mutating inputs.

Overall we will be migrating everything to use #frozen_string_literal: true
it will take a while, but this is the first and safest move in this direction

											
										
										
											2019-04-30 08:27:42 +08:00
+								# frozen_string_literal: true
-												Add RSpec 4 compatibility (#17652)

* Remove outdated option

https://github.com/rspec/rspec-core/commit/04078317ba6577699d06cf4dccf014254dcde7a6

* Use the non-globally exposed RSpec syntax

https://github.com/rspec/rspec-core/pull/2803

* Use the non-globally exposed RSpec syntax, cont

https://github.com/rspec/rspec-core/pull/2803

* Comply to strict predicate matchers

See:
 - https://github.com/rspec/rspec-expectations/pull/1195
 - https://github.com/rspec/rspec-expectations/pull/1196
 - https://github.com/rspec/rspec-expectations/pull/1277

											
										
										
											2022-07-28 10:27:38 +08:00
+								RSpec.describe RetrieveTitle do
-												DEV: Use proper wording for contexts in specs

											
										
										
											2022-07-28 00:14:14 +08:00
+								  describe ".extract_title" do
-												FEATURE: Whitelists for inline oneboxing

											
										
										
											2017-07-22 03:29:04 +08:00
+								    it "will extract the value from the title tag" do
 								      title = RetrieveTitle.extract_title(
 								        "<html><title>My Cool Title</title></html>"
 								      )
 								      expect(title).to eq("My Cool Title")
 								    end
 								    it "will strip whitespace" do
 								      title = RetrieveTitle.extract_title(
 								        "<html><title>   Another Title\n\n </title></html>"
 								      )
 								      expect(title).to eq("Another Title")
 								    end
-												FEATURE: option to enable inline oneboxes for all domains

Also, change to prefer title over open graph which is often way too sparse

											
										
										
											2017-08-03 02:27:21 +08:00
+								    it "will pick og:title if title is missing" do
 								      title = RetrieveTitle.extract_title(<<~HTML
 								        <html>
 								          <meta property="og:title" content="Good Title"
 								        </html>
 								        HTML
 								      )
 								      expect(title).to eq("Good Title")
 								    end
-												FEATURE: add title tag for group detail page (#13702)


											
										
										
											2021-07-12 22:35:57 +08:00
+								    it "will prefer the title over the opengraph tag" do
-												FEATURE: Whitelists for inline oneboxing

											
										
										
											2017-07-22 03:29:04 +08:00
+								      title = RetrieveTitle.extract_title(<<~HTML
 								        <html>
-												FEATURE: option to enable inline oneboxes for all domains

Also, change to prefer title over open graph which is often way too sparse

											
										
										
											2017-08-03 02:27:21 +08:00
+								          <title>Good Title</title>
 								          <meta property="og:title" content="Bad Title"
-												FEATURE: Whitelists for inline oneboxing

											
										
										
											2017-07-22 03:29:04 +08:00
+								        </html>
 								        HTML
 								      )
 								      expect(title).to eq("Good Title")
 								    end
-												FIX: Hack our title retriever so that it parses YouTube URLs

											
										
										
											2017-09-28 21:29:50 +08:00
+								    it "will parse a YouTube url from javascript" do
 								      title = RetrieveTitle.extract_title(<<~HTML
 								        <html>
 								          <title>YouTube</title>
 								          <script>document.title = "Video Title";</script>
 								        </html>
 								        HTML
 								      )
 								      expect(title).to eq("Video Title")
 								    end
-												FIX: ignore malformed HTML for title extraction (#18040)

Certain HTML can be rejected by nokogumbo, specifically cases where there
are enormous amounts of attributes

This ensures that malformed HTML is simply skipped instead of leaking out
an exception and terminating downstream processes.
											
										
										
											2022-08-23 13:03:57 +08:00
 								    it "will not exception out for invalid html" do
 								      attributes = (1..1000).map { |x| " attr#{x}='1' " }.join
 								      title = RetrieveTitle.extract_title <<~HTML
 								        <html>
 								          <title>test</title>
 								          <body #{attributes}>
 								        </html>
 								      HTML
 								      expect(title).to eq(nil)
 								    end
-												PERF: ability to crawl for titles without extra HEAD req

Also, introduces a much more aggressive timeout for title crawling
and introduces gzip to body that is crawled

											
										
										
											2018-01-29 12:36:52 +08:00
+								  end
-												DEV: Use proper wording for contexts in specs

											
										
										
											2022-07-28 00:14:14 +08:00
+								  describe ".crawl" do
-												PERF: ability to crawl for titles without extra HEAD req

Also, introduces a much more aggressive timeout for title crawling
and introduces gzip to body that is crawled

											
										
										
											2018-01-29 12:36:52 +08:00
+								    it "can properly extract a title from a url" do
 								      stub_request(:get, "https://brelksdjflaskfj.com/amazing")
 								        .to_return(status: 200, body: "<html><title>very amazing</title>")
-												FIX: Hack our title retriever so that it parses YouTube URLs

											
										
										
											2017-09-28 21:29:50 +08:00
-												PERF: ability to crawl for titles without extra HEAD req

Also, introduces a much more aggressive timeout for title crawling
and introduces gzip to body that is crawled

											
										
										
											2018-01-29 12:36:52 +08:00
+								      # we still resolve the IP address for every host
 								      IPSocket.stubs(:getaddress).returns('100.2.3.4')
 								      expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing")
 								    end
-												FEATURE: Whitelists for inline oneboxing

											
										
										
											2017-07-22 03:29:04 +08:00
-												FIX: Inline Onebox should use encoding from Content-Type header when present (#11625)

* FIX: Inline onebox should use encoding from Content-Type header when present

* Use Regexp.last_match(1)

Signed-off-by: OsamaSayegh <asooomaasoooma90@gmail.com>
											
										
										
											2021-01-05 03:32:08 +08:00
+								    it "detects and uses encoding from Content-Type header" do
 								      stub_request(:get, "https://brelksdjflaskfj.com/amazing")
 								        .to_return(
 								          status: 200,
 								          body: "<html><title>fancy apostrophes ’’’</title>".dup.force_encoding('ASCII-8BIT'),
 								          headers: { 'Content-Type' => 'text/html; charset="utf-8"' }
 								        )
 								      IPSocket.stubs(:getaddress).returns('100.2.3.4')
 								      expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("fancy apostrophes ’’’")
 								      stub_request(:get, "https://brelksdjflaskfj.com/amazing")
 								        .to_return(
 								          status: 200,
 								          body: "<html><title>japanese こんにちは website</title>".encode('EUC-JP').force_encoding('ASCII-8BIT'),
 								          headers: { 'Content-Type' => 'text/html;charset=euc-jp' }
 								        )
 								      IPSocket.stubs(:getaddress).returns('100.2.3.4')
 								      expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("japanese こんにちは website")
 								    end
-												FIX: follow redirects for inline/mini onebox (#13512)


											
										
										
											2021-06-24 22:23:39 +08:00
 								    it "can follow redirect" do
 								      stub_request(:get, "http://foobar.com/amazing").
 								        to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })
 								      stub_request(:get, "https://wikipedia.com/amazing").
 								        to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
-												FIX: Respect blocked domains list when redirecting (#15656)

Our previous implementation used a simple `blocked_domain_array.include?(hostname)`
so some values were not matching. Additionally, in some configurations like ours, we'd used
"cat.*.dog.com" with the assumption we'd support globbing.

This change implicitly allows globbing by blocking "http://a.b.com" if "b.com" is a blocked 
domain but does not actively do anything for "*".

An upcoming change might include frontend validation for values that can be inserted.
											
										
										
											2022-01-20 14:12:34 +08:00
 								      IPSocket.stubs(:getaddress).returns('100.2.3.4')
 								      expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
 								    end
 								    it "returns empty title if redirect uri is in blacklist" do
 								      SiteSetting.blocked_onebox_domains = "wikipedia.com"
 								      stub_request(:get, "http://foobar.com/amazing")
 								        .to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })
 								      stub_request(:get, "https://wikipedia.com/amazing")
 								        .to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
 								      expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil)
 								    end
-												FIX: Apply onebox blocked domain checks on every redirect (#16150)

The `blocked onebox domains` setting lets site owners change what sites
are allowed to be oneboxed. When a link is entered into a post,
Discourse checks the domain of the link against that setting and blocks
the onebox if the domain is blocked. But if there's a chain of
redirects, then only the final destination website is checked against
the site setting.

This commit amends that behavior so that every website in the redirect
chain is checked against the site setting, and if anything is blocked
the original link doesn't onebox at all in the post. The
`Discourse-No-Onebox` header is also checked in every response and the
onebox is blocked if the header is set to "1".

Additionally, Discourse will now include the `Discourse-No-Onebox`
header with every response if the site requires login to access content.
This is done to signal to a Discourse instance that it shouldn't attempt
to onebox other Discourse instances if they're login-only. Non-Discourse
websites can also use include that header if they don't wish to have
Discourse onebox their content.

Internal ticket: t59305.
											
										
										
											2022-03-11 14:18:12 +08:00
+								    it "doesn't return title if a blocked domain is encountered anywhere in the redirect chain" do
-												FIX: Respect blocked domains list when redirecting (#15656)

Our previous implementation used a simple `blocked_domain_array.include?(hostname)`
so some values were not matching. Additionally, in some configurations like ours, we'd used
"cat.*.dog.com" with the assumption we'd support globbing.

This change implicitly allows globbing by blocking "http://a.b.com" if "b.com" is a blocked 
domain but does not actively do anything for "*".

An upcoming change might include frontend validation for values that can be inserted.
											
										
										
											2022-01-20 14:12:34 +08:00
+								      SiteSetting.blocked_onebox_domains = "wikipedia.com"
 								      stub_request(:get, "http://foobar.com/amazing")
 								        .to_return(status: 301, body: "", headers: { "location" => "https://wikipedia.com/amazing" })
 								      stub_request(:get, "https://wikipedia.com/amazing")
 								        .to_return(status: 301, body: "", headers: { "location" => "https://cat.com/meow" })
 								      stub_request(:get, "https://cat.com/meow")
 								        .to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
-												FIX: follow redirects for inline/mini onebox (#13512)


											
										
										
											2021-06-24 22:23:39 +08:00
-												FIX: Apply onebox blocked domain checks on every redirect (#16150)

The `blocked onebox domains` setting lets site owners change what sites
are allowed to be oneboxed. When a link is entered into a post,
Discourse checks the domain of the link against that setting and blocks
the onebox if the domain is blocked. But if there's a chain of
redirects, then only the final destination website is checked against
the site setting.

This commit amends that behavior so that every website in the redirect
chain is checked against the site setting, and if anything is blocked
the original link doesn't onebox at all in the post. The
`Discourse-No-Onebox` header is also checked in every response and the
onebox is blocked if the header is set to "1".

Additionally, Discourse will now include the `Discourse-No-Onebox`
header with every response if the site requires login to access content.
This is done to signal to a Discourse instance that it shouldn't attempt
to onebox other Discourse instances if they're login-only. Non-Discourse
websites can also use include that header if they don't wish to have
Discourse onebox their content.

Internal ticket: t59305.
											
										
										
											2022-03-11 14:18:12 +08:00
+								      expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to be_blank
 								    end
 								    it "doesn't return title if the Discourse-No-Onebox header == 1" do
 								      stub_request(:get, "https://cat.com/meow/no-onebox")
 								        .to_return(
 								          status: 200,
 								          body: "<html><title>discourse stay away</title>",
 								          headers: { "Discourse-No-Onebox" => "1" }
 								        )
 								      expect(RetrieveTitle.crawl("https://cat.com/meow/no-onebox")).to be_blank
-												FIX: follow redirects for inline/mini onebox (#13512)


											
										
										
											2021-06-24 22:23:39 +08:00
+								    end
-												FIX: Do not raise if title cannot be crawled (#16247)

If the crawled page returned an error, `FinalDestination#safe_get`
yielded `nil` for `uri` and `chunk` arguments. Another problem is that
`get` did not handle the case when `safe_get` failed and did not return
the `location` and `set_cookie` headers.
											
										
										
											2022-03-23 02:13:27 +08:00
 								    it "doesn't return a title if response is unsuccessful" do
 								      stub_request(:get, "https://example.com").to_return(status: 404, body: "")
 								      expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
 								    end
-												DEV: Supress logs when RetrieveTitle.crawl fails with Net::ReadTimeout errors (#16971)

This PR changes the rescue block to rescue only Net::TimeoutError exceptions and removes the log line to prevent clutter the logs with errors that are ignored. Other errors can bubble up because they're errors we probably want to know about
											
										
										
											2022-06-10 03:30:22 +08:00
 								    it "it raises errors other than Net::ReadTimeout, e.g. NoMethodError" do
 								      stub_request(:get, "https://example.com").to_raise(NoMethodError)
 								      expect { RetrieveTitle.crawl("https://example.com") }.to raise_error(NoMethodError)
 								    end
 								    it "it ignores Net::ReadTimeout errors" do
 								      stub_request(:get, "https://example.com").to_raise(Net::ReadTimeout)
-												FIX: Gracefully handle DNS issued from SSRF lookup when inline oneboxing (#19631)

There is an issue where chat message processing breaks due to
unhandles `SocketError` exceptions originating in the SSRF check,
specifically in `FinalDestination::Resolver`.

This change gives `FinalDestination::SSRFDetector` a new error class
to wrap the `SocketError` in, and haves the `RetrieveTitle` class
handle that error gracefully.
											
										
										
											2022-12-28 10:30:20 +08:00
+								      expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
 								    end
 								    it "ignores SSRF lookup errors" do
 								      subject.stubs(:fetch_title).raises(FinalDestination::SSRFDetector::LookupFailedError)
 								      expect(RetrieveTitle.crawl("https://example.com")).to eq(nil)
-												DEV: Supress logs when RetrieveTitle.crawl fails with Net::ReadTimeout errors (#16971)

This PR changes the rescue block to rescue only Net::TimeoutError exceptions and removes the log line to prevent clutter the logs with errors that are ignored. Other errors can bubble up because they're errors we probably want to know about
											
										
										
											2022-06-10 03:30:22 +08:00
+								    end
-												FIX: Inline Onebox should use encoding from Content-Type header when present (#11625)

* FIX: Inline onebox should use encoding from Content-Type header when present

* Use Regexp.last_match(1)

Signed-off-by: OsamaSayegh <asooomaasoooma90@gmail.com>
											
										
										
											2021-01-05 03:32:08 +08:00
+								  end
-												FIX: increase chunk size to fetch title tag correctly (#14144)


											
										
										
											2021-09-03 15:45:58 +08:00
-												DEV: Use proper wording for contexts in specs

											
										
										
											2022-07-28 00:14:14 +08:00
+								  describe '.fetch_title' do
-												FIX: increase chunk size to fetch title tag correctly (#14144)


											
										
										
											2021-09-03 15:45:58 +08:00
+								    it "does not parse broken title tag" do
 								      # webmock does not do chunks
 								      stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
 								        to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</ti" , headers: {})
 								      title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
 								      expect(title).to eq(nil)
 								    end
 								    it "can parse correct title tag" do
 								      # webmock does not do chunks
 								      stub_request(:get, "https://en.wikipedia.org/wiki/Internet").
 								        to_return(status: 200, body: "<html><head><title>Internet - Wikipedia</title>" , headers: {})
 								      title = RetrieveTitle.fetch_title("https://en.wikipedia.org/wiki/Internet")
 								      expect(title).to eq("Internet - Wikipedia")
 								    end
 								  end
-												FEATURE: Whitelists for inline oneboxing

											
										
										
											2017-07-22 03:29:04 +08:00
+								end