discourse/lib/retrieve_title.rb

# frozen_string_literal: true

module RetrieveTitle
  CRAWL_TIMEOUT = 1

  def self.crawl(url)
    fetch_title(url)
  rescue Exception
    # If there was a connection error, do nothing
  end

  def self.extract_title(html, encoding = nil)
    title = nil
    if html =~ /<title>/ && html !~ /<\/title>/
      return nil
    end
    if doc = Nokogiri::HTML5(html, nil, encoding)

      title = doc.at('title')&.inner_text

      # A horrible hack - YouTube uses `document.title` to populate the title
      # for some reason. For any other site than YouTube this wouldn't be worth it.
      if title == "YouTube" && html =~ /document\.title *= *"(.*)";/
        title = Regexp.last_match[1].sub(/ - YouTube$/, '')
      end

      if !title && node = doc.at('meta[property="og:title"]')
        title = node['content']
      end
    end

    if title.present?
      title.gsub!(/\n/, ' ')
      title.gsub!(/ +/, ' ')
      title.strip!
      return title
    end
    nil
  end

  private

  def self.max_chunk_size(uri)

    # Amazon and YouTube leave the title until very late. Exceptions are bad
    # but these are large sites.
    return 500 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
    return 300 if uri.host =~ /youtube\.com$/ || uri.host =~ /youtu.be/

    # default is 20k
    20
  end

  # Fetch the beginning of a HTML document at a url
  def self.fetch_title(url)
    fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT)

    current = nil
    title = nil
    encoding = nil

    fd.get do |_response, chunk, uri|
      if (uri.present? && Onebox::DomainChecker.is_blocked?(uri.hostname))
        throw :done
      end

      unless Net::HTTPRedirection === _response
        if current
          current << chunk
        else
          current = chunk
        end

        if !encoding && content_type = _response['content-type']&.strip&.downcase
          if content_type =~ /charset="?([a-z0-9_-]+)"?/
            encoding = Regexp.last_match(1)
            if !Encoding.list.map(&:name).map(&:downcase).include?(encoding)
              encoding = nil
            end
          end
        end

        max_size = max_chunk_size(uri) * 1024
        title = extract_title(current, encoding)
        throw :done if title || max_size < current.length
      end
    end
    title
  end
end
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-03 06:17:27 +08:00			`# frozen_string_literal: true`

FEATURE: Whitelists for inline oneboxing 2017-07-22 03:29:04 +08:00			`module RetrieveTitle`
PERF: ability to crawl for titles without extra HEAD req Also, introduces a much more aggressive timeout for title crawling and introduces gzip to body that is crawled 2018-01-29 12:36:52 +08:00			`CRAWL_TIMEOUT = 1`
FEATURE: Whitelists for inline oneboxing 2017-07-22 03:29:04 +08:00
			`def self.crawl(url)`
PERF: ability to crawl for titles without extra HEAD req Also, introduces a much more aggressive timeout for title crawling and introduces gzip to body that is crawled 2018-01-29 12:36:52 +08:00			`fetch_title(url)`
FEATURE: Whitelists for inline oneboxing 2017-07-22 03:29:04 +08:00			`rescue Exception`
			`# If there was a connection error, do nothing`
			`end`

FIX: Inline Onebox should use encoding from Content-Type header when present (#11625) * FIX: Inline onebox should use encoding from Content-Type header when present * Use Regexp.last_match(1) Signed-off-by: OsamaSayegh <asooomaasoooma90@gmail.com> 2021-01-05 03:32:08 +08:00			`def self.extract_title(html, encoding = nil)`
FEATURE: Whitelists for inline oneboxing 2017-07-22 03:29:04 +08:00			`title = nil`
FIX: increase chunk size to fetch title tag correctly (#14144) 2021-09-03 15:45:58 +08:00			`if html =~ /<title>/ && html !~ /<\/title>/`
			`return nil`
			`end`
FIX: Inline Onebox should use encoding from Content-Type header when present (#11625) * FIX: Inline onebox should use encoding from Content-Type header when present * Use Regexp.last_match(1) Signed-off-by: OsamaSayegh <asooomaasoooma90@gmail.com> 2021-01-05 03:32:08 +08:00			`if doc = Nokogiri::HTML5(html, nil, encoding)`
FEATURE: Whitelists for inline oneboxing 2017-07-22 03:29:04 +08:00
FEATURE: option to enable inline oneboxes for all domains Also, change to prefer title over open graph which is often way too sparse 2017-08-03 02:27:21 +08:00			`title = doc.at('title')&.inner_text`

FIX: Hack our title retriever so that it parses YouTube URLs 2017-09-28 21:29:50 +08:00			# A horrible hack - YouTube uses `document.title` to populate the title
			`# for some reason. For any other site than YouTube this wouldn't be worth it.`
			`if title == "YouTube" && html =~ /document\.title = "(.*)";/`
			`title = Regexp.last_match[1].sub(/ - YouTube$/, '')`
			`end`

FEATURE: option to enable inline oneboxes for all domains Also, change to prefer title over open graph which is often way too sparse 2017-08-03 02:27:21 +08:00			`if !title && node = doc.at('meta[property="og:title"]')`
FEATURE: Whitelists for inline oneboxing 2017-07-22 03:29:04 +08:00			`title = node['content']`
			`end`
			`end`

			`if title.present?`
			`title.gsub!(/\n/, ' ')`
			`title.gsub!(/ +/, ' ')`
			`title.strip!`
			`return title`
			`end`
			`nil`
			`end`

			`private`

			`def self.max_chunk_size(uri)`
FIX: Hack our title retriever so that it parses YouTube URLs 2017-09-28 21:29:50 +08:00
			`# Amazon and YouTube leave the title until very late. Exceptions are bad`
			`# but these are large sites.`
PERF: ability to crawl for titles without extra HEAD req Also, introduces a much more aggressive timeout for title crawling and introduces gzip to body that is crawled 2018-01-29 12:36:52 +08:00			`return 500 if uri.host =~ /amazon\.(com\|ca\|co\.uk\|es\|fr\|de\|it\|com\.au\|com\.br\|cn\|in\|co\.jp\|com\.mx)$/`
FIX: Hack our title retriever so that it parses YouTube URLs 2017-09-28 21:29:50 +08:00			`return 300 if uri.host =~ /youtube\.com$/ \|\| uri.host =~ /youtu.be/`
FEATURE: Whitelists for inline oneboxing 2017-07-22 03:29:04 +08:00
FIX: increase chunk size to fetch title tag correctly (#14144) 2021-09-03 15:45:58 +08:00			`# default is 20k`
			`20`
FEATURE: Whitelists for inline oneboxing 2017-07-22 03:29:04 +08:00			`end`
PERF: ability to crawl for titles without extra HEAD req Also, introduces a much more aggressive timeout for title crawling and introduces gzip to body that is crawled 2018-01-29 12:36:52 +08:00
			`# Fetch the beginning of a HTML document at a url`
			`def self.fetch_title(url)`
			`fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT)`

			`current = nil`
FEATURE: Whitelists for inline oneboxing 2017-07-22 03:29:04 +08:00			`title = nil`
FIX: Inline Onebox should use encoding from Content-Type header when present (#11625) * FIX: Inline onebox should use encoding from Content-Type header when present * Use Regexp.last_match(1) Signed-off-by: OsamaSayegh <asooomaasoooma90@gmail.com> 2021-01-05 03:32:08 +08:00			`encoding = nil`
PERF: ability to crawl for titles without extra HEAD req Also, introduces a much more aggressive timeout for title crawling and introduces gzip to body that is crawled 2018-01-29 12:36:52 +08:00
			`fd.get do \|_response, chunk, uri\|`
FIX: Only block domains at the final destination (#15689) (#15783) In an earlier PR, we decided that we only want to block a domain if the blocked domain in the SiteSetting is the final destination (/t/59305). That PR used `FinalDestination#get`. `resolve` however is used several places but blocks domains along the redirect chain when certain options are provided. This commit changes the default options for `resolve` to not do that. Existing users of `FinalDestination#resolve` are - `Oneboxer#external_onebox` - our onebox helper `fetch_html_doc`, which is used in amazon, standard embed and youtube - these folks already go through `Oneboxer#external_onebox` which already blocks correctly 2022-02-03 09:42:06 +08:00			`if (uri.present? && Onebox::DomainChecker.is_blocked?(uri.hostname))`
FIX: Respect blocked domains list when redirecting (#15656) Our previous implementation used a simple `blocked_domain_array.include?(hostname)` so some values were not matching. Additionally, in some configurations like ours, we'd used "cat..dog.com" with the assumption we'd support globbing. This change implicitly allows globbing by blocking "http://a.b.com" if "b.com" is a blocked domain but does not actively do anything for "". An upcoming change might include frontend validation for values that can be inserted. 2022-01-20 14:12:34 +08:00			`throw :done`
			`end`

FIX: follow redirects for inline/mini onebox (#13512) 2021-06-24 22:23:39 +08:00			`unless Net::HTTPRedirection === _response`
			`if current`
			`current << chunk`
			`else`
			`current = chunk`
			`end`
PERF: ability to crawl for titles without extra HEAD req Also, introduces a much more aggressive timeout for title crawling and introduces gzip to body that is crawled 2018-01-29 12:36:52 +08:00
FIX: follow redirects for inline/mini onebox (#13512) 2021-06-24 22:23:39 +08:00			`if !encoding && content_type = _response['content-type']&.strip&.downcase`
			`if content_type =~ /charset="?([a-z0-9_-]+)"?/`
			`encoding = Regexp.last_match(1)`
			`if !Encoding.list.map(&:name).map(&:downcase).include?(encoding)`
			`encoding = nil`
			`end`
FIX: Inline Onebox should use encoding from Content-Type header when present (#11625) * FIX: Inline onebox should use encoding from Content-Type header when present * Use Regexp.last_match(1) Signed-off-by: OsamaSayegh <asooomaasoooma90@gmail.com> 2021-01-05 03:32:08 +08:00			`end`
			`end`
Make rubocop happy again. 2018-06-07 13:28:18 +08:00
FIX: follow redirects for inline/mini onebox (#13512) 2021-06-24 22:23:39 +08:00			`max_size = max_chunk_size(uri) * 1024`
			`title = extract_title(current, encoding)`
			`throw :done if title \|\| max_size < current.length`
			`end`
FEATURE: Whitelists for inline oneboxing 2017-07-22 03:29:04 +08:00			`end`
DEV: Apply Rubocop redundant return style 2019-11-15 04:10:51 +08:00			`title`
Make rubocop happy again. 2018-06-07 13:28:18 +08:00			`end`
FEATURE: Whitelists for inline oneboxing 2017-07-22 03:29:04 +08:00			`end`