discourse/lib/retrieve_title.rb
Sam df04462475
FIX: ignore malformed HTML for title extraction (#18040)
Certain HTML can be rejected by nokogumbo, specifically cases where there
are enormous amounts of attributes

This ensures that malformed HTML is simply skipped instead of leaking out
an exception and terminating downstream processes.
2022-08-23 15:03:57 +10:00

105 lines
2.8 KiB
Ruby

# frozen_string_literal: true
module RetrieveTitle
CRAWL_TIMEOUT = 1
def self.crawl(url, max_redirects: nil, initial_https_redirect_ignore_limit: false)
fetch_title(
url,
max_redirects: max_redirects,
initial_https_redirect_ignore_limit: initial_https_redirect_ignore_limit
)
rescue Net::ReadTimeout
# do nothing for Net::ReadTimeout errors
end
def self.extract_title(html, encoding = nil)
title = nil
if html =~ /<title>/ && html !~ /<\/title>/
return nil
end
doc = nil
begin
doc = Nokogiri::HTML5(html, nil, encoding)
rescue ArgumentError
# invalid HTML (too many attributes) - ignore
end
if doc
title = doc.at('title')&.inner_text
# A horrible hack - YouTube uses `document.title` to populate the title
# for some reason. For any other site than YouTube this wouldn't be worth it.
if title == "YouTube" && html =~ /document\.title *= *"(.*)";/
title = Regexp.last_match[1].sub(/ - YouTube$/, '')
end
if !title && node = doc.at('meta[property="og:title"]')
title = node['content']
end
end
if title.present?
title.gsub!(/\n/, ' ')
title.gsub!(/ +/, ' ')
title.strip!
return title
end
nil
end
private
def self.max_chunk_size(uri)
# Exception for sites that leave the title until very late.
return 500 if uri.host =~ /(^|\.)amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
return 300 if uri.host =~ /(^|\.)youtube\.com$/ || uri.host =~ /(^|\.)youtu\.be$/
return 50 if uri.host =~ /(^|\.)github\.com$/
# default is 20k
20
end
# Fetch the beginning of a HTML document at a url
def self.fetch_title(url, max_redirects: nil, initial_https_redirect_ignore_limit: false)
fd = FinalDestination.new(
url,
timeout: CRAWL_TIMEOUT,
stop_at_blocked_pages: true,
max_redirects: max_redirects,
initial_https_redirect_ignore_limit: initial_https_redirect_ignore_limit
)
current = nil
title = nil
encoding = nil
fd.get do |_response, chunk, uri|
unless Net::HTTPRedirection === _response
throw :done if uri.blank?
if current
current << chunk
else
current = chunk
end
if !encoding && content_type = _response['content-type']&.strip&.downcase
if content_type =~ /charset="?([a-z0-9_-]+)"?/
encoding = Regexp.last_match(1)
if !Encoding.list.map(&:name).map(&:downcase).include?(encoding)
encoding = nil
end
end
end
max_size = max_chunk_size(uri) * 1024
title = extract_title(current, encoding)
throw :done if title || max_size < current.length
end
end
title
end
end