mirror of
https://github.com/discourse/discourse.git
synced 2024-12-12 21:44:31 +08:00
59867cc091
### Background When SSRF detection fails, the exception bubbles all the way up, causing a log alert. This isn't actionable, and should instead be ignored. The existing `rescue` does already ignore network errors, but fails to account for SSRF exceptions coming from `FinalDestination`. ### What is this change? This PR does two things. --- Firstly, it introduces a common root exception class, `FinalDestination::SSRFError` for SSRF errors. This serves two functions: 1) it makes it easier to rescue both errors at once, which is generally what one wants to do and 2) prevents having to dig deep into the class hierarchy for the constant. This change is fully backwards compatible thanks to how inheritance and exception handling works. --- Secondly, it rescues this new exception in `UserAvatar.import_url_for_user`, which is causing sporadic errors to be logged in production. After this SSRF errors are handled the same as network errors.
107 lines
3.0 KiB
Ruby
107 lines
3.0 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module RetrieveTitle
|
|
CRAWL_TIMEOUT = 1
|
|
|
|
def self.crawl(url, max_redirects: nil, initial_https_redirect_ignore_limit: false)
|
|
fetch_title(
|
|
url,
|
|
max_redirects: max_redirects,
|
|
initial_https_redirect_ignore_limit: initial_https_redirect_ignore_limit,
|
|
)
|
|
rescue Net::ReadTimeout, FinalDestination::SSRFError
|
|
# do nothing for Net::ReadTimeout errors
|
|
end
|
|
|
|
def self.extract_title(html, encoding = nil)
|
|
title = nil
|
|
return nil if html =~ /<title>/ && html !~ %r{</title>}
|
|
|
|
doc = nil
|
|
begin
|
|
doc = Nokogiri.HTML5(html, nil, encoding)
|
|
rescue ArgumentError
|
|
# invalid HTML (Eg: too many attributes, status tree too deep) - ignore
|
|
# Error in nokogumbo is not specialized, uses generic ArgumentError
|
|
# see: https://www.rubydoc.info/gems/nokogiri/Nokogiri/HTML5#label-Error+reporting
|
|
end
|
|
|
|
if doc
|
|
title = doc.at("title")&.inner_text
|
|
|
|
# A horrible hack - YouTube uses `document.title` to populate the title
|
|
# for some reason. For any other site than YouTube this wouldn't be worth it.
|
|
if title == "YouTube" && html =~ /document\.title *= *"(.*)";/
|
|
title = Regexp.last_match[1].sub(/ - YouTube\z/, "")
|
|
end
|
|
|
|
if !title && node = doc.at('meta[property="og:title"]')
|
|
title = node["content"]
|
|
end
|
|
end
|
|
|
|
if title.present?
|
|
title.gsub!(/\n/, " ")
|
|
title.gsub!(/ +/, " ")
|
|
title.strip!
|
|
return title
|
|
end
|
|
nil
|
|
end
|
|
|
|
private
|
|
|
|
def self.max_chunk_size(uri)
|
|
# Exception for sites that leave the title until very late.
|
|
if uri.host =~
|
|
/(^|\.)amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)\z/
|
|
return 500
|
|
end
|
|
return 300 if uri.host =~ /(^|\.)youtube\.com\z/ || uri.host =~ /(^|\.)youtu\.be\z/
|
|
return 50 if uri.host =~ /(^|\.)github\.com\z/
|
|
|
|
# default is 20k
|
|
20
|
|
end
|
|
|
|
# Fetch the beginning of a HTML document at a url
|
|
def self.fetch_title(url, max_redirects: nil, initial_https_redirect_ignore_limit: false)
|
|
fd =
|
|
FinalDestination.new(
|
|
url,
|
|
timeout: CRAWL_TIMEOUT,
|
|
stop_at_blocked_pages: true,
|
|
max_redirects: max_redirects,
|
|
initial_https_redirect_ignore_limit: initial_https_redirect_ignore_limit,
|
|
)
|
|
|
|
current = nil
|
|
title = nil
|
|
encoding = nil
|
|
|
|
fd.get do |_response, chunk, uri|
|
|
unless Net::HTTPRedirection === _response
|
|
throw :done if uri.blank?
|
|
|
|
if current
|
|
current << chunk
|
|
else
|
|
current = chunk
|
|
end
|
|
|
|
if !encoding && content_type = _response["content-type"]&.strip&.downcase
|
|
if content_type =~ /charset="?([a-z0-9_-]+)"?/
|
|
encoding = Regexp.last_match(1)
|
|
encoding = nil if !Encoding.list.map(&:name).map(&:downcase).include?(encoding)
|
|
end
|
|
end
|
|
|
|
max_size = max_chunk_size(uri) * 1024
|
|
title = extract_title(current, encoding)
|
|
throw :done if title || max_size < current.length
|
|
end
|
|
end
|
|
title
|
|
end
|
|
end
|