mirror of
https://github.com/discourse/discourse.git
synced 2024-12-12 21:58:33 +08:00
59867cc091
### Background When SSRF detection fails, the exception bubbles all the way up, causing a log alert. This isn't actionable, and should instead be ignored. The existing `rescue` does already ignore network errors, but fails to account for SSRF exceptions coming from `FinalDestination`. ### What is this change? This PR does two things. --- Firstly, it introduces a common root exception class, `FinalDestination::SSRFError` for SSRF errors. This serves two functions: 1) it makes it easier to rescue both errors at once, which is generally what one wants to do and 2) prevents having to dig deep into the class hierarchy for the constant. This change is fully backwards compatible thanks to how inheritance and exception handling works. --- Secondly, it rescues this new exception in `UserAvatar.import_url_for_user`, which is causing sporadic errors to be logged in production. After this SSRF errors are handled the same as network errors.
561 lines
15 KiB
Ruby
561 lines
15 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "socket"
|
|
require "ipaddr"
|
|
require "excon"
|
|
require "rate_limiter"
|
|
require "url_helper"
|
|
|
|
# Determine the final endpoint for a Web URI, following redirects
|
|
class FinalDestination
|
|
class SSRFError < SocketError
|
|
end
|
|
|
|
MAX_REQUEST_TIME_SECONDS = 10
|
|
MAX_REQUEST_SIZE_BYTES = 5_242_880 # 1024 * 1024 * 5
|
|
|
|
def self.clear_https_cache!(domain)
|
|
key = redis_https_key(domain)
|
|
Discourse.redis.without_namespace.del(key)
|
|
end
|
|
|
|
def self.cache_https_domain(domain)
|
|
key = redis_https_key(domain)
|
|
Discourse.redis.without_namespace.setex(key, 1.day.to_i, "1")
|
|
end
|
|
|
|
def self.is_https_domain?(domain)
|
|
key = redis_https_key(domain)
|
|
Discourse.redis.without_namespace.get(key).present?
|
|
end
|
|
|
|
def self.redis_https_key(domain)
|
|
"HTTPS_DOMAIN_#{domain}"
|
|
end
|
|
|
|
DEFAULT_USER_AGENT =
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"
|
|
|
|
attr_reader :status, :cookie, :status_code, :content_type, :ignored
|
|
|
|
def initialize(url, opts = nil)
|
|
@url = url
|
|
@uri = uri(normalized_url) if @url
|
|
|
|
@opts = opts || {}
|
|
@force_get_hosts = @opts[:force_get_hosts] || []
|
|
@preserve_fragment_url_hosts = @opts[:preserve_fragment_url_hosts] || []
|
|
@force_custom_user_agent_hosts = @opts[:force_custom_user_agent_hosts] || []
|
|
@default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT
|
|
@opts[:max_redirects] ||= 5
|
|
@https_redirect_ignore_limit = @opts[:initial_https_redirect_ignore_limit]
|
|
|
|
@max_redirects = @opts[:max_redirects]
|
|
@limit = @max_redirects
|
|
|
|
@ignored = []
|
|
if @limit > 0
|
|
ignore_redirects = [Discourse.base_url_no_prefix]
|
|
|
|
ignore_redirects.concat(@opts[:ignore_redirects]) if @opts[:ignore_redirects]
|
|
|
|
ignore_redirects.each do |ignore_redirect|
|
|
ignore_redirect = uri(ignore_redirect)
|
|
@ignored << ignore_redirect.hostname if ignore_redirect.present? && ignore_redirect.hostname
|
|
end
|
|
end
|
|
|
|
@status = :ready
|
|
@follow_canonical = @opts[:follow_canonical]
|
|
@http_verb = @opts[:http_verb] || http_verb(@force_get_hosts, @follow_canonical)
|
|
@cookie = nil
|
|
@limited_ips = []
|
|
@verbose = @opts[:verbose] || false
|
|
@timeout = @opts[:timeout] || nil
|
|
@preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) }
|
|
@validate_uri = @opts.fetch(:validate_uri) { true }
|
|
@user_agent =
|
|
(
|
|
if @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) }
|
|
Onebox.options.user_agent
|
|
else
|
|
@default_user_agent
|
|
end
|
|
)
|
|
@stop_at_blocked_pages = @opts[:stop_at_blocked_pages]
|
|
end
|
|
|
|
def self.connection_timeout
|
|
20
|
|
end
|
|
|
|
def self.resolve(url, opts = nil)
|
|
new(url, opts).resolve
|
|
end
|
|
|
|
def http_verb(force_get_hosts, follow_canonical)
|
|
if follow_canonical || force_get_hosts.any? { |host| hostname_matches?(host) }
|
|
:get
|
|
else
|
|
:head
|
|
end
|
|
end
|
|
|
|
def timeout
|
|
@timeout || FinalDestination.connection_timeout
|
|
end
|
|
|
|
def redirected?
|
|
@limit < @max_redirects
|
|
end
|
|
|
|
def request_headers
|
|
result = {
|
|
"User-Agent" => @user_agent,
|
|
"Accept" => "*/*",
|
|
"Accept-Language" => "*",
|
|
"Host" => @uri.hostname,
|
|
}
|
|
|
|
result["Cookie"] = @cookie if @cookie
|
|
|
|
result
|
|
end
|
|
|
|
def small_get(request_headers)
|
|
status_code, response_headers = nil
|
|
|
|
catch(:done) do
|
|
FinalDestination::HTTP.start(
|
|
@uri.host,
|
|
@uri.port,
|
|
use_ssl: @uri.is_a?(URI::HTTPS),
|
|
open_timeout: timeout,
|
|
) do |http|
|
|
http.read_timeout = timeout
|
|
http.request_get(@uri.request_uri, request_headers) do |resp|
|
|
status_code = resp.code.to_i
|
|
response_headers = resp.to_hash
|
|
|
|
# see: https://bugs.ruby-lang.org/issues/15624
|
|
# if we allow response to return then body will be read
|
|
# got to abort without reading body
|
|
throw :done
|
|
end
|
|
end
|
|
end
|
|
|
|
[status_code, response_headers]
|
|
end
|
|
|
|
# this is a new interface for simply getting
|
|
# N bytes accounting for all internal logic
|
|
def get(redirects = @limit, extra_headers: {}, &blk)
|
|
raise "Must specify block" unless block_given?
|
|
|
|
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
|
|
@uri.scheme = "https"
|
|
@uri = URI(@uri.to_s)
|
|
end
|
|
|
|
return if !validate_uri
|
|
return if @stop_at_blocked_pages && blocked_domain?(@uri)
|
|
|
|
result, headers_subset = safe_get(@uri, &blk)
|
|
return if !result
|
|
|
|
cookie = headers_subset.set_cookie
|
|
location = headers_subset.location
|
|
|
|
if result == :redirect
|
|
return if !location
|
|
|
|
old_uri = @uri
|
|
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
|
@uri = uri(location)
|
|
|
|
if @uri && redirects == @max_redirects && @https_redirect_ignore_limit &&
|
|
same_uri_but_https?(old_uri, @uri)
|
|
redirects += 1
|
|
@https_redirect_ignore_limit = false
|
|
end
|
|
|
|
return if redirects == 0
|
|
|
|
# https redirect, so just cache that whole new domain is https
|
|
if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri)
|
|
FinalDestination.cache_https_domain(@uri.hostname)
|
|
end
|
|
|
|
return if !@uri
|
|
|
|
extra = nil
|
|
extra = { "Cookie" => cookie } if cookie
|
|
|
|
get(redirects - 1, extra_headers: extra, &blk)
|
|
elsif result == :ok
|
|
@uri.to_s
|
|
else
|
|
nil
|
|
end
|
|
end
|
|
|
|
def resolve
|
|
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
|
|
@uri.scheme = "https"
|
|
@uri = URI(@uri.to_s)
|
|
end
|
|
|
|
if @limit < 0
|
|
@status = :too_many_redirects
|
|
log(:warn, "FinalDestination could not resolve URL (too many redirects): #{@uri}")
|
|
return
|
|
end
|
|
|
|
unless validate_uri
|
|
@status = :invalid_address
|
|
log(:warn, "FinalDestination could not resolve URL (invalid URI): #{@uri}")
|
|
return
|
|
end
|
|
|
|
@ignored.each do |host|
|
|
if @uri&.hostname&.match?(host)
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
end
|
|
|
|
if Oneboxer.cached_response_body_exists?(@uri.to_s)
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
|
|
headers = request_headers
|
|
middlewares = Excon.defaults[:middlewares].dup
|
|
middlewares << Excon::Middleware::Decompress if @http_verb == :get
|
|
|
|
request_start_time = Time.now
|
|
response_body = +""
|
|
request_validator =
|
|
lambda do |chunk, _remaining_bytes, _total_bytes|
|
|
response_body << chunk
|
|
if response_body.bytesize > MAX_REQUEST_SIZE_BYTES
|
|
raise Excon::Errors::ExpectationFailed.new("response size too big: #{@uri.to_s}")
|
|
end
|
|
if Time.now - request_start_time > MAX_REQUEST_TIME_SECONDS
|
|
raise Excon::Errors::ExpectationFailed.new("connect timeout reached: #{@uri.to_s}")
|
|
end
|
|
end
|
|
|
|
# This technique will only use the first resolved IP
|
|
# TODO: Can we standardise this by using FinalDestination::HTTP?
|
|
begin
|
|
resolved_ip = SSRFDetector.lookup_and_filter_ips(@uri.hostname).first
|
|
rescue SSRFDetector::DisallowedIpError, SocketError, Timeout::Error
|
|
@status = :invalid_address
|
|
return
|
|
end
|
|
request_uri = @uri.dup
|
|
request_uri.hostname = resolved_ip unless Rails.env.test? # WebMock doesn't understand the IP-based requests
|
|
|
|
response =
|
|
Excon.public_send(
|
|
@http_verb,
|
|
request_uri.to_s,
|
|
read_timeout: timeout,
|
|
connect_timeout: timeout,
|
|
headers: { "Host" => @uri.hostname }.merge(headers),
|
|
middlewares: middlewares,
|
|
response_block: request_validator,
|
|
ssl_verify_peer_host: @uri.hostname,
|
|
)
|
|
|
|
if @stop_at_blocked_pages
|
|
if blocked_domain?(@uri) || response.headers["Discourse-No-Onebox"] == "1"
|
|
@status = :blocked_page
|
|
return
|
|
end
|
|
end
|
|
|
|
location = nil
|
|
response_headers = nil
|
|
response_status = response.status.to_i
|
|
|
|
case response.status
|
|
when 200
|
|
# Cache body of successful `get` requests
|
|
if @http_verb == :get
|
|
if Oneboxer.cache_response_body?(@uri)
|
|
Oneboxer.cache_response_body(@uri.to_s, response_body)
|
|
end
|
|
end
|
|
|
|
if @follow_canonical
|
|
next_url = fetch_canonical_url(response_body)
|
|
|
|
if next_url.to_s.present? && next_url != @uri
|
|
@follow_canonical = false
|
|
@uri = next_url
|
|
@http_verb = http_verb(@force_get_hosts, @follow_canonical)
|
|
|
|
return resolve
|
|
end
|
|
end
|
|
|
|
@content_type = response.headers["Content-Type"] if response.headers.has_key?("Content-Type")
|
|
@status = :resolved
|
|
return @uri
|
|
when 103, 400, 405, 406, 409, 500, 501
|
|
response_status, small_headers = small_get(request_headers)
|
|
|
|
if @stop_at_blocked_pages
|
|
# this may seem weird, but the #to_hash method of the response object
|
|
# of ruby's net/http lib returns a hash where each value is an array.
|
|
# small_headers here is like that so our no onebox header value is an
|
|
# array if it's set. Also the hash keys are always lower-cased.
|
|
dont_onebox = small_headers["discourse-no-onebox"]&.join("") == "1"
|
|
if dont_onebox || blocked_domain?(@uri)
|
|
@status = :blocked_page
|
|
return
|
|
end
|
|
end
|
|
|
|
if response_status == 200
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
|
|
response_headers = {}
|
|
if cookie_val = small_headers["set-cookie"]
|
|
response_headers[:cookies] = cookie_val
|
|
end
|
|
|
|
if location_val = small_headers["location"]
|
|
response_headers[:location] = location_val.join
|
|
end
|
|
end
|
|
|
|
unless response_headers
|
|
response_headers = {
|
|
cookies: response.data[:cookies] || response.headers[:"set-cookie"],
|
|
location: response.headers[:location],
|
|
}
|
|
end
|
|
|
|
location = response_headers[:location] if (300..399).include?(response_status)
|
|
|
|
if cookies = response_headers[:cookies]
|
|
@cookie = Array.wrap(cookies).map { |c| c.split(";").first.strip }.join("; ")
|
|
end
|
|
|
|
if location
|
|
redirect_uri = uri(location)
|
|
if @uri.host == redirect_uri.host &&
|
|
(redirect_uri.path =~ %r{/login} || redirect_uri.path =~ %r{/session})
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
|
|
old_uri = @uri
|
|
location = "#{location}##{@uri.fragment}" if @preserve_fragment_url && @uri.fragment.present?
|
|
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
|
@uri = uri(location)
|
|
|
|
if @uri && @limit == @max_redirects && @https_redirect_ignore_limit &&
|
|
same_uri_but_https?(old_uri, @uri)
|
|
@limit += 1
|
|
@https_redirect_ignore_limit = false
|
|
end
|
|
@limit -= 1
|
|
|
|
# https redirect, so just cache that whole new domain is https
|
|
if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri)
|
|
FinalDestination.cache_https_domain(@uri.hostname)
|
|
end
|
|
return resolve
|
|
end
|
|
|
|
# this is weird an exception seems better
|
|
@status = :failure
|
|
@status_code = response.status
|
|
|
|
log(:warn, "FinalDestination could not resolve URL (status #{response.status}): #{@uri}")
|
|
nil
|
|
rescue Excon::Errors::Timeout
|
|
log(:warn, "FinalDestination could not resolve URL (timeout): #{@uri}")
|
|
nil
|
|
end
|
|
|
|
def skip_validations?
|
|
!@validate_uri
|
|
end
|
|
|
|
def validate_uri
|
|
skip_validations? || validate_uri_format
|
|
end
|
|
|
|
def validate_uri_format
|
|
return false unless @uri && @uri.host
|
|
return false unless %w[https http].include?(@uri.scheme)
|
|
return false if @uri.scheme == "http" && @uri.port != 80
|
|
return false if @uri.scheme == "https" && @uri.port != 443
|
|
|
|
# Disallow IP based crawling
|
|
(
|
|
begin
|
|
IPAddr.new(@uri.hostname)
|
|
rescue StandardError
|
|
nil
|
|
end
|
|
).nil?
|
|
end
|
|
|
|
def hostname
|
|
@uri.hostname
|
|
end
|
|
|
|
def hostname_matches?(url)
|
|
url = uri(url)
|
|
|
|
if @uri&.hostname.present? && url&.hostname.present?
|
|
hostname_parts = url.hostname.split(".")
|
|
has_wildcard = hostname_parts.first == "*"
|
|
|
|
if has_wildcard
|
|
@uri.hostname.end_with?(hostname_parts[1..-1].join("."))
|
|
else
|
|
@uri.hostname == url.hostname
|
|
end
|
|
end
|
|
end
|
|
|
|
def normalized_url
|
|
UrlHelper.normalized_encode(@url)
|
|
end
|
|
|
|
def log(log_level, message)
|
|
return unless @verbose
|
|
return if @status_code == 404
|
|
|
|
Rails.logger.public_send(
|
|
log_level,
|
|
"#{RailsMultisite::ConnectionManagement.current_db}: #{message}",
|
|
)
|
|
end
|
|
|
|
protected
|
|
|
|
def safe_get(uri)
|
|
result = nil
|
|
unsafe_close = false
|
|
headers_subset = Struct.new(:location, :set_cookie).new
|
|
|
|
safe_session(uri) do |http|
|
|
headers = request_headers.merge("Accept-Encoding" => "gzip", "Host" => uri.host)
|
|
|
|
req = FinalDestination::HTTP::Get.new(uri.request_uri, headers)
|
|
|
|
http.request(req) do |resp|
|
|
headers_subset.set_cookie = resp["Set-Cookie"]
|
|
|
|
if @stop_at_blocked_pages
|
|
dont_onebox = resp["Discourse-No-Onebox"] == "1"
|
|
if dont_onebox
|
|
result = :blocked, headers_subset
|
|
next
|
|
end
|
|
end
|
|
|
|
if Net::HTTPRedirection === resp
|
|
headers_subset.location = resp["location"]
|
|
result = :redirect, headers_subset
|
|
end
|
|
|
|
if Net::HTTPSuccess === resp
|
|
resp.decode_content = true
|
|
resp.read_body do |chunk|
|
|
read_next = true
|
|
|
|
catch(:done) do
|
|
if read_next
|
|
read_next = false
|
|
yield resp, chunk, uri
|
|
read_next = true
|
|
end
|
|
end
|
|
|
|
# no clean way of finishing abruptly cause
|
|
# response likes reading till the end
|
|
if !read_next
|
|
unsafe_close = true
|
|
http.finish
|
|
raise StandardError
|
|
end
|
|
end
|
|
result = :ok, headers_subset
|
|
else
|
|
catch(:done) { yield resp, nil, nil }
|
|
end
|
|
end
|
|
end
|
|
|
|
result
|
|
rescue Timeout::Error
|
|
log(:warn, "FinalDestination could not resolve URL (timeout): #{@uri}")
|
|
nil
|
|
rescue OpenSSL::SSL::SSLError => exception
|
|
log(:warn, "An error with SSL occurred: #{@uri} #{exception.message}")
|
|
nil
|
|
rescue StandardError
|
|
unsafe_close ? [:ok, headers_subset] : raise
|
|
end
|
|
|
|
def safe_session(uri)
|
|
FinalDestination::HTTP.start(
|
|
uri.host,
|
|
uri.port,
|
|
use_ssl: (uri.scheme == "https"),
|
|
open_timeout: timeout,
|
|
) do |http|
|
|
http.read_timeout = timeout
|
|
yield http
|
|
end
|
|
end
|
|
|
|
private
|
|
|
|
def uri(location)
|
|
begin
|
|
URI.parse(location)
|
|
rescue URI::Error
|
|
end
|
|
end
|
|
|
|
def fetch_canonical_url(body)
|
|
return if body.blank?
|
|
|
|
canonical_element = Nokogiri.HTML5(body).at("link[rel='canonical']")
|
|
return if canonical_element.nil?
|
|
canonical_uri = uri(canonical_element["href"])
|
|
return if canonical_uri.blank?
|
|
|
|
return canonical_uri if canonical_uri.host.present?
|
|
parts = [@uri.host, canonical_uri.to_s]
|
|
complete_url = canonical_uri.to_s.starts_with?("/") ? parts.join("") : parts.join("/")
|
|
complete_url = "#{@uri.scheme}://#{complete_url}" if @uri.scheme
|
|
|
|
uri(complete_url)
|
|
end
|
|
|
|
def blocked_domain?(uri)
|
|
Onebox::DomainChecker.is_blocked?(uri.hostname)
|
|
end
|
|
|
|
def same_uri_but_https?(before, after)
|
|
before = before.to_s
|
|
after = after.to_s
|
|
before.start_with?("http://") && after.start_with?("https://") &&
|
|
before.sub("http://", "") == after.sub("https://", "")
|
|
end
|
|
end
|