discourse/lib/final_destination.rb
Vinoth Kannan d681decf01
FEATURE: use new site setting for onebox custom user agent. (#28045)
Previously, we couldn't change the user agent name dynamically for onebox requests. In this commit, a new hidden site setting `onebox_user_agent` is created to override the default user agent value specified in the [initializer](c333e9d6e6/config/initializers/100-onebox_options.rb (L15)).

Co-authored-by: Régis Hanol <regis@hanol.fr>
2024-07-24 04:45:30 +05:30

603 lines
17 KiB
Ruby

# frozen_string_literal: true
require "socket"
require "ipaddr"
require "excon"
require "rate_limiter"
require "url_helper"
# Determine the final endpoint for a Web URI, following redirects
class FinalDestination
class SSRFError < SocketError
end
class UrlEncodingError < ArgumentError
end
MAX_REQUEST_TIME_SECONDS = 10
MAX_REQUEST_SIZE_BYTES = 5_242_880 # 1024 * 1024 * 5
def self.clear_https_cache!(domain)
key = redis_https_key(domain)
Discourse.redis.without_namespace.del(key)
end
def self.cache_https_domain(domain)
key = redis_https_key(domain)
Discourse.redis.without_namespace.setex(key, 1.day.to_i, "1")
end
def self.is_https_domain?(domain)
key = redis_https_key(domain)
Discourse.redis.without_namespace.get(key).present?
end
def self.redis_https_key(domain)
"HTTPS_DOMAIN_#{domain}"
end
DEFAULT_USER_AGENT =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"
attr_reader :status, :cookie, :status_code, :content_type, :ignored
def initialize(url, opts = nil)
@url = url
@uri = uri(normalized_url) if @url
@opts = opts || {}
@force_get_hosts = @opts[:force_get_hosts] || []
@preserve_fragment_url_hosts = @opts[:preserve_fragment_url_hosts] || []
@force_custom_user_agent_hosts = @opts[:force_custom_user_agent_hosts] || []
@default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT
@opts[:max_redirects] ||= 5
@https_redirect_ignore_limit = @opts[:initial_https_redirect_ignore_limit]
@include_port_in_host_header = @opts[:include_port_in_host_header] || false
@max_redirects = @opts[:max_redirects]
@limit = @max_redirects
@ignored = []
if @limit > 0
ignore_redirects = [Discourse.base_url_no_prefix]
ignore_redirects.concat(@opts[:ignore_redirects]) if @opts[:ignore_redirects]
ignore_redirects.each do |ignore_redirect|
ignore_redirect = uri(ignore_redirect)
@ignored << ignore_redirect.hostname if ignore_redirect.present? && ignore_redirect.hostname
end
end
@status = :ready
@follow_canonical = @opts[:follow_canonical]
@http_verb = @opts[:http_verb] || http_verb(@force_get_hosts, @follow_canonical)
@cookie = nil
@limited_ips = []
@verbose = @opts[:verbose] || false
@timeout = @opts[:timeout] || nil
@preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) }
@validate_uri = @opts.fetch(:validate_uri) { true }
@user_agent =
(
if @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) }
Onebox::Helpers.user_agent
else
@default_user_agent
end
)
@stop_at_blocked_pages = @opts[:stop_at_blocked_pages]
@extra_headers = @opts[:headers]
end
def self.connection_timeout
20
end
def self.resolve(url, opts = nil)
new(url, opts).resolve
end
def http_verb(force_get_hosts, follow_canonical)
if follow_canonical || force_get_hosts.any? { |host| hostname_matches?(host) }
:get
else
:head
end
end
def timeout
@timeout || FinalDestination.connection_timeout
end
def redirected?
@limit < @max_redirects
end
def request_headers
result = {
"User-Agent" => @user_agent,
"Accept" => "*/*",
"Accept-Language" => "*",
"Host" => @uri.hostname + (@include_port_in_host_header ? ":#{@uri.port}" : ""),
}
result.merge!(@extra_headers) if @extra_headers
result["Cookie"] = @cookie if @cookie
result
end
def small_get(request_headers)
status_code, response_headers = nil
catch(:done) do
FinalDestination::HTTP.start(
@uri.host,
@uri.port,
use_ssl: @uri.is_a?(URI::HTTPS),
open_timeout: timeout,
) do |http|
http.read_timeout = timeout
http.request_get(@uri.request_uri, request_headers) do |resp|
status_code = resp.code.to_i
response_headers = resp.to_hash
# see: https://bugs.ruby-lang.org/issues/15624
# if we allow response to return then body will be read
# got to abort without reading body
throw :done
end
end
end
[status_code, response_headers]
end
# this is a new interface for simply getting
# N bytes accounting for all internal logic
def get(redirects = @limit, extra_headers: {}, except_headers: [], &blk)
raise "Must specify block" unless block_given?
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
@uri.scheme = "https"
@uri = URI(@uri.to_s)
end
return if !validate_uri
return if @stop_at_blocked_pages && blocked_domain?(@uri)
result, headers_subset = safe_get(@uri, except_headers:, &blk)
return if !result
cookie = headers_subset.set_cookie
location = headers_subset.location
if result == :redirect
return if !location
old_uri = @uri
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
@uri = uri(location)
if @uri && redirects == @max_redirects && @https_redirect_ignore_limit &&
same_uri_but_https?(old_uri, @uri)
redirects += 1
@https_redirect_ignore_limit = false
end
return if redirects == 0
# https redirect, so just cache that whole new domain is https
if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri)
FinalDestination.cache_https_domain(@uri.hostname)
end
return if !@uri
extra = nil
extra = { "Cookie" => cookie } if cookie
# Most HTTP Clients strip the Authorization header on redirects as the client could be redirecting to a untrusted
# party. Not stripping the Authorization header on redirect can also lead to problems where the
# redirected party does not expect a Authorization header and thus rejects the request.
except_headers = ["Authorization"]
get(redirects - 1, extra_headers: extra, except_headers:, &blk)
elsif result == :ok
@uri.to_s
else
nil
end
end
def resolve
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
@uri.scheme = "https"
@uri = URI(@uri.to_s)
end
if @limit < 0
@status = :too_many_redirects
log(:warn, "FinalDestination could not resolve URL (too many redirects): #{@uri}")
return
end
unless validate_uri
@status = :invalid_address
log(:warn, "FinalDestination could not resolve URL (invalid URI): #{@uri}")
return
end
@ignored.each do |host|
if @uri&.hostname&.match?(host)
@status = :resolved
return @uri
end
end
if Oneboxer.cached_response_body_exists?(@uri.to_s)
@status = :resolved
return @uri
end
headers = request_headers
middlewares = Excon.defaults[:middlewares].dup
middlewares << Excon::Middleware::Decompress if @http_verb == :get
request_start_time = Time.now
response_body = +""
request_validator =
lambda do |chunk, _remaining_bytes, _total_bytes|
response_body << chunk
if response_body.bytesize > MAX_REQUEST_SIZE_BYTES
raise Excon::Errors::ExpectationFailed.new("response size too big: #{@uri}")
end
if Time.now - request_start_time > MAX_REQUEST_TIME_SECONDS
raise Excon::Errors::ExpectationFailed.new("connect timeout reached: #{@uri}")
end
end
# This technique will only use the first resolved IP
# TODO: Can we standardise this by using FinalDestination::HTTP?
begin
resolved_ip = SSRFDetector.lookup_and_filter_ips(@uri.hostname).first
rescue SSRFDetector::DisallowedIpError, SocketError, Timeout::Error
@status = :invalid_address
return
end
request_uri = @uri.dup
request_uri.hostname = resolved_ip unless Rails.env.test? # WebMock doesn't understand the IP-based requests
response =
Excon.public_send(
@http_verb,
request_uri.to_s,
read_timeout: timeout,
connect_timeout: timeout,
headers: { "Host" => @uri.hostname }.merge(headers),
middlewares: middlewares,
response_block: request_validator,
ssl_verify_peer_host: @uri.hostname,
)
if @stop_at_blocked_pages
if blocked_domain?(@uri) || response.headers["Discourse-No-Onebox"] == "1"
@status = :blocked_page
return
end
end
location = nil
response_headers = nil
response_status = response.status.to_i
case response.status
when 200
# Cache body of successful `get` requests
if @http_verb == :get
if Oneboxer.cache_response_body?(@uri)
Oneboxer.cache_response_body(@uri.to_s, response_body)
end
end
if @follow_canonical
next_url = fetch_canonical_url(response_body)
if next_url.to_s.present? && next_url != @uri
@follow_canonical = false
@uri = next_url
@http_verb = http_verb(@force_get_hosts, @follow_canonical)
return resolve
end
end
@content_type = response.headers["Content-Type"] if response.headers.has_key?("Content-Type")
@status = :resolved
return @uri
when 103, 400, 405, 406, 409, 500, 501
response_status, small_headers = small_get(request_headers)
if @stop_at_blocked_pages
# this may seem weird, but the #to_hash method of the response object
# of ruby's net/http lib returns a hash where each value is an array.
# small_headers here is like that so our no onebox header value is an
# array if it's set. Also the hash keys are always lower-cased.
dont_onebox = small_headers["discourse-no-onebox"]&.join("") == "1"
if dont_onebox || blocked_domain?(@uri)
@status = :blocked_page
return
end
end
if response_status == 200
@status = :resolved
return @uri
end
response_headers = {}
if cookie_val = small_headers["set-cookie"]
response_headers[:cookies] = cookie_val
end
if location_val = small_headers["location"]
response_headers[:location] = location_val.join
end
end
unless response_headers
response_headers = {
cookies: response.data[:cookies] || response.headers[:"set-cookie"],
location: response.headers[:location],
}
end
location = response_headers[:location] if (300..399).include?(response_status)
if cookies = response_headers[:cookies]
@cookie = Array.wrap(cookies).map { |c| c.split(";").first.strip }.join("; ")
end
if location
redirect_uri = uri(location)
if @uri.host == redirect_uri.host &&
(redirect_uri.path =~ %r{/login} || redirect_uri.path =~ %r{/session})
@status = :resolved
return @uri
end
old_uri = @uri
location = "#{location}##{@uri.fragment}" if @preserve_fragment_url && @uri.fragment.present?
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
@uri = uri(location)
if @uri && @limit == @max_redirects && @https_redirect_ignore_limit &&
same_uri_but_https?(old_uri, @uri)
@limit += 1
@https_redirect_ignore_limit = false
end
@limit -= 1
# https redirect, so just cache that whole new domain is https
if old_uri.port == 80 && @uri&.port == 443 && (URI::HTTPS === @uri)
FinalDestination.cache_https_domain(@uri.hostname)
end
return resolve
end
# this is weird an exception seems better
@status = :failure
@status_code = response.status
log(:warn, "FinalDestination could not resolve URL (status #{response.status}): #{@uri}")
nil
rescue Excon::Errors::Timeout
log(:warn, "FinalDestination could not resolve URL (timeout): #{@uri}")
nil
end
def skip_validations?
!@validate_uri
end
def validate_uri
skip_validations? || validate_uri_format
end
def validate_uri_format
return false unless @uri && @uri.host
return false if %w[https http].exclude?(@uri.scheme)
# In some cases (like local/test environments) we may want to allow http URLs
# to be used for internal hosts, but only if it's the case that the host is
# explicitly used for SiteSetting.s3_endpoint. This is to allow for local
# S3 providers like minio.
#
# In all other cases, we should not be allowing http calls to anything except
# port 80.
return false if @uri.scheme == "http" && !http_port_ok?
return false if @uri.scheme == "https" && @uri.port != 443
# Disallow IP based crawling
(
begin
IPAddr.new(@uri.hostname)
rescue StandardError
nil
end
).nil?
end
def http_port_ok?
return true if @uri.port == 80
allowed_internal_hosts =
SiteSetting.allowed_internal_hosts&.split(/[|\n]/)&.filter_map { |aih| aih.strip.presence }
return false if allowed_internal_hosts.empty? || SiteSetting.s3_endpoint.blank?
return false if allowed_internal_hosts.none? { |aih| hostname_matches_s3_endpoint?(aih) }
true
end
def hostname_matches_s3_endpoint?(allowed_internal_host)
s3_endpoint_uri = URI(SiteSetting.s3_endpoint)
hostname_matches?("http://#{allowed_internal_host}") && @uri.port == s3_endpoint_uri.port &&
@uri.hostname.end_with?(s3_endpoint_uri.hostname)
end
def hostname
@uri.hostname
end
def hostname_matches?(url)
url = uri(url)
if @uri&.hostname.present? && url&.hostname.present?
hostname_parts = url.hostname.split(".")
has_wildcard = hostname_parts.first == "*"
if has_wildcard
@uri.hostname.end_with?(hostname_parts[1..-1].join("."))
else
@uri.hostname == url.hostname
end
end
end
def normalized_url
UrlHelper.normalized_encode(@url)
rescue ArgumentError => e
raise UrlEncodingError, e.message
end
def log(log_level, message)
return unless @verbose
return if @status_code == 404
Rails.logger.public_send(
log_level,
"#{RailsMultisite::ConnectionManagement.current_db}: #{message}",
)
end
protected
def safe_get(uri, except_headers: [])
result = nil
unsafe_close = false
headers_subset = Struct.new(:location, :set_cookie).new
safe_session(uri) do |http|
headers =
request_headers.merge(
"Accept-Encoding" => "gzip",
"Host" => uri.hostname + (@include_port_in_host_header ? ":#{uri.port}" : ""),
).except(*except_headers)
req = FinalDestination::HTTP::Get.new(uri.request_uri, headers)
http.request(req) do |resp|
headers_subset.set_cookie = resp["Set-Cookie"]
if @stop_at_blocked_pages
dont_onebox = resp["Discourse-No-Onebox"] == "1"
if dont_onebox
result = :blocked, headers_subset
next
end
end
if Net::HTTPRedirection === resp
headers_subset.location = resp["location"]
result = :redirect, headers_subset
end
if Net::HTTPSuccess === resp
resp.decode_content = true
resp.read_body do |chunk|
read_next = true
catch(:done) do
if read_next
read_next = false
yield resp, chunk, uri
read_next = true
end
end
# no clean way of finishing abruptly cause
# response likes reading till the end
if !read_next
unsafe_close = true
http.finish
raise StandardError
end
end
result = :ok, headers_subset
else
catch(:done) { yield resp, nil, nil }
end
end
end
result
rescue Timeout::Error
log(:warn, "FinalDestination could not resolve URL (timeout): #{@uri}")
nil
rescue OpenSSL::SSL::SSLError => exception
log(:warn, "An error with SSL occurred: #{@uri} #{exception.message}")
nil
rescue StandardError
unsafe_close ? [:ok, headers_subset] : raise
end
def safe_session(uri)
FinalDestination::HTTP.start(
uri.host,
uri.port,
use_ssl: (uri.scheme == "https"),
open_timeout: timeout,
) do |http|
http.read_timeout = timeout
yield http
end
end
private
def uri(location)
begin
URI.parse(location)
rescue URI::Error
end
end
def fetch_canonical_url(body)
return if body.blank?
canonical_element = Nokogiri.HTML5(body).at("link[rel='canonical']")
return if canonical_element.nil?
canonical_uri = uri(canonical_element["href"])
return if canonical_uri.blank?
return canonical_uri if canonical_uri.host.present?
parts = [@uri.host, canonical_uri.to_s]
complete_url = canonical_uri.to_s.starts_with?("/") ? parts.join("") : parts.join("/")
complete_url = "#{@uri.scheme}://#{complete_url}" if @uri.scheme
uri(complete_url)
end
def blocked_domain?(uri)
Onebox::DomainChecker.is_blocked?(uri.hostname)
end
def same_uri_but_https?(before, after)
before = before.to_s
after = after.to_s
before.start_with?("http://") && after.start_with?("https://") &&
before.sub("http://", "") == after.sub("https://", "")
end
end