mirror of
https://github.com/discourse/discourse.git
synced 2024-11-23 15:05:24 +08:00
68d0916eb5
* FEATURE: Cache successful HTTP GET requests during Oneboxing Some oneboxes may fail if when making excessive and/or odd requests against the target domains. This change provides a simple mechanism to cache the results of succesful GET requests as part of the oneboxing process, with the goal of reducing repeated requests and ultimately improving the rate of successful oneboxing. To enable: Set `SiteSetting.cache_onebox_response_body` to `true` Add the domains you’re interesting in caching to `SiteSetting. cache_onebox_response_body_domains` e.g. `example.com|example.org|example.net` Optionally set `SiteSetting.cache_onebox_user_agent` to a user agent string of your choice to use when making requests against domains in the above list. * FIX: Swap order of duration and value in redis call The correct order for `setex` arguments is `key`, `duration`, and `value`. Duration and value had been flipped, however the code would not have thrown an error because we were caching the value of `1.day.to_i` for a period of 1 seconds… The intention appears to be to set a value of 1 (purely as a flag) for a period of 1 day.
451 lines
12 KiB
Ruby
451 lines
12 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require 'socket'
|
|
require 'ipaddr'
|
|
require 'excon'
|
|
require 'rate_limiter'
|
|
require 'url_helper'
|
|
|
|
# Determine the final endpoint for a Web URI, following redirects
|
|
class FinalDestination
|
|
|
|
def self.clear_https_cache!(domain)
|
|
key = redis_https_key(domain)
|
|
Discourse.redis.without_namespace.del(key)
|
|
end
|
|
|
|
def self.cache_https_domain(domain)
|
|
key = redis_https_key(domain)
|
|
Discourse.redis.without_namespace.setex(key, 1.day.to_i, "1")
|
|
end
|
|
|
|
def self.is_https_domain?(domain)
|
|
key = redis_https_key(domain)
|
|
Discourse.redis.without_namespace.get(key).present?
|
|
end
|
|
|
|
def self.redis_https_key(domain)
|
|
"HTTPS_DOMAIN_#{domain}"
|
|
end
|
|
|
|
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"
|
|
|
|
attr_reader :status, :cookie, :status_code, :ignored
|
|
|
|
def initialize(url, opts = nil)
|
|
@url = url
|
|
@uri = uri(escape_url) if @url
|
|
|
|
@opts = opts || {}
|
|
@force_get_hosts = @opts[:force_get_hosts] || []
|
|
@preserve_fragment_url_hosts = @opts[:preserve_fragment_url_hosts] || []
|
|
@force_custom_user_agent_hosts = @opts[:force_custom_user_agent_hosts] || []
|
|
@default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT
|
|
@opts[:max_redirects] ||= 5
|
|
@opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) }
|
|
|
|
@ignored = @opts[:ignore_hostnames] || []
|
|
@limit = @opts[:max_redirects]
|
|
|
|
if @limit > 0
|
|
ignore_redirects = [Discourse.base_url_no_prefix]
|
|
|
|
if @opts[:ignore_redirects]
|
|
ignore_redirects.concat(@opts[:ignore_redirects])
|
|
end
|
|
|
|
ignore_redirects.each do |ignore_redirect|
|
|
ignore_redirect = uri(ignore_redirect)
|
|
if ignore_redirect.present? && ignore_redirect.hostname
|
|
@ignored << ignore_redirect.hostname
|
|
end
|
|
end
|
|
end
|
|
|
|
@status = :ready
|
|
@http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head
|
|
@cookie = nil
|
|
@limited_ips = []
|
|
@verbose = @opts[:verbose] || false
|
|
@timeout = @opts[:timeout] || nil
|
|
@preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) }
|
|
@validate_uri = @opts.fetch(:validate_uri) { true }
|
|
@user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : @default_user_agent
|
|
end
|
|
|
|
def self.connection_timeout
|
|
20
|
|
end
|
|
|
|
def timeout
|
|
@timeout || FinalDestination.connection_timeout
|
|
end
|
|
|
|
def redirected?
|
|
@limit < @opts[:max_redirects]
|
|
end
|
|
|
|
def request_headers
|
|
result = {
|
|
"User-Agent" => @user_agent,
|
|
"Accept" => "*/*",
|
|
"Host" => @uri.hostname
|
|
}
|
|
|
|
result['Cookie'] = @cookie if @cookie
|
|
|
|
result
|
|
end
|
|
|
|
def small_get(request_headers)
|
|
status_code, response_headers = nil
|
|
|
|
catch(:done) do
|
|
Net::HTTP.start(@uri.host, @uri.port, use_ssl: @uri.is_a?(URI::HTTPS)) do |http|
|
|
http.open_timeout = timeout
|
|
http.read_timeout = timeout
|
|
http.request_get(@uri.request_uri, request_headers) do |resp|
|
|
status_code = resp.code.to_i
|
|
response_headers = resp.to_hash
|
|
|
|
# see: https://bugs.ruby-lang.org/issues/15624
|
|
# if we allow response to return then body will be read
|
|
# got to abort without reading body
|
|
throw :done
|
|
end
|
|
end
|
|
end
|
|
|
|
[status_code, response_headers]
|
|
end
|
|
|
|
# this is a new interface for simply getting
|
|
# N bytes accounting for all internal logic
|
|
def get(uri = @uri, redirects = @limit, extra_headers: {}, &blk)
|
|
raise "Must specify block" unless block_given?
|
|
|
|
if uri && uri.port == 80 && FinalDestination.is_https_domain?(uri.hostname)
|
|
uri.scheme = "https"
|
|
uri = URI(uri.to_s)
|
|
end
|
|
|
|
return nil unless validate_uri
|
|
|
|
result, (location, cookie) = safe_get(uri, &blk)
|
|
|
|
if result == :redirect && (redirects == 0 || !location)
|
|
return nil
|
|
end
|
|
|
|
if result == :redirect
|
|
old_port = uri.port
|
|
location = "#{uri.scheme}://#{uri.host}#{location}" if location[0] == "/"
|
|
uri = uri(location)
|
|
|
|
# https redirect, so just cache that whole new domain is https
|
|
if old_port == 80 && uri&.port == 443 && (URI::HTTPS === uri)
|
|
FinalDestination.cache_https_domain(uri.hostname)
|
|
end
|
|
|
|
return nil if !uri
|
|
|
|
extra = nil
|
|
extra = { 'Cookie' => cookie } if cookie
|
|
|
|
get(uri, redirects - 1, extra_headers: extra, &blk)
|
|
elsif result == :ok
|
|
uri.to_s
|
|
else
|
|
nil
|
|
end
|
|
end
|
|
|
|
def resolve
|
|
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
|
|
@uri.scheme = "https"
|
|
@uri = URI(@uri.to_s)
|
|
end
|
|
|
|
if @limit < 0
|
|
@status = :too_many_redirects
|
|
log(:warn, "FinalDestination could not resolve URL (too many redirects): #{@uri}") if @verbose
|
|
return nil
|
|
end
|
|
|
|
unless validate_uri
|
|
@status = :invalid_address
|
|
log(:warn, "FinalDestination could not resolve URL (invalid URI): #{@uri}") if @verbose
|
|
return nil
|
|
end
|
|
|
|
@ignored.each do |host|
|
|
if @uri&.hostname&.match?(host)
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
end
|
|
|
|
if Oneboxer.cached_response_body_exists?(@uri.to_s)
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
|
|
headers = request_headers
|
|
middlewares = Excon.defaults[:middlewares]
|
|
middlewares << Excon::Middleware::Decompress if @http_verb == :get
|
|
|
|
response = Excon.public_send(@http_verb,
|
|
@uri.to_s,
|
|
read_timeout: timeout,
|
|
headers: headers,
|
|
middlewares: middlewares
|
|
)
|
|
|
|
location = nil
|
|
response_headers = nil
|
|
response_status = response.status.to_i
|
|
|
|
case response.status
|
|
when 200
|
|
# Cache body of successful `get` requests
|
|
if @http_verb == :get
|
|
if Oneboxer.cache_response_body?(@uri)
|
|
Oneboxer.cache_response_body(@uri.to_s, response.body)
|
|
end
|
|
end
|
|
|
|
@status = :resolved
|
|
return @uri
|
|
when 400, 405, 406, 409, 501
|
|
response_status, small_headers = small_get(request_headers)
|
|
|
|
if response_status == 200
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
|
|
response_headers = {}
|
|
if cookie_val = small_headers['set-cookie']
|
|
response_headers[:cookies] = cookie_val
|
|
end
|
|
|
|
if location_val = small_headers['location']
|
|
response_headers[:location] = location_val.join
|
|
end
|
|
end
|
|
|
|
unless response_headers
|
|
response_headers = {
|
|
cookies: response.data[:cookies] || response.headers[:"set-cookie"],
|
|
location: response.headers[:location]
|
|
}
|
|
end
|
|
|
|
if (300..399).include?(response_status)
|
|
location = response_headers[:location]
|
|
end
|
|
|
|
if cookies = response_headers[:cookies]
|
|
@cookie = Array.wrap(cookies).map { |c| c.split(';').first.strip }.join('; ')
|
|
end
|
|
|
|
if location
|
|
redirect_uri = uri(location)
|
|
if @uri.host == redirect_uri.host && (redirect_uri.path =~ /\/login/ || redirect_uri.path =~ /\/session/)
|
|
@status = :resolved
|
|
return @uri
|
|
end
|
|
|
|
old_port = @uri.port
|
|
location = "#{location}##{@uri.fragment}" if @preserve_fragment_url && @uri.fragment.present?
|
|
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
|
@uri = uri(location)
|
|
@limit -= 1
|
|
|
|
# https redirect, so just cache that whole new domain is https
|
|
if old_port == 80 && @uri.port == 443 && (URI::HTTPS === @uri)
|
|
FinalDestination.cache_https_domain(@uri.hostname)
|
|
end
|
|
|
|
return resolve
|
|
end
|
|
|
|
# this is weird an exception seems better
|
|
@status = :failure
|
|
@status_code = response.status
|
|
|
|
log(:warn, "FinalDestination could not resolve URL (status #{response.status}): #{@uri}") if @verbose
|
|
nil
|
|
rescue Excon::Errors::Timeout
|
|
log(:warn, "FinalDestination could not resolve URL (timeout): #{@uri}") if @verbose
|
|
nil
|
|
end
|
|
|
|
def validate_uri
|
|
!@validate_uri || (validate_uri_format && is_dest_valid?)
|
|
end
|
|
|
|
def validate_uri_format
|
|
return false unless @uri
|
|
return false unless ['https', 'http'].include?(@uri.scheme)
|
|
return false if @uri.scheme == 'http' && @uri.port != 80
|
|
return false if @uri.scheme == 'https' && @uri.port != 443
|
|
|
|
# Disallow IP based crawling
|
|
(IPAddr.new(@uri.hostname) rescue nil).nil?
|
|
end
|
|
|
|
def hostname
|
|
@uri.hostname
|
|
end
|
|
|
|
def hostname_matches?(url)
|
|
url = uri(url)
|
|
@uri && url.present? && @uri.hostname == url&.hostname
|
|
end
|
|
|
|
def is_dest_valid?
|
|
return false unless @uri && @uri.host
|
|
|
|
# Allowlisted hosts
|
|
return true if hostname_matches?(SiteSetting.Upload.s3_cdn_url) ||
|
|
hostname_matches?(GlobalSetting.try(:cdn_url)) ||
|
|
hostname_matches?(Discourse.base_url_no_prefix)
|
|
|
|
if SiteSetting.allowed_internal_hosts.present?
|
|
return true if SiteSetting.allowed_internal_hosts.split("|").any? { |h| h.downcase == @uri.hostname.downcase }
|
|
end
|
|
|
|
address_s = @opts[:lookup_ip].call(@uri.hostname)
|
|
return false unless address_s
|
|
|
|
address = IPAddr.new(address_s)
|
|
|
|
if private_ranges.any? { |r| r === address }
|
|
@status = :invalid_address
|
|
return false
|
|
end
|
|
|
|
# Rate limit how often this IP can be crawled
|
|
if !@opts[:skip_rate_limit] && !@limited_ips.include?(address)
|
|
@limited_ips << address
|
|
RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 1000, 1.hour).performed!
|
|
end
|
|
|
|
true
|
|
rescue RateLimiter::LimitExceeded
|
|
false
|
|
end
|
|
|
|
def escape_url
|
|
UrlHelper.escape_uri(@url)
|
|
end
|
|
|
|
def private_ranges
|
|
FinalDestination.standard_private_ranges +
|
|
SiteSetting.blocked_ip_blocks.split('|').map { |r| IPAddr.new(r) rescue nil }.compact
|
|
end
|
|
|
|
def log(log_level, message)
|
|
return if @status_code == 404
|
|
|
|
Rails.logger.public_send(
|
|
log_level,
|
|
"#{RailsMultisite::ConnectionManagement.current_db}: #{message}"
|
|
)
|
|
end
|
|
|
|
def self.standard_private_ranges
|
|
@private_ranges ||= [
|
|
IPAddr.new('0.0.0.0/8'),
|
|
IPAddr.new('127.0.0.1'),
|
|
IPAddr.new('172.16.0.0/12'),
|
|
IPAddr.new('192.168.0.0/16'),
|
|
IPAddr.new('10.0.0.0/8'),
|
|
IPAddr.new('fc00::/7')
|
|
]
|
|
end
|
|
|
|
def self.lookup_ip(host)
|
|
if Rails.env.test?
|
|
"1.1.1.1"
|
|
else
|
|
IPSocket::getaddress(host)
|
|
end
|
|
rescue SocketError
|
|
nil
|
|
end
|
|
|
|
protected
|
|
|
|
def safe_get(uri)
|
|
result = nil
|
|
unsafe_close = false
|
|
|
|
safe_session(uri) do |http|
|
|
headers = request_headers.merge(
|
|
'Accept-Encoding' => 'gzip',
|
|
'Host' => uri.host
|
|
)
|
|
|
|
req = Net::HTTP::Get.new(uri.request_uri, headers)
|
|
|
|
http.request(req) do |resp|
|
|
if Net::HTTPRedirection === resp
|
|
result = :redirect, [resp['location'], resp['Set-Cookie']]
|
|
end
|
|
|
|
if Net::HTTPSuccess === resp
|
|
resp.decode_content = true
|
|
resp.read_body do |chunk|
|
|
read_next = true
|
|
|
|
catch(:done) do
|
|
if read_next
|
|
read_next = false
|
|
yield resp, chunk, uri
|
|
read_next = true
|
|
end
|
|
end
|
|
|
|
# no clean way of finishing abruptly cause
|
|
# response likes reading till the end
|
|
if !read_next
|
|
unsafe_close = true
|
|
http.finish
|
|
raise StandardError
|
|
end
|
|
end
|
|
result = :ok
|
|
else
|
|
catch(:done) do
|
|
yield resp, nil, nil
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
result
|
|
rescue StandardError
|
|
unsafe_close ? :ok : raise
|
|
end
|
|
|
|
def safe_session(uri)
|
|
Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == "https")) do |http|
|
|
http.read_timeout = timeout
|
|
http.open_timeout = timeout
|
|
yield http
|
|
end
|
|
end
|
|
|
|
private
|
|
|
|
def uri(location)
|
|
begin
|
|
URI.parse(location)
|
|
rescue URI::Error
|
|
end
|
|
end
|
|
|
|
end
|