2019-05-03 06:17:27 +08:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2017-10-17 13:22:38 +08:00
|
|
|
require 'socket'
|
|
|
|
require 'ipaddr'
|
2017-05-23 00:23:04 +08:00
|
|
|
require 'excon'
|
2017-05-24 03:03:04 +08:00
|
|
|
require 'rate_limiter'
|
2017-12-13 00:50:39 +08:00
|
|
|
require 'url_helper'
|
2017-05-23 00:23:04 +08:00
|
|
|
|
|
|
|
# Determine the final endpoint for a Web URI, following redirects
|
|
|
|
class FinalDestination
|
2022-02-14 09:11:09 +08:00
|
|
|
MAX_REQUEST_TIME_SECONDS = 10
|
|
|
|
MAX_REQUEST_SIZE_BYTES = 1_048_576 # 1024 * 1024
|
2017-05-23 00:23:04 +08:00
|
|
|
|
2017-10-17 13:22:38 +08:00
|
|
|
def self.clear_https_cache!(domain)
|
|
|
|
key = redis_https_key(domain)
|
2019-12-03 17:05:53 +08:00
|
|
|
Discourse.redis.without_namespace.del(key)
|
2017-10-17 13:22:38 +08:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.cache_https_domain(domain)
|
|
|
|
key = redis_https_key(domain)
|
2021-04-01 01:19:34 +08:00
|
|
|
Discourse.redis.without_namespace.setex(key, 1.day.to_i, "1")
|
2017-10-17 13:22:38 +08:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.is_https_domain?(domain)
|
|
|
|
key = redis_https_key(domain)
|
2019-12-03 17:05:53 +08:00
|
|
|
Discourse.redis.without_namespace.get(key).present?
|
2017-10-17 13:22:38 +08:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.redis_https_key(domain)
|
|
|
|
"HTTPS_DOMAIN_#{domain}"
|
|
|
|
end
|
|
|
|
|
2021-04-01 01:19:34 +08:00
|
|
|
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"
|
|
|
|
|
2021-07-31 01:36:30 +08:00
|
|
|
attr_reader :status, :cookie, :status_code, :content_type, :ignored
|
2017-05-23 00:23:04 +08:00
|
|
|
|
2017-05-23 23:51:23 +08:00
|
|
|
def initialize(url, opts = nil)
|
2017-07-30 00:42:04 +08:00
|
|
|
@url = url
|
2018-03-28 16:20:08 +08:00
|
|
|
@uri = uri(escape_url) if @url
|
2017-07-18 08:50:06 +08:00
|
|
|
|
2017-05-23 00:23:04 +08:00
|
|
|
@opts = opts || {}
|
2017-08-08 17:44:27 +08:00
|
|
|
@force_get_hosts = @opts[:force_get_hosts] || []
|
2018-12-19 14:27:07 +08:00
|
|
|
@preserve_fragment_url_hosts = @opts[:preserve_fragment_url_hosts] || []
|
2019-11-07 17:14:43 +08:00
|
|
|
@force_custom_user_agent_hosts = @opts[:force_custom_user_agent_hosts] || []
|
2021-04-01 01:19:34 +08:00
|
|
|
@default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT
|
2017-05-23 00:23:04 +08:00
|
|
|
@opts[:max_redirects] ||= 5
|
2018-08-26 22:31:02 +08:00
|
|
|
|
2019-05-28 08:28:57 +08:00
|
|
|
@limit = @opts[:max_redirects]
|
2018-09-04 10:16:21 +08:00
|
|
|
|
2022-02-03 09:42:06 +08:00
|
|
|
@ignored = []
|
2019-05-28 08:28:57 +08:00
|
|
|
if @limit > 0
|
|
|
|
ignore_redirects = [Discourse.base_url_no_prefix]
|
2018-09-04 10:16:21 +08:00
|
|
|
|
2019-05-28 08:28:57 +08:00
|
|
|
if @opts[:ignore_redirects]
|
|
|
|
ignore_redirects.concat(@opts[:ignore_redirects])
|
|
|
|
end
|
2018-09-04 10:16:21 +08:00
|
|
|
|
2019-05-28 08:28:57 +08:00
|
|
|
ignore_redirects.each do |ignore_redirect|
|
|
|
|
ignore_redirect = uri(ignore_redirect)
|
|
|
|
if ignore_redirect.present? && ignore_redirect.hostname
|
|
|
|
@ignored << ignore_redirect.hostname
|
|
|
|
end
|
2018-08-26 22:31:02 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2017-05-23 00:23:04 +08:00
|
|
|
@status = :ready
|
2021-10-01 23:48:21 +08:00
|
|
|
@follow_canonical = @opts[:follow_canonical]
|
|
|
|
@http_verb = http_verb(@force_get_hosts, @follow_canonical)
|
2017-06-07 01:53:49 +08:00
|
|
|
@cookie = nil
|
2017-10-17 13:22:38 +08:00
|
|
|
@limited_ips = []
|
2017-11-01 00:03:03 +08:00
|
|
|
@verbose = @opts[:verbose] || false
|
2018-01-29 12:36:52 +08:00
|
|
|
@timeout = @opts[:timeout] || nil
|
2018-12-19 14:27:07 +08:00
|
|
|
@preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) }
|
2019-05-28 08:28:57 +08:00
|
|
|
@validate_uri = @opts.fetch(:validate_uri) { true }
|
2021-04-01 01:19:34 +08:00
|
|
|
@user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : @default_user_agent
|
2017-06-07 01:53:49 +08:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.connection_timeout
|
|
|
|
20
|
2017-05-23 00:23:04 +08:00
|
|
|
end
|
|
|
|
|
2022-11-02 00:34:12 +08:00
|
|
|
def self.resolve(url)
|
|
|
|
new(url).resolve
|
|
|
|
end
|
|
|
|
|
2021-10-01 23:48:21 +08:00
|
|
|
def http_verb(force_get_hosts, follow_canonical)
|
|
|
|
if follow_canonical || force_get_hosts.any? { |host| hostname_matches?(host) }
|
|
|
|
:get
|
|
|
|
else
|
|
|
|
:head
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-01-29 12:36:52 +08:00
|
|
|
def timeout
|
|
|
|
@timeout || FinalDestination.connection_timeout
|
|
|
|
end
|
|
|
|
|
2017-05-23 00:23:04 +08:00
|
|
|
def redirected?
|
|
|
|
@limit < @opts[:max_redirects]
|
|
|
|
end
|
|
|
|
|
|
|
|
def request_headers
|
2017-06-07 01:53:49 +08:00
|
|
|
result = {
|
2019-11-07 17:14:43 +08:00
|
|
|
"User-Agent" => @user_agent,
|
2017-11-18 00:24:35 +08:00
|
|
|
"Accept" => "*/*",
|
2021-07-22 13:49:59 +08:00
|
|
|
"Accept-Language" => "*",
|
2017-06-07 01:53:49 +08:00
|
|
|
"Host" => @uri.hostname
|
|
|
|
}
|
|
|
|
|
2018-02-18 23:08:07 +08:00
|
|
|
result['Cookie'] = @cookie if @cookie
|
2017-06-07 01:53:49 +08:00
|
|
|
|
|
|
|
result
|
|
|
|
end
|
|
|
|
|
2019-03-09 04:36:49 +08:00
|
|
|
def small_get(request_headers)
|
|
|
|
status_code, response_headers = nil
|
2019-02-27 11:51:15 +08:00
|
|
|
|
|
|
|
catch(:done) do
|
2022-11-02 00:34:12 +08:00
|
|
|
FinalDestination::HTTP.start(@uri.host, @uri.port, use_ssl: @uri.is_a?(URI::HTTPS), open_timeout: timeout) do |http|
|
2019-02-27 11:51:15 +08:00
|
|
|
http.read_timeout = timeout
|
2019-03-09 04:36:49 +08:00
|
|
|
http.request_get(@uri.request_uri, request_headers) do |resp|
|
2019-02-27 11:51:15 +08:00
|
|
|
status_code = resp.code.to_i
|
2019-03-09 04:36:49 +08:00
|
|
|
response_headers = resp.to_hash
|
2019-02-27 11:51:15 +08:00
|
|
|
|
|
|
|
# see: https://bugs.ruby-lang.org/issues/15624
|
|
|
|
# if we allow response to return then body will be read
|
|
|
|
# got to abort without reading body
|
|
|
|
throw :done
|
|
|
|
end
|
|
|
|
end
|
2017-06-07 01:53:49 +08:00
|
|
|
end
|
2019-02-27 11:51:15 +08:00
|
|
|
|
2019-03-09 04:36:49 +08:00
|
|
|
[status_code, response_headers]
|
2017-05-23 00:23:04 +08:00
|
|
|
end
|
|
|
|
|
2018-01-29 12:36:52 +08:00
|
|
|
# this is a new interface for simply getting
|
|
|
|
# N bytes accounting for all internal logic
|
|
|
|
def get(uri = @uri, redirects = @limit, extra_headers: {}, &blk)
|
|
|
|
raise "Must specify block" unless block_given?
|
|
|
|
|
|
|
|
if uri && uri.port == 80 && FinalDestination.is_https_domain?(uri.hostname)
|
|
|
|
uri.scheme = "https"
|
|
|
|
uri = URI(uri.to_s)
|
|
|
|
end
|
|
|
|
|
2018-02-24 19:35:57 +08:00
|
|
|
return nil unless validate_uri
|
2018-01-29 12:36:52 +08:00
|
|
|
|
|
|
|
result, (location, cookie) = safe_get(uri, &blk)
|
|
|
|
|
|
|
|
if result == :redirect && (redirects == 0 || !location)
|
|
|
|
return nil
|
|
|
|
end
|
|
|
|
|
|
|
|
if result == :redirect
|
|
|
|
old_port = uri.port
|
|
|
|
location = "#{uri.scheme}://#{uri.host}#{location}" if location[0] == "/"
|
2018-03-28 16:20:08 +08:00
|
|
|
uri = uri(location)
|
2018-01-29 12:36:52 +08:00
|
|
|
|
|
|
|
# https redirect, so just cache that whole new domain is https
|
|
|
|
if old_port == 80 && uri&.port == 443 && (URI::HTTPS === uri)
|
|
|
|
FinalDestination.cache_https_domain(uri.hostname)
|
|
|
|
end
|
|
|
|
|
|
|
|
return nil if !uri
|
|
|
|
|
|
|
|
extra = nil
|
2018-02-24 19:35:57 +08:00
|
|
|
extra = { 'Cookie' => cookie } if cookie
|
2018-01-29 12:36:52 +08:00
|
|
|
|
|
|
|
get(uri, redirects - 1, extra_headers: extra, &blk)
|
|
|
|
elsif result == :ok
|
|
|
|
uri.to_s
|
|
|
|
else
|
|
|
|
nil
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2017-05-23 00:23:04 +08:00
|
|
|
def resolve
|
2017-10-17 13:22:38 +08:00
|
|
|
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
|
|
|
|
@uri.scheme = "https"
|
|
|
|
@uri = URI(@uri.to_s)
|
|
|
|
end
|
|
|
|
|
2017-05-23 00:23:04 +08:00
|
|
|
if @limit < 0
|
|
|
|
@status = :too_many_redirects
|
2017-11-01 00:03:03 +08:00
|
|
|
log(:warn, "FinalDestination could not resolve URL (too many redirects): #{@uri}") if @verbose
|
2017-05-23 00:23:04 +08:00
|
|
|
return nil
|
|
|
|
end
|
|
|
|
|
2018-08-26 22:31:02 +08:00
|
|
|
unless validate_uri
|
2020-11-19 01:55:16 +08:00
|
|
|
@status = :invalid_address
|
2018-08-26 22:31:02 +08:00
|
|
|
log(:warn, "FinalDestination could not resolve URL (invalid URI): #{@uri}") if @verbose
|
|
|
|
return nil
|
|
|
|
end
|
|
|
|
|
2017-06-27 03:38:23 +08:00
|
|
|
@ignored.each do |host|
|
2018-08-26 22:31:02 +08:00
|
|
|
if @uri&.hostname&.match?(host)
|
2017-06-27 03:38:23 +08:00
|
|
|
@status = :resolved
|
|
|
|
return @uri
|
|
|
|
end
|
2017-06-13 01:22:29 +08:00
|
|
|
end
|
|
|
|
|
2021-04-01 01:19:34 +08:00
|
|
|
if Oneboxer.cached_response_body_exists?(@uri.to_s)
|
|
|
|
@status = :resolved
|
|
|
|
return @uri
|
|
|
|
end
|
|
|
|
|
2017-05-23 00:23:04 +08:00
|
|
|
headers = request_headers
|
2021-04-01 01:19:34 +08:00
|
|
|
middlewares = Excon.defaults[:middlewares]
|
|
|
|
middlewares << Excon::Middleware::Decompress if @http_verb == :get
|
|
|
|
|
2022-02-14 09:11:09 +08:00
|
|
|
request_start_time = Time.now
|
|
|
|
response_body = +""
|
|
|
|
request_validator = lambda do |chunk, _remaining_bytes, _total_bytes|
|
|
|
|
response_body << chunk
|
|
|
|
raise Excon::Errors::ExpectationFailed.new("response size too big: #{@uri.to_s}") if response_body.bytesize > MAX_REQUEST_SIZE_BYTES
|
|
|
|
raise Excon::Errors::ExpectationFailed.new("connect timeout reached: #{@uri.to_s}") if Time.now - request_start_time > MAX_REQUEST_TIME_SECONDS
|
|
|
|
end
|
|
|
|
|
2022-11-02 00:34:12 +08:00
|
|
|
# This technique will only use the first resolved IP
|
|
|
|
# TODO: Can we standardise this by using FinalDestination::HTTP?
|
|
|
|
begin
|
|
|
|
resolved_ip = SSRFDetector.lookup_and_filter_ips(@uri.hostname).first
|
|
|
|
rescue SSRFDetector::DisallowedIpError, SocketError, Timeout::Error
|
|
|
|
@status = :invalid_address
|
|
|
|
return
|
|
|
|
end
|
|
|
|
request_uri = @uri.dup
|
|
|
|
request_uri.hostname = resolved_ip unless Rails.env.test? # WebMock doesn't understand the IP-based requests
|
|
|
|
|
2017-08-08 17:44:27 +08:00
|
|
|
response = Excon.public_send(@http_verb,
|
2022-11-02 00:34:12 +08:00
|
|
|
request_uri.to_s,
|
2018-01-29 12:36:52 +08:00
|
|
|
read_timeout: timeout,
|
2021-06-21 23:34:01 +08:00
|
|
|
connect_timeout: timeout,
|
2022-11-02 00:34:12 +08:00
|
|
|
headers: { "Host" => @uri.hostname }.merge(headers),
|
2022-02-14 09:11:09 +08:00
|
|
|
middlewares: middlewares,
|
2022-11-02 00:34:12 +08:00
|
|
|
response_block: request_validator,
|
|
|
|
ssl_verify_peer_host: @uri.hostname
|
2017-06-07 01:53:49 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
location = nil
|
2018-02-18 23:08:07 +08:00
|
|
|
response_headers = nil
|
2017-09-28 14:35:27 +08:00
|
|
|
response_status = response.status.to_i
|
|
|
|
|
2017-06-07 01:53:49 +08:00
|
|
|
case response.status
|
|
|
|
when 200
|
2021-04-01 01:19:34 +08:00
|
|
|
# Cache body of successful `get` requests
|
|
|
|
if @http_verb == :get
|
|
|
|
if Oneboxer.cache_response_body?(@uri)
|
2022-02-14 09:11:09 +08:00
|
|
|
Oneboxer.cache_response_body(@uri.to_s, response_body)
|
2021-04-01 01:19:34 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2021-10-01 23:48:21 +08:00
|
|
|
if @follow_canonical
|
2022-02-14 09:11:09 +08:00
|
|
|
next_url = fetch_canonical_url(response_body)
|
2021-10-01 23:48:21 +08:00
|
|
|
|
|
|
|
if next_url.to_s.present? && next_url != @uri
|
|
|
|
@follow_canonical = false
|
|
|
|
@uri = next_url
|
|
|
|
@http_verb = http_verb(@force_get_hosts, @follow_canonical)
|
|
|
|
|
|
|
|
return resolve
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2021-07-31 01:36:30 +08:00
|
|
|
@content_type = response.headers['Content-Type'] if response.headers.has_key?('Content-Type')
|
2017-05-23 00:23:04 +08:00
|
|
|
@status = :resolved
|
|
|
|
return @uri
|
2021-10-30 02:51:56 +08:00
|
|
|
when 103, 400, 405, 406, 409, 500, 501
|
2019-02-27 11:51:15 +08:00
|
|
|
response_status, small_headers = small_get(request_headers)
|
2017-06-07 01:53:49 +08:00
|
|
|
|
2017-09-28 14:35:27 +08:00
|
|
|
if response_status == 200
|
2017-06-07 01:53:49 +08:00
|
|
|
@status = :resolved
|
|
|
|
return @uri
|
|
|
|
end
|
|
|
|
|
2018-02-18 23:08:07 +08:00
|
|
|
response_headers = {}
|
2019-02-27 11:51:15 +08:00
|
|
|
if cookie_val = small_headers['set-cookie']
|
2018-02-18 23:08:07 +08:00
|
|
|
response_headers[:cookies] = cookie_val
|
2017-06-07 01:53:49 +08:00
|
|
|
end
|
|
|
|
|
2019-02-27 11:51:15 +08:00
|
|
|
if location_val = small_headers['location']
|
2018-02-18 23:08:07 +08:00
|
|
|
response_headers[:location] = location_val.join
|
2017-06-07 01:53:49 +08:00
|
|
|
end
|
2017-09-28 14:35:27 +08:00
|
|
|
end
|
|
|
|
|
2018-02-18 23:08:07 +08:00
|
|
|
unless response_headers
|
|
|
|
response_headers = {
|
|
|
|
cookies: response.data[:cookies] || response.headers[:"set-cookie"],
|
|
|
|
location: response.headers[:location]
|
|
|
|
}
|
2017-05-23 00:23:04 +08:00
|
|
|
end
|
|
|
|
|
2017-09-28 14:35:27 +08:00
|
|
|
if (300..399).include?(response_status)
|
2018-02-18 23:08:07 +08:00
|
|
|
location = response_headers[:location]
|
2017-09-28 14:35:27 +08:00
|
|
|
end
|
|
|
|
|
2018-02-18 23:08:07 +08:00
|
|
|
if cookies = response_headers[:cookies]
|
|
|
|
@cookie = Array.wrap(cookies).map { |c| c.split(';').first.strip }.join('; ')
|
2017-09-28 14:35:27 +08:00
|
|
|
end
|
|
|
|
|
2017-05-23 00:23:04 +08:00
|
|
|
if location
|
2019-08-07 18:56:03 +08:00
|
|
|
redirect_uri = uri(location)
|
|
|
|
if @uri.host == redirect_uri.host && (redirect_uri.path =~ /\/login/ || redirect_uri.path =~ /\/session/)
|
|
|
|
@status = :resolved
|
|
|
|
return @uri
|
|
|
|
end
|
|
|
|
|
2017-10-17 13:22:38 +08:00
|
|
|
old_port = @uri.port
|
2018-12-19 14:27:07 +08:00
|
|
|
location = "#{location}##{@uri.fragment}" if @preserve_fragment_url && @uri.fragment.present?
|
2017-05-23 00:23:04 +08:00
|
|
|
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
2019-08-07 19:09:58 +08:00
|
|
|
@uri = uri(location)
|
2017-05-23 00:23:04 +08:00
|
|
|
@limit -= 1
|
2017-10-17 13:22:38 +08:00
|
|
|
|
|
|
|
# https redirect, so just cache that whole new domain is https
|
|
|
|
if old_port == 80 && @uri.port == 443 && (URI::HTTPS === @uri)
|
|
|
|
FinalDestination.cache_https_domain(@uri.hostname)
|
|
|
|
end
|
|
|
|
|
2017-05-23 00:23:04 +08:00
|
|
|
return resolve
|
|
|
|
end
|
|
|
|
|
2017-09-28 14:35:27 +08:00
|
|
|
# this is weird an exception seems better
|
|
|
|
@status = :failure
|
|
|
|
@status_code = response.status
|
|
|
|
|
2017-11-01 00:03:03 +08:00
|
|
|
log(:warn, "FinalDestination could not resolve URL (status #{response.status}): #{@uri}") if @verbose
|
2017-09-27 14:52:49 +08:00
|
|
|
nil
|
|
|
|
rescue Excon::Errors::Timeout
|
2017-11-01 00:03:03 +08:00
|
|
|
log(:warn, "FinalDestination could not resolve URL (timeout): #{@uri}") if @verbose
|
2017-05-23 00:23:04 +08:00
|
|
|
nil
|
|
|
|
end
|
|
|
|
|
2022-11-02 00:34:12 +08:00
|
|
|
def skip_validations?
|
|
|
|
!@validate_uri
|
|
|
|
end
|
|
|
|
|
2017-05-23 00:23:04 +08:00
|
|
|
def validate_uri
|
2022-11-02 00:34:12 +08:00
|
|
|
skip_validations? || validate_uri_format
|
2017-05-23 00:23:04 +08:00
|
|
|
end
|
|
|
|
|
|
|
|
def validate_uri_format
|
2022-11-02 00:34:12 +08:00
|
|
|
return false unless @uri && @uri.host
|
2017-05-23 00:23:04 +08:00
|
|
|
return false unless ['https', 'http'].include?(@uri.scheme)
|
2017-05-24 01:07:18 +08:00
|
|
|
return false if @uri.scheme == 'http' && @uri.port != 80
|
|
|
|
return false if @uri.scheme == 'https' && @uri.port != 443
|
2017-05-23 00:23:04 +08:00
|
|
|
|
2017-05-24 01:07:18 +08:00
|
|
|
# Disallow IP based crawling
|
|
|
|
(IPAddr.new(@uri.hostname) rescue nil).nil?
|
2017-05-23 00:23:04 +08:00
|
|
|
end
|
|
|
|
|
2020-11-19 01:55:16 +08:00
|
|
|
def hostname
|
|
|
|
@uri.hostname
|
|
|
|
end
|
|
|
|
|
2017-06-13 01:22:29 +08:00
|
|
|
def hostname_matches?(url)
|
2018-03-28 16:20:08 +08:00
|
|
|
url = uri(url)
|
2021-05-14 03:48:35 +08:00
|
|
|
|
|
|
|
if @uri&.hostname.present? && url&.hostname.present?
|
|
|
|
hostname_parts = url.hostname.split('.')
|
|
|
|
has_wildcard = hostname_parts.first == '*'
|
|
|
|
|
|
|
|
if has_wildcard
|
|
|
|
@uri.hostname.end_with?(hostname_parts[1..-1].join('.'))
|
|
|
|
else
|
|
|
|
@uri.hostname == url.hostname
|
|
|
|
end
|
|
|
|
end
|
2017-06-13 01:22:29 +08:00
|
|
|
end
|
2017-05-24 04:32:54 +08:00
|
|
|
|
2017-07-30 00:42:04 +08:00
|
|
|
def escape_url
|
2019-12-12 10:49:21 +08:00
|
|
|
UrlHelper.escape_uri(@url)
|
2017-07-30 00:42:04 +08:00
|
|
|
end
|
|
|
|
|
2017-10-31 19:08:34 +08:00
|
|
|
def log(log_level, message)
|
2018-05-25 10:11:16 +08:00
|
|
|
return if @status_code == 404
|
2017-11-17 22:38:26 +08:00
|
|
|
|
2017-10-31 19:08:34 +08:00
|
|
|
Rails.logger.public_send(
|
|
|
|
log_level,
|
|
|
|
"#{RailsMultisite::ConnectionManagement.current_db}: #{message}"
|
|
|
|
)
|
|
|
|
end
|
|
|
|
|
2018-01-29 12:36:52 +08:00
|
|
|
protected
|
|
|
|
|
|
|
|
def safe_get(uri)
|
|
|
|
result = nil
|
|
|
|
unsafe_close = false
|
|
|
|
|
|
|
|
safe_session(uri) do |http|
|
|
|
|
headers = request_headers.merge(
|
|
|
|
'Accept-Encoding' => 'gzip',
|
|
|
|
'Host' => uri.host
|
|
|
|
)
|
|
|
|
|
2022-11-02 00:34:12 +08:00
|
|
|
req = FinalDestination::HTTP::Get.new(uri.request_uri, headers)
|
2018-01-29 12:36:52 +08:00
|
|
|
|
|
|
|
http.request(req) do |resp|
|
|
|
|
if Net::HTTPRedirection === resp
|
|
|
|
result = :redirect, [resp['location'], resp['Set-Cookie']]
|
|
|
|
end
|
|
|
|
|
|
|
|
if Net::HTTPSuccess === resp
|
|
|
|
resp.decode_content = true
|
2018-02-24 19:35:57 +08:00
|
|
|
resp.read_body do |chunk|
|
2018-01-29 12:36:52 +08:00
|
|
|
read_next = true
|
|
|
|
|
|
|
|
catch(:done) do
|
|
|
|
if read_next
|
|
|
|
read_next = false
|
|
|
|
yield resp, chunk, uri
|
|
|
|
read_next = true
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# no clean way of finishing abruptly cause
|
|
|
|
# response likes reading till the end
|
|
|
|
if !read_next
|
|
|
|
unsafe_close = true
|
|
|
|
http.finish
|
|
|
|
raise StandardError
|
|
|
|
end
|
2018-02-24 19:35:57 +08:00
|
|
|
end
|
2018-01-29 12:36:52 +08:00
|
|
|
result = :ok
|
2018-02-24 19:35:57 +08:00
|
|
|
else
|
|
|
|
catch(:done) do
|
|
|
|
yield resp, nil, nil
|
|
|
|
end
|
2018-01-29 12:36:52 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
result
|
|
|
|
rescue StandardError
|
2018-02-24 19:35:57 +08:00
|
|
|
unsafe_close ? :ok : raise
|
2018-01-29 12:36:52 +08:00
|
|
|
end
|
|
|
|
|
|
|
|
def safe_session(uri)
|
2022-11-02 00:34:12 +08:00
|
|
|
FinalDestination::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == "https"), open_timeout: timeout) do |http|
|
2018-01-29 12:36:52 +08:00
|
|
|
http.read_timeout = timeout
|
|
|
|
yield http
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-03-28 16:20:08 +08:00
|
|
|
private
|
|
|
|
|
|
|
|
def uri(location)
|
|
|
|
begin
|
2018-08-14 18:23:32 +08:00
|
|
|
URI.parse(location)
|
|
|
|
rescue URI::Error
|
2018-03-28 16:20:08 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2021-10-01 23:48:21 +08:00
|
|
|
def fetch_canonical_url(body)
|
|
|
|
return if body.blank?
|
|
|
|
|
2021-11-06 01:20:14 +08:00
|
|
|
canonical_element = Nokogiri::HTML5(body).at("link[rel='canonical']")
|
|
|
|
return if canonical_element.nil?
|
|
|
|
canonical_uri = uri(canonical_element['href'])
|
|
|
|
return if canonical_uri.blank?
|
2021-10-01 23:48:21 +08:00
|
|
|
|
2021-11-06 01:20:14 +08:00
|
|
|
return canonical_uri if canonical_uri.host.present?
|
|
|
|
parts = [@uri.host, canonical_uri.to_s]
|
|
|
|
complete_url = canonical_uri.to_s.starts_with?('/') ? parts.join('') : parts.join('/')
|
|
|
|
complete_url = "#{@uri.scheme}://#{complete_url}" if @uri.scheme
|
|
|
|
|
|
|
|
uri(complete_url)
|
2021-10-01 23:48:21 +08:00
|
|
|
end
|
2017-05-23 00:23:04 +08:00
|
|
|
end
|