2017-05-23 00:23:04 +08:00
|
|
|
require "socket"
|
|
|
|
require "ipaddr"
|
|
|
|
require 'excon'
|
2017-05-24 03:03:04 +08:00
|
|
|
require 'rate_limiter'
|
2017-05-23 00:23:04 +08:00
|
|
|
|
|
|
|
# Determine the final endpoint for a Web URI, following redirects
|
|
|
|
class FinalDestination
|
|
|
|
|
|
|
|
attr_reader :status
|
|
|
|
|
2017-05-23 23:51:23 +08:00
|
|
|
def initialize(url, opts=nil)
|
2017-05-23 00:23:04 +08:00
|
|
|
@uri = URI(url) rescue nil
|
|
|
|
@opts = opts || {}
|
|
|
|
@opts[:max_redirects] ||= 5
|
|
|
|
@opts[:lookup_ip] ||= lambda do |host|
|
|
|
|
begin
|
|
|
|
IPSocket::getaddress(host)
|
|
|
|
rescue SocketError
|
|
|
|
nil
|
|
|
|
end
|
|
|
|
end
|
|
|
|
@limit = @opts[:max_redirects]
|
|
|
|
@status = :ready
|
|
|
|
end
|
|
|
|
|
|
|
|
def redirected?
|
|
|
|
@limit < @opts[:max_redirects]
|
|
|
|
end
|
|
|
|
|
|
|
|
def request_headers
|
|
|
|
{ "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
|
|
|
|
"Accept" => "text/html",
|
|
|
|
"Host" => @uri.hostname }
|
|
|
|
end
|
|
|
|
|
|
|
|
def resolve
|
|
|
|
if @limit < 0
|
|
|
|
@status = :too_many_redirects
|
|
|
|
return nil
|
|
|
|
end
|
|
|
|
|
|
|
|
return nil unless validate_uri
|
|
|
|
headers = request_headers
|
|
|
|
head = Excon.head(@uri.to_s, read_timeout: 20, headers: headers)
|
|
|
|
|
|
|
|
# If the site does not allow HEAD, just try the url
|
|
|
|
return @uri if head.status == 405
|
|
|
|
|
|
|
|
if head.status == 200
|
|
|
|
@status = :resolved
|
|
|
|
return @uri
|
|
|
|
end
|
|
|
|
|
|
|
|
location = FinalDestination.header_for(head, 'location')
|
|
|
|
if location
|
|
|
|
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
|
|
|
@uri = URI(location) rescue nil
|
|
|
|
@limit -= 1
|
|
|
|
return resolve
|
|
|
|
end
|
|
|
|
|
|
|
|
nil
|
|
|
|
end
|
|
|
|
|
|
|
|
def validate_uri
|
2017-05-24 01:31:20 +08:00
|
|
|
validate_uri_format && is_dest_valid?
|
2017-05-23 00:23:04 +08:00
|
|
|
end
|
|
|
|
|
|
|
|
def validate_uri_format
|
|
|
|
return false unless @uri
|
|
|
|
return false unless ['https', 'http'].include?(@uri.scheme)
|
2017-05-24 01:07:18 +08:00
|
|
|
return false if @uri.scheme == 'http' && @uri.port != 80
|
|
|
|
return false if @uri.scheme == 'https' && @uri.port != 443
|
2017-05-23 00:23:04 +08:00
|
|
|
|
2017-05-24 01:07:18 +08:00
|
|
|
# Disallow IP based crawling
|
|
|
|
(IPAddr.new(@uri.hostname) rescue nil).nil?
|
2017-05-23 00:23:04 +08:00
|
|
|
end
|
|
|
|
|
2017-05-24 01:31:20 +08:00
|
|
|
def is_dest_valid?
|
2017-05-23 00:23:04 +08:00
|
|
|
return false unless @uri && @uri.host
|
|
|
|
|
|
|
|
address_s = @opts[:lookup_ip].call(@uri.hostname)
|
|
|
|
return false unless address_s
|
|
|
|
|
|
|
|
address = IPAddr.new(address_s)
|
|
|
|
|
2017-05-23 23:51:23 +08:00
|
|
|
if private_ranges.any? {|r| r === address }
|
2017-05-23 00:23:04 +08:00
|
|
|
@status = :invalid_address
|
|
|
|
return false
|
|
|
|
end
|
|
|
|
|
2017-05-24 03:03:04 +08:00
|
|
|
# Rate limit how often this IP can be crawled
|
|
|
|
RateLimiter.new(nil, "crawl-destination-ip:#{address_s}", 100, 1.hour).performed!
|
|
|
|
|
2017-05-23 00:23:04 +08:00
|
|
|
true
|
2017-05-24 03:03:04 +08:00
|
|
|
rescue RateLimiter::LimitExceeded
|
|
|
|
false
|
2017-05-23 00:23:04 +08:00
|
|
|
end
|
|
|
|
|
2017-05-23 23:51:23 +08:00
|
|
|
def private_ranges
|
|
|
|
FinalDestination.standard_private_ranges +
|
|
|
|
SiteSetting.blacklist_ip_blocks.split('|').map {|r| IPAddr.new(r) rescue nil }.compact
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.standard_private_ranges
|
2017-05-23 00:23:04 +08:00
|
|
|
@private_ranges ||= [
|
|
|
|
IPAddr.new('127.0.0.1'),
|
|
|
|
IPAddr.new('172.16.0.0/12'),
|
|
|
|
IPAddr.new('192.168.0.0/16'),
|
|
|
|
IPAddr.new('10.0.0.0/8'),
|
|
|
|
IPAddr.new('fc00::/7')
|
|
|
|
]
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.lookup_ip(host)
|
|
|
|
IPSocket::getaddress(host)
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.header_for(head, name)
|
|
|
|
header = head.headers.detect do |k, _|
|
|
|
|
name == k.downcase
|
|
|
|
end
|
|
|
|
header[1] if header
|
|
|
|
end
|
2017-05-24 01:31:20 +08:00
|
|
|
|
2017-05-23 00:23:04 +08:00
|
|
|
end
|