Helper to find the final destination for a URL

This commit is contained in:
Robin Ward 2017-05-22 12:23:04 -04:00
parent 6231318462
commit b23fc2bf84
4 changed files with 293 additions and 81 deletions

View File

@ -1,55 +1,13 @@
require 'open-uri'
require 'nokogiri'
require 'excon'
require 'final_destination'
module Jobs
class CrawlTopicLink < Jobs::Base
class ReadEnough < StandardError; end
# Retrieve a header regardless of case sensitivity
def self.header_for(head, name)
header = head.headers.detect do |k, _|
name == k.downcase
end
header[1] if header
end
def self.request_headers(uri)
{ "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Accept" => "text/html",
"Host" => uri.host }
end
# Follow any redirects that might exist
def self.final_uri(url, limit=5)
return if limit < 0
uri = URI(url)
return if uri.blank? || uri.host.blank?
return unless ['https', 'http'].include?(uri.scheme)
return unless [80, 443].include?(uri.port)
headers = CrawlTopicLink.request_headers(uri)
head = Excon.head(url, read_timeout: 20, headers: headers)
# If the site does not allow HEAD, just try the url
return uri if head.status == 405
if head.status == 200
uri = nil unless header_for(head, 'content-type') =~ /text\/html/
return uri
end
location = header_for(head, 'location')
if location
location = "#{uri.scheme}://#{uri.host}#{location}" if location[0] == "/"
return final_uri(location, limit - 1)
end
nil
end
def self.max_chunk_size(uri)
# Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
# one host but amazon is a big one.
@ -64,7 +22,8 @@ module Jobs
# Never crawl in test mode
return if Rails.env.test?
uri = final_uri(url)
fd = FinalDestination.new(url)
uri = fd.resolve
return "" unless uri
result = ""
@ -76,7 +35,7 @@ module Jobs
# that matter!)
raise ReadEnough.new if result.size > (CrawlTopicLink.max_chunk_size(uri) * 1024)
end
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: CrawlTopicLink.request_headers(uri))
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
result
rescue Excon::Errors::SocketError => ex

View File

@ -2,42 +2,6 @@
Some notes about testing Discourse:
## FakeWeb
We use the [FakeWeb](https://github.com/chrisk/fakeweb) gem to fake external web
requests.
For example, check out the specs on `specs/components/oneboxer`.
This has several advantages to making real requests:
* We freeze the expected response from the remote server.
* We don't need a network connection to run the specs.
* It's faster.
So, if you need to define a spec that makes a web request, you'll have to record
the real response to a fixture file, and tell FakeWeb to respond with it for the
URI of your request.
Check out `spec/components/oneboxer/amazon_onebox_spec.rb` for an example on
this.
### Recording responses
To record the actual response from the remote server, you can use curl and save the response to a file. We use the `-i` option to include headers in the output
curl -i http://en.m.wikipedia.org/wiki/Ruby > wikipedia.response
If you need to specify the User-Agent to send to the server, you can use `-A`:
curl -i -A 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A405 Safari/7534.48.3' http://en.m.wikipedia.org/wiki/Ruby > wikipedia.response
If the remote server is responding with a redirect, you'll need to fake both the
original request and the one for the destination. Check out the
`wikipedia.response` and `wikipedia_redirected.response` files in
`spec/fixtures/oneboxer` for an example. You can also consider working directly
with the final URL for simplicity.
## MailCatcher
Discourse depends heavily on (sending) email for notifications. We use [MailCatcher](http://mailcatcher.me/)

119
lib/final_destination.rb Normal file
View File

@ -0,0 +1,119 @@
require "socket"
require "ipaddr"
require 'excon'
# Determine the final endpoint for a Web URI, following redirects
class FinalDestination
attr_reader :status
def initialize(url, opts = nil)
@uri = URI(url) rescue nil
@opts = opts || {}
@opts[:max_redirects] ||= 5
@opts[:lookup_ip] ||= lambda do |host|
begin
IPSocket::getaddress(host)
rescue SocketError
nil
end
end
@limit = @opts[:max_redirects]
@status = :ready
end
def redirected?
@limit < @opts[:max_redirects]
end
def request_headers
{ "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Accept" => "text/html",
"Host" => @uri.hostname }
end
def resolve
if @limit < 0
@status = :too_many_redirects
return nil
end
return nil unless validate_uri
headers = request_headers
head = Excon.head(@uri.to_s, read_timeout: 20, headers: headers)
# If the site does not allow HEAD, just try the url
return @uri if head.status == 405
if head.status == 200
@uri = nil unless FinalDestination.header_for(head, 'content-type') =~ /text\/html/
@status = :resolved
return @uri
end
location = FinalDestination.header_for(head, 'location')
if location
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
@uri = URI(location) rescue nil
@limit -= 1
return resolve
end
nil
end
def validate_uri
validate_uri_format && is_public?
end
def validate_uri_format
return false unless @uri
return false unless ['https', 'http'].include?(@uri.scheme)
if @uri.scheme == 'http'
return @uri.port == 80
elsif @uri.scheme == 'https'
return @uri.port == 443
end
false
end
def is_public?
return false unless @uri && @uri.host
address_s = @opts[:lookup_ip].call(@uri.hostname)
return false unless address_s
address = IPAddr.new(address_s)
private_match = FinalDestination.private_ranges.any? {|r| r === address }
if private_match
@status = :invalid_address
return false
end
true
end
def self.private_ranges
@private_ranges ||= [
IPAddr.new('127.0.0.1'),
IPAddr.new('172.16.0.0/12'),
IPAddr.new('192.168.0.0/16'),
IPAddr.new('10.0.0.0/8'),
IPAddr.new('fc00::/7')
]
end
def self.lookup_ip(host)
IPSocket::getaddress(host)
end
def self.header_for(head, name)
header = head.headers.detect do |k, _|
name == k.downcase
end
header[1] if header
end
end

View File

@ -0,0 +1,170 @@
require 'rails_helper'
require 'final_destination'
describe FinalDestination do
let(:opts) do
{ # avoid IP lookups in test
lookup_ip: lambda do |host|
case host
when 'eviltrout.com' then '52.84.143.152'
when 'codinghorror.com' then '91.146.108.148'
when 'discourse.org' then '104.25.152.10'
when 'private-host.com' then '192.168.10.1'
else
host
end
end
}
end
before do
FinalDestination.stubs(:lookup_ip) do |host|
end
end
let(:doc_response) do
{ body: "<html>document</html>",
headers: { "Content-Type" => "text/html" } }
end
def redirect_response(from, dest)
stub_request(:head, from).to_return(
status: 302,
headers: { "Location" => dest }
)
end
describe '.resolve' do
it "has a ready status code before anything happens" do
expect(FinalDestination.new('https://eviltrout.com').status).to eq(:ready)
end
it "returns nil an invalid url" do
expect(FinalDestination.new(nil, opts).resolve).to be_nil
expect(FinalDestination.new('asdf', opts).resolve).to be_nil
end
context "without redirects" do
before do
stub_request(:head, "https://eviltrout.com").to_return(doc_response)
end
it "returns the final url" do
fd = FinalDestination.new('https://eviltrout.com', opts)
expect(fd.resolve.to_s).to eq('https://eviltrout.com')
expect(fd.redirected?).to eq(false)
expect(fd.status).to eq(:resolved)
end
end
context "with a couple of redirects" do
before do
redirect_response("https://eviltrout.com", "https://codinghorror.com/blog")
redirect_response("https://codinghorror.com/blog", "https://discourse.org")
stub_request(:head, "https://discourse.org").to_return(doc_response)
end
it "returns the final url" do
fd = FinalDestination.new('https://eviltrout.com', opts)
expect(fd.resolve.to_s).to eq('https://discourse.org')
expect(fd.redirected?).to eq(true)
expect(fd.status).to eq(:resolved)
end
end
context "with too many redirects" do
before do
redirect_response("https://eviltrout.com", "https://codinghorror.com/blog")
redirect_response("https://codinghorror.com/blog", "https://discourse.org")
stub_request(:head, "https://discourse.org").to_return(doc_response)
end
it "returns the final url" do
fd = FinalDestination.new('https://eviltrout.com', opts.merge(max_redirects: 1))
expect(fd.resolve).to be_nil
expect(fd.redirected?).to eq(true)
expect(fd.status).to eq(:too_many_redirects)
end
end
context "with a redirect to an internal IP" do
before do
redirect_response("https://eviltrout.com", "https://private-host.com")
stub_request(:head, "https://private-host.com").to_return(doc_response)
end
it "returns the final url" do
fd = FinalDestination.new('https://eviltrout.com', opts)
expect(fd.resolve).to be_nil
expect(fd.redirected?).to eq(true)
expect(fd.status).to eq(:invalid_address)
end
end
end
describe '.validate_uri' do
context "host lookups" do
it "works for various hosts" do
expect(FinalDestination.new('https://private-host.com', opts).validate_uri).to eq(false)
expect(FinalDestination.new('https://eviltrout.com:443', opts).validate_uri).to eq(true)
end
end
end
describe ".validate_url_format" do
it "supports http urls" do
expect(FinalDestination.new('http://eviltrout.com', opts).validate_uri_format).to eq(true)
end
it "supports https urls" do
expect(FinalDestination.new('https://eviltrout.com', opts).validate_uri_format).to eq(true)
end
it "doesn't support ftp urls" do
expect(FinalDestination.new('ftp://eviltrout.com', opts).validate_uri_format).to eq(false)
end
it "returns false for schemeless URL" do
expect(FinalDestination.new('eviltrout.com', opts).validate_uri_format).to eq(false)
end
it "returns false for nil URL" do
expect(FinalDestination.new(nil, opts).validate_uri_format).to eq(false)
end
it "returns false for invalid ports" do
expect(FinalDestination.new('http://eviltrout.com:21', opts).validate_uri_format).to eq(false)
expect(FinalDestination.new('https://eviltrout.com:8000', opts).validate_uri_format).to eq(false)
end
it "returns true for valid ports" do
expect(FinalDestination.new('http://eviltrout.com:80', opts).validate_uri_format).to eq(true)
expect(FinalDestination.new('https://eviltrout.com:443',opts).validate_uri_format).to eq(true)
end
end
describe ".is_public" do
it "returns false for a valid ipv4" do
expect(FinalDestination.new("https://52.84.143.67", opts).is_public?).to eq(true)
expect(FinalDestination.new("https://104.25.153.10", opts).is_public?).to eq(true)
end
it "returns true for private ipv4" do
expect(FinalDestination.new("https://127.0.0.1", opts).is_public?).to eq(false)
expect(FinalDestination.new("https://192.168.1.3", opts).is_public?).to eq(false)
expect(FinalDestination.new("https://10.0.0.5", opts).is_public?).to eq(false)
expect(FinalDestination.new("https://172.16.0.1", opts).is_public?).to eq(false)
end
it "returns true for public ipv6" do
expect(FinalDestination.new("https://[2001:470:1:3a8::251]", opts).is_public?).to eq(true)
end
it "returns true for private ipv6" do
expect(FinalDestination.new("https://[fdd7:b450:d4d1:6b44::1]", opts).is_public?).to eq(false)
end
end
end