mirror of
https://github.com/discourse/discourse.git
synced 2024-11-25 09:42:07 +08:00
Helper to find the final destination for a URL
This commit is contained in:
parent
6231318462
commit
b23fc2bf84
|
@ -1,55 +1,13 @@
|
|||
require 'open-uri'
|
||||
require 'nokogiri'
|
||||
require 'excon'
|
||||
require 'final_destination'
|
||||
|
||||
module Jobs
|
||||
class CrawlTopicLink < Jobs::Base
|
||||
|
||||
class ReadEnough < StandardError; end
|
||||
|
||||
# Retrieve a header regardless of case sensitivity
|
||||
def self.header_for(head, name)
|
||||
header = head.headers.detect do |k, _|
|
||||
name == k.downcase
|
||||
end
|
||||
header[1] if header
|
||||
end
|
||||
|
||||
def self.request_headers(uri)
|
||||
{ "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
|
||||
"Accept" => "text/html",
|
||||
"Host" => uri.host }
|
||||
end
|
||||
|
||||
# Follow any redirects that might exist
|
||||
def self.final_uri(url, limit=5)
|
||||
return if limit < 0
|
||||
|
||||
uri = URI(url)
|
||||
return if uri.blank? || uri.host.blank?
|
||||
return unless ['https', 'http'].include?(uri.scheme)
|
||||
return unless [80, 443].include?(uri.port)
|
||||
|
||||
headers = CrawlTopicLink.request_headers(uri)
|
||||
head = Excon.head(url, read_timeout: 20, headers: headers)
|
||||
|
||||
# If the site does not allow HEAD, just try the url
|
||||
return uri if head.status == 405
|
||||
|
||||
if head.status == 200
|
||||
uri = nil unless header_for(head, 'content-type') =~ /text\/html/
|
||||
return uri
|
||||
end
|
||||
|
||||
location = header_for(head, 'location')
|
||||
if location
|
||||
location = "#{uri.scheme}://#{uri.host}#{location}" if location[0] == "/"
|
||||
return final_uri(location, limit - 1)
|
||||
end
|
||||
|
||||
nil
|
||||
end
|
||||
|
||||
def self.max_chunk_size(uri)
|
||||
# Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
|
||||
# one host but amazon is a big one.
|
||||
|
@ -64,7 +22,8 @@ module Jobs
|
|||
# Never crawl in test mode
|
||||
return if Rails.env.test?
|
||||
|
||||
uri = final_uri(url)
|
||||
fd = FinalDestination.new(url)
|
||||
uri = fd.resolve
|
||||
return "" unless uri
|
||||
|
||||
result = ""
|
||||
|
@ -76,7 +35,7 @@ module Jobs
|
|||
# that matter!)
|
||||
raise ReadEnough.new if result.size > (CrawlTopicLink.max_chunk_size(uri) * 1024)
|
||||
end
|
||||
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: CrawlTopicLink.request_headers(uri))
|
||||
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
|
||||
result
|
||||
|
||||
rescue Excon::Errors::SocketError => ex
|
||||
|
|
|
@ -2,42 +2,6 @@
|
|||
|
||||
Some notes about testing Discourse:
|
||||
|
||||
## FakeWeb
|
||||
|
||||
We use the [FakeWeb](https://github.com/chrisk/fakeweb) gem to fake external web
|
||||
requests.
|
||||
For example, check out the specs on `specs/components/oneboxer`.
|
||||
|
||||
This has several advantages to making real requests:
|
||||
|
||||
* We freeze the expected response from the remote server.
|
||||
* We don't need a network connection to run the specs.
|
||||
* It's faster.
|
||||
|
||||
So, if you need to define a spec that makes a web request, you'll have to record
|
||||
the real response to a fixture file, and tell FakeWeb to respond with it for the
|
||||
URI of your request.
|
||||
|
||||
Check out `spec/components/oneboxer/amazon_onebox_spec.rb` for an example on
|
||||
this.
|
||||
|
||||
### Recording responses
|
||||
|
||||
To record the actual response from the remote server, you can use curl and save the response to a file. We use the `-i` option to include headers in the output
|
||||
|
||||
curl -i http://en.m.wikipedia.org/wiki/Ruby > wikipedia.response
|
||||
|
||||
If you need to specify the User-Agent to send to the server, you can use `-A`:
|
||||
|
||||
curl -i -A 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A405 Safari/7534.48.3' http://en.m.wikipedia.org/wiki/Ruby > wikipedia.response
|
||||
|
||||
If the remote server is responding with a redirect, you'll need to fake both the
|
||||
original request and the one for the destination. Check out the
|
||||
`wikipedia.response` and `wikipedia_redirected.response` files in
|
||||
`spec/fixtures/oneboxer` for an example. You can also consider working directly
|
||||
with the final URL for simplicity.
|
||||
|
||||
|
||||
## MailCatcher
|
||||
|
||||
Discourse depends heavily on (sending) email for notifications. We use [MailCatcher](http://mailcatcher.me/)
|
||||
|
|
119
lib/final_destination.rb
Normal file
119
lib/final_destination.rb
Normal file
|
@ -0,0 +1,119 @@
|
|||
require "socket"
|
||||
require "ipaddr"
|
||||
require 'excon'
|
||||
|
||||
# Determine the final endpoint for a Web URI, following redirects
|
||||
class FinalDestination
|
||||
|
||||
attr_reader :status
|
||||
|
||||
def initialize(url, opts = nil)
|
||||
@uri = URI(url) rescue nil
|
||||
@opts = opts || {}
|
||||
@opts[:max_redirects] ||= 5
|
||||
@opts[:lookup_ip] ||= lambda do |host|
|
||||
begin
|
||||
IPSocket::getaddress(host)
|
||||
rescue SocketError
|
||||
nil
|
||||
end
|
||||
end
|
||||
@limit = @opts[:max_redirects]
|
||||
@status = :ready
|
||||
end
|
||||
|
||||
def redirected?
|
||||
@limit < @opts[:max_redirects]
|
||||
end
|
||||
|
||||
def request_headers
|
||||
{ "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
|
||||
"Accept" => "text/html",
|
||||
"Host" => @uri.hostname }
|
||||
end
|
||||
|
||||
def resolve
|
||||
if @limit < 0
|
||||
@status = :too_many_redirects
|
||||
return nil
|
||||
end
|
||||
|
||||
return nil unless validate_uri
|
||||
headers = request_headers
|
||||
head = Excon.head(@uri.to_s, read_timeout: 20, headers: headers)
|
||||
|
||||
# If the site does not allow HEAD, just try the url
|
||||
return @uri if head.status == 405
|
||||
|
||||
if head.status == 200
|
||||
@uri = nil unless FinalDestination.header_for(head, 'content-type') =~ /text\/html/
|
||||
@status = :resolved
|
||||
return @uri
|
||||
end
|
||||
|
||||
location = FinalDestination.header_for(head, 'location')
|
||||
if location
|
||||
location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/"
|
||||
@uri = URI(location) rescue nil
|
||||
@limit -= 1
|
||||
return resolve
|
||||
end
|
||||
|
||||
nil
|
||||
end
|
||||
|
||||
def validate_uri
|
||||
validate_uri_format && is_public?
|
||||
end
|
||||
|
||||
def validate_uri_format
|
||||
return false unless @uri
|
||||
return false unless ['https', 'http'].include?(@uri.scheme)
|
||||
|
||||
if @uri.scheme == 'http'
|
||||
return @uri.port == 80
|
||||
elsif @uri.scheme == 'https'
|
||||
return @uri.port == 443
|
||||
end
|
||||
|
||||
false
|
||||
end
|
||||
|
||||
def is_public?
|
||||
return false unless @uri && @uri.host
|
||||
|
||||
address_s = @opts[:lookup_ip].call(@uri.hostname)
|
||||
return false unless address_s
|
||||
|
||||
address = IPAddr.new(address_s)
|
||||
private_match = FinalDestination.private_ranges.any? {|r| r === address }
|
||||
|
||||
if private_match
|
||||
@status = :invalid_address
|
||||
return false
|
||||
end
|
||||
|
||||
true
|
||||
end
|
||||
|
||||
def self.private_ranges
|
||||
@private_ranges ||= [
|
||||
IPAddr.new('127.0.0.1'),
|
||||
IPAddr.new('172.16.0.0/12'),
|
||||
IPAddr.new('192.168.0.0/16'),
|
||||
IPAddr.new('10.0.0.0/8'),
|
||||
IPAddr.new('fc00::/7')
|
||||
]
|
||||
end
|
||||
|
||||
def self.lookup_ip(host)
|
||||
IPSocket::getaddress(host)
|
||||
end
|
||||
|
||||
def self.header_for(head, name)
|
||||
header = head.headers.detect do |k, _|
|
||||
name == k.downcase
|
||||
end
|
||||
header[1] if header
|
||||
end
|
||||
end
|
170
spec/components/final_destination_spec.rb
Normal file
170
spec/components/final_destination_spec.rb
Normal file
|
@ -0,0 +1,170 @@
|
|||
require 'rails_helper'
|
||||
require 'final_destination'
|
||||
|
||||
describe FinalDestination do
|
||||
|
||||
let(:opts) do
|
||||
{ # avoid IP lookups in test
|
||||
lookup_ip: lambda do |host|
|
||||
case host
|
||||
when 'eviltrout.com' then '52.84.143.152'
|
||||
when 'codinghorror.com' then '91.146.108.148'
|
||||
when 'discourse.org' then '104.25.152.10'
|
||||
when 'private-host.com' then '192.168.10.1'
|
||||
else
|
||||
host
|
||||
end
|
||||
end
|
||||
}
|
||||
end
|
||||
|
||||
before do
|
||||
FinalDestination.stubs(:lookup_ip) do |host|
|
||||
end
|
||||
end
|
||||
|
||||
let(:doc_response) do
|
||||
{ body: "<html>document</html>",
|
||||
headers: { "Content-Type" => "text/html" } }
|
||||
end
|
||||
|
||||
def redirect_response(from, dest)
|
||||
stub_request(:head, from).to_return(
|
||||
status: 302,
|
||||
headers: { "Location" => dest }
|
||||
)
|
||||
end
|
||||
|
||||
describe '.resolve' do
|
||||
|
||||
it "has a ready status code before anything happens" do
|
||||
expect(FinalDestination.new('https://eviltrout.com').status).to eq(:ready)
|
||||
end
|
||||
|
||||
it "returns nil an invalid url" do
|
||||
expect(FinalDestination.new(nil, opts).resolve).to be_nil
|
||||
expect(FinalDestination.new('asdf', opts).resolve).to be_nil
|
||||
end
|
||||
|
||||
context "without redirects" do
|
||||
before do
|
||||
stub_request(:head, "https://eviltrout.com").to_return(doc_response)
|
||||
end
|
||||
|
||||
it "returns the final url" do
|
||||
fd = FinalDestination.new('https://eviltrout.com', opts)
|
||||
expect(fd.resolve.to_s).to eq('https://eviltrout.com')
|
||||
expect(fd.redirected?).to eq(false)
|
||||
expect(fd.status).to eq(:resolved)
|
||||
end
|
||||
end
|
||||
|
||||
context "with a couple of redirects" do
|
||||
before do
|
||||
redirect_response("https://eviltrout.com", "https://codinghorror.com/blog")
|
||||
redirect_response("https://codinghorror.com/blog", "https://discourse.org")
|
||||
stub_request(:head, "https://discourse.org").to_return(doc_response)
|
||||
end
|
||||
|
||||
it "returns the final url" do
|
||||
fd = FinalDestination.new('https://eviltrout.com', opts)
|
||||
expect(fd.resolve.to_s).to eq('https://discourse.org')
|
||||
expect(fd.redirected?).to eq(true)
|
||||
expect(fd.status).to eq(:resolved)
|
||||
end
|
||||
end
|
||||
|
||||
context "with too many redirects" do
|
||||
before do
|
||||
redirect_response("https://eviltrout.com", "https://codinghorror.com/blog")
|
||||
redirect_response("https://codinghorror.com/blog", "https://discourse.org")
|
||||
stub_request(:head, "https://discourse.org").to_return(doc_response)
|
||||
end
|
||||
|
||||
it "returns the final url" do
|
||||
fd = FinalDestination.new('https://eviltrout.com', opts.merge(max_redirects: 1))
|
||||
expect(fd.resolve).to be_nil
|
||||
expect(fd.redirected?).to eq(true)
|
||||
expect(fd.status).to eq(:too_many_redirects)
|
||||
end
|
||||
end
|
||||
|
||||
context "with a redirect to an internal IP" do
|
||||
before do
|
||||
redirect_response("https://eviltrout.com", "https://private-host.com")
|
||||
stub_request(:head, "https://private-host.com").to_return(doc_response)
|
||||
end
|
||||
|
||||
it "returns the final url" do
|
||||
fd = FinalDestination.new('https://eviltrout.com', opts)
|
||||
expect(fd.resolve).to be_nil
|
||||
expect(fd.redirected?).to eq(true)
|
||||
expect(fd.status).to eq(:invalid_address)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '.validate_uri' do
|
||||
context "host lookups" do
|
||||
it "works for various hosts" do
|
||||
expect(FinalDestination.new('https://private-host.com', opts).validate_uri).to eq(false)
|
||||
expect(FinalDestination.new('https://eviltrout.com:443', opts).validate_uri).to eq(true)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe ".validate_url_format" do
|
||||
it "supports http urls" do
|
||||
expect(FinalDestination.new('http://eviltrout.com', opts).validate_uri_format).to eq(true)
|
||||
end
|
||||
|
||||
it "supports https urls" do
|
||||
expect(FinalDestination.new('https://eviltrout.com', opts).validate_uri_format).to eq(true)
|
||||
end
|
||||
|
||||
it "doesn't support ftp urls" do
|
||||
expect(FinalDestination.new('ftp://eviltrout.com', opts).validate_uri_format).to eq(false)
|
||||
end
|
||||
|
||||
it "returns false for schemeless URL" do
|
||||
expect(FinalDestination.new('eviltrout.com', opts).validate_uri_format).to eq(false)
|
||||
end
|
||||
|
||||
it "returns false for nil URL" do
|
||||
expect(FinalDestination.new(nil, opts).validate_uri_format).to eq(false)
|
||||
end
|
||||
|
||||
it "returns false for invalid ports" do
|
||||
expect(FinalDestination.new('http://eviltrout.com:21', opts).validate_uri_format).to eq(false)
|
||||
expect(FinalDestination.new('https://eviltrout.com:8000', opts).validate_uri_format).to eq(false)
|
||||
end
|
||||
|
||||
it "returns true for valid ports" do
|
||||
expect(FinalDestination.new('http://eviltrout.com:80', opts).validate_uri_format).to eq(true)
|
||||
expect(FinalDestination.new('https://eviltrout.com:443',opts).validate_uri_format).to eq(true)
|
||||
end
|
||||
end
|
||||
|
||||
describe ".is_public" do
|
||||
it "returns false for a valid ipv4" do
|
||||
expect(FinalDestination.new("https://52.84.143.67", opts).is_public?).to eq(true)
|
||||
expect(FinalDestination.new("https://104.25.153.10", opts).is_public?).to eq(true)
|
||||
end
|
||||
|
||||
it "returns true for private ipv4" do
|
||||
expect(FinalDestination.new("https://127.0.0.1", opts).is_public?).to eq(false)
|
||||
expect(FinalDestination.new("https://192.168.1.3", opts).is_public?).to eq(false)
|
||||
expect(FinalDestination.new("https://10.0.0.5", opts).is_public?).to eq(false)
|
||||
expect(FinalDestination.new("https://172.16.0.1", opts).is_public?).to eq(false)
|
||||
end
|
||||
|
||||
it "returns true for public ipv6" do
|
||||
expect(FinalDestination.new("https://[2001:470:1:3a8::251]", opts).is_public?).to eq(true)
|
||||
end
|
||||
|
||||
it "returns true for private ipv6" do
|
||||
expect(FinalDestination.new("https://[fdd7:b450:d4d1:6b44::1]", opts).is_public?).to eq(false)
|
||||
end
|
||||
end
|
||||
|
||||
end
|
Loading…
Reference in New Issue
Block a user