diff --git a/app/jobs/regular/crawl_topic_link.rb b/app/jobs/regular/crawl_topic_link.rb index a5fb8907e00..05321081815 100644 --- a/app/jobs/regular/crawl_topic_link.rb +++ b/app/jobs/regular/crawl_topic_link.rb @@ -1,55 +1,13 @@ require 'open-uri' require 'nokogiri' require 'excon' +require 'final_destination' module Jobs class CrawlTopicLink < Jobs::Base class ReadEnough < StandardError; end - # Retrieve a header regardless of case sensitivity - def self.header_for(head, name) - header = head.headers.detect do |k, _| - name == k.downcase - end - header[1] if header - end - - def self.request_headers(uri) - { "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36", - "Accept" => "text/html", - "Host" => uri.host } - end - - # Follow any redirects that might exist - def self.final_uri(url, limit=5) - return if limit < 0 - - uri = URI(url) - return if uri.blank? || uri.host.blank? - return unless ['https', 'http'].include?(uri.scheme) - return unless [80, 443].include?(uri.port) - - headers = CrawlTopicLink.request_headers(uri) - head = Excon.head(url, read_timeout: 20, headers: headers) - - # If the site does not allow HEAD, just try the url - return uri if head.status == 405 - - if head.status == 200 - uri = nil unless header_for(head, 'content-type') =~ /text\/html/ - return uri - end - - location = header_for(head, 'location') - if location - location = "#{uri.scheme}://#{uri.host}#{location}" if location[0] == "/" - return final_uri(location, limit - 1) - end - - nil - end - def self.max_chunk_size(uri) # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for # one host but amazon is a big one. @@ -64,7 +22,8 @@ module Jobs # Never crawl in test mode return if Rails.env.test? - uri = final_uri(url) + fd = FinalDestination.new(url) + uri = fd.resolve return "" unless uri result = "" @@ -76,7 +35,7 @@ module Jobs # that matter!) raise ReadEnough.new if result.size > (CrawlTopicLink.max_chunk_size(uri) * 1024) end - Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: CrawlTopicLink.request_headers(uri)) + Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers) result rescue Excon::Errors::SocketError => ex diff --git a/docs/TESTING.md b/docs/TESTING.md index b2e97601f7e..f62818d5c92 100644 --- a/docs/TESTING.md +++ b/docs/TESTING.md @@ -2,42 +2,6 @@ Some notes about testing Discourse: -## FakeWeb - -We use the [FakeWeb](https://github.com/chrisk/fakeweb) gem to fake external web -requests. -For example, check out the specs on `specs/components/oneboxer`. - -This has several advantages to making real requests: - -* We freeze the expected response from the remote server. -* We don't need a network connection to run the specs. -* It's faster. - -So, if you need to define a spec that makes a web request, you'll have to record -the real response to a fixture file, and tell FakeWeb to respond with it for the -URI of your request. - -Check out `spec/components/oneboxer/amazon_onebox_spec.rb` for an example on -this. - -### Recording responses - -To record the actual response from the remote server, you can use curl and save the response to a file. We use the `-i` option to include headers in the output - - curl -i http://en.m.wikipedia.org/wiki/Ruby > wikipedia.response - -If you need to specify the User-Agent to send to the server, you can use `-A`: - - curl -i -A 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A405 Safari/7534.48.3' http://en.m.wikipedia.org/wiki/Ruby > wikipedia.response - -If the remote server is responding with a redirect, you'll need to fake both the -original request and the one for the destination. Check out the -`wikipedia.response` and `wikipedia_redirected.response` files in -`spec/fixtures/oneboxer` for an example. You can also consider working directly -with the final URL for simplicity. - - ## MailCatcher Discourse depends heavily on (sending) email for notifications. We use [MailCatcher](http://mailcatcher.me/) diff --git a/lib/final_destination.rb b/lib/final_destination.rb new file mode 100644 index 00000000000..23e76033a51 --- /dev/null +++ b/lib/final_destination.rb @@ -0,0 +1,119 @@ +require "socket" +require "ipaddr" +require 'excon' + +# Determine the final endpoint for a Web URI, following redirects +class FinalDestination + + attr_reader :status + + def initialize(url, opts = nil) + @uri = URI(url) rescue nil + @opts = opts || {} + @opts[:max_redirects] ||= 5 + @opts[:lookup_ip] ||= lambda do |host| + begin + IPSocket::getaddress(host) + rescue SocketError + nil + end + end + @limit = @opts[:max_redirects] + @status = :ready + end + + def redirected? + @limit < @opts[:max_redirects] + end + + def request_headers + { "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", + "Accept" => "text/html", + "Host" => @uri.hostname } + end + + def resolve + if @limit < 0 + @status = :too_many_redirects + return nil + end + + return nil unless validate_uri + headers = request_headers + head = Excon.head(@uri.to_s, read_timeout: 20, headers: headers) + + # If the site does not allow HEAD, just try the url + return @uri if head.status == 405 + + if head.status == 200 + @uri = nil unless FinalDestination.header_for(head, 'content-type') =~ /text\/html/ + @status = :resolved + return @uri + end + + location = FinalDestination.header_for(head, 'location') + if location + location = "#{@uri.scheme}://#{@uri.host}#{location}" if location[0] == "/" + @uri = URI(location) rescue nil + @limit -= 1 + return resolve + end + + nil + end + + def validate_uri + validate_uri_format && is_public? + end + + def validate_uri_format + return false unless @uri + return false unless ['https', 'http'].include?(@uri.scheme) + + if @uri.scheme == 'http' + return @uri.port == 80 + elsif @uri.scheme == 'https' + return @uri.port == 443 + end + + false + end + + def is_public? + return false unless @uri && @uri.host + + address_s = @opts[:lookup_ip].call(@uri.hostname) + return false unless address_s + + address = IPAddr.new(address_s) + private_match = FinalDestination.private_ranges.any? {|r| r === address } + + if private_match + @status = :invalid_address + return false + end + + true + end + + def self.private_ranges + @private_ranges ||= [ + IPAddr.new('127.0.0.1'), + IPAddr.new('172.16.0.0/12'), + IPAddr.new('192.168.0.0/16'), + IPAddr.new('10.0.0.0/8'), + IPAddr.new('fc00::/7') + ] + end + + def self.lookup_ip(host) + IPSocket::getaddress(host) + end + + def self.header_for(head, name) + header = head.headers.detect do |k, _| + name == k.downcase + end + header[1] if header + end +end diff --git a/spec/components/final_destination_spec.rb b/spec/components/final_destination_spec.rb new file mode 100644 index 00000000000..0b4aed46b14 --- /dev/null +++ b/spec/components/final_destination_spec.rb @@ -0,0 +1,170 @@ +require 'rails_helper' +require 'final_destination' + +describe FinalDestination do + + let(:opts) do + { # avoid IP lookups in test + lookup_ip: lambda do |host| + case host + when 'eviltrout.com' then '52.84.143.152' + when 'codinghorror.com' then '91.146.108.148' + when 'discourse.org' then '104.25.152.10' + when 'private-host.com' then '192.168.10.1' + else + host + end + end + } + end + + before do + FinalDestination.stubs(:lookup_ip) do |host| + end + end + + let(:doc_response) do + { body: "document", + headers: { "Content-Type" => "text/html" } } + end + + def redirect_response(from, dest) + stub_request(:head, from).to_return( + status: 302, + headers: { "Location" => dest } + ) + end + + describe '.resolve' do + + it "has a ready status code before anything happens" do + expect(FinalDestination.new('https://eviltrout.com').status).to eq(:ready) + end + + it "returns nil an invalid url" do + expect(FinalDestination.new(nil, opts).resolve).to be_nil + expect(FinalDestination.new('asdf', opts).resolve).to be_nil + end + + context "without redirects" do + before do + stub_request(:head, "https://eviltrout.com").to_return(doc_response) + end + + it "returns the final url" do + fd = FinalDestination.new('https://eviltrout.com', opts) + expect(fd.resolve.to_s).to eq('https://eviltrout.com') + expect(fd.redirected?).to eq(false) + expect(fd.status).to eq(:resolved) + end + end + + context "with a couple of redirects" do + before do + redirect_response("https://eviltrout.com", "https://codinghorror.com/blog") + redirect_response("https://codinghorror.com/blog", "https://discourse.org") + stub_request(:head, "https://discourse.org").to_return(doc_response) + end + + it "returns the final url" do + fd = FinalDestination.new('https://eviltrout.com', opts) + expect(fd.resolve.to_s).to eq('https://discourse.org') + expect(fd.redirected?).to eq(true) + expect(fd.status).to eq(:resolved) + end + end + + context "with too many redirects" do + before do + redirect_response("https://eviltrout.com", "https://codinghorror.com/blog") + redirect_response("https://codinghorror.com/blog", "https://discourse.org") + stub_request(:head, "https://discourse.org").to_return(doc_response) + end + + it "returns the final url" do + fd = FinalDestination.new('https://eviltrout.com', opts.merge(max_redirects: 1)) + expect(fd.resolve).to be_nil + expect(fd.redirected?).to eq(true) + expect(fd.status).to eq(:too_many_redirects) + end + end + + context "with a redirect to an internal IP" do + before do + redirect_response("https://eviltrout.com", "https://private-host.com") + stub_request(:head, "https://private-host.com").to_return(doc_response) + end + + it "returns the final url" do + fd = FinalDestination.new('https://eviltrout.com', opts) + expect(fd.resolve).to be_nil + expect(fd.redirected?).to eq(true) + expect(fd.status).to eq(:invalid_address) + end + end + end + + describe '.validate_uri' do + context "host lookups" do + it "works for various hosts" do + expect(FinalDestination.new('https://private-host.com', opts).validate_uri).to eq(false) + expect(FinalDestination.new('https://eviltrout.com:443', opts).validate_uri).to eq(true) + end + end + end + + describe ".validate_url_format" do + it "supports http urls" do + expect(FinalDestination.new('http://eviltrout.com', opts).validate_uri_format).to eq(true) + end + + it "supports https urls" do + expect(FinalDestination.new('https://eviltrout.com', opts).validate_uri_format).to eq(true) + end + + it "doesn't support ftp urls" do + expect(FinalDestination.new('ftp://eviltrout.com', opts).validate_uri_format).to eq(false) + end + + it "returns false for schemeless URL" do + expect(FinalDestination.new('eviltrout.com', opts).validate_uri_format).to eq(false) + end + + it "returns false for nil URL" do + expect(FinalDestination.new(nil, opts).validate_uri_format).to eq(false) + end + + it "returns false for invalid ports" do + expect(FinalDestination.new('http://eviltrout.com:21', opts).validate_uri_format).to eq(false) + expect(FinalDestination.new('https://eviltrout.com:8000', opts).validate_uri_format).to eq(false) + end + + it "returns true for valid ports" do + expect(FinalDestination.new('http://eviltrout.com:80', opts).validate_uri_format).to eq(true) + expect(FinalDestination.new('https://eviltrout.com:443',opts).validate_uri_format).to eq(true) + end + end + + describe ".is_public" do + it "returns false for a valid ipv4" do + expect(FinalDestination.new("https://52.84.143.67", opts).is_public?).to eq(true) + expect(FinalDestination.new("https://104.25.153.10", opts).is_public?).to eq(true) + end + + it "returns true for private ipv4" do + expect(FinalDestination.new("https://127.0.0.1", opts).is_public?).to eq(false) + expect(FinalDestination.new("https://192.168.1.3", opts).is_public?).to eq(false) + expect(FinalDestination.new("https://10.0.0.5", opts).is_public?).to eq(false) + expect(FinalDestination.new("https://172.16.0.1", opts).is_public?).to eq(false) + end + + it "returns true for public ipv6" do + expect(FinalDestination.new("https://[2001:470:1:3a8::251]", opts).is_public?).to eq(true) + end + + it "returns true for private ipv6" do + expect(FinalDestination.new("https://[fdd7:b450:d4d1:6b44::1]", opts).is_public?).to eq(false) + end + end + +end