discourse/spec/components/crawler_detection_spec.rb

# frozen_string_literal: true

require 'rails_helper'

describe CrawlerDetection do

  def crawler!(user_agent, via = nil)
    if (!CrawlerDetection.crawler?(user_agent, via))
      raise "#{user_agent} should be a crawler!"
    end
  end

  def not_crawler!(s)
    if CrawlerDetection.crawler?(s)
      raise "#{s} should not be a crawler!"
    end
  end

  describe "crawler?" do

    it "can be amended via site settings" do
      SiteSetting.crawler_user_agents = 'Mooble|Kaboodle+*'

      crawler! "Mozilla/5.0 Safari (compatible; Kaboodle+*/2.1; +http://www.google.com/bot.html)"
      crawler! "Mozilla/5.0 Safari (compatible; Mooble+*/2.1; +http://www.google.com/bot.html)"
      not_crawler! "Mozilla/5.0 Safari (compatible; Gooble+*/2.1; +http://www.google.com/bot.html)"
    end

    it "returns true for crawler user agents" do
      # https://support.google.com/webmasters/answer/1061943?hl=en
      crawler! "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
      crawler! "Googlebot/2.1 (+http://www.google.com/bot.html)"
      crawler! "Googlebot-News"
      crawler! "Googlebot-Image/1.0"
      crawler! "Googlebot-Video/1.0"
      crawler! "(compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)"
      crawler! "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
      crawler! "(compatible; Mediapartners-Google/2.1; +http://www.google.com/bot.html)"
      crawler! "Mediapartners-Google"
      crawler! "AdsBot-Google (+http://www.google.com/adsbot.html)"
      crawler! "Twitterbot"
      crawler! "facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)"
      crawler! "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
      crawler! "Baiduspider+(+http://www.baidu.com/search/spider.htm)"
      crawler! "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
      crawler! "Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)"
      crawler! "LogicMonitor SiteMonitor/1.0"
      crawler! "Java/1.8.0_151"
      crawler! "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
    end

    it "returns true when VIA header contains 'web.archive.org'" do
      crawler!("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "HTTP/1.0 web.archive.org (Wayback Save Page)")
    end

    it "returns false for non-crawler user agents" do
      not_crawler! "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36"
      not_crawler! "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko"
      not_crawler! "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)"
      not_crawler! "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25"
      not_crawler! "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0"
      not_crawler! "Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"
      not_crawler! "Mozilla/5.0 (Linux; Android 6.0; CUBOT DINOSAUR Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Mobile Safari/537.36+"
      not_crawler! "DiscourseAPI Ruby Gem 0.19.0"
    end

  end

  describe 'allow_crawler?' do
    it 'returns true if whitelist and blacklist are blank' do
      expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true)
    end

    context 'whitelist is set' do
      before do
        SiteSetting.whitelisted_crawler_user_agents = 'Googlebot|Twitterbot'
      end

      it 'returns true for matching user agents' do
        expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true)
        expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true)
        expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(true)
      end

      it 'returns false for user agents that do not match' do
        expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(false)
        expect(CrawlerDetection.allow_crawler?('Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)')).to eq(false)
        expect(CrawlerDetection.allow_crawler?('')).to eq(false)
      end

      context 'and blacklist is set' do
        before do
          SiteSetting.blacklisted_crawler_user_agents = 'Googlebot-Image'
        end

        it 'ignores the blacklist' do
          expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true)
        end
      end
    end

    context 'blacklist is set' do
      before do
        SiteSetting.blacklisted_crawler_user_agents = 'Googlebot|Twitterbot'
      end

      it 'returns true for crawlers that do not match' do
        expect(CrawlerDetection.allow_crawler?('Mediapartners-Google')).to eq(true)
        expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(true)
        expect(CrawlerDetection.allow_crawler?('')).to eq(true)
      end

      it 'returns false for user agents that match' do
        expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(false)
        expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(false)
        expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(false)
      end
    end
  end

  describe 'is_blocked_crawler?' do
    it 'is false if user agent is a crawler and no whitelist or blacklist is defined' do
      expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(false)
    end

    it 'is false if user agent is not a crawler and no whitelist or blacklist is defined' do
      expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
    end

    it 'is true if user agent is a crawler and is not whitelisted' do
      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
      expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true)
    end

    it 'is false if user agent is not a crawler and there is a whitelist' do
      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
      expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
    end

    it 'is true if user agent is a crawler and is blacklisted' do
      SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot'
      expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true)
    end

    it 'is true if user agent is a crawler and is not blacklisted' do
      SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot'
      expect(CrawlerDetection.is_blocked_crawler?('Googlebot')).to eq(false)
    end

    it 'is false if user agent is not a crawler and blacklist is defined' do
      SiteSetting.blacklisted_crawler_user_agents = 'Mozilla'
      expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
    end

    it 'is true if user agent is missing and whitelist is defined' do
      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
      expect(CrawlerDetection.is_blocked_crawler?('')).to eq(true)
      expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(true)
    end

    it 'is false if user agent is missing and blacklist is defined' do
      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
      expect(CrawlerDetection.is_blocked_crawler?('')).to eq(false)
      expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(false)
    end
  end
end
DEV: use #frozen_string_literal: true on all spec This change both speeds up specs (less strings to allocate) and helps catch cases where methods in Discourse are mutating inputs. Overall we will be migrating everything to use #frozen_string_literal: true it will take a while, but this is the first and safest move in this direction 2019-04-30 08:27:42 +08:00			`# frozen_string_literal: true`

Prepare for separation of RSpec helper files Since rspec-rails 3, the default installation creates two helper files: * `spec_helper.rb` * `rails_helper.rb` `spec_helper.rb` is intended as a way of running specs that do not require Rails, whereas `rails_helper.rb` loads Rails (as Discourse's current `spec_helper.rb` does). For more information: https://www.relishapp.com/rspec/rspec-rails/docs/upgrade#default-helper-files In this commit, I've simply replaced all instances of `spec_helper` with `rails_helper`, and renamed the original `spec_helper.rb`. This brings the Discourse project closer to the standard usage of RSpec in a Rails app. At present, every spec relies on loading Rails, but there are likely many that don't need to. In a future pull request, I hope to introduce a separate, minimal `spec_helper.rb` which can be used in tests which don't rely on Rails. 2015-10-11 17:41:23 +08:00			`require 'rails_helper'`
REFACTOR: Rename `GooglebotDetection` to `CrawlerDetection` because we will likely whitelist more crawlers in the future. 2014-02-21 05:07:02 +08:00
			`describe CrawlerDetection do`
FIX: cubot android devices were detected as crawlers 2018-06-21 08:56:46 +08:00
FIX: use crawler layout when saving url in Wayback Machine (#7667) 2019-06-03 10:13:32 +08:00			`def crawler!(user_agent, via = nil)`
			`if (!CrawlerDetection.crawler?(user_agent, via))`
			`raise "#{user_agent} should be a crawler!"`
FIX: cubot android devices were detected as crawlers 2018-06-21 08:56:46 +08:00			`end`
			`end`

			`def not_crawler!(s)`
			`if CrawlerDetection.crawler?(s)`
			`raise "#{s} should not be a crawler!"`
			`end`
			`end`

REFACTOR: Rename `GooglebotDetection` to `CrawlerDetection` because we will likely whitelist more crawlers in the future. 2014-02-21 05:07:02 +08:00			`describe "crawler?" do`
FEATURE: flexible crawler detection You can use the crawler user agents site setting to amend what user agents are considered crawlers based on a string match in the user agent Also improves performance of crawler detection slightly 2017-09-29 10:31:50 +08:00
			`it "can be amended via site settings" do`
			`SiteSetting.crawler_user_agents = 'Mooble\|Kaboodle+*'`
FIX: cubot android devices were detected as crawlers 2018-06-21 08:56:46 +08:00
			`crawler! "Mozilla/5.0 Safari (compatible; Kaboodle+*/2.1; +http://www.google.com/bot.html)"`
			`crawler! "Mozilla/5.0 Safari (compatible; Mooble+*/2.1; +http://www.google.com/bot.html)"`
			`not_crawler! "Mozilla/5.0 Safari (compatible; Gooble+*/2.1; +http://www.google.com/bot.html)"`
FEATURE: flexible crawler detection You can use the crawler user agents site setting to amend what user agents are considered crawlers based on a string match in the user agent Also improves performance of crawler detection slightly 2017-09-29 10:31:50 +08:00			`end`

REFACTOR: Rename `GooglebotDetection` to `CrawlerDetection` because we will likely whitelist more crawlers in the future. 2014-02-21 05:07:02 +08:00			`it "returns true for crawler user agents" do`
			`# https://support.google.com/webmasters/answer/1061943?hl=en`
FIX: cubot android devices were detected as crawlers 2018-06-21 08:56:46 +08:00			`crawler! "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"`
			`crawler! "Googlebot/2.1 (+http://www.google.com/bot.html)"`
			`crawler! "Googlebot-News"`
			`crawler! "Googlebot-Image/1.0"`
			`crawler! "Googlebot-Video/1.0"`
			`crawler! "(compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)"`
			`crawler! "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"`
			`crawler! "(compatible; Mediapartners-Google/2.1; +http://www.google.com/bot.html)"`
			`crawler! "Mediapartners-Google"`
			`crawler! "AdsBot-Google (+http://www.google.com/adsbot.html)"`
			`crawler! "Twitterbot"`
			`crawler! "facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)"`
			`crawler! "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"`
			`crawler! "Baiduspider+(+http://www.baidu.com/search/spider.htm)"`
			`crawler! "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"`
			`crawler! "Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)"`
			`crawler! "LogicMonitor SiteMonitor/1.0"`
			`crawler! "Java/1.8.0_151"`
			`crawler! "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"`
REFACTOR: Rename `GooglebotDetection` to `CrawlerDetection` because we will likely whitelist more crawlers in the future. 2014-02-21 05:07:02 +08:00			`end`

FIX: use crawler layout when saving url in Wayback Machine (#7667) 2019-06-03 10:13:32 +08:00			`it "returns true when VIA header contains 'web.archive.org'" do`
			`crawler!("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "HTTP/1.0 web.archive.org (Wayback Save Page)")`
			`end`

REFACTOR: Rename `GooglebotDetection` to `CrawlerDetection` because we will likely whitelist more crawlers in the future. 2014-02-21 05:07:02 +08:00			`it "returns false for non-crawler user agents" do`
FIX: cubot android devices were detected as crawlers 2018-06-21 08:56:46 +08:00			`not_crawler! "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36"`
			`not_crawler! "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko"`
			`not_crawler! "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)"`
			`not_crawler! "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25"`
			`not_crawler! "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0"`
			`not_crawler! "Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"`
			`not_crawler! "Mozilla/5.0 (Linux; Android 6.0; CUBOT DINOSAUR Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Mobile Safari/537.36+"`
Fix tests 2019-05-08 21:58:47 +08:00			`not_crawler! "DiscourseAPI Ruby Gem 0.19.0"`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-16 05:10:45 +08:00			`end`
FEATURE: much improved and simplified crawler detection - phase one does it match 'trident\|webkit\|gecko\|chrome\|safari\|msie\|opera' yes- well it is possibly a browser - phase two does it match 'rss\|bot\|spider\|crawler\|facebook\|archive\|wayback\|ping\|monitor' probably a crawler then Based off: https://gist.github.com/SamSaffron/6cfad7ea3e6df321ffb7a84f93720a53 2018-01-16 12:41:13 +08:00
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-16 05:10:45 +08:00			`end`

			`describe 'allow_crawler?' do`
			`it 'returns true if whitelist and blacklist are blank' do`
			`expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true)`
			`end`

			`context 'whitelist is set' do`
			`before do`
			`SiteSetting.whitelisted_crawler_user_agents = 'Googlebot\|Twitterbot'`
			`end`

			`it 'returns true for matching user agents' do`
			`expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true)`
			`expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true)`
			`expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(true)`
			`end`

			`it 'returns false for user agents that do not match' do`
			`expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(false)`
			`expect(CrawlerDetection.allow_crawler?('Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)')).to eq(false)`
			`expect(CrawlerDetection.allow_crawler?('')).to eq(false)`
			`end`

			`context 'and blacklist is set' do`
			`before do`
			`SiteSetting.blacklisted_crawler_user_agents = 'Googlebot-Image'`
			`end`

			`it 'ignores the blacklist' do`
			`expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true)`
			`end`
			`end`
			`end`

			`context 'blacklist is set' do`
			`before do`
			`SiteSetting.blacklisted_crawler_user_agents = 'Googlebot\|Twitterbot'`
			`end`

			`it 'returns true for crawlers that do not match' do`
			`expect(CrawlerDetection.allow_crawler?('Mediapartners-Google')).to eq(true)`
			`expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(true)`
			`expect(CrawlerDetection.allow_crawler?('')).to eq(true)`
			`end`

			`it 'returns false for user agents that match' do`
			`expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(false)`
			`expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(false)`
			`expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(false)`
			`end`
			`end`
			`end`

			`describe 'is_blocked_crawler?' do`
			`it 'is false if user agent is a crawler and no whitelist or blacklist is defined' do`
			`expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(false)`
REFACTOR: Rename `GooglebotDetection` to `CrawlerDetection` because we will likely whitelist more crawlers in the future. 2014-02-21 05:07:02 +08:00			`end`

FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-16 05:10:45 +08:00			`it 'is false if user agent is not a crawler and no whitelist or blacklist is defined' do`
			`expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)`
			`end`

			`it 'is true if user agent is a crawler and is not whitelisted' do`
			`SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'`
			`expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true)`
			`end`

			`it 'is false if user agent is not a crawler and there is a whitelist' do`
			`SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'`
			`expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)`
			`end`

			`it 'is true if user agent is a crawler and is blacklisted' do`
			`SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot'`
			`expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true)`
			`end`

			`it 'is true if user agent is a crawler and is not blacklisted' do`
			`SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot'`
			`expect(CrawlerDetection.is_blocked_crawler?('Googlebot')).to eq(false)`
			`end`

			`it 'is false if user agent is not a crawler and blacklist is defined' do`
			`SiteSetting.blacklisted_crawler_user_agents = 'Mozilla'`
			`expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)`
			`end`

			`it 'is true if user agent is missing and whitelist is defined' do`
			`SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'`
			`expect(CrawlerDetection.is_blocked_crawler?('')).to eq(true)`
			`expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(true)`
			`end`

			`it 'is false if user agent is missing and blacklist is defined' do`
			`SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'`
			`expect(CrawlerDetection.is_blocked_crawler?('')).to eq(false)`
			`expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(false)`
			`end`
REFACTOR: Rename `GooglebotDetection` to `CrawlerDetection` because we will likely whitelist more crawlers in the future. 2014-02-21 05:07:02 +08:00			`end`
			`end`