discourse/spec/requests/robots_txt_controller_spec.rb

# frozen_string_literal: true

require 'rails_helper'

RSpec.describe RobotsTxtController do
  describe '#builder' do
    it "returns json information for building a robots.txt" do
      get "/robots-builder.json"
      json = ::JSON.parse(response.body)
      expect(json).to be_present
      expect(json['header']).to be_present
      expect(json['agents']).to be_present
    end

    it "includes overridden content if robots.txt is is overridden" do
      SiteSetting.overridden_robots_txt = "something"

      get "/robots-builder.json"
      expect(response.status).to eq(200)
      json = ::JSON.parse(response.body)
      expect(json['header']).to be_present
      expect(json['agents']).to be_present
      expect(json['overridden']).to eq("something")
    end
  end

  describe '#index' do

    context "header for when the content is overridden" do
      it "is not prepended if there are no overrides" do
        sign_in(Fabricate(:admin))
        get '/robots.txt'
        expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
      end

      it "is prepended if there are overrides and the user is admin" do
        SiteSetting.overridden_robots_txt = "overridden_content"
        sign_in(Fabricate(:admin))
        get '/robots.txt'
        expect(response.body).to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
      end

      it "is not prepended if the user is not admin" do
        SiteSetting.overridden_robots_txt = "overridden_content"
        get '/robots.txt'
        expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
      end
    end

    context 'subfolder' do
      it 'prefixes the rules with the directory' do
        Discourse.stubs(:base_uri).returns('/forum')
        get '/robots.txt'
        expect(response.body).to include("\nDisallow: /forum/admin")
        expect(response.body).to include("\nNoindex: /forum/admin")
      end
    end

    context 'crawl delay' do
      it 'allows you to set crawl delay on particular bots' do
        SiteSetting.allow_index_in_robots_txt = true
        SiteSetting.slow_down_crawler_rate = 17
        SiteSetting.slow_down_crawler_user_agents = 'bingbot|googlebot'
        get '/robots.txt'
        expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17")
        expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17")
      end
    end

    context 'allow_index_in_robots_txt is true' do

      def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
        expect(allow_index).to be_present
        expect(disallow_index).to be_present

        allow_section = allow_index < disallow_index ?
          response.body[allow_index...disallow_index] : response.body[allow_index..-1]

        expect(allow_section).to include('Disallow: /u/')
        expect(allow_section).to_not include("Disallow: /\n")

        disallowed_section = allow_index < disallow_index ?
          response.body[disallow_index..-1] : response.body[disallow_index...allow_index]
        expect(disallowed_section).to include("Disallow: /\n")
      end

      it "returns index when indexing is allowed" do
        SiteSetting.allow_index_in_robots_txt = true
        get '/robots.txt'

        i = response.body.index('User-agent: *')
        expect(i).to be_present
        expect(response.body[i..-1]).to include("Disallow: /u/")
      end

      it "can whitelist user agents" do
        SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot"
        get '/robots.txt'
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')

        allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
        disallow_all_index = response.body.index('User-agent: *')

        expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)
      end

      it "can blacklist user agents" do
        SiteSetting.blacklisted_crawler_user_agents = "Googlebot|Twitterbot"
        get '/robots.txt'
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')

        disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
        allow_index = response.body.index('User-agent: *')

        expect_allowed_and_disallowed_sections(allow_index, disallow_index)
      end

      it "ignores blacklist if whitelist is set" do
        SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot"
        SiteSetting.blacklisted_crawler_user_agents = "Bananabot"
        get '/robots.txt'
        expect(response.body).to_not include('Bananabot')
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')
      end
    end

    it "returns noindex when indexing is disallowed" do
      SiteSetting.allow_index_in_robots_txt = false
      get '/robots.txt'

      expect(response.body).to_not include("Disallow: /u/")
    end

    it "returns overridden robots.txt if the file is overridden" do
      SiteSetting.overridden_robots_txt = "blah whatever"
      get '/robots.txt'
      expect(response.status).to eq(200)
      expect(response.body).to eq(SiteSetting.overridden_robots_txt)
    end
  end
end
DEV: use #frozen_string_literal: true on all spec This change both speeds up specs (less strings to allocate) and helps catch cases where methods in Discourse are mutating inputs. Overall we will be migrating everything to use #frozen_string_literal: true it will take a while, but this is the first and safest move in this direction 2019-04-30 08:27:42 +08:00			`# frozen_string_literal: true`

Prepare for separation of RSpec helper files Since rspec-rails 3, the default installation creates two helper files: * `spec_helper.rb` * `rails_helper.rb` `spec_helper.rb` is intended as a way of running specs that do not require Rails, whereas `rails_helper.rb` loads Rails (as Discourse's current `spec_helper.rb` does). For more information: https://www.relishapp.com/rspec/rspec-rails/docs/upgrade#default-helper-files In this commit, I've simply replaced all instances of `spec_helper` with `rails_helper`, and renamed the original `spec_helper.rb`. This brings the Discourse project closer to the standard usage of RSpec in a Rails app. At present, every spec relies on loading Rails, but there are likely many that don't need to. In a future pull request, I hope to introduce a separate, minimal `spec_helper.rb` which can be used in tests which don't rely on Rails. 2015-10-11 17:41:23 +08:00			`require 'rails_helper'`
added support for disabling indexing by google using SiteSetting.allow_index_in_robots_txt = false 2013-02-11 08:02:57 +08:00
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 12:06:56 +08:00			`RSpec.describe RobotsTxtController do`
FEATURE: An API to help sites build robots.txt files programatically This is mainly useful for subfolder sites, who need to expose their robots.txt contents to a parent site. 2018-04-17 03:43:20 +08:00			`describe '#builder' do`
			`it "returns json information for building a robots.txt" do`
			`get "/robots-builder.json"`
			`json = ::JSON.parse(response.body)`
			`expect(json).to be_present`
			`expect(json['header']).to be_present`
			`expect(json['agents']).to be_present`
			`end`
FEATURE: Allow customization of robots.txt (#7884) * FEATURE: Allow customization of robots.txt This allows admins to customize/override the content of the robots.txt file at /admin/customize/robots. That page is not linked to anywhere in the UI -- admins have to manually type the URL to access that page. * use Ember.computed.not * Jeff feedback * Feedback * Remove unused import 2019-07-16 01:47:44 +08:00
			`it "includes overridden content if robots.txt is is overridden" do`
			`SiteSetting.overridden_robots_txt = "something"`

			`get "/robots-builder.json"`
			`expect(response.status).to eq(200)`
			`json = ::JSON.parse(response.body)`
			`expect(json['header']).to be_present`
			`expect(json['agents']).to be_present`
			`expect(json['overridden']).to eq("something")`
			`end`
FEATURE: An API to help sites build robots.txt files programatically This is mainly useful for subfolder sites, who need to expose their robots.txt contents to a parent site. 2018-04-17 03:43:20 +08:00			`end`

Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 12:06:56 +08:00			`describe '#index' do`
FEATURE: allow for setting crawl delay per user agent Also moved to default crawl delay bing so no more than a req every 5 seconds is allowed New site settings: "slow_down_crawler_user_agents" - list of crawlers that will be slowed down "slow_down_crawler_rate" - how many seconds to wait between requests Not enforced server side yet 2018-04-06 08:15:23 +08:00
FEATURE: Allow customization of robots.txt (#7884) * FEATURE: Allow customization of robots.txt This allows admins to customize/override the content of the robots.txt file at /admin/customize/robots. That page is not linked to anywhere in the UI -- admins have to manually type the URL to access that page. * use Ember.computed.not * Jeff feedback * Feedback * Remove unused import 2019-07-16 01:47:44 +08:00			`context "header for when the content is overridden" do`
			`it "is not prepended if there are no overrides" do`
			`sign_in(Fabricate(:admin))`
			`get '/robots.txt'`
			`expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)`
			`end`

			`it "is prepended if there are overrides and the user is admin" do`
			`SiteSetting.overridden_robots_txt = "overridden_content"`
			`sign_in(Fabricate(:admin))`
			`get '/robots.txt'`
			`expect(response.body).to start_with(RobotsTxtController::OVERRIDDEN_HEADER)`
			`end`

			`it "is not prepended if the user is not admin" do`
			`SiteSetting.overridden_robots_txt = "overridden_content"`
			`get '/robots.txt'`
			`expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)`
			`end`
			`end`

prefix the robots.txt rules with the directory when using subfolder 2018-04-12 04:05:02 +08:00			`context 'subfolder' do`
			`it 'prefixes the rules with the directory' do`
			`Discourse.stubs(:base_uri).returns('/forum')`
			`get '/robots.txt'`
			`expect(response.body).to include("\nDisallow: /forum/admin")`
FEATURE: add Noindex to robots.txt for disallowed routes This strips pages out of indexes that should not exist see: https://meta.discourse.org/t/pages-listed-in-the-robots-txt-are-crawled-and-indexed-by-google/100309/11?u=sam 2018-11-02 13:39:47 +08:00			`expect(response.body).to include("\nNoindex: /forum/admin")`
prefix the robots.txt rules with the directory when using subfolder 2018-04-12 04:05:02 +08:00			`end`
			`end`

FEATURE: allow for setting crawl delay per user agent Also moved to default crawl delay bing so no more than a req every 5 seconds is allowed New site settings: "slow_down_crawler_user_agents" - list of crawlers that will be slowed down "slow_down_crawler_rate" - how many seconds to wait between requests Not enforced server side yet 2018-04-06 08:15:23 +08:00			`context 'crawl delay' do`
			`it 'allows you to set crawl delay on particular bots' do`
			`SiteSetting.allow_index_in_robots_txt = true`
			`SiteSetting.slow_down_crawler_rate = 17`
			`SiteSetting.slow_down_crawler_user_agents = 'bingbot\|googlebot'`
			`get '/robots.txt'`
			`expect(response.body).to include("\nUser-agent: bingbot\nCrawl-delay: 17")`
			`expect(response.body).to include("\nUser-agent: googlebot\nCrawl-delay: 17")`
			`end`
			`end`

FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-16 05:10:45 +08:00			`context 'allow_index_in_robots_txt is true' do`

			`def expect_allowed_and_disallowed_sections(allow_index, disallow_index)`
			`expect(allow_index).to be_present`
			`expect(disallow_index).to be_present`

			`allow_section = allow_index < disallow_index ?`
			`response.body[allow_index...disallow_index] : response.body[allow_index..-1]`

			`expect(allow_section).to include('Disallow: /u/')`
			`expect(allow_section).to_not include("Disallow: /\n")`

			`disallowed_section = allow_index < disallow_index ?`
			`response.body[disallow_index..-1] : response.body[disallow_index...allow_index]`
			`expect(disallowed_section).to include("Disallow: /\n")`
			`end`

			`it "returns index when indexing is allowed" do`
			`SiteSetting.allow_index_in_robots_txt = true`
			`get '/robots.txt'`

			`i = response.body.index('User-agent: *')`
			`expect(i).to be_present`
			`expect(response.body[i..-1]).to include("Disallow: /u/")`
			`end`

			`it "can whitelist user agents" do`
			`SiteSetting.whitelisted_crawler_user_agents = "Googlebot\|Twitterbot"`
			`get '/robots.txt'`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`

			`allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min`
			`disallow_all_index = response.body.index('User-agent: *')`

			`expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)`
			`end`

			`it "can blacklist user agents" do`
			`SiteSetting.blacklisted_crawler_user_agents = "Googlebot\|Twitterbot"`
			`get '/robots.txt'`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`

			`disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min`
			`allow_index = response.body.index('User-agent: *')`

			`expect_allowed_and_disallowed_sections(allow_index, disallow_index)`
			`end`
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 12:06:56 +08:00
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-16 05:10:45 +08:00			`it "ignores blacklist if whitelist is set" do`
			`SiteSetting.whitelisted_crawler_user_agents = "Googlebot\|Twitterbot"`
			`SiteSetting.blacklisted_crawler_user_agents = "Bananabot"`
			`get '/robots.txt'`
			`expect(response.body).to_not include('Bananabot')`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`
			`end`
added support for disabling indexing by google using SiteSetting.allow_index_in_robots_txt = false 2013-02-11 08:02:57 +08:00			`end`

Merge branch 'whitespace-cleanese' of git://github.com/goshakkk/discourse Conflicts: lib/oneboxer.rb lib/oneboxer/whitelist.rb spec/controllers/robots_txt_controller_spec.rb 2013-02-26 23:42:49 +08:00			`it "returns noindex when indexing is disallowed" do`
Remove site setting stubbing (Round 1) 2015-06-03 18:14:00 +08:00			`SiteSetting.allow_index_in_robots_txt = false`
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 12:06:56 +08:00			`get '/robots.txt'`
remove trailing whitespaces :heart: 2013-02-26 00:42:20 +08:00
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 12:06:56 +08:00			`expect(response.body).to_not include("Disallow: /u/")`
			`end`
FEATURE: Allow customization of robots.txt (#7884) * FEATURE: Allow customization of robots.txt This allows admins to customize/override the content of the robots.txt file at /admin/customize/robots. That page is not linked to anywhere in the UI -- admins have to manually type the URL to access that page. * use Ember.computed.not * Jeff feedback * Feedback * Remove unused import 2019-07-16 01:47:44 +08:00
			`it "returns overridden robots.txt if the file is overridden" do`
			`SiteSetting.overridden_robots_txt = "blah whatever"`
			`get '/robots.txt'`
			`expect(response.status).to eq(200)`
			`expect(response.body).to eq(SiteSetting.overridden_robots_txt)`
			`end`
added support for disabling indexing by google using SiteSetting.allow_index_in_robots_txt = false 2013-02-11 08:02:57 +08:00			`end`
			`end`