discourse/spec/requests/robots_txt_controller_spec.rb

# frozen_string_literal: true

RSpec.describe RobotsTxtController do
  describe '#builder' do
    it "returns json information for building a robots.txt" do
      get "/robots-builder.json"
      json = response.parsed_body
      expect(json).to be_present
      expect(json['header']).to be_present
      expect(json['agents']).to be_present
    end

    it "includes overridden content if robots.txt is is overridden" do
      SiteSetting.overridden_robots_txt = "something"

      get "/robots-builder.json"
      expect(response.status).to eq(200)
      json = response.parsed_body
      expect(json['header']).to be_present
      expect(json['agents']).to be_present
      expect(json['overridden']).to eq("something")
    end
  end

  describe '#index' do
    context "when the content is overridden" do
      it "is not prepended if there are no overrides" do
        sign_in(Fabricate(:admin))
        get '/robots.txt'
        expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
      end

      it "is prepended if there are overrides and the user is admin" do
        SiteSetting.overridden_robots_txt = "overridden_content"
        sign_in(Fabricate(:admin))
        get '/robots.txt'
        expect(response.body).to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
      end

      it "is not prepended if the user is not admin" do
        SiteSetting.overridden_robots_txt = "overridden_content"
        get '/robots.txt'
        expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
      end
    end

    context 'with subfolder' do
      it 'prefixes the rules with the directory' do
        set_subfolder "/forum"

        get '/robots.txt'
        expect(response.body).to include("\nDisallow: /forum/email/")
      end
    end

    context 'when allow_index_in_robots_txt is true' do
      def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
        expect(allow_index).to be_present
        expect(disallow_index).to be_present

        allow_section = allow_index < disallow_index ?
          response.body[allow_index...disallow_index] : response.body[allow_index..-1]

        expect(allow_section).to include('Disallow: /auth/')
        expect(allow_section).to_not include("Disallow: /\n")

        disallowed_section = allow_index < disallow_index ?
          response.body[disallow_index..-1] : response.body[disallow_index...allow_index]
        expect(disallowed_section).to include("Disallow: /\n")
      end

      it "returns index when indexing is allowed" do
        SiteSetting.allow_index_in_robots_txt = true
        get '/robots.txt'

        i = response.body.index('User-agent: *')
        expect(i).to be_present
        expect(response.body[i..-1]).to include("Disallow: /auth/")
        # we have to insert Googlebot for special handling
        expect(response.body[i..-1]).to include("User-agent: Googlebot")
      end

      it "can allowlist user agents" do
        SiteSetting.allowed_crawler_user_agents = "Googlebot|Twitterbot"
        get '/robots.txt'
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')

        allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
        disallow_all_index = response.body.index('User-agent: *')

        expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)
      end

      it "can blocklist user agents" do
        SiteSetting.blocked_crawler_user_agents = "Googlebot|Twitterbot"
        get '/robots.txt'
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')

        disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
        allow_index = response.body.index('User-agent: *')

        expect_allowed_and_disallowed_sections(allow_index, disallow_index)
      end

      it "ignores blocklist if allowlist is set" do
        SiteSetting.allowed_crawler_user_agents = "Googlebot|Twitterbot"
        SiteSetting.blocked_crawler_user_agents = "Bananabot"
        get '/robots.txt'
        expect(response.body).to_not include('Bananabot')
        expect(response.body).to include('User-agent: Googlebot')
        expect(response.body).to include('User-agent: Twitterbot')
      end
    end

    it "returns noindex when indexing is disallowed" do
      SiteSetting.allow_index_in_robots_txt = false
      get '/robots.txt'

      expect(response.body).to_not include("Disallow: /auth/")
      expect(response.body).to include("User-agent: googlebot\nAllow")
    end

    it "returns overridden robots.txt if the file is overridden" do
      SiteSetting.overridden_robots_txt = "blah whatever"
      get '/robots.txt'
      expect(response.status).to eq(200)
      expect(response.body).to eq(SiteSetting.overridden_robots_txt)
    end

    describe 'sitemap' do
      let(:sitemap_line) { "Sitemap: #{Discourse.base_protocol}://#{Discourse.current_hostname}/sitemap.xml" }

      it 'include sitemap location when enabled' do
        SiteSetting.enable_sitemap = true
        SiteSetting.login_required = false

        get '/robots.txt'

        expect(response.body).to include(sitemap_line)
      end

      it "doesn't include sitemap location when disabled" do
        SiteSetting.enable_sitemap = false
        SiteSetting.login_required = false

        get '/robots.txt'

        expect(response.body).not_to include(sitemap_line)
      end

      it "doesn't include sitemap location when site has login_required enabled" do
        SiteSetting.enable_sitemap = true
        SiteSetting.login_required = true

        get '/robots.txt'

        expect(response.body).not_to include(sitemap_line)
      end
    end

    describe 'plugins' do
      let(:event_handler) do
        Proc.new { |robots_info| robots_info[:agents] << { name: 'Test', disallow: ['/test/'] } }
      end

      before do
        DiscourseEvent.on(:robots_info, &event_handler)
      end

      after do
        DiscourseEvent.off(:robots_info, &event_handler)
      end

      it 'can add to robots.txt' do
        get '/robots.txt'

        expect(response.parsed_body).to include("User-agent: Test\nDisallow: /test/")
      end
    end
  end
end
DEV: use #frozen_string_literal: true on all spec This change both speeds up specs (less strings to allocate) and helps catch cases where methods in Discourse are mutating inputs. Overall we will be migrating everything to use #frozen_string_literal: true it will take a while, but this is the first and safest move in this direction 2019-04-30 08:27:42 +08:00			`# frozen_string_literal: true`

Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 12:06:56 +08:00			`RSpec.describe RobotsTxtController do`
FEATURE: An API to help sites build robots.txt files programatically This is mainly useful for subfolder sites, who need to expose their robots.txt contents to a parent site. 2018-04-17 03:43:20 +08:00			`describe '#builder' do`
			`it "returns json information for building a robots.txt" do`
			`get "/robots-builder.json"`
DEV: Use `response.parsed_body` in specs (#9615) Most of it was autofixed with rubocop-discourse 2.1.1. 2020-05-07 23:04:12 +08:00			`json = response.parsed_body`
FEATURE: An API to help sites build robots.txt files programatically This is mainly useful for subfolder sites, who need to expose their robots.txt contents to a parent site. 2018-04-17 03:43:20 +08:00			`expect(json).to be_present`
			`expect(json['header']).to be_present`
			`expect(json['agents']).to be_present`
			`end`
FEATURE: Allow customization of robots.txt (#7884) * FEATURE: Allow customization of robots.txt This allows admins to customize/override the content of the robots.txt file at /admin/customize/robots. That page is not linked to anywhere in the UI -- admins have to manually type the URL to access that page. * use Ember.computed.not * Jeff feedback * Feedback * Remove unused import 2019-07-16 01:47:44 +08:00
			`it "includes overridden content if robots.txt is is overridden" do`
			`SiteSetting.overridden_robots_txt = "something"`

			`get "/robots-builder.json"`
			`expect(response.status).to eq(200)`
DEV: Use `response.parsed_body` in specs (#9615) Most of it was autofixed with rubocop-discourse 2.1.1. 2020-05-07 23:04:12 +08:00			`json = response.parsed_body`
FEATURE: Allow customization of robots.txt (#7884) * FEATURE: Allow customization of robots.txt This allows admins to customize/override the content of the robots.txt file at /admin/customize/robots. That page is not linked to anywhere in the UI -- admins have to manually type the URL to access that page. * use Ember.computed.not * Jeff feedback * Feedback * Remove unused import 2019-07-16 01:47:44 +08:00			`expect(json['header']).to be_present`
			`expect(json['agents']).to be_present`
			`expect(json['overridden']).to eq("something")`
			`end`
FEATURE: An API to help sites build robots.txt files programatically This is mainly useful for subfolder sites, who need to expose their robots.txt contents to a parent site. 2018-04-17 03:43:20 +08:00			`end`

Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 12:06:56 +08:00			`describe '#index' do`
DEV: Use proper wording for contexts in specs 2022-07-28 00:14:14 +08:00			`context "when the content is overridden" do`
FEATURE: Allow customization of robots.txt (#7884) * FEATURE: Allow customization of robots.txt This allows admins to customize/override the content of the robots.txt file at /admin/customize/robots. That page is not linked to anywhere in the UI -- admins have to manually type the URL to access that page. * use Ember.computed.not * Jeff feedback * Feedback * Remove unused import 2019-07-16 01:47:44 +08:00			`it "is not prepended if there are no overrides" do`
			`sign_in(Fabricate(:admin))`
			`get '/robots.txt'`
			`expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)`
			`end`

			`it "is prepended if there are overrides and the user is admin" do`
			`SiteSetting.overridden_robots_txt = "overridden_content"`
			`sign_in(Fabricate(:admin))`
			`get '/robots.txt'`
			`expect(response.body).to start_with(RobotsTxtController::OVERRIDDEN_HEADER)`
			`end`

			`it "is not prepended if the user is not admin" do`
			`SiteSetting.overridden_robots_txt = "overridden_content"`
			`get '/robots.txt'`
			`expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)`
			`end`
			`end`

DEV: Use proper wording for contexts in specs 2022-07-28 00:14:14 +08:00			`context 'with subfolder' do`
prefix the robots.txt rules with the directory when using subfolder 2018-04-12 04:05:02 +08:00			`it 'prefixes the rules with the directory' do`
DEV: improve usability of subfolder specs Previously people were not consistent about mocking which left internals in a fragile state when running subfolder specs. This introduces a simple helper `set_subfolder` which you can use to set the subfolder for the spec. It takes care of proper configuration of subfolder and teardown. ``` # usage set_subfolder "/my_amazing_subfolder" ``` You should no longer stub base_uri or global_settings 2019-11-15 13:48:24 +08:00			`set_subfolder "/forum"`

prefix the robots.txt rules with the directory when using subfolder 2018-04-12 04:05:02 +08:00			`get '/robots.txt'`
Fix spec (#10539) 2020-08-27 05:31:02 +08:00			`expect(response.body).to include("\nDisallow: /forum/email/")`
prefix the robots.txt rules with the directory when using subfolder 2018-04-12 04:05:02 +08:00			`end`
			`end`

DEV: Use proper wording for contexts in specs 2022-07-28 00:14:14 +08:00			`context 'when allow_index_in_robots_txt is true' do`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-16 05:10:45 +08:00			`def expect_allowed_and_disallowed_sections(allow_index, disallow_index)`
			`expect(allow_index).to be_present`
			`expect(disallow_index).to be_present`

			`allow_section = allow_index < disallow_index ?`
			`response.body[allow_index...disallow_index] : response.body[allow_index..-1]`

FIX: Broken specs `/u/` is no longer in robots.txt, so don't test for it 2020-06-26 02:30:57 +08:00			`expect(allow_section).to include('Disallow: /auth/')`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-16 05:10:45 +08:00			`expect(allow_section).to_not include("Disallow: /\n")`

			`disallowed_section = allow_index < disallow_index ?`
			`response.body[disallow_index..-1] : response.body[disallow_index...allow_index]`
			`expect(disallowed_section).to include("Disallow: /\n")`
			`end`

			`it "returns index when indexing is allowed" do`
			`SiteSetting.allow_index_in_robots_txt = true`
			`get '/robots.txt'`

			`i = response.body.index('User-agent: *')`
			`expect(i).to be_present`
FIX: Broken specs `/u/` is no longer in robots.txt, so don't test for it 2020-06-26 02:30:57 +08:00			`expect(response.body[i..-1]).to include("Disallow: /auth/")`
FEATURE: explicitly ban outlier traffic sources in robots.txt (#11553) Googlebot handles no-index headers very elegantly. It advises to leave as many routes as possible open and uses headers for high fidelity rules regarding indexes. Discourse adds special `x-robot-tags` noindex headers to users, badges, groups, search and tag routes. Following up on b52143feff8c32f2 we now have it so Googlebot gets special handling. Rest of the crawlers get a far more aggressive disallow list to protect against excessive crawling. 2020-12-23 05:51:14 +08:00			`# we have to insert Googlebot for special handling`
			`expect(response.body[i..-1]).to include("User-agent: Googlebot")`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-16 05:10:45 +08:00			`end`

FIX: use allowlist and blocklist terminology (#10209) This is a PR of the renaming whitelist to allowlist and blacklist to the blocklist. 2020-07-27 08:23:54 +08:00			`it "can allowlist user agents" do`
			`SiteSetting.allowed_crawler_user_agents = "Googlebot\|Twitterbot"`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-16 05:10:45 +08:00			`get '/robots.txt'`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`

			`allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min`
			`disallow_all_index = response.body.index('User-agent: *')`

			`expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)`
			`end`

FIX: use allowlist and blocklist terminology (#10209) This is a PR of the renaming whitelist to allowlist and blacklist to the blocklist. 2020-07-27 08:23:54 +08:00			`it "can blocklist user agents" do`
			`SiteSetting.blocked_crawler_user_agents = "Googlebot\|Twitterbot"`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-16 05:10:45 +08:00			`get '/robots.txt'`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`

			`disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min`
			`allow_index = response.body.index('User-agent: *')`

			`expect_allowed_and_disallowed_sections(allow_index, disallow_index)`
			`end`
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 12:06:56 +08:00
FIX: use allowlist and blocklist terminology (#10209) This is a PR of the renaming whitelist to allowlist and blacklist to the blocklist. 2020-07-27 08:23:54 +08:00			`it "ignores blocklist if allowlist is set" do`
			`SiteSetting.allowed_crawler_user_agents = "Googlebot\|Twitterbot"`
			`SiteSetting.blocked_crawler_user_agents = "Bananabot"`
FEATURE: control which web crawlers can access using a whitelist or blacklist 2018-03-16 05:10:45 +08:00			`get '/robots.txt'`
			`expect(response.body).to_not include('Bananabot')`
			`expect(response.body).to include('User-agent: Googlebot')`
			`expect(response.body).to include('User-agent: Twitterbot')`
			`end`
added support for disabling indexing by google using SiteSetting.allow_index_in_robots_txt = false 2013-02-11 08:02:57 +08:00			`end`

Merge branch 'whitespace-cleanese' of git://github.com/goshakkk/discourse Conflicts: lib/oneboxer.rb lib/oneboxer/whitelist.rb spec/controllers/robots_txt_controller_spec.rb 2013-02-26 23:42:49 +08:00			`it "returns noindex when indexing is disallowed" do`
Remove site setting stubbing (Round 1) 2015-06-03 18:14:00 +08:00			`SiteSetting.allow_index_in_robots_txt = false`
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 12:06:56 +08:00			`get '/robots.txt'`
remove trailing whitespaces :heart: 2013-02-26 00:42:20 +08:00
FIX: Broken specs `/u/` is no longer in robots.txt, so don't test for it 2020-06-26 02:30:57 +08:00			`expect(response.body).to_not include("Disallow: /auth/")`
FEATURE: let Google index pages so it can remove them Google insists on indexing pages so it can figure out if they can be removed from the index. see: https://support.google.com/webmasters/answer/6332384?hl=en This change ensures the we have special behavior for Googlebot where we allow indexing, but block the actual indexing via X-Robots-Tag 2020-05-11 10:14:21 +08:00			`expect(response.body).to include("User-agent: googlebot\nAllow")`
Fix all the errors to get our tests green on Rails 5.1. 2017-08-31 12:06:56 +08:00			`end`
FEATURE: Allow customization of robots.txt (#7884) * FEATURE: Allow customization of robots.txt This allows admins to customize/override the content of the robots.txt file at /admin/customize/robots. That page is not linked to anywhere in the UI -- admins have to manually type the URL to access that page. * use Ember.computed.not * Jeff feedback * Feedback * Remove unused import 2019-07-16 01:47:44 +08:00
			`it "returns overridden robots.txt if the file is overridden" do`
			`SiteSetting.overridden_robots_txt = "blah whatever"`
			`get '/robots.txt'`
			`expect(response.status).to eq(200)`
			`expect(response.body).to eq(SiteSetting.overridden_robots_txt)`
			`end`
FEATURE: Let sites add a sitemap.xml file. (#16357) * FEATURE: Let sites add a sitemap.xml file. This PR adds the same features discourse-sitemap provides to core. Sitemaps are only added to the robots.txt file if the `enable_sitemap` setting is enabled and `login_required` disabled. After merging discourse/discourse-sitemap#34, this change will take priority over the sitemap plugin because it will disable itself. We're also using the same sitemaps table, so our migration won't try to create it again using `if_not_exists: true`. 2022-04-12 21:33:59 +08:00
			`describe 'sitemap' do`
			`let(:sitemap_line) { "Sitemap: #{Discourse.base_protocol}://#{Discourse.current_hostname}/sitemap.xml" }`

			`it 'include sitemap location when enabled' do`
			`SiteSetting.enable_sitemap = true`
			`SiteSetting.login_required = false`

			`get '/robots.txt'`

			`expect(response.body).to include(sitemap_line)`
			`end`

			`it "doesn't include sitemap location when disabled" do`
			`SiteSetting.enable_sitemap = false`
			`SiteSetting.login_required = false`

			`get '/robots.txt'`

			`expect(response.body).not_to include(sitemap_line)`
			`end`

			`it "doesn't include sitemap location when site has login_required enabled" do`
			`SiteSetting.enable_sitemap = true`
			`SiteSetting.login_required = true`

			`get '/robots.txt'`

			`expect(response.body).not_to include(sitemap_line)`
			`end`
			`end`
DEV: Add plugin API to add to robots.txt (#17378) This plugin API can be used to add to robots.txt. The event handler receives the complete robots information before it is converted into robots.txt. 2022-07-13 01:52:55 +08:00
			`describe 'plugins' do`
			`let(:event_handler) do`
			`Proc.new { \|robots_info\| robots_info[:agents] << { name: 'Test', disallow: ['/test/'] } }`
			`end`

			`before do`
			`DiscourseEvent.on(:robots_info, &event_handler)`
			`end`

			`after do`
			`DiscourseEvent.off(:robots_info, &event_handler)`
			`end`

			`it 'can add to robots.txt' do`
			`get '/robots.txt'`

			`expect(response.parsed_body).to include("User-agent: Test\nDisallow: /test/")`
			`end`
			`end`
added support for disabling indexing by google using SiteSetting.allow_index_in_robots_txt = false 2013-02-11 08:02:57 +08:00			`end`
			`end`