2019-04-30 08:27:42 +08:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2017-08-31 12:06:56 +08:00
|
|
|
RSpec.describe RobotsTxtController do
|
2018-04-17 03:43:20 +08:00
|
|
|
describe '#builder' do
|
|
|
|
it "returns json information for building a robots.txt" do
|
|
|
|
get "/robots-builder.json"
|
2020-05-07 23:04:12 +08:00
|
|
|
json = response.parsed_body
|
2018-04-17 03:43:20 +08:00
|
|
|
expect(json).to be_present
|
|
|
|
expect(json['header']).to be_present
|
|
|
|
expect(json['agents']).to be_present
|
|
|
|
end
|
2019-07-16 01:47:44 +08:00
|
|
|
|
|
|
|
it "includes overridden content if robots.txt is is overridden" do
|
|
|
|
SiteSetting.overridden_robots_txt = "something"
|
|
|
|
|
|
|
|
get "/robots-builder.json"
|
|
|
|
expect(response.status).to eq(200)
|
2020-05-07 23:04:12 +08:00
|
|
|
json = response.parsed_body
|
2019-07-16 01:47:44 +08:00
|
|
|
expect(json['header']).to be_present
|
|
|
|
expect(json['agents']).to be_present
|
|
|
|
expect(json['overridden']).to eq("something")
|
|
|
|
end
|
2018-04-17 03:43:20 +08:00
|
|
|
end
|
|
|
|
|
2017-08-31 12:06:56 +08:00
|
|
|
describe '#index' do
|
2018-04-06 08:15:23 +08:00
|
|
|
|
2019-07-16 01:47:44 +08:00
|
|
|
context "header for when the content is overridden" do
|
|
|
|
it "is not prepended if there are no overrides" do
|
|
|
|
sign_in(Fabricate(:admin))
|
|
|
|
get '/robots.txt'
|
|
|
|
expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
|
|
|
|
end
|
|
|
|
|
|
|
|
it "is prepended if there are overrides and the user is admin" do
|
|
|
|
SiteSetting.overridden_robots_txt = "overridden_content"
|
|
|
|
sign_in(Fabricate(:admin))
|
|
|
|
get '/robots.txt'
|
|
|
|
expect(response.body).to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
|
|
|
|
end
|
|
|
|
|
|
|
|
it "is not prepended if the user is not admin" do
|
|
|
|
SiteSetting.overridden_robots_txt = "overridden_content"
|
|
|
|
get '/robots.txt'
|
|
|
|
expect(response.body).not_to start_with(RobotsTxtController::OVERRIDDEN_HEADER)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-04-12 04:05:02 +08:00
|
|
|
context 'subfolder' do
|
|
|
|
it 'prefixes the rules with the directory' do
|
2019-11-15 13:48:24 +08:00
|
|
|
set_subfolder "/forum"
|
|
|
|
|
2018-04-12 04:05:02 +08:00
|
|
|
get '/robots.txt'
|
2020-08-27 05:31:02 +08:00
|
|
|
expect(response.body).to include("\nDisallow: /forum/email/")
|
2018-04-12 04:05:02 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-03-16 05:10:45 +08:00
|
|
|
context 'allow_index_in_robots_txt is true' do
|
|
|
|
|
|
|
|
def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
|
|
|
|
expect(allow_index).to be_present
|
|
|
|
expect(disallow_index).to be_present
|
|
|
|
|
|
|
|
allow_section = allow_index < disallow_index ?
|
|
|
|
response.body[allow_index...disallow_index] : response.body[allow_index..-1]
|
|
|
|
|
2020-06-26 02:30:57 +08:00
|
|
|
expect(allow_section).to include('Disallow: /auth/')
|
2018-03-16 05:10:45 +08:00
|
|
|
expect(allow_section).to_not include("Disallow: /\n")
|
|
|
|
|
|
|
|
disallowed_section = allow_index < disallow_index ?
|
|
|
|
response.body[disallow_index..-1] : response.body[disallow_index...allow_index]
|
|
|
|
expect(disallowed_section).to include("Disallow: /\n")
|
|
|
|
end
|
|
|
|
|
|
|
|
it "returns index when indexing is allowed" do
|
|
|
|
SiteSetting.allow_index_in_robots_txt = true
|
|
|
|
get '/robots.txt'
|
|
|
|
|
|
|
|
i = response.body.index('User-agent: *')
|
|
|
|
expect(i).to be_present
|
2020-06-26 02:30:57 +08:00
|
|
|
expect(response.body[i..-1]).to include("Disallow: /auth/")
|
2020-12-23 05:51:14 +08:00
|
|
|
# we have to insert Googlebot for special handling
|
|
|
|
expect(response.body[i..-1]).to include("User-agent: Googlebot")
|
2018-03-16 05:10:45 +08:00
|
|
|
end
|
|
|
|
|
2020-07-27 08:23:54 +08:00
|
|
|
it "can allowlist user agents" do
|
|
|
|
SiteSetting.allowed_crawler_user_agents = "Googlebot|Twitterbot"
|
2018-03-16 05:10:45 +08:00
|
|
|
get '/robots.txt'
|
|
|
|
expect(response.body).to include('User-agent: Googlebot')
|
|
|
|
expect(response.body).to include('User-agent: Twitterbot')
|
|
|
|
|
|
|
|
allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
|
|
|
|
disallow_all_index = response.body.index('User-agent: *')
|
|
|
|
|
|
|
|
expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)
|
|
|
|
end
|
|
|
|
|
2020-07-27 08:23:54 +08:00
|
|
|
it "can blocklist user agents" do
|
|
|
|
SiteSetting.blocked_crawler_user_agents = "Googlebot|Twitterbot"
|
2018-03-16 05:10:45 +08:00
|
|
|
get '/robots.txt'
|
|
|
|
expect(response.body).to include('User-agent: Googlebot')
|
|
|
|
expect(response.body).to include('User-agent: Twitterbot')
|
|
|
|
|
|
|
|
disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
|
|
|
|
allow_index = response.body.index('User-agent: *')
|
|
|
|
|
|
|
|
expect_allowed_and_disallowed_sections(allow_index, disallow_index)
|
|
|
|
end
|
2017-08-31 12:06:56 +08:00
|
|
|
|
2020-07-27 08:23:54 +08:00
|
|
|
it "ignores blocklist if allowlist is set" do
|
|
|
|
SiteSetting.allowed_crawler_user_agents = "Googlebot|Twitterbot"
|
|
|
|
SiteSetting.blocked_crawler_user_agents = "Bananabot"
|
2018-03-16 05:10:45 +08:00
|
|
|
get '/robots.txt'
|
|
|
|
expect(response.body).to_not include('Bananabot')
|
|
|
|
expect(response.body).to include('User-agent: Googlebot')
|
|
|
|
expect(response.body).to include('User-agent: Twitterbot')
|
|
|
|
end
|
2013-02-11 08:02:57 +08:00
|
|
|
end
|
|
|
|
|
2013-02-26 23:42:49 +08:00
|
|
|
it "returns noindex when indexing is disallowed" do
|
2015-06-03 18:14:00 +08:00
|
|
|
SiteSetting.allow_index_in_robots_txt = false
|
2017-08-31 12:06:56 +08:00
|
|
|
get '/robots.txt'
|
2013-02-26 00:42:20 +08:00
|
|
|
|
2020-06-26 02:30:57 +08:00
|
|
|
expect(response.body).to_not include("Disallow: /auth/")
|
2020-05-11 10:14:21 +08:00
|
|
|
expect(response.body).to include("User-agent: googlebot\nAllow")
|
2017-08-31 12:06:56 +08:00
|
|
|
end
|
2019-07-16 01:47:44 +08:00
|
|
|
|
|
|
|
it "returns overridden robots.txt if the file is overridden" do
|
|
|
|
SiteSetting.overridden_robots_txt = "blah whatever"
|
|
|
|
get '/robots.txt'
|
|
|
|
expect(response.status).to eq(200)
|
|
|
|
expect(response.body).to eq(SiteSetting.overridden_robots_txt)
|
|
|
|
end
|
2022-04-12 21:33:59 +08:00
|
|
|
|
|
|
|
describe 'sitemap' do
|
|
|
|
let(:sitemap_line) { "Sitemap: #{Discourse.base_protocol}://#{Discourse.current_hostname}/sitemap.xml" }
|
|
|
|
|
|
|
|
it 'include sitemap location when enabled' do
|
|
|
|
SiteSetting.enable_sitemap = true
|
|
|
|
SiteSetting.login_required = false
|
|
|
|
|
|
|
|
get '/robots.txt'
|
|
|
|
|
|
|
|
expect(response.body).to include(sitemap_line)
|
|
|
|
end
|
|
|
|
|
|
|
|
it "doesn't include sitemap location when disabled" do
|
|
|
|
SiteSetting.enable_sitemap = false
|
|
|
|
SiteSetting.login_required = false
|
|
|
|
|
|
|
|
get '/robots.txt'
|
|
|
|
|
|
|
|
expect(response.body).not_to include(sitemap_line)
|
|
|
|
end
|
|
|
|
|
|
|
|
it "doesn't include sitemap location when site has login_required enabled" do
|
|
|
|
SiteSetting.enable_sitemap = true
|
|
|
|
SiteSetting.login_required = true
|
|
|
|
|
|
|
|
get '/robots.txt'
|
|
|
|
|
|
|
|
expect(response.body).not_to include(sitemap_line)
|
|
|
|
end
|
|
|
|
end
|
2013-02-11 08:02:57 +08:00
|
|
|
end
|
|
|
|
end
|