prefix the robots.txt rules with the directory when using subfolder

This commit is contained in:
Régis Hanol 2018-04-11 22:05:02 +02:00
parent 3c8b43bb01
commit df7970a6f6
4 changed files with 49 additions and 35 deletions

View File

@ -2,14 +2,46 @@ class RobotsTxtController < ApplicationController
layout false
skip_before_action :preload_json, :check_xhr, :redirect_to_login_if_required
# NOTE: order is important!
DISALLOWED_PATHS ||= %w{
/auth/cas
/auth/facebook/callback
/auth/twitter/callback
/auth/google/callback
/auth/yahoo/callback
/auth/github/callback
/auth/cas/callback
/assets/browser-update*.js
/users/
/u/
/badges/
/search
/search/
/tags
/tags/
/email/
/session
/session/
/admin
/admin/
/user-api-key
/user-api-key/
/*?api_key*
/*?*api_key*
/groups
/groups/
/t/*/*.rss
/tags/*.rss
/c/*.rss
}
def index
if SiteSetting.allow_index_in_robots_txt
path = :index
@crawler_delayed_agents = []
SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent|
@crawler_delayed_agents << [agent, SiteSetting.slow_down_crawler_rate]
end
@crawler_delayed_agents = SiteSetting.slow_down_crawler_user_agents.split('|').map { |agent|
[agent, SiteSetting.slow_down_crawler_rate]
}
if SiteSetting.whitelisted_crawler_user_agents.present?
@allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')

View File

@ -3,40 +3,14 @@
<% @allowed_user_agents.each do |user_agent| %>
User-agent: <%= user_agent %>
<% end %>
Disallow: /auth/cas
Disallow: /auth/facebook/callback
Disallow: /auth/twitter/callback
Disallow: /auth/google/callback
Disallow: /auth/yahoo/callback
Disallow: /auth/github/callback
Disallow: /auth/cas/callback
Disallow: /assets/browser-update*.js
Disallow: /users/
Disallow: /u/
Disallow: /badges/
Disallow: /search
Disallow: /search/
Disallow: /tags
Disallow: /tags/
Disallow: /email/
Disallow: /session
Disallow: /session/
Disallow: /admin
Disallow: /admin/
Disallow: /user-api-key
Disallow: /user-api-key/
Disallow: /*?api_key*
Disallow: /*?*api_key*
Disallow: /groups
Disallow: /groups/
Disallow: /t/*/*.rss
Disallow: /tags/*.rss
Disallow: /c/*.rss
<% RobotsTxtController::DISALLOWED_PATHS.each do |path| %>
Disallow: <%= Discourse.base_uri + path %>
<% end %>
<% if @disallowed_user_agents %>
<% @disallowed_user_agents.each do |user_agent| %>
User-agent: <%= user_agent %>
Disallow: /
Disallow: <%= Discourse.base_uri + "/" %>
<% end %>
<% end %>

View File

@ -1,4 +1,4 @@
# See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file
#
User-agent: *
Disallow: /
Disallow: <%= Discourse.base_uri + "/" %>

View File

@ -3,6 +3,14 @@ require 'rails_helper'
RSpec.describe RobotsTxtController do
describe '#index' do
context 'subfolder' do
it 'prefixes the rules with the directory' do
Discourse.stubs(:base_uri).returns('/forum')
get '/robots.txt'
expect(response.body).to include("\nDisallow: /forum/admin")
end
end
context 'crawl delay' do
it 'allows you to set crawl delay on particular bots' do
SiteSetting.allow_index_in_robots_txt = true