From df7970a6f6e0d21b7e5514b7174896d6d6dafe54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= <regis@hanol.fr> Date: Wed, 11 Apr 2018 22:05:02 +0200 Subject: [PATCH] prefix the robots.txt rules with the directory when using subfolder --- app/controllers/robots_txt_controller.rb | 40 ++++++++++++++++++--- app/views/robots_txt/index.erb | 34 +++--------------- app/views/robots_txt/no_index.erb | 2 +- spec/requests/robots_txt_controller_spec.rb | 8 +++++ 4 files changed, 49 insertions(+), 35 deletions(-) diff --git a/app/controllers/robots_txt_controller.rb b/app/controllers/robots_txt_controller.rb index fe838b885b4..b5dfdb3e626 100644 --- a/app/controllers/robots_txt_controller.rb +++ b/app/controllers/robots_txt_controller.rb @@ -2,14 +2,46 @@ class RobotsTxtController < ApplicationController layout false skip_before_action :preload_json, :check_xhr, :redirect_to_login_if_required + # NOTE: order is important! + DISALLOWED_PATHS ||= %w{ + /auth/cas + /auth/facebook/callback + /auth/twitter/callback + /auth/google/callback + /auth/yahoo/callback + /auth/github/callback + /auth/cas/callback + /assets/browser-update*.js + /users/ + /u/ + /badges/ + /search + /search/ + /tags + /tags/ + /email/ + /session + /session/ + /admin + /admin/ + /user-api-key + /user-api-key/ + /*?api_key* + /*?*api_key* + /groups + /groups/ + /t/*/*.rss + /tags/*.rss + /c/*.rss + } + def index if SiteSetting.allow_index_in_robots_txt path = :index - @crawler_delayed_agents = [] - SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent| - @crawler_delayed_agents << [agent, SiteSetting.slow_down_crawler_rate] - end + @crawler_delayed_agents = SiteSetting.slow_down_crawler_user_agents.split('|').map { |agent| + [agent, SiteSetting.slow_down_crawler_rate] + } if SiteSetting.whitelisted_crawler_user_agents.present? @allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|') diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb index daf5d1758cf..f81e8128ce8 100644 --- a/app/views/robots_txt/index.erb +++ b/app/views/robots_txt/index.erb @@ -3,40 +3,14 @@ <% @allowed_user_agents.each do |user_agent| %> User-agent: <%= user_agent %> <% end %> -Disallow: /auth/cas -Disallow: /auth/facebook/callback -Disallow: /auth/twitter/callback -Disallow: /auth/google/callback -Disallow: /auth/yahoo/callback -Disallow: /auth/github/callback -Disallow: /auth/cas/callback -Disallow: /assets/browser-update*.js -Disallow: /users/ -Disallow: /u/ -Disallow: /badges/ -Disallow: /search -Disallow: /search/ -Disallow: /tags -Disallow: /tags/ -Disallow: /email/ -Disallow: /session -Disallow: /session/ -Disallow: /admin -Disallow: /admin/ -Disallow: /user-api-key -Disallow: /user-api-key/ -Disallow: /*?api_key* -Disallow: /*?*api_key* -Disallow: /groups -Disallow: /groups/ -Disallow: /t/*/*.rss -Disallow: /tags/*.rss -Disallow: /c/*.rss +<% RobotsTxtController::DISALLOWED_PATHS.each do |path| %> +Disallow: <%= Discourse.base_uri + path %> +<% end %> <% if @disallowed_user_agents %> <% @disallowed_user_agents.each do |user_agent| %> User-agent: <%= user_agent %> -Disallow: / +Disallow: <%= Discourse.base_uri + "/" %> <% end %> <% end %> diff --git a/app/views/robots_txt/no_index.erb b/app/views/robots_txt/no_index.erb index 09e665facdf..7697afcf260 100644 --- a/app/views/robots_txt/no_index.erb +++ b/app/views/robots_txt/no_index.erb @@ -1,4 +1,4 @@ # See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file # User-agent: * -Disallow: / +Disallow: <%= Discourse.base_uri + "/" %> diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb index fa5c62e3205..87232518cb6 100644 --- a/spec/requests/robots_txt_controller_spec.rb +++ b/spec/requests/robots_txt_controller_spec.rb @@ -3,6 +3,14 @@ require 'rails_helper' RSpec.describe RobotsTxtController do describe '#index' do + context 'subfolder' do + it 'prefixes the rules with the directory' do + Discourse.stubs(:base_uri).returns('/forum') + get '/robots.txt' + expect(response.body).to include("\nDisallow: /forum/admin") + end + end + context 'crawl delay' do it 'allows you to set crawl delay on particular bots' do SiteSetting.allow_index_in_robots_txt = true