From df7970a6f6e0d21b7e5514b7174896d6d6dafe54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9gis=20Hanol?= <regis@hanol.fr>
Date: Wed, 11 Apr 2018 22:05:02 +0200
Subject: [PATCH] prefix the robots.txt rules with the directory when using
 subfolder

---
 app/controllers/robots_txt_controller.rb    | 40 ++++++++++++++++++---
 app/views/robots_txt/index.erb              | 34 +++---------------
 app/views/robots_txt/no_index.erb           |  2 +-
 spec/requests/robots_txt_controller_spec.rb |  8 +++++
 4 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/app/controllers/robots_txt_controller.rb b/app/controllers/robots_txt_controller.rb
index fe838b885b4..b5dfdb3e626 100644
--- a/app/controllers/robots_txt_controller.rb
+++ b/app/controllers/robots_txt_controller.rb
@@ -2,14 +2,46 @@ class RobotsTxtController < ApplicationController
   layout false
   skip_before_action :preload_json, :check_xhr, :redirect_to_login_if_required
 
+  # NOTE: order is important!
+  DISALLOWED_PATHS ||= %w{
+    /auth/cas
+    /auth/facebook/callback
+    /auth/twitter/callback
+    /auth/google/callback
+    /auth/yahoo/callback
+    /auth/github/callback
+    /auth/cas/callback
+    /assets/browser-update*.js
+    /users/
+    /u/
+    /badges/
+    /search
+    /search/
+    /tags
+    /tags/
+    /email/
+    /session
+    /session/
+    /admin
+    /admin/
+    /user-api-key
+    /user-api-key/
+    /*?api_key*
+    /*?*api_key*
+    /groups
+    /groups/
+    /t/*/*.rss
+    /tags/*.rss
+    /c/*.rss
+  }
+
   def index
     if SiteSetting.allow_index_in_robots_txt
       path = :index
-      @crawler_delayed_agents = []
 
-      SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent|
-        @crawler_delayed_agents << [agent, SiteSetting.slow_down_crawler_rate]
-      end
+      @crawler_delayed_agents = SiteSetting.slow_down_crawler_user_agents.split('|').map { |agent|
+        [agent, SiteSetting.slow_down_crawler_rate]
+      }
 
       if SiteSetting.whitelisted_crawler_user_agents.present?
         @allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')
diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb
index daf5d1758cf..f81e8128ce8 100644
--- a/app/views/robots_txt/index.erb
+++ b/app/views/robots_txt/index.erb
@@ -3,40 +3,14 @@
 <% @allowed_user_agents.each do |user_agent| %>
 User-agent: <%= user_agent %>
 <% end %>
-Disallow: /auth/cas
-Disallow: /auth/facebook/callback
-Disallow: /auth/twitter/callback
-Disallow: /auth/google/callback
-Disallow: /auth/yahoo/callback
-Disallow: /auth/github/callback
-Disallow: /auth/cas/callback
-Disallow: /assets/browser-update*.js
-Disallow: /users/
-Disallow: /u/
-Disallow: /badges/
-Disallow: /search
-Disallow: /search/
-Disallow: /tags
-Disallow: /tags/
-Disallow: /email/
-Disallow: /session
-Disallow: /session/
-Disallow: /admin
-Disallow: /admin/
-Disallow: /user-api-key
-Disallow: /user-api-key/
-Disallow: /*?api_key*
-Disallow: /*?*api_key*
-Disallow: /groups
-Disallow: /groups/
-Disallow: /t/*/*.rss
-Disallow: /tags/*.rss
-Disallow: /c/*.rss
+<% RobotsTxtController::DISALLOWED_PATHS.each do |path| %>
+Disallow: <%= Discourse.base_uri + path %>
+<% end %>
 
 <% if @disallowed_user_agents %>
   <% @disallowed_user_agents.each do |user_agent| %>
 User-agent: <%= user_agent %>
-Disallow: /
+Disallow: <%= Discourse.base_uri + "/" %>
 
   <% end %>
 <% end %>
diff --git a/app/views/robots_txt/no_index.erb b/app/views/robots_txt/no_index.erb
index 09e665facdf..7697afcf260 100644
--- a/app/views/robots_txt/no_index.erb
+++ b/app/views/robots_txt/no_index.erb
@@ -1,4 +1,4 @@
 # See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file
 #
 User-agent: *
-Disallow: /
+Disallow: <%= Discourse.base_uri + "/" %>
diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb
index fa5c62e3205..87232518cb6 100644
--- a/spec/requests/robots_txt_controller_spec.rb
+++ b/spec/requests/robots_txt_controller_spec.rb
@@ -3,6 +3,14 @@ require 'rails_helper'
 RSpec.describe RobotsTxtController do
   describe '#index' do
 
+    context 'subfolder' do
+      it 'prefixes the rules with the directory' do
+        Discourse.stubs(:base_uri).returns('/forum')
+        get '/robots.txt'
+        expect(response.body).to include("\nDisallow: /forum/admin")
+      end
+    end
+
     context 'crawl delay' do
       it 'allows you to set crawl delay on particular bots' do
         SiteSetting.allow_index_in_robots_txt = true