discourse/app/controllers/robots_txt_controller.rb
Natalie Tay 038a391cfe
FIX: Remove /u/ from robots (#30782)
Follow up from https://github.com/discourse/discourse/pull/27712.

Currently, we already add `noindex` to /u routes. However, due to
robots.txt blocking this, search engines are not able to see the header.

This commit removes /u from our robots.txt to allow search engines to
see the header. This commit also includes a migration to remove the /u
from admins who have overridden the file. I had contemplated not
including this migration, but seeing there are existing site admins who
are trying to remove errors from their dashboard, they would probably
welcome this change.

The migration replaces overridden text at this area, and will not modify
if additional routes have been added in-between:

<img width="500" alt="Screenshot 2025-01-15 at 11 28 43 AM copy"
src="https://github.com/user-attachments/assets/60db64c9-ed33-48a5-a917-a10545282a5c"
/>


Side effect note: This might potentially result in more pageviews* from
GoogleBot (for example) for a period of time as Google starts visiting
the user routes they have been denied before.
2025-01-15 13:42:08 +08:00

91 lines
2.8 KiB
Ruby

# frozen_string_literal: true
class RobotsTxtController < ApplicationController
layout false
skip_before_action :preload_json,
:check_xhr,
:redirect_to_login_if_required,
:redirect_to_profile_if_required
OVERRIDDEN_HEADER = "# This robots.txt file has been customized at /admin/customize/robots\n"
# NOTE: order is important!
DISALLOWED_PATHS = %w[
/admin/
/auth/
/assets/browser-update*.js
/email/
/session
/user-api-key
/*?api_key*
/*?*api_key*
]
DISALLOWED_WITH_HEADER_PATHS = %w[/badges /my /search /tag/*/l /g /t/*/*.rss /c/*.rss]
def index
if (overridden = SiteSetting.overridden_robots_txt.dup).present?
overridden.prepend(OVERRIDDEN_HEADER) if guardian.is_admin? && !is_api?
render plain: overridden
return
end
if SiteSetting.allow_index_in_robots_txt?
@robots_info = self.class.fetch_default_robots_info
render :index, content_type: "text/plain"
else
render :no_index, content_type: "text/plain"
end
end
# If you are hosting Discourse in a subfolder, you will need to create your robots.txt
# in the root of your web server with the appropriate paths. This method will return
# JSON that can be used by a script to create a robots.txt that works well with your
# existing site.
def builder
result = self.class.fetch_default_robots_info
overridden = SiteSetting.overridden_robots_txt
result[:overridden] = overridden if overridden.present?
render json: result
end
def self.fetch_default_robots_info
deny_paths_googlebot = DISALLOWED_PATHS.map { |p| Discourse.base_path + p }
deny_paths =
deny_paths_googlebot + DISALLOWED_WITH_HEADER_PATHS.map { |p| Discourse.base_path + p }
deny_all = ["#{Discourse.base_path}/"]
result = {
header:
"# See http://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file",
agents: [],
}
if SiteSetting.allowed_crawler_user_agents.present?
SiteSetting
.allowed_crawler_user_agents
.split("|")
.each do |agent|
paths = agent == "Googlebot" ? deny_paths_googlebot : deny_paths
result[:agents] << { name: agent, disallow: paths }
end
result[:agents] << { name: "*", disallow: deny_all }
else
if SiteSetting.blocked_crawler_user_agents.present?
SiteSetting
.blocked_crawler_user_agents
.split("|")
.each { |agent| result[:agents] << { name: agent, disallow: deny_all } }
end
result[:agents] << { name: "*", disallow: deny_paths }
result[:agents] << { name: "Googlebot", disallow: deny_paths_googlebot }
end
DiscourseEvent.trigger(:robots_info, result)
result
end
end