mirror of
https://github.com/discourse/discourse.git
synced 2025-02-02 20:59:46 +08:00
038a391cfe
Follow up from https://github.com/discourse/discourse/pull/27712. Currently, we already add `noindex` to /u routes. However, due to robots.txt blocking this, search engines are not able to see the header. This commit removes /u from our robots.txt to allow search engines to see the header. This commit also includes a migration to remove the /u from admins who have overridden the file. I had contemplated not including this migration, but seeing there are existing site admins who are trying to remove errors from their dashboard, they would probably welcome this change. The migration replaces overridden text at this area, and will not modify if additional routes have been added in-between: <img width="500" alt="Screenshot 2025-01-15 at 11 28 43 AM copy" src="https://github.com/user-attachments/assets/60db64c9-ed33-48a5-a917-a10545282a5c" /> Side effect note: This might potentially result in more pageviews* from GoogleBot (for example) for a period of time as Google starts visiting the user routes they have been denied before.
91 lines
2.8 KiB
Ruby
91 lines
2.8 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
class RobotsTxtController < ApplicationController
|
|
layout false
|
|
skip_before_action :preload_json,
|
|
:check_xhr,
|
|
:redirect_to_login_if_required,
|
|
:redirect_to_profile_if_required
|
|
|
|
OVERRIDDEN_HEADER = "# This robots.txt file has been customized at /admin/customize/robots\n"
|
|
|
|
# NOTE: order is important!
|
|
DISALLOWED_PATHS = %w[
|
|
/admin/
|
|
/auth/
|
|
/assets/browser-update*.js
|
|
/email/
|
|
/session
|
|
/user-api-key
|
|
/*?api_key*
|
|
/*?*api_key*
|
|
]
|
|
|
|
DISALLOWED_WITH_HEADER_PATHS = %w[/badges /my /search /tag/*/l /g /t/*/*.rss /c/*.rss]
|
|
|
|
def index
|
|
if (overridden = SiteSetting.overridden_robots_txt.dup).present?
|
|
overridden.prepend(OVERRIDDEN_HEADER) if guardian.is_admin? && !is_api?
|
|
render plain: overridden
|
|
return
|
|
end
|
|
if SiteSetting.allow_index_in_robots_txt?
|
|
@robots_info = self.class.fetch_default_robots_info
|
|
render :index, content_type: "text/plain"
|
|
else
|
|
render :no_index, content_type: "text/plain"
|
|
end
|
|
end
|
|
|
|
# If you are hosting Discourse in a subfolder, you will need to create your robots.txt
|
|
# in the root of your web server with the appropriate paths. This method will return
|
|
# JSON that can be used by a script to create a robots.txt that works well with your
|
|
# existing site.
|
|
def builder
|
|
result = self.class.fetch_default_robots_info
|
|
overridden = SiteSetting.overridden_robots_txt
|
|
result[:overridden] = overridden if overridden.present?
|
|
render json: result
|
|
end
|
|
|
|
def self.fetch_default_robots_info
|
|
deny_paths_googlebot = DISALLOWED_PATHS.map { |p| Discourse.base_path + p }
|
|
deny_paths =
|
|
deny_paths_googlebot + DISALLOWED_WITH_HEADER_PATHS.map { |p| Discourse.base_path + p }
|
|
deny_all = ["#{Discourse.base_path}/"]
|
|
|
|
result = {
|
|
header:
|
|
"# See http://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file",
|
|
agents: [],
|
|
}
|
|
|
|
if SiteSetting.allowed_crawler_user_agents.present?
|
|
SiteSetting
|
|
.allowed_crawler_user_agents
|
|
.split("|")
|
|
.each do |agent|
|
|
paths = agent == "Googlebot" ? deny_paths_googlebot : deny_paths
|
|
result[:agents] << { name: agent, disallow: paths }
|
|
end
|
|
|
|
result[:agents] << { name: "*", disallow: deny_all }
|
|
else
|
|
if SiteSetting.blocked_crawler_user_agents.present?
|
|
SiteSetting
|
|
.blocked_crawler_user_agents
|
|
.split("|")
|
|
.each { |agent| result[:agents] << { name: agent, disallow: deny_all } }
|
|
end
|
|
|
|
result[:agents] << { name: "*", disallow: deny_paths }
|
|
|
|
result[:agents] << { name: "Googlebot", disallow: deny_paths_googlebot }
|
|
end
|
|
|
|
DiscourseEvent.trigger(:robots_info, result)
|
|
|
|
result
|
|
end
|
|
end
|