mirror of
https://github.com/discourse/discourse.git
synced 2025-01-18 16:52:45 +08:00
FIX: move crawler blocking to app controller
We need access to site settings in multisite, we do not have access yet if we attempt to get them in request tracker middleware
This commit is contained in:
parent
d1b21aa73b
commit
7f98ed69cd
|
@ -40,6 +40,7 @@ class ApplicationController < ActionController::Base
|
|||
end
|
||||
end
|
||||
|
||||
before_action :block_crawlers
|
||||
before_action :check_readonly_mode
|
||||
before_action :handle_theme
|
||||
before_action :set_current_user_for_logs
|
||||
|
@ -464,6 +465,19 @@ class ApplicationController < ActionController::Base
|
|||
|
||||
private
|
||||
|
||||
def block_crawlers
|
||||
if (
|
||||
request.get? &&
|
||||
!request.xhr? &&
|
||||
!request.path.ends_with?('robots.txt') &&
|
||||
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
|
||||
)
|
||||
|
||||
request.env["discourse.request_tracker.skip"] = true
|
||||
raise Discourse::InvalidAccess, 'Crawler not allowed'
|
||||
end
|
||||
end
|
||||
|
||||
def check_readonly_mode
|
||||
@readonly_mode = Discourse.readonly_mode?
|
||||
end
|
||||
|
|
|
@ -175,12 +175,6 @@ class Middleware::RequestTracker
|
|||
return result
|
||||
end
|
||||
|
||||
# if block_crawler(request)
|
||||
# log_request = false
|
||||
# result = [403, { 'Content-Type' => 'text/plain' }, ["Crawler is not allowed."]]
|
||||
# return result
|
||||
# end
|
||||
|
||||
env["discourse.request_tracker"] = self
|
||||
MethodProfiler.start
|
||||
result = @app.call(env)
|
||||
|
@ -287,13 +281,6 @@ class Middleware::RequestTracker
|
|||
end
|
||||
end
|
||||
|
||||
def block_crawler(request)
|
||||
request.get? &&
|
||||
!request.xhr? &&
|
||||
!request.path.ends_with?('robots.txt') &&
|
||||
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
|
||||
end
|
||||
|
||||
def log_later(data, host)
|
||||
Scheduler::Defer.later("Track view", _db = nil) do
|
||||
self.class.log_request_on_site(data, host)
|
||||
|
|
|
@ -32,4 +32,72 @@ RSpec.describe ApplicationController do
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
context "crawler blocking" do
|
||||
let :non_crawler do
|
||||
{
|
||||
"HTTP_USER_AGENT" =>
|
||||
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
||||
}
|
||||
end
|
||||
it "applies whitelisted_crawler_user_agents correctly" do
|
||||
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
|
||||
|
||||
get '/srv/status', headers: {
|
||||
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
||||
}
|
||||
|
||||
expect(response.status).to eq(200)
|
||||
|
||||
get '/srv/status', headers: {
|
||||
'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
|
||||
}
|
||||
|
||||
expect(response.status).to eq(403)
|
||||
|
||||
get '/srv/status', headers: non_crawler
|
||||
expect(response.status).to eq(200)
|
||||
end
|
||||
|
||||
it "applies blacklisted_crawler_user_agents correctly" do
|
||||
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
||||
|
||||
get '/srv/status', headers: non_crawler
|
||||
expect(response.status).to eq(200)
|
||||
|
||||
get '/srv/status', headers: {
|
||||
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
||||
}
|
||||
|
||||
expect(response.status).to eq(403)
|
||||
|
||||
get '/srv/status', headers: {
|
||||
'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
|
||||
}
|
||||
|
||||
expect(response.status).to eq(200)
|
||||
end
|
||||
|
||||
it "blocked crawlers shouldn't log page views" do
|
||||
ApplicationRequest.clear_cache!
|
||||
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
||||
expect {
|
||||
get '/srv/status', headers: {
|
||||
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
||||
}
|
||||
ApplicationRequest.write_cache!
|
||||
}.to_not change { ApplicationRequest.count }
|
||||
end
|
||||
|
||||
it "blocks json requests" do
|
||||
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
|
||||
|
||||
get '/srv/status.json', headers: {
|
||||
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
|
||||
}
|
||||
|
||||
expect(response.status).to eq(403)
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue
Block a user