FIX: move crawler blocking to app controller

We need access to site settings in multisite, we do not have access
yet if we attempt to get them in request tracker middleware
This commit is contained in:
Sam 2018-07-04 10:07:14 +10:00
parent d1b21aa73b
commit 7f98ed69cd
3 changed files with 82 additions and 13 deletions

View File

@ -40,6 +40,7 @@ class ApplicationController < ActionController::Base
end
end
before_action :block_crawlers
before_action :check_readonly_mode
before_action :handle_theme
before_action :set_current_user_for_logs
@ -464,6 +465,19 @@ class ApplicationController < ActionController::Base
private
def block_crawlers
if (
request.get? &&
!request.xhr? &&
!request.path.ends_with?('robots.txt') &&
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
)
request.env["discourse.request_tracker.skip"] = true
raise Discourse::InvalidAccess, 'Crawler not allowed'
end
end
def check_readonly_mode
@readonly_mode = Discourse.readonly_mode?
end

View File

@ -175,12 +175,6 @@ class Middleware::RequestTracker
return result
end
# if block_crawler(request)
# log_request = false
# result = [403, { 'Content-Type' => 'text/plain' }, ["Crawler is not allowed."]]
# return result
# end
env["discourse.request_tracker"] = self
MethodProfiler.start
result = @app.call(env)
@ -287,13 +281,6 @@ class Middleware::RequestTracker
end
end
def block_crawler(request)
request.get? &&
!request.xhr? &&
!request.path.ends_with?('robots.txt') &&
CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
end
def log_later(data, host)
Scheduler::Defer.later("Track view", _db = nil) do
self.class.log_request_on_site(data, host)

View File

@ -32,4 +32,72 @@ RSpec.describe ApplicationController do
end
end
end
context "crawler blocking" do
let :non_crawler do
{
"HTTP_USER_AGENT" =>
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
}
end
it "applies whitelisted_crawler_user_agents correctly" do
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(response.status).to eq(403)
get '/srv/status', headers: non_crawler
expect(response.status).to eq(200)
end
it "applies blacklisted_crawler_user_agents correctly" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: non_crawler
expect(response.status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(403)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(response.status).to eq(200)
end
it "blocked crawlers shouldn't log page views" do
ApplicationRequest.clear_cache!
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
expect {
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
ApplicationRequest.write_cache!
}.to_not change { ApplicationRequest.count }
end
it "blocks json requests" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status.json', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(response.status).to eq(403)
end
end
end