FIX: move crawler blocking to app controller

We need access to site settings in multisite, we do not have access yet if we attempt to get them in request tracker middleware
2025-02-20 16:48:29 +08:00 · 2018-07-04 10:07:14 +10:00 · 2018-07-04 10:07:14 +10:00 · 7f98ed69cd
commit 7f98ed69cd
parent d1b21aa73b
3 changed files with 82 additions and 13 deletions
--- a/app/controllers/application_controller.rb
+++ b/app/controllers/application_controller.rb
@ -40,6 +40,7 @@ class ApplicationController < ActionController::Base
    end
  end

+  before_action :block_crawlers
  before_action :check_readonly_mode
  before_action :handle_theme
  before_action :set_current_user_for_logs
@ -464,6 +465,19 @@ class ApplicationController < ActionController::Base

  private

+  def block_crawlers
+    if (
+        request.get? &&
+        !request.xhr? &&
+        !request.path.ends_with?('robots.txt') &&
+        CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
+    )
+
+      request.env["discourse.request_tracker.skip"] = true
+      raise Discourse::InvalidAccess, 'Crawler not allowed'
+    end
+  end
+
  def check_readonly_mode
    @readonly_mode = Discourse.readonly_mode?
  end
--- a/lib/middleware/request_tracker.rb
+++ b/lib/middleware/request_tracker.rb
@ -175,12 +175,6 @@ class Middleware::RequestTracker
      return result
    end

-    # if block_crawler(request)
-    #   log_request = false
-    #   result = [403, { 'Content-Type' => 'text/plain' }, ["Crawler is not allowed."]]
-    #   return result
-    # end
-
    env["discourse.request_tracker"] = self
    MethodProfiler.start
    result = @app.call(env)
@ -287,13 +281,6 @@ class Middleware::RequestTracker
    end
  end

-  def block_crawler(request)
-    request.get? &&
-      !request.xhr? &&
-      !request.path.ends_with?('robots.txt') &&
-      CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
-  end
-
  def log_later(data, host)
    Scheduler::Defer.later("Track view", _db = nil) do
      self.class.log_request_on_site(data, host)
--- a/spec/requests/application_controller_spec.rb
+++ b/spec/requests/application_controller_spec.rb
@ -32,4 +32,72 @@ RSpec.describe ApplicationController do
      end
    end
  end
+
+  context "crawler blocking" do
+    let :non_crawler do
+      {
+        "HTTP_USER_AGENT" =>
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
+      }
+    end
+    it "applies whitelisted_crawler_user_agents correctly" do
+      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(response.status).to eq(200)
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
+      }
+
+      expect(response.status).to eq(403)
+
+      get '/srv/status', headers: non_crawler
+      expect(response.status).to eq(200)
+    end
+
+    it "applies blacklisted_crawler_user_agents correctly" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+
+      get '/srv/status', headers: non_crawler
+      expect(response.status).to eq(200)
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(response.status).to eq(403)
+
+      get '/srv/status', headers: {
+        'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
+      }
+
+      expect(response.status).to eq(200)
+    end
+
+    it "blocked crawlers shouldn't log page views" do
+      ApplicationRequest.clear_cache!
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+      expect {
+        get '/srv/status', headers: {
+          'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+        }
+        ApplicationRequest.write_cache!
+      }.to_not change { ApplicationRequest.count }
+    end
+
+    it "blocks json requests" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+
+      get '/srv/status.json', headers: {
+        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
+      }
+
+      expect(response.status).to eq(403)
+    end
+  end
+
 end