FEATURE: control which web crawlers can access using a whitelist or blacklist

2024-11-26 01:53:39 +08:00 · 2018-03-15 17:10:45 -04:00 · 2018-03-15 17:10:45 -04:00 · ced7e9a691
commit ced7e9a691
parent cbbeedf53b
22 changed files with 722 additions and 97 deletions
--- a/app/assets/javascripts/admin/models/report.js.es6
+++ b/app/assets/javascripts/admin/models/report.js.es6
@ -2,6 +2,7 @@ import { ajax } from 'discourse/lib/ajax';
 import round from "discourse/lib/round";
 import { fmt } from 'discourse/lib/computed';
 import { fillMissingDates } from 'discourse/lib/utilities';
+import computed from 'ember-addons/ember-computed-decorators';

 const Report = Discourse.Model.extend({
  reportUrl: fmt("type", "/admin/reports/%@"),
@ -42,7 +43,8 @@ const Report = Discourse.Model.extend({
  lastSevenDaysCount:  function() { return this.valueFor(1, 7); }.property("data"),
  lastThirtyDaysCount: function() { return this.valueFor(1, 30); }.property("data"),

-  yesterdayTrend: function() {
+  @computed('data')
+  yesterdayTrend() {
    const yesterdayVal = this.valueAt(1);
    const twoDaysAgoVal = this.valueAt(2);
    if (yesterdayVal > twoDaysAgoVal) {
@ -52,9 +54,10 @@ const Report = Discourse.Model.extend({
    } else {
      return "no-change";
    }
-  }.property("data"),
+  },

-  sevenDayTrend: function() {
+  @computed('data')
+  sevenDayTrend() {
    const currentPeriod = this.valueFor(1, 7);
    const prevPeriod = this.valueFor(8, 14);
    if (currentPeriod > prevPeriod) {
@ -64,36 +67,39 @@ const Report = Discourse.Model.extend({
    } else {
      return "no-change";
    }
-  }.property("data"),
+  },

-  thirtyDayTrend: function() {
-    if (this.get("prev30Days")) {
+  @computed('prev30Days', 'data')
+  thirtyDayTrend(prev30Days) {
+    if (prev30Days) {
      const currentPeriod = this.valueFor(1, 30);
      if (currentPeriod > this.get("prev30Days")) {
        return "trending-up";
-      } else if (currentPeriod < this.get("prev30Days")) {
+      } else if (currentPeriod < prev30Days) {
        return "trending-down";
      }
    }
    return "no-change";
-  }.property("data", "prev30Days"),
+  },

-  icon: function() {
-    switch (this.get("type")) {
+  @computed('type')
+  icon(type) {
+    switch (type) {
      case "flags": return "flag";
      case "likes": return "heart";
      case "bookmarks": return "bookmark";
      default: return null;
    }
-  }.property("type"),
+  },

-  method: function() {
-    if (this.get("type") === "time_to_first_response") {
+  @computed('type')
+  method(type) {
+    if (type === "time_to_first_response") {
      return "average";
    } else {
      return "sum";
    }
-  }.property("type"),
+  },

  percentChangeString(val1, val2) {
    const val = ((val1 - val2) / val2) * 100;
@ -114,21 +120,31 @@ const Report = Discourse.Model.extend({
    return title;
  },

-  yesterdayCountTitle: function() {
+  @computed('data')
+  yesterdayCountTitle() {
    return this.changeTitle(this.valueAt(1), this.valueAt(2), "two days ago");
-  }.property("data"),
+  },

-  sevenDayCountTitle: function() {
+  @computed('data')
+  sevenDayCountTitle() {
    return this.changeTitle(this.valueFor(1, 7), this.valueFor(8, 14), "two weeks ago");
-  }.property("data"),
+  },

-  thirtyDayCountTitle: function() {
-    return this.changeTitle(this.valueFor(1, 30), this.get("prev30Days"), "in the previous 30 day period");
-  }.property("data"),
+  @computed('prev30Days', 'data')
+  thirtyDayCountTitle(prev30Days) {
+    return this.changeTitle(this.valueFor(1, 30), prev30Days, "in the previous 30 day period");
+  },

-  dataReversed: function() {
-    return this.get("data").toArray().reverse();
-  }.property("data")
+  @computed('data')
+  sortedData(data) {
+    return this.get('xaxisIsDate') ? data.toArray().reverse() : data.toArray();
+  },
+
+  @computed('data')
+  xaxisIsDate() {
+    if (!this.data[0]) return false;
+    return this.data && moment(this.data[0].x, 'YYYY-MM-DD').isValid();
+  }

 });

@ -152,6 +168,14 @@ Report.reopenClass({

      const model = Report.create({ type: type });
      model.setProperties(json.report);
+
+      if (json.report.related_report) {
+        // TODO: fillMissingDates if xaxis is date
+        const related = Report.create({ type: json.report.related_report.type });
+        related.setProperties(json.report.related_report);
+        model.set('relatedReport', related);
+      }
+
      return model;
    });
  }
--- a/app/assets/javascripts/admin/templates/components/admin-table-report.hbs
+++ b/app/assets/javascripts/admin/templates/components/admin-table-report.hbs
@ -0,0 +1,17 @@
+{{#if model.sortedData}}
+  <table class="table report {{model.type}}">
+    <tr>
+      <th>{{model.xaxis}}</th>
+      <th>{{model.yaxis}}</th>
+    </tr>
+
+    {{#each model.sortedData as |row|}}
+      <tr>
+        <td class="x-value">{{row.x}}</td>
+        <td>
+          {{row.y}}
+        </td>
+      </tr>
+    {{/each}}
+  </table>
+{{/if}}
--- a/app/assets/javascripts/admin/templates/reports.hbs
+++ b/app/assets/javascripts/admin/templates/reports.hbs
@ -31,20 +31,10 @@
  {{#if viewingGraph}}
    {{admin-graph model=model}}
  {{else}}
-  <table class='table report'>
-    <tr>
-      <th>{{model.xaxis}}</th>
-      <th>{{model.yaxis}}</th>
-    </tr>
+    {{admin-table-report model=model}}
+  {{/if}}

-    {{#each model.dataReversed as |row|}}
-      <tr>
-        <td>{{row.x}}</td>
-        <td>
-          {{row.y}}
-        </td>
-      </tr>
-    {{/each}}
-  </table>
+  {{#if model.relatedReport}}
+    {{admin-table-report model=model.relatedReport}}
  {{/if}}
 {{/conditional-loading-spinner}}
--- a/app/assets/stylesheets/common/admin/admin_base.scss
+++ b/app/assets/stylesheets/common/admin/admin_base.scss
@ -167,6 +167,20 @@ $mobile-breakpoint: 700px;
      }
    }

+    &.web_crawlers {
+      tr {
+        th:nth-of-type(1) {
+          width: 60%;
+        }
+      }
+      td.x-value {
+        max-width: 0;
+        overflow: hidden;
+        text-overflow: ellipsis;
+        white-space: nowrap;
+      }
+    }
+
    .bar-container {
      float: left;
      width: 300px;
--- a/app/controllers/robots_txt_controller.rb
+++ b/app/controllers/robots_txt_controller.rb
@ -3,7 +3,21 @@ class RobotsTxtController < ApplicationController
  skip_before_action :preload_json, :check_xhr, :redirect_to_login_if_required

  def index
-    path = SiteSetting.allow_index_in_robots_txt ? :index : :no_index
+    if SiteSetting.allow_index_in_robots_txt
+      path = :index
+      if SiteSetting.whitelisted_crawler_user_agents.present?
+        @allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')
+        @disallowed_user_agents = ['*']
+      elsif SiteSetting.blacklisted_crawler_user_agents.present?
+        @allowed_user_agents = ['*']
+        @disallowed_user_agents = SiteSetting.blacklisted_crawler_user_agents.split('|')
+      else
+        @allowed_user_agents = ['*']
+      end
+    else
+      path = :no_index
+    end
+
    render path, content_type: 'text/plain'
  end
 end
--- a/app/jobs/scheduled/clean_up_crawler_stats.rb
+++ b/app/jobs/scheduled/clean_up_crawler_stats.rb
@ -0,0 +1,26 @@
+module Jobs
+
+  class CleanUpCrawlerStats < Jobs::Scheduled
+    every 1.day
+
+    def execute(args)
+      WebCrawlerRequest.where('date < ?', WebCrawlerRequest.max_record_age.ago).delete_all
+
+      # keep count of only the top user agents
+      WebCrawlerRequest.exec_sql <<~SQL
+        WITH ranked_requests AS (
+          SELECT row_number() OVER (ORDER BY count DESC) as row_number, id
+            FROM web_crawler_requests
+           WHERE date = '#{1.day.ago.strftime("%Y-%m-%d")}'
+        )
+        DELETE FROM web_crawler_requests
+        WHERE id IN (
+          SELECT ranked_requests.id
+            FROM ranked_requests
+           WHERE row_number > #{WebCrawlerRequest.max_records_per_day}
+        )
+      SQL
+    end
+  end
+
+end
--- a/app/models/application_request.rb
+++ b/app/models/application_request.rb
@ -1,5 +1,6 @@
 # frozen_string_literal: true
 class ApplicationRequest < ActiveRecord::Base
+
  enum req_type: %i(http_total
                    http_2xx
                    http_background
@ -12,41 +13,12 @@ class ApplicationRequest < ActiveRecord::Base
                    page_view_logged_in_mobile
                    page_view_anon_mobile)

-  cattr_accessor :autoflush, :autoflush_seconds, :last_flush
-  # auto flush if backlog is larger than this
-  self.autoflush = 2000
-
-  # auto flush if older than this
-  self.autoflush_seconds = 5.minutes
-  self.last_flush = Time.now.utc
+  include CachedCounting

  def self.increment!(type, opts = nil)
-    key = redis_key(type)
-    val = $redis.incr(key).to_i
-
-    # readonly mode it is going to be 0, skip
-    return if val == 0
-
-    # 3.days, see: https://github.com/rails/rails/issues/21296
-    $redis.expire(key, 259200)
-
-    autoflush = (opts && opts[:autoflush]) || self.autoflush
-    if autoflush > 0 && val >= autoflush
-      write_cache!
-      return
-    end
-
-    if (Time.now.utc - last_flush).to_i > autoflush_seconds
-      write_cache!
-    end
+    perform_increment!(redis_key(type), opts)
  end

-  GET_AND_RESET = <<~LUA
-    local val = redis.call('get', KEYS[1])
-    redis.call('set', KEYS[1], '0')
-    return val
-  LUA
-
  def self.write_cache!(date = nil)
    if date.nil?
      write_cache!(Time.now.utc)
@ -58,13 +30,9 @@ class ApplicationRequest < ActiveRecord::Base

    date = date.to_date

-    # this may seem a bit fancy but in so it allows
-    # for concurrent calls without double counting
    req_types.each do |req_type, _|
-      key = redis_key(req_type, date)
+      val = get_and_reset(redis_key(req_type, date))

-      namespaced_key = $redis.namespace_key(key)
-      val = $redis.without_namespace.eval(GET_AND_RESET, keys: [namespaced_key]).to_i
      next if val == 0

      id = req_id(date, req_type)
--- a/app/models/concerns/cached_counting.rb
+++ b/app/models/concerns/cached_counting.rb
@ -0,0 +1,67 @@
+module CachedCounting
+  extend ActiveSupport::Concern
+
+  included do
+    class << self
+      attr_accessor :autoflush, :autoflush_seconds, :last_flush
+    end
+
+    # auto flush if backlog is larger than this
+    self.autoflush = 2000
+
+    # auto flush if older than this
+    self.autoflush_seconds = 5.minutes
+
+    self.last_flush = Time.now.utc
+  end
+
+  class_methods do
+    def perform_increment!(key, opts = nil)
+      val = $redis.incr(key).to_i
+
+      # readonly mode it is going to be 0, skip
+      return if val == 0
+
+      # 3.days, see: https://github.com/rails/rails/issues/21296
+      $redis.expire(key, 259200)
+
+      autoflush = (opts && opts[:autoflush]) || self.autoflush
+      if autoflush > 0 && val >= autoflush
+        write_cache!
+        return
+      end
+
+      if (Time.now.utc - last_flush).to_i > autoflush_seconds
+        write_cache!
+      end
+    end
+
+    def write_cache!(date = nil)
+      raise NotImplementedError
+    end
+
+    GET_AND_RESET = <<~LUA
+      local val = redis.call('get', KEYS[1])
+      redis.call('set', KEYS[1], '0')
+      return val
+    LUA
+
+    # this may seem a bit fancy but in so it allows
+    # for concurrent calls without double counting
+    def get_and_reset(key)
+      namespaced_key = $redis.namespace_key(key)
+      $redis.without_namespace.eval(GET_AND_RESET, keys: [namespaced_key]).to_i
+    end
+
+    def request_id(query_params, retries = 0)
+      id = where(query_params).pluck(:id).first
+      id ||= create!(query_params.merge(count: 0)).id
+    rescue # primary key violation
+      if retries == 0
+        request_id(query_params, 1)
+      else
+        raise
+      end
+    end
+  end
+end
--- a/app/models/report.rb
+++ b/app/models/report.rb
@ -27,7 +27,11 @@ class Report
     category_id: category_id,
     group_id: group_id,
     prev30Days: self.prev30Days
-    }
+    }.tap do |json|
+      if type == 'page_view_crawler_reqs'
+        json[:related_report] = Report.find('web_crawlers', start_date: start_date, end_date: end_date)&.as_json
+      end
+    end
  end

  def Report.add_report(name, &block)
@ -225,4 +229,12 @@ class Report
  def self.report_notify_user_private_messages(report)
    private_messages_report report, TopicSubtype.notify_user
  end
+
+  def self.report_web_crawlers(report)
+    report.data = WebCrawlerRequest.where('date >= ? and date <= ?', report.start_date, report.end_date)
+      .limit(200)
+      .order('sum_count DESC')
+      .group(:user_agent).sum(:count)
+      .map { |ua, count| { x: ua, y: count } }
+  end
 end
--- a/app/models/web_crawler_request.rb
+++ b/app/models/web_crawler_request.rb
@ -0,0 +1,76 @@
+class WebCrawlerRequest < ActiveRecord::Base
+  include CachedCounting
+
+  # auto flush if older than this
+  self.autoflush_seconds = 1.hour
+
+  cattr_accessor :max_record_age, :max_records_per_day
+
+  # only keep the top records based on request count
+  self.max_records_per_day = 200
+
+  # delete records older than this
+  self.max_record_age = 30.days
+
+  def self.increment!(user_agent, opts = nil)
+    ua_list_key = user_agent_list_key
+    $redis.sadd(ua_list_key, user_agent)
+    $redis.expire(ua_list_key, 259200) # 3.days
+
+    perform_increment!(redis_key(user_agent), opts)
+  end
+
+  def self.write_cache!(date = nil)
+    if date.nil?
+      write_cache!(Time.now.utc)
+      write_cache!(Time.now.utc.yesterday)
+      return
+    end
+
+    self.last_flush = Time.now.utc
+
+    date = date.to_date
+
+    $redis.smembers(user_agent_list_key(date)).each do |user_agent, _|
+
+      val = get_and_reset(redis_key(user_agent, date))
+
+      next if val == 0
+
+      self.where(id: req_id(date, user_agent)).update_all(["count = count + ?", val])
+    end
+  rescue Redis::CommandError => e
+    raise unless e.message =~ /READONLY/
+    nil
+  end
+
+  def self.clear_cache!(date = nil)
+    if date.nil?
+      clear_cache!(Time.now.utc)
+      clear_cache!(Time.now.utc.yesterday)
+      return
+    end
+
+    list_key = user_agent_list_key(date)
+
+    $redis.smembers(list_key).each do |user_agent, _|
+      $redis.del redis_key(user_agent, date)
+    end
+
+    $redis.del list_key
+  end
+
+  protected
+
+  def self.user_agent_list_key(time = Time.now.utc)
+    "crawl_ua_list:#{time.strftime('%Y%m%d')}"
+  end
+
+  def self.redis_key(user_agent, time = Time.now.utc)
+    "crawl_req:#{time.strftime('%Y%m%d')}:#{user_agent}"
+  end
+
+  def self.req_id(date, user_agent)
+    request_id(date: date, user_agent: user_agent)
+  end
+end
--- a/app/views/robots_txt/index.erb
+++ b/app/views/robots_txt/index.erb
@ -1,6 +1,8 @@
 # See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file
 #
-User-agent: *
+<% @allowed_user_agents.each do |user_agent| %>
+User-agent: <%= user_agent %>
+<% end %>
 Disallow: /auth/cas
 Disallow: /auth/facebook/callback
 Disallow: /auth/twitter/callback
@ -29,4 +31,12 @@ Disallow: /groups
 Disallow: /groups/
 Disallow: /uploads/

+<% if @disallowed_user_agents %>
+  <% @disallowed_user_agents.each do |user_agent| %>
+User-agent: <%= user_agent %>
+Disallow: /
+
+  <% end %>
+<% end %>
+
 <%= server_plugin_outlet "robots_txt_index" %>
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@ -962,6 +962,10 @@ en:
      title: "User Visits"
      xaxis: "Day"
      yaxis: "Number of visits"
+    web_crawlers:
+      title: "Web Crawler Requests"
+      xaxis: "User Agent"
+      yaxis: "Pageviews"

  dashboard:
    rails_env_warning: "Your server is running in %{env} mode."
@ -1113,6 +1117,8 @@ en:
    blacklist_ip_blocks: "A list of private IP blocks that should never be crawled by Discourse"
    whitelist_internal_hosts: "A list of internal hosts that discourse can safely crawl for oneboxing and other purposes"
    allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts"
+    whitelisted_crawler_user_agents: 'User agents of web crawlers that should be allowed to access the site.'
+    blacklisted_crawler_user_agents: 'User agents of web crawlers that should not be allowed to access the site. Does not apply if whitelist is defined.'
    top_menu: "Determine which items appear in the homepage navigation, and in what order. Example latest|new|unread|categories|top|read|posted|bookmarks"
    post_menu: "Determine which items appear on the post menu, and in what order. Example like|edit|flag|delete|share|bookmark|reply"
    post_menu_hidden_items: "The menu items to hide by default in the post menu unless an expansion ellipsis is clicked on."
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@ -1018,6 +1018,12 @@ security:
    default: 'https://www.google.com/maps/embed?|https://www.openstreetmap.org/export/embed.html?|https://calendar.google.com/calendar/embed?'
    type: list
    client: true
+  whitelisted_crawler_user_agents:
+    type: list
+    default: ''
+  blacklisted_crawler_user_agents:
+    type: list
+    default: ''

 onebox:
  enable_flash_video_onebox: false
--- a/db/migrate/20180320190339_create_web_crawler_requests.rb
+++ b/db/migrate/20180320190339_create_web_crawler_requests.rb
@ -0,0 +1,11 @@
+class CreateWebCrawlerRequests < ActiveRecord::Migration[5.1]
+  def change
+    create_table :web_crawler_requests do |t|
+      t.date :date, null: false
+      t.string :user_agent, null: false
+      t.integer :count, null: false, default: 0
+    end
+
+    add_index :web_crawler_requests, [:date, :user_agent], unique: true
+  end
+end
--- a/lib/crawler_detection.rb
+++ b/lib/crawler_detection.rb
@ -28,4 +28,25 @@ module CrawlerDetection
    end

  end
+
+  # Given a user_agent that returns true from crawler?, should its request be allowed?
+  def self.allow_crawler?(user_agent)
+    return true if SiteSetting.whitelisted_crawler_user_agents.blank? &&
+      SiteSetting.blacklisted_crawler_user_agents.blank?
+
+    @whitelisted_matchers ||= {}
+    @blacklisted_matchers ||= {}
+
+    if SiteSetting.whitelisted_crawler_user_agents.present?
+      whitelisted = @whitelisted_matchers[SiteSetting.whitelisted_crawler_user_agents] ||= to_matcher(SiteSetting.whitelisted_crawler_user_agents)
+      !user_agent.nil? && user_agent.match?(whitelisted)
+    else
+      blacklisted = @blacklisted_matchers[SiteSetting.blacklisted_crawler_user_agents] ||= to_matcher(SiteSetting.blacklisted_crawler_user_agents)
+      user_agent.nil? || !user_agent.match?(blacklisted)
+    end
+  end
+
+  def self.is_blocked_crawler?(user_agent)
+    crawler?(user_agent) && !allow_crawler?(user_agent)
+  end
 end
--- a/lib/middleware/request_tracker.rb
+++ b/lib/middleware/request_tracker.rb
@ -83,6 +83,7 @@ class Middleware::RequestTracker
    if track_view
      if data[:is_crawler]
        ApplicationRequest.increment!(:page_view_crawler)
+        WebCrawlerRequest.increment!(data[:user_agent])
      elsif data[:has_auth_cookie]
        ApplicationRequest.increment!(:page_view_logged_in)
        ApplicationRequest.increment!(:page_view_logged_in_mobile) if data[:is_mobile]
@ -129,8 +130,9 @@ class Middleware::RequestTracker
      is_mobile: helper.is_mobile?,
      track_view: track_view,
      timing: timing
-    }
-
+    }.tap do |h|
+      h[:user_agent] = env['HTTP_USER_AGENT'] if h[:is_crawler]
+    end
  end

  def log_request_info(env, result, info)
@ -155,12 +157,20 @@ class Middleware::RequestTracker

  def call(env)
    result = nil
+    log_request = true
+    request = Rack::Request.new(env)

-    if rate_limit(env)
+    if rate_limit(request)
      result = [429, {}, ["Slow down, too Many Requests from this IP Address"]]
      return result
    end

+    if block_crawler(request)
+      log_request = false
+      result = [403, {}, []]
+      return result
+    end
+
    env["discourse.request_tracker"] = self
    MethodProfiler.start
    result = @app.call(env)
@ -186,7 +196,7 @@ class Middleware::RequestTracker
        end
      end
    end
-    log_request_info(env, result, info) unless env["discourse.request_tracker.skip"]
+    log_request_info(env, result, info) unless !log_request || env["discourse.request_tracker.skip"]
  end

  PRIVATE_IP ||= /^(127\.)|(192\.168\.)|(10\.)|(172\.1[6-9]\.)|(172\.2[0-9]\.)|(172\.3[0-1]\.)|(::1$)|([fF][cCdD])/
@ -196,7 +206,7 @@ class Middleware::RequestTracker
    !!(ip && ip.to_s.match?(PRIVATE_IP))
  end

-  def rate_limit(env)
+  def rate_limit(request)

    if (
      GlobalSetting.max_reqs_per_ip_mode == "block" ||
@ -204,7 +214,7 @@ class Middleware::RequestTracker
      GlobalSetting.max_reqs_per_ip_mode == "warn+block"
    )

-      ip = Rack::Request.new(env).ip
+      ip = request.ip

      if !GlobalSetting.max_reqs_rate_limit_on_private
        return false if is_private_ip?(ip)
@ -236,15 +246,15 @@ class Middleware::RequestTracker
        global: true
      )

-      env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60]
-      env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10]
+      request.env['DISCOURSE_RATE_LIMITERS'] = [limiter10, limiter60]
+      request.env['DISCOURSE_ASSET_RATE_LIMITERS'] = [limiter_assets10]

      warn = GlobalSetting.max_reqs_per_ip_mode == "warn" ||
        GlobalSetting.max_reqs_per_ip_mode == "warn+block"

      if !limiter_assets10.can_perform?
        if warn
-          Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{env["REQUEST_URI"]}")
+          Rails.logger.warn("Global asset IP rate limit exceeded for #{ip}: 10 second rate limit, uri: #{request.env["REQUEST_URI"]}")
        end

        return !(GlobalSetting.max_reqs_per_ip_mode == "warn")
@ -257,7 +267,7 @@ class Middleware::RequestTracker
        limiter60.performed!
      rescue RateLimiter::LimitExceeded
        if warn
-          Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{env["REQUEST_URI"]}")
+          Rails.logger.warn("Global IP rate limit exceeded for #{ip}: #{type} second rate limit, uri: #{request.env["REQUEST_URI"]}")
          !(GlobalSetting.max_reqs_per_ip_mode == "warn")
        else
          true
@ -266,6 +276,11 @@ class Middleware::RequestTracker
    end
  end

+  def block_crawler(request)
+    !request.path.ends_with?('robots.txt') &&
+      CrawlerDetection.is_blocked_crawler?(request.env['HTTP_USER_AGENT'])
+  end
+
  def log_later(data, host)
    Scheduler::Defer.later("Track view", _db = nil) do
      self.class.log_request_on_site(data, host)
--- a/spec/components/crawler_detection_spec.rb
+++ b/spec/components/crawler_detection_spec.rb
@ -28,6 +28,12 @@ describe CrawlerDetection do
      expect(described_class.crawler?("Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)")).to eq(true)
      expect(described_class.crawler?("Baiduspider+(+http://www.baidu.com/search/spider.htm)")).to eq(true)
      expect(described_class.crawler?("Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)")).to eq(true)
+
+      expect(described_class.crawler?("DiscourseAPI Ruby Gem 0.19.0")).to eq(true)
+      expect(described_class.crawler?("Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)")).to eq(true)
+      expect(described_class.crawler?("LogicMonitor SiteMonitor/1.0")).to eq(true)
+      expect(described_class.crawler?("Java/1.8.0_151")).to eq(true)
+      expect(described_class.crawler?("Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)")).to eq(true)
    end

    it "returns false for non-crawler user agents" do
@ -37,13 +43,106 @@ describe CrawlerDetection do
      expect(described_class.crawler?("Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25")).to eq(false)
      expect(described_class.crawler?("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0")).to eq(false)
      expect(described_class.crawler?("Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30")).to eq(false)
-
-      expect(described_class.crawler?("DiscourseAPI Ruby Gem 0.19.0")).to eq(true)
-      expect(described_class.crawler?("Pingdom.com_bot_version_1.4_(http://www.pingdom.com/)")).to eq(true)
-      expect(described_class.crawler?("LogicMonitor SiteMonitor/1.0")).to eq(true)
-      expect(described_class.crawler?("Java/1.8.0_151")).to eq(true)
-      expect(described_class.crawler?("Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)")).to eq(true)
    end

  end
+
+  describe 'allow_crawler?' do
+    it 'returns true if whitelist and blacklist are blank' do
+      expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true)
+    end
+
+    context 'whitelist is set' do
+      before do
+        SiteSetting.whitelisted_crawler_user_agents = 'Googlebot|Twitterbot'
+      end
+
+      it 'returns true for matching user agents' do
+        expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(true)
+        expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true)
+        expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(true)
+      end
+
+      it 'returns false for user agents that do not match' do
+        expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(false)
+        expect(CrawlerDetection.allow_crawler?('Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)')).to eq(false)
+        expect(CrawlerDetection.allow_crawler?('')).to eq(false)
+      end
+
+      context 'and blacklist is set' do
+        before do
+          SiteSetting.blacklisted_crawler_user_agents = 'Googlebot-Image'
+        end
+
+        it 'ignores the blacklist' do
+          expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(true)
+        end
+      end
+    end
+
+    context 'blacklist is set' do
+      before do
+        SiteSetting.blacklisted_crawler_user_agents = 'Googlebot|Twitterbot'
+      end
+
+      it 'returns true for crawlers that do not match' do
+        expect(CrawlerDetection.allow_crawler?('Mediapartners-Google')).to eq(true)
+        expect(CrawlerDetection.allow_crawler?('facebookexternalhit/1.1 (+http(s)://www.facebook.com/externalhit_uatext.php)')).to eq(true)
+        expect(CrawlerDetection.allow_crawler?('')).to eq(true)
+      end
+
+      it 'returns false for user agents that match' do
+        expect(CrawlerDetection.allow_crawler?('Googlebot/2.1 (+http://www.google.com/bot.html)')).to eq(false)
+        expect(CrawlerDetection.allow_crawler?('Googlebot-Image/1.0')).to eq(false)
+        expect(CrawlerDetection.allow_crawler?('Twitterbot')).to eq(false)
+      end
+    end
+  end
+
+  describe 'is_blocked_crawler?' do
+    it 'is false if user agent is a crawler and no whitelist or blacklist is defined' do
+      expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(false)
+    end
+
+    it 'is false if user agent is not a crawler and no whitelist or blacklist is defined' do
+      expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
+    end
+
+    it 'is true if user agent is a crawler and is not whitelisted' do
+      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
+      expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true)
+    end
+
+    it 'is false if user agent is not a crawler and there is a whitelist' do
+      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
+      expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
+    end
+
+    it 'is true if user agent is a crawler and is blacklisted' do
+      SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot'
+      expect(CrawlerDetection.is_blocked_crawler?('Twitterbot')).to eq(true)
+    end
+
+    it 'is true if user agent is a crawler and is not blacklisted' do
+      SiteSetting.blacklisted_crawler_user_agents = 'Twitterbot'
+      expect(CrawlerDetection.is_blocked_crawler?('Googlebot')).to eq(false)
+    end
+
+    it 'is false if user agent is not a crawler and blacklist is defined' do
+      SiteSetting.blacklisted_crawler_user_agents = 'Mozilla'
+      expect(CrawlerDetection.is_blocked_crawler?('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')).to eq(false)
+    end
+
+    it 'is true if user agent is missing and whitelist is defined' do
+      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
+      expect(CrawlerDetection.is_blocked_crawler?('')).to eq(true)
+      expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(true)
+    end
+
+    it 'is false if user agent is missing and blacklist is defined' do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+      expect(CrawlerDetection.is_blocked_crawler?('')).to eq(false)
+      expect(CrawlerDetection.is_blocked_crawler?(nil)).to eq(false)
+    end
+  end
 end
--- a/spec/components/middleware/request_tracker_spec.rb
+++ b/spec/components/middleware/request_tracker_spec.rb
@ -273,4 +273,50 @@ describe Middleware::RequestTracker do
      expect(timing[:redis][:calls]).to eq 2
    end
  end
+
+  context "crawler blocking" do
+    let :middleware do
+      app = lambda do |env|
+        [200, {}, ['OK']]
+      end
+
+      Middleware::RequestTracker.new(app)
+    end
+
+    def expect_success_response(status, _, response)
+      expect(status).to eq(200)
+      expect(response).to eq(['OK'])
+    end
+
+    def expect_blocked_response(status, _, response)
+      expect(status).to eq(403)
+      expect(response).to be_blank
+    end
+
+    it "applies whitelisted_crawler_user_agents correctly" do
+      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
+      expect_success_response(*middleware.call(env))
+      expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'Twitterbot')))
+      expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)')))
+      expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'DiscourseAPI Ruby Gem 0.19.0')))
+    end
+
+    it "applies blacklisted_crawler_user_agents correctly" do
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+      expect_success_response(*middleware.call(env))
+      expect_blocked_response(*middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)')))
+      expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'Twitterbot')))
+      expect_success_response(*middleware.call(env('HTTP_USER_AGENT' => 'DiscourseAPI Ruby Gem 0.19.0')))
+    end
+
+    it "blocked crawlers shouldn't log page views" do
+      ApplicationRequest.clear_cache!
+      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
+      expect {
+        middleware.call(env('HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'))
+        ApplicationRequest.write_cache!
+      }.to_not change { ApplicationRequest.count }
+    end
+  end
+
 end
--- a/spec/fabricators/web_crawler_request_fabricator.rb
+++ b/spec/fabricators/web_crawler_request_fabricator.rb
@ -0,0 +1,5 @@
+Fabricator(:web_crawler_request) do
+  user_agent { sequence(:ua) { |i| "Googlebot #{i}.0" } }
+  date Time.zone.now.to_date
+  count 0
+end
--- a/spec/jobs/clean_up_crawler_stats_spec.rb
+++ b/spec/jobs/clean_up_crawler_stats_spec.rb
@ -0,0 +1,31 @@
+require 'rails_helper'
+
+describe Jobs::CleanUpCrawlerStats do
+  subject { Jobs::CleanUpCrawlerStats.new.execute({}) }
+
+  it "deletes records older than 30 days old" do
+    freeze_time
+
+    today = Fabricate(:web_crawler_request, date: Time.zone.now.to_date)
+    yesterday = Fabricate(:web_crawler_request, date: 1.day.ago.to_date)
+    too_old = Fabricate(:web_crawler_request, date: 31.days.ago.to_date)
+
+    expect { subject }.to change { WebCrawlerRequest.count }.by(-1)
+    expect(WebCrawlerRequest.where(id: too_old.id)).to_not exist
+  end
+
+  it "keeps only the top records from the previous day" do
+    freeze_time
+
+    WebCrawlerRequest.stubs(:max_records_per_day).returns(3)
+
+    req1 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 100)
+    req4 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 30)
+    req3 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 40)
+    req2 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 50)
+    req5 = Fabricate(:web_crawler_request, date: 1.day.ago.to_date, count: 1)
+
+    expect { subject }.to change { WebCrawlerRequest.count }.by(-2)
+    expect(WebCrawlerRequest.all).to contain_exactly(req1, req2, req3)
+  end
+end
--- a/spec/models/web_crawler_request_spec.rb
+++ b/spec/models/web_crawler_request_spec.rb
@ -0,0 +1,114 @@
+require 'rails_helper'
+
+describe WebCrawlerRequest do
+  before do
+    WebCrawlerRequest.last_flush = Time.now.utc
+    WebCrawlerRequest.clear_cache!
+  end
+
+  after do
+    WebCrawlerRequest.clear_cache!
+  end
+
+  def inc(user_agent, opts = nil)
+    WebCrawlerRequest.increment!(user_agent, opts)
+  end
+
+  def disable_date_flush!
+    freeze_time(Time.now)
+    WebCrawlerRequest.last_flush = Time.now.utc
+  end
+
+  def web_crawler_request(user_agent)
+    WebCrawlerRequest.where(user_agent: user_agent).first
+  end
+
+  it 'works even if redis is in readonly' do
+    disable_date_flush!
+
+    inc('Googlebot')
+    inc('Googlebot')
+
+    $redis.without_namespace.stubs(:incr).raises(Redis::CommandError.new("READONLY"))
+    $redis.without_namespace.stubs(:eval).raises(Redis::CommandError.new("READONLY"))
+
+    inc('Googlebot', autoflush: 3)
+    WebCrawlerRequest.write_cache!
+
+    $redis.without_namespace.unstub(:incr)
+    $redis.without_namespace.unstub(:eval)
+
+    inc('Googlebot', autoflush: 3)
+    expect(web_crawler_request('Googlebot').count).to eq(3)
+  end
+
+  it 'logs nothing for an unflushed increment' do
+    WebCrawlerRequest.increment!('Googlebot')
+    expect(WebCrawlerRequest.count).to eq(0)
+  end
+
+  it 'can automatically flush' do
+    disable_date_flush!
+
+    inc('Googlebot', autoflush: 3)
+    expect(web_crawler_request('Googlebot')).to_not be_present
+    expect(WebCrawlerRequest.count).to eq(0)
+    inc('Googlebot', autoflush: 3)
+    expect(web_crawler_request('Googlebot')).to_not be_present
+    inc('Googlebot', autoflush: 3)
+    expect(web_crawler_request('Googlebot').count).to eq(3)
+    expect(WebCrawlerRequest.count).to eq(1)
+
+    3.times { inc('Googlebot', autoflush: 3) }
+    expect(web_crawler_request('Googlebot').count).to eq(6)
+    expect(WebCrawlerRequest.count).to eq(1)
+  end
+
+  it 'can flush based on time' do
+    t1 = Time.now.utc.at_midnight
+    freeze_time(t1)
+    WebCrawlerRequest.write_cache!
+    inc('Googlebot')
+    expect(WebCrawlerRequest.count).to eq(0)
+
+    freeze_time(t1 + WebCrawlerRequest.autoflush_seconds + 1)
+    inc('Googlebot')
+
+    expect(WebCrawlerRequest.count).to eq(1)
+  end
+
+  it 'flushes yesterdays results' do
+    t1 = Time.now.utc.at_midnight
+    freeze_time(t1)
+    inc('Googlebot')
+    freeze_time(t1.tomorrow)
+    inc('Googlebot')
+
+    WebCrawlerRequest.write_cache!
+    expect(WebCrawlerRequest.count).to eq(2)
+  end
+
+  it 'clears cache correctly' do
+    inc('Googlebot')
+    inc('Twitterbot')
+    WebCrawlerRequest.clear_cache!
+    WebCrawlerRequest.write_cache!
+
+    expect(WebCrawlerRequest.count).to eq(0)
+  end
+
+  it 'logs a few counts once flushed' do
+    time = Time.now.at_midnight
+    freeze_time(time)
+
+    3.times { inc('Googlebot') }
+    2.times { inc('Twitterbot') }
+    4.times { inc('Bingbot') }
+
+    WebCrawlerRequest.write_cache!
+
+    expect(web_crawler_request('Googlebot').count).to eq(3)
+    expect(web_crawler_request('Twitterbot').count).to eq(2)
+    expect(web_crawler_request('Bingbot').count).to eq(4)
+  end
+end
--- a/spec/requests/robots_txt_controller_spec.rb
+++ b/spec/requests/robots_txt_controller_spec.rb
@ -2,11 +2,64 @@ require 'rails_helper'

 RSpec.describe RobotsTxtController do
  describe '#index' do
-    it "returns index when indexing is allowed" do
-      SiteSetting.allow_index_in_robots_txt = true
-      get '/robots.txt'
+    context 'allow_index_in_robots_txt is true' do

-      expect(response.body).to include("Disallow: /u/")
+      def expect_allowed_and_disallowed_sections(allow_index, disallow_index)
+        expect(allow_index).to be_present
+        expect(disallow_index).to be_present
+
+        allow_section = allow_index < disallow_index ?
+          response.body[allow_index...disallow_index] : response.body[allow_index..-1]
+
+        expect(allow_section).to include('Disallow: /u/')
+        expect(allow_section).to_not include("Disallow: /\n")
+
+        disallowed_section = allow_index < disallow_index ?
+          response.body[disallow_index..-1] : response.body[disallow_index...allow_index]
+        expect(disallowed_section).to include("Disallow: /\n")
+      end
+
+      it "returns index when indexing is allowed" do
+        SiteSetting.allow_index_in_robots_txt = true
+        get '/robots.txt'
+
+        i = response.body.index('User-agent: *')
+        expect(i).to be_present
+        expect(response.body[i..-1]).to include("Disallow: /u/")
+      end
+
+      it "can whitelist user agents" do
+        SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot"
+        get '/robots.txt'
+        expect(response.body).to include('User-agent: Googlebot')
+        expect(response.body).to include('User-agent: Twitterbot')
+
+        allowed_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
+        disallow_all_index = response.body.index('User-agent: *')
+
+        expect_allowed_and_disallowed_sections(allowed_index, disallow_all_index)
+      end
+
+      it "can blacklist user agents" do
+        SiteSetting.blacklisted_crawler_user_agents = "Googlebot|Twitterbot"
+        get '/robots.txt'
+        expect(response.body).to include('User-agent: Googlebot')
+        expect(response.body).to include('User-agent: Twitterbot')
+
+        disallow_index = [response.body.index('User-agent: Googlebot'), response.body.index('User-agent: Twitterbot')].min
+        allow_index = response.body.index('User-agent: *')
+
+        expect_allowed_and_disallowed_sections(allow_index, disallow_index)
+      end
+
+      it "ignores blacklist if whitelist is set" do
+        SiteSetting.whitelisted_crawler_user_agents = "Googlebot|Twitterbot"
+        SiteSetting.blacklisted_crawler_user_agents = "Bananabot"
+        get '/robots.txt'
+        expect(response.body).to_not include('Bananabot')
+        expect(response.body).to include('User-agent: Googlebot')
+        expect(response.body).to include('User-agent: Twitterbot')
+      end
    end

    it "returns noindex when indexing is disallowed" do