mirror of
https://github.com/discourse/discourse.git
synced 2024-11-25 04:48:53 +08:00
30990006a9
This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging
93 lines
2.1 KiB
Ruby
93 lines
2.1 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
class WebCrawlerRequest < ActiveRecord::Base
|
|
include CachedCounting
|
|
|
|
# auto flush if older than this
|
|
self.autoflush_seconds = 1.hour
|
|
|
|
cattr_accessor :max_record_age, :max_records_per_day
|
|
|
|
# only keep the top records based on request count
|
|
self.max_records_per_day = 200
|
|
|
|
# delete records older than this
|
|
self.max_record_age = 30.days
|
|
|
|
def self.increment!(user_agent, opts = nil)
|
|
ua_list_key = user_agent_list_key
|
|
$redis.sadd(ua_list_key, user_agent)
|
|
$redis.expire(ua_list_key, 259200) # 3.days
|
|
|
|
perform_increment!(redis_key(user_agent), opts)
|
|
end
|
|
|
|
def self.write_cache!(date = nil)
|
|
if date.nil?
|
|
write_cache!(Time.now.utc)
|
|
write_cache!(Time.now.utc.yesterday)
|
|
return
|
|
end
|
|
|
|
self.last_flush = Time.now.utc
|
|
|
|
date = date.to_date
|
|
ua_list_key = user_agent_list_key(date)
|
|
|
|
while user_agent = $redis.spop(ua_list_key)
|
|
val = get_and_reset(redis_key(user_agent, date))
|
|
|
|
next if val == 0
|
|
|
|
self.where(id: req_id(date, user_agent)).update_all(["count = count + ?", val])
|
|
end
|
|
rescue Redis::CommandError => e
|
|
raise unless e.message =~ /READONLY/
|
|
nil
|
|
end
|
|
|
|
def self.clear_cache!(date = nil)
|
|
if date.nil?
|
|
clear_cache!(Time.now.utc)
|
|
clear_cache!(Time.now.utc.yesterday)
|
|
return
|
|
end
|
|
|
|
ua_list_key = user_agent_list_key(date)
|
|
|
|
while user_agent = $redis.spop(ua_list_key)
|
|
$redis.del redis_key(user_agent, date)
|
|
end
|
|
|
|
$redis.del(ua_list_key)
|
|
end
|
|
|
|
protected
|
|
|
|
def self.user_agent_list_key(time = Time.now.utc)
|
|
"crawl_ua_list:#{time.strftime('%Y%m%d')}"
|
|
end
|
|
|
|
def self.redis_key(user_agent, time = Time.now.utc)
|
|
"crawl_req:#{time.strftime('%Y%m%d')}:#{user_agent}"
|
|
end
|
|
|
|
def self.req_id(date, user_agent)
|
|
request_id(date: date, user_agent: user_agent)
|
|
end
|
|
end
|
|
|
|
# == Schema Information
|
|
#
|
|
# Table name: web_crawler_requests
|
|
#
|
|
# id :bigint not null, primary key
|
|
# date :date not null
|
|
# user_agent :string not null
|
|
# count :integer default(0), not null
|
|
#
|
|
# Indexes
|
|
#
|
|
# index_web_crawler_requests_on_date_and_user_agent (date,user_agent) UNIQUE
|
|
#
|