mirror of
https://github.com/discourse/discourse.git
synced 2024-11-29 08:13:51 +08:00
30990006a9
This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging
60 lines
2.0 KiB
Ruby
60 lines
2.0 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module CrawlerDetection
|
|
|
|
def self.to_matcher(string, type: nil)
|
|
escaped = string.split('|').map { |agent| Regexp.escape(agent) }.join('|')
|
|
|
|
if type == :real && Rails.env == "test"
|
|
# we need this bypass so we properly render views
|
|
escaped << "|Rails Testing"
|
|
end
|
|
|
|
Regexp.new(escaped, Regexp::IGNORECASE)
|
|
end
|
|
|
|
def self.crawler?(user_agent)
|
|
return true if user_agent.nil?
|
|
|
|
# this is done to avoid regenerating regexes
|
|
@non_crawler_matchers ||= {}
|
|
@matchers ||= {}
|
|
|
|
possibly_real = (@non_crawler_matchers[SiteSetting.non_crawler_user_agents] ||= to_matcher(SiteSetting.non_crawler_user_agents, type: :real))
|
|
|
|
if user_agent.match?(possibly_real)
|
|
known_bots = (@matchers[SiteSetting.crawler_user_agents] ||= to_matcher(SiteSetting.crawler_user_agents))
|
|
if user_agent.match?(known_bots)
|
|
bypass = (@matchers[SiteSetting.crawler_check_bypass_agents] ||= to_matcher(SiteSetting.crawler_check_bypass_agents))
|
|
!user_agent.match?(bypass)
|
|
else
|
|
false
|
|
end
|
|
else
|
|
true
|
|
end
|
|
|
|
end
|
|
|
|
# Given a user_agent that returns true from crawler?, should its request be allowed?
|
|
def self.allow_crawler?(user_agent)
|
|
return true if SiteSetting.whitelisted_crawler_user_agents.blank? &&
|
|
SiteSetting.blacklisted_crawler_user_agents.blank?
|
|
|
|
@whitelisted_matchers ||= {}
|
|
@blacklisted_matchers ||= {}
|
|
|
|
if SiteSetting.whitelisted_crawler_user_agents.present?
|
|
whitelisted = @whitelisted_matchers[SiteSetting.whitelisted_crawler_user_agents] ||= to_matcher(SiteSetting.whitelisted_crawler_user_agents)
|
|
!user_agent.nil? && user_agent.match?(whitelisted)
|
|
else
|
|
blacklisted = @blacklisted_matchers[SiteSetting.blacklisted_crawler_user_agents] ||= to_matcher(SiteSetting.blacklisted_crawler_user_agents)
|
|
user_agent.nil? || !user_agent.match?(blacklisted)
|
|
end
|
|
end
|
|
|
|
def self.is_blocked_crawler?(user_agent)
|
|
crawler?(user_agent) && !allow_crawler?(user_agent)
|
|
end
|
|
end
|