2019-05-03 06:17:27 +08:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2017-06-29 04:56:44 +08:00
|
|
|
class WordWatcher
|
2021-06-03 09:39:12 +08:00
|
|
|
REPLACEMENT_LETTER ||= CGI.unescape_html("■")
|
2017-06-29 04:56:44 +08:00
|
|
|
|
|
|
|
def initialize(raw)
|
|
|
|
@raw = raw
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.words_for_action(action)
|
2021-02-25 20:00:58 +08:00
|
|
|
words = WatchedWord.where(action: WatchedWord.actions[action.to_sym]).limit(1000)
|
2021-06-02 13:36:49 +08:00
|
|
|
if WatchedWord.has_replacement?(action.to_sym)
|
2021-02-25 20:00:58 +08:00
|
|
|
words.pluck(:word, :replacement).to_h
|
|
|
|
else
|
|
|
|
words.pluck(:word)
|
|
|
|
end
|
2017-06-29 04:56:44 +08:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.words_for_action_exists?(action)
|
|
|
|
WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists?
|
|
|
|
end
|
|
|
|
|
2019-07-22 19:59:56 +08:00
|
|
|
def self.get_cached_words(action)
|
|
|
|
Discourse.cache.fetch(word_matcher_regexp_key(action), expires_in: 1.day) do
|
|
|
|
words_for_action(action).presence
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-08-01 01:33:49 +08:00
|
|
|
# This regexp is run in miniracer, and the client JS app
|
|
|
|
# Make sure it is compatible with major browsers when changing
|
|
|
|
# hint: non-chrome browsers do not support 'lookbehind'
|
|
|
|
def self.word_matcher_regexp(action, raise_errors: false)
|
2019-07-22 19:59:56 +08:00
|
|
|
words = get_cached_words(action)
|
|
|
|
if words
|
2021-06-02 13:36:49 +08:00
|
|
|
if WatchedWord.has_replacement?(action.to_sym)
|
2021-02-25 20:00:58 +08:00
|
|
|
words = words.keys
|
|
|
|
end
|
2019-07-22 19:59:56 +08:00
|
|
|
words = words.map do |w|
|
|
|
|
word = word_to_regexp(w)
|
|
|
|
word = "(#{word})" if SiteSetting.watched_words_regular_expressions?
|
|
|
|
word
|
|
|
|
end
|
|
|
|
regexp = words.join('|')
|
|
|
|
if !SiteSetting.watched_words_regular_expressions?
|
|
|
|
regexp = "(#{regexp})"
|
2019-08-01 01:33:49 +08:00
|
|
|
regexp = "(?:\\W|^)#{regexp}(?=\\W|$)"
|
2017-11-18 03:10:38 +08:00
|
|
|
end
|
2019-07-22 19:59:56 +08:00
|
|
|
Regexp.new(regexp, Regexp::IGNORECASE)
|
2017-06-29 04:56:44 +08:00
|
|
|
end
|
2021-04-21 23:16:25 +08:00
|
|
|
rescue RegexpError
|
2019-08-01 01:33:49 +08:00
|
|
|
raise if raise_errors
|
|
|
|
nil # Admin will be alerted via admin_dashboard_data.rb
|
2017-06-29 04:56:44 +08:00
|
|
|
end
|
|
|
|
|
2021-05-18 17:09:47 +08:00
|
|
|
def self.word_matcher_regexps(action)
|
|
|
|
if words = get_cached_words(action)
|
2021-06-18 23:54:06 +08:00
|
|
|
words.map { |w, r| [word_to_regexp(w, whole: true), r] }.to_h
|
2021-05-18 17:09:47 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2021-06-18 23:54:06 +08:00
|
|
|
def self.word_to_regexp(word, whole: false)
|
2017-11-18 03:10:38 +08:00
|
|
|
if SiteSetting.watched_words_regular_expressions?
|
|
|
|
# Strip ruby regexp format if present, we're going to make the whole thing
|
|
|
|
# case insensitive anyway
|
2021-06-18 23:54:06 +08:00
|
|
|
regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
|
|
|
|
regexp = "(#{regexp})" if whole
|
|
|
|
return regexp
|
2017-11-18 03:10:38 +08:00
|
|
|
end
|
2021-06-18 23:54:06 +08:00
|
|
|
|
|
|
|
regexp = Regexp.escape(word).gsub("\\*", '\S*')
|
|
|
|
|
|
|
|
if whole && !SiteSetting.watched_words_regular_expressions?
|
|
|
|
regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
|
|
|
|
end
|
|
|
|
|
|
|
|
regexp
|
2017-09-28 03:48:57 +08:00
|
|
|
end
|
|
|
|
|
2017-06-29 04:56:44 +08:00
|
|
|
def self.word_matcher_regexp_key(action)
|
2019-07-22 19:59:56 +08:00
|
|
|
"watched-words-list:#{action}"
|
2017-06-29 04:56:44 +08:00
|
|
|
end
|
|
|
|
|
2021-06-03 09:39:12 +08:00
|
|
|
def self.censor(html)
|
|
|
|
regexp = WordWatcher.word_matcher_regexp(:censor)
|
|
|
|
return html if regexp.blank?
|
|
|
|
|
|
|
|
doc = Nokogiri::HTML5::fragment(html)
|
|
|
|
doc.traverse do |node|
|
|
|
|
if node.text?
|
|
|
|
node.content = node.content.gsub(regexp) do |match|
|
|
|
|
# the regex captures leading whitespaces
|
|
|
|
padding = match.size - match.lstrip.size
|
|
|
|
if padding > 0
|
|
|
|
match[0..padding - 1] + REPLACEMENT_LETTER * (match.size - padding)
|
|
|
|
else
|
|
|
|
REPLACEMENT_LETTER * match.size
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
doc.to_s
|
|
|
|
end
|
|
|
|
|
2017-06-29 04:56:44 +08:00
|
|
|
def self.clear_cache!
|
2019-06-25 07:06:32 +08:00
|
|
|
WatchedWord.actions.each do |a, i|
|
2017-06-29 04:56:44 +08:00
|
|
|
Discourse.cache.delete word_matcher_regexp_key(a)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def requires_approval?
|
|
|
|
word_matches_for_action?(:require_approval)
|
|
|
|
end
|
|
|
|
|
|
|
|
def should_flag?
|
|
|
|
word_matches_for_action?(:flag)
|
|
|
|
end
|
|
|
|
|
|
|
|
def should_block?
|
2019-07-22 19:59:56 +08:00
|
|
|
word_matches_for_action?(:block, all_matches: true)
|
2017-06-29 04:56:44 +08:00
|
|
|
end
|
|
|
|
|
2021-05-28 00:19:58 +08:00
|
|
|
def should_silence?
|
|
|
|
word_matches_for_action?(:silence)
|
|
|
|
end
|
|
|
|
|
2019-07-22 19:59:56 +08:00
|
|
|
def word_matches_for_action?(action, all_matches: false)
|
|
|
|
regexp = self.class.word_matcher_regexp(action)
|
|
|
|
if regexp
|
|
|
|
match = regexp.match(@raw)
|
|
|
|
return match if !all_matches || !match
|
2017-06-29 04:56:44 +08:00
|
|
|
|
2019-07-22 19:59:56 +08:00
|
|
|
if SiteSetting.watched_words_regular_expressions?
|
|
|
|
set = Set.new
|
|
|
|
@raw.scan(regexp).each do |m|
|
|
|
|
if Array === m
|
|
|
|
set.add(m.find(&:present?))
|
|
|
|
elsif String === m
|
|
|
|
set.add(m)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
matches = set.to_a
|
|
|
|
else
|
|
|
|
matches = @raw.scan(regexp)
|
|
|
|
matches.flatten!
|
|
|
|
matches.uniq!
|
|
|
|
end
|
|
|
|
matches.compact!
|
|
|
|
matches.sort!
|
|
|
|
matches
|
|
|
|
else
|
|
|
|
false
|
|
|
|
end
|
|
|
|
end
|
2021-03-03 16:53:38 +08:00
|
|
|
|
2021-05-14 21:52:10 +08:00
|
|
|
def word_matches?(word)
|
2021-06-18 23:54:06 +08:00
|
|
|
Regexp.new(WordWatcher.word_to_regexp(word, whole: true), Regexp::IGNORECASE).match?(@raw)
|
2021-03-03 16:53:38 +08:00
|
|
|
end
|
2017-06-29 04:56:44 +08:00
|
|
|
end
|