discourse/app/services/word_watcher.rb
Bianca Nenciu 277496b6e0
FIX: Replace watched words with wildcards (#24279)
These have been broken since fd07c943ad
because watched words were not correctly transformed to regexps.
This partially reverts the changes.
2023-11-08 18:51:11 +02:00

286 lines
7.5 KiB
Ruby

# frozen_string_literal: true
class WordWatcher
REPLACEMENT_LETTER ||= CGI.unescape_html("■")
CACHE_VERSION ||= 3
def initialize(raw)
@raw = raw
end
@cache_enabled = true
def self.disable_cache
@cache_enabled = false
end
def self.cache_enabled?
@cache_enabled
end
def self.cache_key(action)
"watched-words-list:v#{CACHE_VERSION}:#{action}"
end
def self.clear_cache!
WatchedWord.actions.each { |action, _| Discourse.cache.delete(cache_key(action)) }
end
def self.words_for_action(action)
WatchedWord
.where(action: WatchedWord.actions[action.to_sym])
.limit(WatchedWord::MAX_WORDS_PER_ACTION)
.order(:id)
.pluck(:word, :replacement, :case_sensitive)
.to_h do |w, r, c|
[
word_to_regexp(w, match_word: false),
{ word: w, replacement: r, case_sensitive: c }.compact,
]
end
end
def self.words_for_action_exist?(action)
WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists?
end
def self.cached_words_for_action(action)
if cache_enabled?
Discourse
.cache
.fetch(cache_key(action), expires_in: 1.day) { words_for_action(action).presence }
else
words_for_action(action).presence
end
end
def self.regexps_for_action(action, engine: :ruby)
cached_words_for_action(action)&.to_h do |_, attrs|
[word_to_regexp(attrs[:word], engine: engine), attrs]
end
end
# This regexp is run in miniracer, and the client JS app
# Make sure it is compatible with major browsers when changing
# hint: non-chrome browsers do not support 'lookbehind'
def self.compiled_regexps_for_action(action, engine: :ruby, raise_errors: false)
words = cached_words_for_action(action)
return [] if words.blank?
words
.values
.group_by { |attrs| attrs[:case_sensitive] ? :case_sensitive : :case_insensitive }
.map do |group_key, attrs_list|
words = attrs_list.map { |attrs| attrs[:word] }
# Compile all watched words into a single regular expression
regexp =
words
.map do |word|
r = word_to_regexp(word, match_word: SiteSetting.watched_words_regular_expressions?)
begin
r if Regexp.new(r)
rescue RegexpError
raise if raise_errors
end
end
.select { |r| r.present? }
.join("|")
# Add word boundaries to the regexp for regular watched words
regexp =
match_word_regexp(
regexp,
engine: engine,
) if !SiteSetting.watched_words_regular_expressions?
# Add case insensitive flag if needed
Regexp.new(regexp, group_key == :case_sensitive ? nil : Regexp::IGNORECASE)
end
end
def self.serialized_regexps_for_action(action, engine: :ruby)
compiled_regexps_for_action(action, engine: engine).map do |r|
{ r.source => { case_sensitive: !r.casefold? } }
end
end
def self.word_to_regexp(word, engine: :ruby, match_word: true)
if SiteSetting.watched_words_regular_expressions?
regexp = word
regexp = "(#{regexp})" if match_word
regexp
else
# Convert word to regex by escaping special characters in a regexp.
# Avoid using Regexp.escape because it escapes more characters than
# it should (for example, whitespaces, dashes, etc)
regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
# Convert wildcards to regexp
regexp = regexp.gsub("\\*", '\S*')
regexp = match_word_regexp(regexp, engine: engine) if match_word
regexp
end
end
def self.censor(html)
regexps = compiled_regexps_for_action(:censor)
return html if regexps.blank?
doc = Nokogiri::HTML5.fragment(html)
doc.traverse do |node|
regexps.each do |regexp|
node.content = censor_text_with_regexp(node.content, regexp) if node.text?
end
end
doc.to_s
end
def self.censor_text(text)
return text if text.blank?
regexps = compiled_regexps_for_action(:censor)
return text if regexps.blank?
regexps.inject(text) { |txt, regexp| censor_text_with_regexp(txt, regexp) }
end
def self.replace_text(text)
return text if text.blank?
replace(text, :replace)
end
def self.replace_link(text)
return text if text.blank?
replace(text, :link)
end
def self.apply_to_text(text)
text = censor_text(text)
text = replace_text(text)
text = replace_link(text)
text
end
def requires_approval?
word_matches_for_action?(:require_approval)
end
def should_flag?
word_matches_for_action?(:flag)
end
def should_block?
word_matches_for_action?(:block, all_matches: true)
end
def should_silence?
word_matches_for_action?(:silence)
end
def word_matches_for_action?(action, all_matches: false)
regexps = self.class.compiled_regexps_for_action(action)
return if regexps.blank?
match_list = []
regexps.each do |regexp|
match = regexp.match(@raw)
if !all_matches
return match if match
next
end
next if !match
if SiteSetting.watched_words_regular_expressions?
set = Set.new
@raw
.scan(regexp)
.each do |m|
if Array === m
set.add(m.find(&:present?))
elsif String === m
set.add(m)
end
end
matches = set.to_a
else
matches = @raw.scan(regexp)
matches.flatten!
end
match_list.concat(matches)
end
return if match_list.blank?
match_list.compact!
match_list.uniq!
match_list.sort!
match_list
end
def word_matches?(word, case_sensitive: false)
options = case_sensitive ? nil : Regexp::IGNORECASE
Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
end
def self.replace_text_with_regexp(text, regexp, replacement)
text.gsub(regexp) do |match|
prefix = ""
# match may be prefixed with a non-word character from the non-capturing group
# Ensure this isn't replaced if watched words regular expression is disabled.
if !SiteSetting.watched_words_regular_expressions? && (match[0] =~ /\W/) != nil
prefix = "#{match[0]}"
end
"#{prefix}#{replacement}"
end
end
private_class_method :replace_text_with_regexp
def self.censor_text_with_regexp(text, regexp)
text.gsub(regexp) do |match|
# the regex captures leading whitespaces
padding = match.size - match.lstrip.size
if padding > 0
match[0..padding - 1] + REPLACEMENT_LETTER * (match.size - padding)
else
REPLACEMENT_LETTER * match.size
end
end
end
private_class_method :censor_text_with_regexp
# Returns a regexp that transforms a regular expression into a regular
# expression that matches a whole word.
def self.match_word_regexp(regexp, engine: :ruby)
if engine == :js
"(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
elsif engine == :ruby
"(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
else
raise "unknown regexp engine: #{engine}"
end
end
private_class_method :match_word_regexp
def self.replace(text, watch_word_type)
regexps_for_action(watch_word_type)
.to_a
.reduce(text) do |t, (word_regexp, attrs)|
case_flag = attrs[:case_sensitive] ? nil : Regexp::IGNORECASE
replace_text_with_regexp(t, Regexp.new(word_regexp, case_flag), attrs[:replacement])
end
end
private_class_method :replace
end