2019-05-03 06:17:27 +08:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2017-06-29 04:56:44 +08:00
|
|
|
class WordWatcher
|
2024-10-16 10:09:07 +08:00
|
|
|
REPLACEMENT_LETTER = CGI.unescape_html("■")
|
|
|
|
CACHE_VERSION = 3
|
2017-06-29 04:56:44 +08:00
|
|
|
|
|
|
|
def initialize(raw)
|
|
|
|
@raw = raw
|
|
|
|
end
|
|
|
|
|
2022-05-24 10:23:54 +08:00
|
|
|
@cache_enabled = true
|
2022-05-12 14:45:05 +08:00
|
|
|
|
2022-05-24 10:23:54 +08:00
|
|
|
def self.disable_cache
|
|
|
|
@cache_enabled = false
|
2022-05-12 14:45:05 +08:00
|
|
|
end
|
|
|
|
|
2022-05-24 10:23:54 +08:00
|
|
|
def self.cache_enabled?
|
|
|
|
@cache_enabled
|
2022-05-12 14:45:05 +08:00
|
|
|
end
|
|
|
|
|
2023-11-01 22:41:10 +08:00
|
|
|
def self.cache_key(action)
|
|
|
|
"watched-words-list:v#{CACHE_VERSION}:#{action}"
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.clear_cache!
|
|
|
|
WatchedWord.actions.each { |action, _| Discourse.cache.delete(cache_key(action)) }
|
|
|
|
end
|
|
|
|
|
2017-06-29 04:56:44 +08:00
|
|
|
def self.words_for_action(action)
|
2022-08-02 16:06:03 +08:00
|
|
|
WatchedWord
|
2021-11-18 00:59:44 +08:00
|
|
|
.where(action: WatchedWord.actions[action.to_sym])
|
|
|
|
.limit(WatchedWord::MAX_WORDS_PER_ACTION)
|
2022-06-23 21:38:12 +08:00
|
|
|
.order(:id)
|
2024-05-06 23:08:34 +08:00
|
|
|
.pluck(:word, :replacement, :case_sensitive, :html)
|
|
|
|
.to_h do |w, r, c, h|
|
|
|
|
opts = { word: w, replacement: r, case_sensitive: c }.compact
|
|
|
|
opts[:html] = true if h
|
|
|
|
[word_to_regexp(w, match_word: false), opts]
|
2023-11-09 00:51:11 +08:00
|
|
|
end
|
2017-06-29 04:56:44 +08:00
|
|
|
end
|
|
|
|
|
2023-11-01 22:41:10 +08:00
|
|
|
def self.words_for_action_exist?(action)
|
2017-06-29 04:56:44 +08:00
|
|
|
WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists?
|
|
|
|
end
|
|
|
|
|
2023-11-01 22:41:10 +08:00
|
|
|
def self.cached_words_for_action(action)
|
2022-05-24 10:23:54 +08:00
|
|
|
if cache_enabled?
|
2022-05-12 14:45:05 +08:00
|
|
|
Discourse
|
|
|
|
.cache
|
2023-11-01 22:41:10 +08:00
|
|
|
.fetch(cache_key(action), expires_in: 1.day) { words_for_action(action).presence }
|
2022-05-24 10:23:54 +08:00
|
|
|
else
|
|
|
|
words_for_action(action).presence
|
2019-07-22 19:59:56 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2023-11-01 22:41:10 +08:00
|
|
|
def self.regexps_for_action(action, engine: :ruby)
|
2023-11-09 00:51:11 +08:00
|
|
|
cached_words_for_action(action)&.to_h do |_, attrs|
|
|
|
|
[word_to_regexp(attrs[:word], engine: engine), attrs]
|
2023-05-15 17:45:04 +08:00
|
|
|
end
|
2022-08-02 16:06:03 +08:00
|
|
|
end
|
|
|
|
|
2019-08-01 01:33:49 +08:00
|
|
|
# This regexp is run in miniracer, and the client JS app
|
|
|
|
# Make sure it is compatible with major browsers when changing
|
|
|
|
# hint: non-chrome browsers do not support 'lookbehind'
|
2023-11-01 22:41:10 +08:00
|
|
|
def self.compiled_regexps_for_action(action, engine: :ruby, raise_errors: false)
|
|
|
|
words = cached_words_for_action(action)
|
2022-08-02 16:06:03 +08:00
|
|
|
return [] if words.blank?
|
|
|
|
|
2023-11-01 22:41:10 +08:00
|
|
|
words
|
|
|
|
.values
|
|
|
|
.group_by { |attrs| attrs[:case_sensitive] ? :case_sensitive : :case_insensitive }
|
|
|
|
.map do |group_key, attrs_list|
|
|
|
|
words = attrs_list.map { |attrs| attrs[:word] }
|
|
|
|
|
|
|
|
# Compile all watched words into a single regular expression
|
|
|
|
regexp =
|
|
|
|
words
|
|
|
|
.map do |word|
|
|
|
|
r = word_to_regexp(word, match_word: SiteSetting.watched_words_regular_expressions?)
|
|
|
|
begin
|
|
|
|
r if Regexp.new(r)
|
|
|
|
rescue RegexpError
|
|
|
|
raise if raise_errors
|
|
|
|
end
|
|
|
|
end
|
|
|
|
.select { |r| r.present? }
|
|
|
|
.join("|")
|
|
|
|
|
|
|
|
# Add word boundaries to the regexp for regular watched words
|
|
|
|
regexp =
|
|
|
|
match_word_regexp(
|
|
|
|
regexp,
|
|
|
|
engine: engine,
|
|
|
|
) if !SiteSetting.watched_words_regular_expressions?
|
|
|
|
|
|
|
|
# Add case insensitive flag if needed
|
|
|
|
Regexp.new(regexp, group_key == :case_sensitive ? nil : Regexp::IGNORECASE)
|
|
|
|
end
|
2017-06-29 04:56:44 +08:00
|
|
|
end
|
|
|
|
|
2023-11-01 22:41:10 +08:00
|
|
|
def self.serialized_regexps_for_action(action, engine: :ruby)
|
|
|
|
compiled_regexps_for_action(action, engine: engine).map do |r|
|
|
|
|
{ r.source => { case_sensitive: !r.casefold? } }
|
|
|
|
end
|
2021-05-18 17:09:47 +08:00
|
|
|
end
|
|
|
|
|
2023-11-01 22:41:10 +08:00
|
|
|
def self.word_to_regexp(word, engine: :ruby, match_word: true)
|
2017-11-18 03:10:38 +08:00
|
|
|
if SiteSetting.watched_words_regular_expressions?
|
2023-11-01 22:41:10 +08:00
|
|
|
regexp = word
|
|
|
|
regexp = "(#{regexp})" if match_word
|
|
|
|
regexp
|
|
|
|
else
|
|
|
|
# Convert word to regex by escaping special characters in a regexp.
|
|
|
|
# Avoid using Regexp.escape because it escapes more characters than
|
|
|
|
# it should (for example, whitespaces, dashes, etc)
|
|
|
|
regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
|
2021-06-18 23:54:06 +08:00
|
|
|
|
2023-11-01 22:41:10 +08:00
|
|
|
# Convert wildcards to regexp
|
|
|
|
regexp = regexp.gsub("\\*", '\S*')
|
2017-09-28 03:48:57 +08:00
|
|
|
|
2023-11-01 22:41:10 +08:00
|
|
|
regexp = match_word_regexp(regexp, engine: engine) if match_word
|
|
|
|
regexp
|
2023-05-15 17:45:04 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2021-06-03 09:39:12 +08:00
|
|
|
def self.censor(html)
|
2023-11-01 22:41:10 +08:00
|
|
|
regexps = compiled_regexps_for_action(:censor)
|
2022-08-02 16:06:03 +08:00
|
|
|
return html if regexps.blank?
|
2021-06-03 09:39:12 +08:00
|
|
|
|
|
|
|
doc = Nokogiri::HTML5.fragment(html)
|
|
|
|
doc.traverse do |node|
|
2022-08-02 16:06:03 +08:00
|
|
|
regexps.each do |regexp|
|
|
|
|
node.content = censor_text_with_regexp(node.content, regexp) if node.text?
|
|
|
|
end
|
2021-06-03 09:39:12 +08:00
|
|
|
end
|
2022-08-02 16:06:03 +08:00
|
|
|
|
2021-06-03 09:39:12 +08:00
|
|
|
doc.to_s
|
|
|
|
end
|
|
|
|
|
2022-05-25 19:51:47 +08:00
|
|
|
def self.censor_text(text)
|
2022-08-09 03:34:51 +08:00
|
|
|
return text if text.blank?
|
|
|
|
|
2023-11-01 22:41:10 +08:00
|
|
|
regexps = compiled_regexps_for_action(:censor)
|
2022-08-02 16:06:03 +08:00
|
|
|
return text if regexps.blank?
|
2022-05-25 19:51:47 +08:00
|
|
|
|
2022-08-02 16:06:03 +08:00
|
|
|
regexps.inject(text) { |txt, regexp| censor_text_with_regexp(txt, regexp) }
|
2022-05-25 19:51:47 +08:00
|
|
|
end
|
|
|
|
|
2022-08-09 03:34:51 +08:00
|
|
|
def self.replace_text(text)
|
|
|
|
return text if text.blank?
|
2023-03-01 10:43:34 +08:00
|
|
|
replace(text, :replace)
|
|
|
|
end
|
2022-07-26 23:15:42 +08:00
|
|
|
|
2023-03-01 10:43:34 +08:00
|
|
|
def self.replace_link(text)
|
|
|
|
return text if text.blank?
|
|
|
|
replace(text, :link)
|
2022-07-26 23:15:42 +08:00
|
|
|
end
|
|
|
|
|
2022-08-09 03:34:51 +08:00
|
|
|
def self.apply_to_text(text)
|
|
|
|
text = censor_text(text)
|
|
|
|
text = replace_text(text)
|
2023-03-01 10:43:34 +08:00
|
|
|
text = replace_link(text)
|
2022-08-09 03:34:51 +08:00
|
|
|
text
|
|
|
|
end
|
|
|
|
|
2017-06-29 04:56:44 +08:00
|
|
|
def requires_approval?
|
|
|
|
word_matches_for_action?(:require_approval)
|
|
|
|
end
|
|
|
|
|
|
|
|
def should_flag?
|
|
|
|
word_matches_for_action?(:flag)
|
|
|
|
end
|
|
|
|
|
|
|
|
def should_block?
|
2019-07-22 19:59:56 +08:00
|
|
|
word_matches_for_action?(:block, all_matches: true)
|
2017-06-29 04:56:44 +08:00
|
|
|
end
|
|
|
|
|
2021-05-28 00:19:58 +08:00
|
|
|
def should_silence?
|
|
|
|
word_matches_for_action?(:silence)
|
|
|
|
end
|
|
|
|
|
2019-07-22 19:59:56 +08:00
|
|
|
def word_matches_for_action?(action, all_matches: false)
|
2023-11-01 22:41:10 +08:00
|
|
|
regexps = self.class.compiled_regexps_for_action(action)
|
2022-08-02 16:06:03 +08:00
|
|
|
return if regexps.blank?
|
|
|
|
|
|
|
|
match_list = []
|
|
|
|
regexps.each do |regexp|
|
2019-07-22 19:59:56 +08:00
|
|
|
match = regexp.match(@raw)
|
2022-08-02 16:06:03 +08:00
|
|
|
|
|
|
|
if !all_matches
|
|
|
|
return match if match
|
|
|
|
next
|
|
|
|
end
|
|
|
|
|
|
|
|
next if !match
|
2017-06-29 04:56:44 +08:00
|
|
|
|
2019-07-22 19:59:56 +08:00
|
|
|
if SiteSetting.watched_words_regular_expressions?
|
|
|
|
set = Set.new
|
|
|
|
@raw
|
|
|
|
.scan(regexp)
|
|
|
|
.each do |m|
|
|
|
|
if Array === m
|
|
|
|
set.add(m.find(&:present?))
|
|
|
|
elsif String === m
|
|
|
|
set.add(m)
|
2023-01-09 20:20:10 +08:00
|
|
|
end
|
2019-07-22 19:59:56 +08:00
|
|
|
end
|
2022-08-02 16:06:03 +08:00
|
|
|
|
2019-07-22 19:59:56 +08:00
|
|
|
matches = set.to_a
|
|
|
|
else
|
|
|
|
matches = @raw.scan(regexp)
|
|
|
|
matches.flatten!
|
|
|
|
end
|
2022-08-02 16:06:03 +08:00
|
|
|
|
|
|
|
match_list.concat(matches)
|
2019-07-22 19:59:56 +08:00
|
|
|
end
|
2022-08-02 16:06:03 +08:00
|
|
|
|
|
|
|
return if match_list.blank?
|
|
|
|
|
|
|
|
match_list.compact!
|
|
|
|
match_list.uniq!
|
|
|
|
match_list.sort!
|
|
|
|
match_list
|
|
|
|
end
|
|
|
|
|
|
|
|
def word_matches?(word, case_sensitive: false)
|
2023-05-15 17:45:04 +08:00
|
|
|
options = case_sensitive ? nil : Regexp::IGNORECASE
|
|
|
|
Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
|
2019-07-22 19:59:56 +08:00
|
|
|
end
|
2021-03-03 16:53:38 +08:00
|
|
|
|
2022-08-02 16:06:03 +08:00
|
|
|
def self.replace_text_with_regexp(text, regexp, replacement)
|
|
|
|
text.gsub(regexp) do |match|
|
|
|
|
prefix = ""
|
|
|
|
# match may be prefixed with a non-word character from the non-capturing group
|
|
|
|
# Ensure this isn't replaced if watched words regular expression is disabled.
|
|
|
|
if !SiteSetting.watched_words_regular_expressions? && (match[0] =~ /\W/) != nil
|
|
|
|
prefix = "#{match[0]}"
|
|
|
|
end
|
|
|
|
|
|
|
|
"#{prefix}#{replacement}"
|
|
|
|
end
|
2021-03-03 16:53:38 +08:00
|
|
|
end
|
2022-05-25 19:51:47 +08:00
|
|
|
|
2022-08-02 16:06:03 +08:00
|
|
|
private_class_method :replace_text_with_regexp
|
2022-05-25 19:51:47 +08:00
|
|
|
|
|
|
|
def self.censor_text_with_regexp(text, regexp)
|
|
|
|
text.gsub(regexp) do |match|
|
|
|
|
# the regex captures leading whitespaces
|
|
|
|
padding = match.size - match.lstrip.size
|
|
|
|
if padding > 0
|
|
|
|
match[0..padding - 1] + REPLACEMENT_LETTER * (match.size - padding)
|
|
|
|
else
|
|
|
|
REPLACEMENT_LETTER * match.size
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2022-08-02 16:06:03 +08:00
|
|
|
|
|
|
|
private_class_method :censor_text_with_regexp
|
2023-03-01 10:43:34 +08:00
|
|
|
|
2023-11-01 22:41:10 +08:00
|
|
|
# Returns a regexp that transforms a regular expression into a regular
|
|
|
|
# expression that matches a whole word.
|
|
|
|
def self.match_word_regexp(regexp, engine: :ruby)
|
|
|
|
if engine == :js
|
|
|
|
"(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
|
|
|
|
elsif engine == :ruby
|
|
|
|
"(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
|
|
|
|
else
|
|
|
|
raise "unknown regexp engine: #{engine}"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
private_class_method :match_word_regexp
|
2023-03-01 10:43:34 +08:00
|
|
|
|
|
|
|
def self.replace(text, watch_word_type)
|
2023-11-01 22:41:10 +08:00
|
|
|
regexps_for_action(watch_word_type)
|
2023-03-01 10:43:34 +08:00
|
|
|
.to_a
|
|
|
|
.reduce(text) do |t, (word_regexp, attrs)|
|
|
|
|
case_flag = attrs[:case_sensitive] ? nil : Regexp::IGNORECASE
|
|
|
|
replace_text_with_regexp(t, Regexp.new(word_regexp, case_flag), attrs[:replacement])
|
|
|
|
end
|
|
|
|
end
|
2023-11-01 22:41:10 +08:00
|
|
|
|
|
|
|
private_class_method :replace
|
2017-06-29 04:56:44 +08:00
|
|
|
end
|