mirror of
https://github.com/discourse/discourse.git
synced 2024-11-27 08:03:40 +08:00
862007fb18
* FEATURE: Add case-sensitivity flag to watched_words Currently, all watched words are matched case-insensitively. This flag allows a watched word to be flagged for case-sensitive matching. To allow allow for backwards compatibility the flag is set to false by default. * FEATURE: Support case-sensitive creation of Watched Words via API Extend admin creation and upload of Watched Words to support case sensitive flag. This lays the ground work for supporting case-insensitive matching of Watched Words. Support for an extra column has also been introduced for the Watched Words upload CSV file. The new column structure is as follows: word,replacement,case_sentive * FEATURE: Enable case-sensitive matching of Watched Words WordWatcher's word_matcher_regexp now returns a list of regular expressions instead of one case-insensitive regular expression. With the ability to flag a Watched Word as case-sensitive, an action can have words of both sensitivities.This makes the use of the global Regexp::IGNORECASE flag added to all words problematic. To get around platform limitations around the use of subexpression level switches/flags, a list of regular expressions is returned instead, one for each case sensitivity. Word matching has also been updated to use this list of regular expressions instead of one. * FEATURE: Use case-sensitive regular expressions for Watched Words Update Watched Words regular expressions matching and processing to handle the extra metadata which comes along with the introduction of case-sensitive Watched Words. This allows case-sensitive Watched Words to matched as such. * DEV: Simplify type casting of case-sensitive flag from uploads Use builtin semantics instead of a custom method for converting string case flags in uploaded Watched Words to boolean. * UX: Add case-sensitivity details to Admin Watched Words UI Update Watched Word form to include a toggle for case-sensitivity. This also adds support for, case-sensitive testing and matching of Watched Word in the admin UI. * DEV: Code improvements from review feedback - Extract watched word regex creation out to a utility function - Make JS array presence check more explicit and readable * DEV: Extract Watched Word regex creation to utility function Clean-up work from review feedback. Reduce code duplication. * DEV: Rename word_matcher_regexp to word_matcher_regexp_list Since a list is returned now instead of a single regular expression, change `word_matcher_regexp` to `word_matcher_regexp_list` to better communicate this change. * DEV: Incorporate WordWatcher updates from upstream Resolve conflicts and ensure apply_to_text does not remove non-word characters in matches that aren't at the beginning of the line.
242 lines
6.2 KiB
Ruby
242 lines
6.2 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
class WordWatcher
|
|
REPLACEMENT_LETTER ||= CGI.unescape_html("■")
|
|
|
|
def initialize(raw)
|
|
@raw = raw
|
|
end
|
|
|
|
@cache_enabled = true
|
|
|
|
def self.disable_cache
|
|
@cache_enabled = false
|
|
end
|
|
|
|
def self.cache_enabled?
|
|
@cache_enabled
|
|
end
|
|
|
|
def self.words_for_action(action)
|
|
WatchedWord
|
|
.where(action: WatchedWord.actions[action.to_sym])
|
|
.limit(WatchedWord::MAX_WORDS_PER_ACTION)
|
|
.order(:id)
|
|
.pluck(:word, :replacement, :case_sensitive)
|
|
.map { |w, r, c| [w, { replacement: r, case_sensitive: c }.compact] }
|
|
.to_h
|
|
end
|
|
|
|
def self.words_for_action_exists?(action)
|
|
WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists?
|
|
end
|
|
|
|
def self.get_cached_words(action)
|
|
if cache_enabled?
|
|
Discourse.cache.fetch(word_matcher_regexp_key(action), expires_in: 1.day) do
|
|
words_for_action(action).presence
|
|
end
|
|
else
|
|
words_for_action(action).presence
|
|
end
|
|
end
|
|
|
|
def self.serializable_word_matcher_regexp(action)
|
|
word_matcher_regexp_list(action)
|
|
.map { |r| { r.source => { case_sensitive: !r.casefold? } } }
|
|
end
|
|
|
|
# This regexp is run in miniracer, and the client JS app
|
|
# Make sure it is compatible with major browsers when changing
|
|
# hint: non-chrome browsers do not support 'lookbehind'
|
|
def self.word_matcher_regexp_list(action, raise_errors: false)
|
|
words = get_cached_words(action)
|
|
return [] if words.blank?
|
|
|
|
grouped_words = { case_sensitive: [], case_insensitive: [] }
|
|
|
|
words.each do |w, attrs|
|
|
word = word_to_regexp(w)
|
|
word = "(#{word})" if SiteSetting.watched_words_regular_expressions?
|
|
|
|
group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive
|
|
grouped_words[group_key] << word
|
|
end
|
|
|
|
regexps = grouped_words
|
|
.select { |_, w| w.present? }
|
|
.transform_values { |w| w.join("|") }
|
|
|
|
if !SiteSetting.watched_words_regular_expressions?
|
|
regexps.transform_values! do |regexp|
|
|
regexp = "(#{regexp})"
|
|
"(?:\\W|^)#{regexp}(?=\\W|$)"
|
|
end
|
|
end
|
|
|
|
regexps
|
|
.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) }
|
|
rescue RegexpError
|
|
raise if raise_errors
|
|
[] # Admin will be alerted via admin_dashboard_data.rb
|
|
end
|
|
|
|
def self.word_matcher_regexps(action)
|
|
if words = get_cached_words(action)
|
|
words.map { |w, opts| [word_to_regexp(w, whole: true), opts] }.to_h
|
|
end
|
|
end
|
|
|
|
def self.word_to_regexp(word, whole: false)
|
|
if SiteSetting.watched_words_regular_expressions?
|
|
# Strip ruby regexp format if present
|
|
regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
|
|
regexp = "(#{regexp})" if whole
|
|
return regexp
|
|
end
|
|
|
|
regexp = Regexp.escape(word).gsub("\\*", '\S*')
|
|
|
|
if whole && !SiteSetting.watched_words_regular_expressions?
|
|
regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
|
|
end
|
|
|
|
regexp
|
|
end
|
|
|
|
def self.word_matcher_regexp_key(action)
|
|
"watched-words-list:#{action}"
|
|
end
|
|
|
|
def self.censor(html)
|
|
regexps = word_matcher_regexp_list(:censor)
|
|
return html if regexps.blank?
|
|
|
|
doc = Nokogiri::HTML5::fragment(html)
|
|
doc.traverse do |node|
|
|
regexps.each do |regexp|
|
|
node.content = censor_text_with_regexp(node.content, regexp) if node.text?
|
|
end
|
|
end
|
|
|
|
doc.to_s
|
|
end
|
|
|
|
def self.censor_text(text)
|
|
regexps = word_matcher_regexp_list(:censor)
|
|
return text if regexps.blank?
|
|
|
|
regexps.inject(text) { |txt, regexp| censor_text_with_regexp(txt, regexp) }
|
|
end
|
|
|
|
def self.apply_to_text(text)
|
|
text = censor_text(text)
|
|
|
|
%i[replace link]
|
|
.flat_map { |type| word_matcher_regexps(type).to_a }
|
|
.reduce(text) do |t, (word_regexp, attrs)|
|
|
case_flag = attrs[:case_sensitive] ? nil : Regexp::IGNORECASE
|
|
replace_text_with_regexp(t, Regexp.new(word_regexp, case_flag), attrs[:replacement])
|
|
end
|
|
end
|
|
|
|
def self.clear_cache!
|
|
WatchedWord.actions.each do |a, i|
|
|
Discourse.cache.delete word_matcher_regexp_key(a)
|
|
end
|
|
end
|
|
|
|
def requires_approval?
|
|
word_matches_for_action?(:require_approval)
|
|
end
|
|
|
|
def should_flag?
|
|
word_matches_for_action?(:flag)
|
|
end
|
|
|
|
def should_block?
|
|
word_matches_for_action?(:block, all_matches: true)
|
|
end
|
|
|
|
def should_silence?
|
|
word_matches_for_action?(:silence)
|
|
end
|
|
|
|
def word_matches_for_action?(action, all_matches: false)
|
|
regexps = self.class.word_matcher_regexp_list(action)
|
|
return if regexps.blank?
|
|
|
|
match_list = []
|
|
regexps.each do |regexp|
|
|
match = regexp.match(@raw)
|
|
|
|
if !all_matches
|
|
return match if match
|
|
next
|
|
end
|
|
|
|
next if !match
|
|
|
|
if SiteSetting.watched_words_regular_expressions?
|
|
set = Set.new
|
|
@raw.scan(regexp).each do |m|
|
|
if Array === m
|
|
set.add(m.find(&:present?))
|
|
elsif String === m
|
|
set.add(m)
|
|
end
|
|
end
|
|
|
|
matches = set.to_a
|
|
else
|
|
matches = @raw.scan(regexp)
|
|
matches.flatten!
|
|
end
|
|
|
|
match_list.concat(matches)
|
|
end
|
|
|
|
return if match_list.blank?
|
|
|
|
match_list.compact!
|
|
match_list.uniq!
|
|
match_list.sort!
|
|
match_list
|
|
end
|
|
|
|
def word_matches?(word, case_sensitive: false)
|
|
Regexp
|
|
.new(WordWatcher.word_to_regexp(word, whole: true), case_sensitive ? nil : Regexp::IGNORECASE)
|
|
.match?(@raw)
|
|
end
|
|
|
|
def self.replace_text_with_regexp(text, regexp, replacement)
|
|
text.gsub(regexp) do |match|
|
|
prefix = ""
|
|
# match may be prefixed with a non-word character from the non-capturing group
|
|
# Ensure this isn't replaced if watched words regular expression is disabled.
|
|
if !SiteSetting.watched_words_regular_expressions? && (match[0] =~ /\W/) != nil
|
|
prefix = "#{match[0]}"
|
|
end
|
|
|
|
"#{prefix}#{replacement}"
|
|
end
|
|
end
|
|
|
|
private_class_method :replace_text_with_regexp
|
|
|
|
def self.censor_text_with_regexp(text, regexp)
|
|
text.gsub(regexp) do |match|
|
|
# the regex captures leading whitespaces
|
|
padding = match.size - match.lstrip.size
|
|
if padding > 0
|
|
match[0..padding - 1] + REPLACEMENT_LETTER * (match.size - padding)
|
|
else
|
|
REPLACEMENT_LETTER * match.size
|
|
end
|
|
end
|
|
end
|
|
|
|
private_class_method :censor_text_with_regexp
|
|
end
|