mirror of
https://github.com/discourse/discourse.git
synced 2024-11-24 11:33:06 +08:00
277496b6e0
These have been broken since fd07c943ad
because watched words were not correctly transformed to regexps.
This partially reverts the changes.
286 lines
7.5 KiB
Ruby
286 lines
7.5 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
class WordWatcher
|
|
REPLACEMENT_LETTER ||= CGI.unescape_html("■")
|
|
CACHE_VERSION ||= 3
|
|
|
|
def initialize(raw)
|
|
@raw = raw
|
|
end
|
|
|
|
@cache_enabled = true
|
|
|
|
def self.disable_cache
|
|
@cache_enabled = false
|
|
end
|
|
|
|
def self.cache_enabled?
|
|
@cache_enabled
|
|
end
|
|
|
|
def self.cache_key(action)
|
|
"watched-words-list:v#{CACHE_VERSION}:#{action}"
|
|
end
|
|
|
|
def self.clear_cache!
|
|
WatchedWord.actions.each { |action, _| Discourse.cache.delete(cache_key(action)) }
|
|
end
|
|
|
|
def self.words_for_action(action)
|
|
WatchedWord
|
|
.where(action: WatchedWord.actions[action.to_sym])
|
|
.limit(WatchedWord::MAX_WORDS_PER_ACTION)
|
|
.order(:id)
|
|
.pluck(:word, :replacement, :case_sensitive)
|
|
.to_h do |w, r, c|
|
|
[
|
|
word_to_regexp(w, match_word: false),
|
|
{ word: w, replacement: r, case_sensitive: c }.compact,
|
|
]
|
|
end
|
|
end
|
|
|
|
def self.words_for_action_exist?(action)
|
|
WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists?
|
|
end
|
|
|
|
def self.cached_words_for_action(action)
|
|
if cache_enabled?
|
|
Discourse
|
|
.cache
|
|
.fetch(cache_key(action), expires_in: 1.day) { words_for_action(action).presence }
|
|
else
|
|
words_for_action(action).presence
|
|
end
|
|
end
|
|
|
|
def self.regexps_for_action(action, engine: :ruby)
|
|
cached_words_for_action(action)&.to_h do |_, attrs|
|
|
[word_to_regexp(attrs[:word], engine: engine), attrs]
|
|
end
|
|
end
|
|
|
|
# This regexp is run in miniracer, and the client JS app
|
|
# Make sure it is compatible with major browsers when changing
|
|
# hint: non-chrome browsers do not support 'lookbehind'
|
|
def self.compiled_regexps_for_action(action, engine: :ruby, raise_errors: false)
|
|
words = cached_words_for_action(action)
|
|
return [] if words.blank?
|
|
|
|
words
|
|
.values
|
|
.group_by { |attrs| attrs[:case_sensitive] ? :case_sensitive : :case_insensitive }
|
|
.map do |group_key, attrs_list|
|
|
words = attrs_list.map { |attrs| attrs[:word] }
|
|
|
|
# Compile all watched words into a single regular expression
|
|
regexp =
|
|
words
|
|
.map do |word|
|
|
r = word_to_regexp(word, match_word: SiteSetting.watched_words_regular_expressions?)
|
|
begin
|
|
r if Regexp.new(r)
|
|
rescue RegexpError
|
|
raise if raise_errors
|
|
end
|
|
end
|
|
.select { |r| r.present? }
|
|
.join("|")
|
|
|
|
# Add word boundaries to the regexp for regular watched words
|
|
regexp =
|
|
match_word_regexp(
|
|
regexp,
|
|
engine: engine,
|
|
) if !SiteSetting.watched_words_regular_expressions?
|
|
|
|
# Add case insensitive flag if needed
|
|
Regexp.new(regexp, group_key == :case_sensitive ? nil : Regexp::IGNORECASE)
|
|
end
|
|
end
|
|
|
|
def self.serialized_regexps_for_action(action, engine: :ruby)
|
|
compiled_regexps_for_action(action, engine: engine).map do |r|
|
|
{ r.source => { case_sensitive: !r.casefold? } }
|
|
end
|
|
end
|
|
|
|
def self.word_to_regexp(word, engine: :ruby, match_word: true)
|
|
if SiteSetting.watched_words_regular_expressions?
|
|
regexp = word
|
|
regexp = "(#{regexp})" if match_word
|
|
regexp
|
|
else
|
|
# Convert word to regex by escaping special characters in a regexp.
|
|
# Avoid using Regexp.escape because it escapes more characters than
|
|
# it should (for example, whitespaces, dashes, etc)
|
|
regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
|
|
|
|
# Convert wildcards to regexp
|
|
regexp = regexp.gsub("\\*", '\S*')
|
|
|
|
regexp = match_word_regexp(regexp, engine: engine) if match_word
|
|
regexp
|
|
end
|
|
end
|
|
|
|
def self.censor(html)
|
|
regexps = compiled_regexps_for_action(:censor)
|
|
return html if regexps.blank?
|
|
|
|
doc = Nokogiri::HTML5.fragment(html)
|
|
doc.traverse do |node|
|
|
regexps.each do |regexp|
|
|
node.content = censor_text_with_regexp(node.content, regexp) if node.text?
|
|
end
|
|
end
|
|
|
|
doc.to_s
|
|
end
|
|
|
|
def self.censor_text(text)
|
|
return text if text.blank?
|
|
|
|
regexps = compiled_regexps_for_action(:censor)
|
|
return text if regexps.blank?
|
|
|
|
regexps.inject(text) { |txt, regexp| censor_text_with_regexp(txt, regexp) }
|
|
end
|
|
|
|
def self.replace_text(text)
|
|
return text if text.blank?
|
|
replace(text, :replace)
|
|
end
|
|
|
|
def self.replace_link(text)
|
|
return text if text.blank?
|
|
replace(text, :link)
|
|
end
|
|
|
|
def self.apply_to_text(text)
|
|
text = censor_text(text)
|
|
text = replace_text(text)
|
|
text = replace_link(text)
|
|
text
|
|
end
|
|
|
|
def requires_approval?
|
|
word_matches_for_action?(:require_approval)
|
|
end
|
|
|
|
def should_flag?
|
|
word_matches_for_action?(:flag)
|
|
end
|
|
|
|
def should_block?
|
|
word_matches_for_action?(:block, all_matches: true)
|
|
end
|
|
|
|
def should_silence?
|
|
word_matches_for_action?(:silence)
|
|
end
|
|
|
|
def word_matches_for_action?(action, all_matches: false)
|
|
regexps = self.class.compiled_regexps_for_action(action)
|
|
return if regexps.blank?
|
|
|
|
match_list = []
|
|
regexps.each do |regexp|
|
|
match = regexp.match(@raw)
|
|
|
|
if !all_matches
|
|
return match if match
|
|
next
|
|
end
|
|
|
|
next if !match
|
|
|
|
if SiteSetting.watched_words_regular_expressions?
|
|
set = Set.new
|
|
@raw
|
|
.scan(regexp)
|
|
.each do |m|
|
|
if Array === m
|
|
set.add(m.find(&:present?))
|
|
elsif String === m
|
|
set.add(m)
|
|
end
|
|
end
|
|
|
|
matches = set.to_a
|
|
else
|
|
matches = @raw.scan(regexp)
|
|
matches.flatten!
|
|
end
|
|
|
|
match_list.concat(matches)
|
|
end
|
|
|
|
return if match_list.blank?
|
|
|
|
match_list.compact!
|
|
match_list.uniq!
|
|
match_list.sort!
|
|
match_list
|
|
end
|
|
|
|
def word_matches?(word, case_sensitive: false)
|
|
options = case_sensitive ? nil : Regexp::IGNORECASE
|
|
Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
|
|
end
|
|
|
|
def self.replace_text_with_regexp(text, regexp, replacement)
|
|
text.gsub(regexp) do |match|
|
|
prefix = ""
|
|
# match may be prefixed with a non-word character from the non-capturing group
|
|
# Ensure this isn't replaced if watched words regular expression is disabled.
|
|
if !SiteSetting.watched_words_regular_expressions? && (match[0] =~ /\W/) != nil
|
|
prefix = "#{match[0]}"
|
|
end
|
|
|
|
"#{prefix}#{replacement}"
|
|
end
|
|
end
|
|
|
|
private_class_method :replace_text_with_regexp
|
|
|
|
def self.censor_text_with_regexp(text, regexp)
|
|
text.gsub(regexp) do |match|
|
|
# the regex captures leading whitespaces
|
|
padding = match.size - match.lstrip.size
|
|
if padding > 0
|
|
match[0..padding - 1] + REPLACEMENT_LETTER * (match.size - padding)
|
|
else
|
|
REPLACEMENT_LETTER * match.size
|
|
end
|
|
end
|
|
end
|
|
|
|
private_class_method :censor_text_with_regexp
|
|
|
|
# Returns a regexp that transforms a regular expression into a regular
|
|
# expression that matches a whole word.
|
|
def self.match_word_regexp(regexp, engine: :ruby)
|
|
if engine == :js
|
|
"(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
|
|
elsif engine == :ruby
|
|
"(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
|
|
else
|
|
raise "unknown regexp engine: #{engine}"
|
|
end
|
|
end
|
|
|
|
private_class_method :match_word_regexp
|
|
|
|
def self.replace(text, watch_word_type)
|
|
regexps_for_action(watch_word_type)
|
|
.to_a
|
|
.reduce(text) do |t, (word_regexp, attrs)|
|
|
case_flag = attrs[:case_sensitive] ? nil : Regexp::IGNORECASE
|
|
replace_text_with_regexp(t, Regexp.new(word_regexp, case_flag), attrs[:replacement])
|
|
end
|
|
end
|
|
|
|
private_class_method :replace
|
|
end
|