From 9a2780397faac09daa1012cd20667e1a30661ae2 Mon Sep 17 00:00:00 2001 From: Bianca Nenciu Date: Mon, 15 May 2023 11:45:04 +0200 Subject: [PATCH] FIX: Handle all UTF-8 characters (#21344) Watched words were converted to regular expressions containing \W, which handled only ASCII characters. Using [^[:word]] instead ensures that UTF-8 characters are also handled correctly. --- .../addon/utils/watched-words.js | 2 +- app/serializers/site_serializer.rb | 6 +-- app/serializers/watched_word_serializer.rb | 2 +- app/services/word_watcher.rb | 54 +++++++++++-------- lib/pretty_text.rb | 6 +-- spec/integration/watched_words_spec.rb | 8 +++ spec/services/word_watcher_spec.rb | 4 +- 7 files changed, 49 insertions(+), 33 deletions(-) diff --git a/app/assets/javascripts/discourse-common/addon/utils/watched-words.js b/app/assets/javascripts/discourse-common/addon/utils/watched-words.js index d864b807352..8ff622ce550 100644 --- a/app/assets/javascripts/discourse-common/addon/utils/watched-words.js +++ b/app/assets/javascripts/discourse-common/addon/utils/watched-words.js @@ -1,6 +1,6 @@ export function createWatchedWordRegExp(word) { const caseFlag = word.case_sensitive ? "" : "i"; - return new RegExp(word.regexp, `${caseFlag}g`); + return new RegExp(word.regexp, `${caseFlag}gu`); } export function toWatchedWord(regexp) { diff --git a/app/serializers/site_serializer.rb b/app/serializers/site_serializer.rb index 7bafdb66976..140d7657c71 100644 --- a/app/serializers/site_serializer.rb +++ b/app/serializers/site_serializer.rb @@ -205,7 +205,7 @@ class SiteSerializer < ApplicationSerializer end def censored_regexp - WordWatcher.serializable_word_matcher_regexp(:censor) + WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js) end def custom_emoji_translation @@ -221,11 +221,11 @@ class SiteSerializer < ApplicationSerializer end def watched_words_replace - WordWatcher.word_matcher_regexps(:replace) + WordWatcher.word_matcher_regexps(:replace, engine: :js) end def watched_words_link - WordWatcher.word_matcher_regexps(:link) + WordWatcher.word_matcher_regexps(:link, engine: :js) end def categories diff --git a/app/serializers/watched_word_serializer.rb b/app/serializers/watched_word_serializer.rb index a798091f282..135aca292b3 100644 --- a/app/serializers/watched_word_serializer.rb +++ b/app/serializers/watched_word_serializer.rb @@ -4,7 +4,7 @@ class WatchedWordSerializer < ApplicationSerializer attributes :id, :word, :regexp, :replacement, :action, :case_sensitive def regexp - WordWatcher.word_to_regexp(word, whole: true) + WordWatcher.word_to_regexp(word) end def action diff --git a/app/services/word_watcher.rb b/app/services/word_watcher.rb index 80cf414d7b8..1899e963d27 100644 --- a/app/services/word_watcher.rb +++ b/app/services/word_watcher.rb @@ -44,23 +44,23 @@ class WordWatcher end end - def self.serializable_word_matcher_regexp(action) - word_matcher_regexp_list(action).map { |r| { r.source => { case_sensitive: !r.casefold? } } } + def self.serializable_word_matcher_regexp(action, engine: :ruby) + word_matcher_regexp_list(action, engine: engine).map do |r| + { r.source => { case_sensitive: !r.casefold? } } + end end # This regexp is run in miniracer, and the client JS app # Make sure it is compatible with major browsers when changing # hint: non-chrome browsers do not support 'lookbehind' - def self.word_matcher_regexp_list(action, raise_errors: false) + def self.word_matcher_regexp_list(action, engine: :ruby, raise_errors: false) words = get_cached_words(action) return [] if words.blank? grouped_words = { case_sensitive: [], case_insensitive: [] } - words.each do |w, attrs| - word = word_to_regexp(w) - word = "(#{word})" if SiteSetting.watched_words_regular_expressions? - + words.each do |word, attrs| + word = word_to_regexp(word, whole: SiteSetting.watched_words_regular_expressions?) group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive grouped_words[group_key] << word end @@ -68,10 +68,7 @@ class WordWatcher regexps = grouped_words.select { |_, w| w.present? }.transform_values { |w| w.join("|") } if !SiteSetting.watched_words_regular_expressions? - regexps.transform_values! do |regexp| - regexp = "(#{regexp})" - "(?:\\W|^)#{regexp}(?=\\W|$)" - end + regexps.transform_values! { |regexp| wrap_regexp(regexp, engine: engine) } end regexps.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) } @@ -80,29 +77,42 @@ class WordWatcher [] # Admin will be alerted via admin_dashboard_data.rb end - def self.word_matcher_regexps(action) + def self.word_matcher_regexps(action, engine: :ruby) if words = get_cached_words(action) - words.map { |w, opts| [word_to_regexp(w, whole: true), opts] }.to_h + words.map { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }.to_h end end - def self.word_to_regexp(word, whole: false) + def self.word_to_regexp(word, engine: :ruby, whole: true) if SiteSetting.watched_words_regular_expressions? - # Strip ruby regexp format if present + # Strip Ruby regexp format if present regexp = word.start_with?("(?-mix:") ? word[7..-2] : word regexp = "(#{regexp})" if whole return regexp end - regexp = Regexp.escape(word).gsub("\\*", '\S*') + # Escape regular expression. Avoid using Regexp.escape because it escapes + # more characters than it should (for example, whitespaces) + regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1') - if whole && !SiteSetting.watched_words_regular_expressions? - regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)" - end + # Handle wildcards + regexp = regexp.gsub("\\*", '\S*') + + regexp = wrap_regexp(regexp, engine: engine) if whole regexp end + def self.wrap_regexp(regexp, engine: :ruby) + if engine == :js + "(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)" + elsif engine == :ruby + "(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)" + else + "(?:\\W|^)(#{regexp})(?=\\W|$)" + end + end + def self.word_matcher_regexp_key(action) "watched-words-list:v#{CACHE_VERSION}:#{action}" end @@ -212,10 +222,8 @@ class WordWatcher end def word_matches?(word, case_sensitive: false) - Regexp.new( - WordWatcher.word_to_regexp(word, whole: true), - case_sensitive ? nil : Regexp::IGNORECASE, - ).match?(@raw) + options = case_sensitive ? nil : Regexp::IGNORECASE + Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw) end def self.replace_text_with_regexp(text, regexp, replacement) diff --git a/lib/pretty_text.rb b/lib/pretty_text.rb index 986dbccecd0..871a9f431cc 100644 --- a/lib/pretty_text.rb +++ b/lib/pretty_text.rb @@ -204,9 +204,9 @@ module PrettyText __optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer; __optInput.emojiDenyList = #{Emoji.denied.to_json}; __optInput.lookupUploadUrls = __lookupUploadUrls; - __optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor).to_json}; - __optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace).to_json}; - __optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link).to_json}; + __optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js).to_json}; + __optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace, engine: :js).to_json}; + __optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link, engine: :js).to_json}; __optInput.additionalOptions = #{Site.markdown_additional_options.to_json}; JS diff --git a/spec/integration/watched_words_spec.rb b/spec/integration/watched_words_spec.rb index 2304fb6bce0..a7d6902fbdd 100644 --- a/spec/integration/watched_words_spec.rb +++ b/spec/integration/watched_words_spec.rb @@ -64,6 +64,14 @@ RSpec.describe WatchedWord do should_block_post(manager) end + it "should handle UTF-8 characters" do + block_word = Fabricate(:watched_word, action: WatchedWord.actions[:block], word: "abc") + manager = + NewPostManager.new(tl2_user, title: "Hello world", raw: "abcódef", topic_id: topic.id) + + expect(manager.perform).to be_success + end + it "should block the post from admin" do manager = NewPostManager.new( diff --git a/spec/services/word_watcher_spec.rb b/spec/services/word_watcher_spec.rb index f7b8881b2a5..4d55d6ab0eb 100644 --- a/spec/services/word_watcher_spec.rb +++ b/spec/services/word_watcher_spec.rb @@ -79,8 +79,8 @@ RSpec.describe WordWatcher do expect(regexps).to be_an(Array) expect(regexps.map(&:inspect)).to contain_exactly( - "/(?:\\W|^)(#{word1}|#{word2})(?=\\W|$)/i", - "/(?:\\W|^)(#{word3}|#{word4})(?=\\W|$)/", + "/(?:[^[:word:]]|^)(#{word1}|#{word2})(?=[^[:word:]]|$)/i", + "/(?:[^[:word:]]|^)(#{word3}|#{word4})(?=[^[:word:]]|$)/", ) end