From 9a2780397faac09daa1012cd20667e1a30661ae2 Mon Sep 17 00:00:00 2001
From: Bianca Nenciu <nbianca@users.noreply.github.com>
Date: Mon, 15 May 2023 11:45:04 +0200
Subject: [PATCH] FIX: Handle all UTF-8 characters (#21344)

Watched words were converted to regular expressions containing \W, which
handled only ASCII characters. Using [^[:word]] instead ensures that
UTF-8 characters are also handled correctly.
---
 .../addon/utils/watched-words.js              |  2 +-
 app/serializers/site_serializer.rb            |  6 +--
 app/serializers/watched_word_serializer.rb    |  2 +-
 app/services/word_watcher.rb                  | 54 +++++++++++--------
 lib/pretty_text.rb                            |  6 +--
 spec/integration/watched_words_spec.rb        |  8 +++
 spec/services/word_watcher_spec.rb            |  4 +-
 7 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/app/assets/javascripts/discourse-common/addon/utils/watched-words.js b/app/assets/javascripts/discourse-common/addon/utils/watched-words.js
index d864b807352..8ff622ce550 100644
--- a/app/assets/javascripts/discourse-common/addon/utils/watched-words.js
+++ b/app/assets/javascripts/discourse-common/addon/utils/watched-words.js
@@ -1,6 +1,6 @@
 export function createWatchedWordRegExp(word) {
   const caseFlag = word.case_sensitive ? "" : "i";
-  return new RegExp(word.regexp, `${caseFlag}g`);
+  return new RegExp(word.regexp, `${caseFlag}gu`);
 }
 
 export function toWatchedWord(regexp) {
diff --git a/app/serializers/site_serializer.rb b/app/serializers/site_serializer.rb
index 7bafdb66976..140d7657c71 100644
--- a/app/serializers/site_serializer.rb
+++ b/app/serializers/site_serializer.rb
@@ -205,7 +205,7 @@ class SiteSerializer < ApplicationSerializer
   end
 
   def censored_regexp
-    WordWatcher.serializable_word_matcher_regexp(:censor)
+    WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js)
   end
 
   def custom_emoji_translation
@@ -221,11 +221,11 @@ class SiteSerializer < ApplicationSerializer
   end
 
   def watched_words_replace
-    WordWatcher.word_matcher_regexps(:replace)
+    WordWatcher.word_matcher_regexps(:replace, engine: :js)
   end
 
   def watched_words_link
-    WordWatcher.word_matcher_regexps(:link)
+    WordWatcher.word_matcher_regexps(:link, engine: :js)
   end
 
   def categories
diff --git a/app/serializers/watched_word_serializer.rb b/app/serializers/watched_word_serializer.rb
index a798091f282..135aca292b3 100644
--- a/app/serializers/watched_word_serializer.rb
+++ b/app/serializers/watched_word_serializer.rb
@@ -4,7 +4,7 @@ class WatchedWordSerializer < ApplicationSerializer
   attributes :id, :word, :regexp, :replacement, :action, :case_sensitive
 
   def regexp
-    WordWatcher.word_to_regexp(word, whole: true)
+    WordWatcher.word_to_regexp(word)
   end
 
   def action
diff --git a/app/services/word_watcher.rb b/app/services/word_watcher.rb
index 80cf414d7b8..1899e963d27 100644
--- a/app/services/word_watcher.rb
+++ b/app/services/word_watcher.rb
@@ -44,23 +44,23 @@ class WordWatcher
     end
   end
 
-  def self.serializable_word_matcher_regexp(action)
-    word_matcher_regexp_list(action).map { |r| { r.source => { case_sensitive: !r.casefold? } } }
+  def self.serializable_word_matcher_regexp(action, engine: :ruby)
+    word_matcher_regexp_list(action, engine: engine).map do |r|
+      { r.source => { case_sensitive: !r.casefold? } }
+    end
   end
 
   # This regexp is run in miniracer, and the client JS app
   # Make sure it is compatible with major browsers when changing
   # hint: non-chrome browsers do not support 'lookbehind'
-  def self.word_matcher_regexp_list(action, raise_errors: false)
+  def self.word_matcher_regexp_list(action, engine: :ruby, raise_errors: false)
     words = get_cached_words(action)
     return [] if words.blank?
 
     grouped_words = { case_sensitive: [], case_insensitive: [] }
 
-    words.each do |w, attrs|
-      word = word_to_regexp(w)
-      word = "(#{word})" if SiteSetting.watched_words_regular_expressions?
-
+    words.each do |word, attrs|
+      word = word_to_regexp(word, whole: SiteSetting.watched_words_regular_expressions?)
       group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive
       grouped_words[group_key] << word
     end
@@ -68,10 +68,7 @@ class WordWatcher
     regexps = grouped_words.select { |_, w| w.present? }.transform_values { |w| w.join("|") }
 
     if !SiteSetting.watched_words_regular_expressions?
-      regexps.transform_values! do |regexp|
-        regexp = "(#{regexp})"
-        "(?:\\W|^)#{regexp}(?=\\W|$)"
-      end
+      regexps.transform_values! { |regexp| wrap_regexp(regexp, engine: engine) }
     end
 
     regexps.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) }
@@ -80,29 +77,42 @@ class WordWatcher
     [] # Admin will be alerted via admin_dashboard_data.rb
   end
 
-  def self.word_matcher_regexps(action)
+  def self.word_matcher_regexps(action, engine: :ruby)
     if words = get_cached_words(action)
-      words.map { |w, opts| [word_to_regexp(w, whole: true), opts] }.to_h
+      words.map { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }.to_h
     end
   end
 
-  def self.word_to_regexp(word, whole: false)
+  def self.word_to_regexp(word, engine: :ruby, whole: true)
     if SiteSetting.watched_words_regular_expressions?
-      # Strip ruby regexp format if present
+      # Strip Ruby regexp format if present
       regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
       regexp = "(#{regexp})" if whole
       return regexp
     end
 
-    regexp = Regexp.escape(word).gsub("\\*", '\S*')
+    # Escape regular expression. Avoid using Regexp.escape because it escapes
+    # more characters than it should (for example, whitespaces)
+    regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
 
-    if whole && !SiteSetting.watched_words_regular_expressions?
-      regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
-    end
+    # Handle wildcards
+    regexp = regexp.gsub("\\*", '\S*')
+
+    regexp = wrap_regexp(regexp, engine: engine) if whole
 
     regexp
   end
 
+  def self.wrap_regexp(regexp, engine: :ruby)
+    if engine == :js
+      "(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
+    elsif engine == :ruby
+      "(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
+    else
+      "(?:\\W|^)(#{regexp})(?=\\W|$)"
+    end
+  end
+
   def self.word_matcher_regexp_key(action)
     "watched-words-list:v#{CACHE_VERSION}:#{action}"
   end
@@ -212,10 +222,8 @@ class WordWatcher
   end
 
   def word_matches?(word, case_sensitive: false)
-    Regexp.new(
-      WordWatcher.word_to_regexp(word, whole: true),
-      case_sensitive ? nil : Regexp::IGNORECASE,
-    ).match?(@raw)
+    options = case_sensitive ? nil : Regexp::IGNORECASE
+    Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
   end
 
   def self.replace_text_with_regexp(text, regexp, replacement)
diff --git a/lib/pretty_text.rb b/lib/pretty_text.rb
index 986dbccecd0..871a9f431cc 100644
--- a/lib/pretty_text.rb
+++ b/lib/pretty_text.rb
@@ -204,9 +204,9 @@ module PrettyText
         __optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer;
         __optInput.emojiDenyList = #{Emoji.denied.to_json};
         __optInput.lookupUploadUrls = __lookupUploadUrls;
-        __optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor).to_json};
-        __optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace).to_json};
-        __optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link).to_json};
+        __optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js).to_json};
+        __optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace, engine: :js).to_json};
+        __optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link, engine: :js).to_json};
         __optInput.additionalOptions = #{Site.markdown_additional_options.to_json};
       JS
 
diff --git a/spec/integration/watched_words_spec.rb b/spec/integration/watched_words_spec.rb
index 2304fb6bce0..a7d6902fbdd 100644
--- a/spec/integration/watched_words_spec.rb
+++ b/spec/integration/watched_words_spec.rb
@@ -64,6 +64,14 @@ RSpec.describe WatchedWord do
       should_block_post(manager)
     end
 
+    it "should handle UTF-8 characters" do
+      block_word = Fabricate(:watched_word, action: WatchedWord.actions[:block], word: "abc")
+      manager =
+        NewPostManager.new(tl2_user, title: "Hello world", raw: "abcódef", topic_id: topic.id)
+
+      expect(manager.perform).to be_success
+    end
+
     it "should block the post from admin" do
       manager =
         NewPostManager.new(
diff --git a/spec/services/word_watcher_spec.rb b/spec/services/word_watcher_spec.rb
index f7b8881b2a5..4d55d6ab0eb 100644
--- a/spec/services/word_watcher_spec.rb
+++ b/spec/services/word_watcher_spec.rb
@@ -79,8 +79,8 @@ RSpec.describe WordWatcher do
 
         expect(regexps).to be_an(Array)
         expect(regexps.map(&:inspect)).to contain_exactly(
-          "/(?:\\W|^)(#{word1}|#{word2})(?=\\W|$)/i",
-          "/(?:\\W|^)(#{word3}|#{word4})(?=\\W|$)/",
+          "/(?:[^[:word:]]|^)(#{word1}|#{word2})(?=[^[:word:]]|$)/i",
+          "/(?:[^[:word:]]|^)(#{word3}|#{word4})(?=[^[:word:]]|$)/",
         )
       end