discourse/app/services/word_watcher.rb

# frozen_string_literal: true

class WordWatcher
  REPLACEMENT_LETTER ||= CGI.unescape_html("&#9632;")

  def initialize(raw)
    @raw = raw
  end

  def self.words_for_action(action)
    words = WatchedWord.where(action: WatchedWord.actions[action.to_sym]).limit(1000)
    if WatchedWord.has_replacement?(action.to_sym)
      words.pluck(:word, :replacement).to_h
    else
      words.pluck(:word)
    end
  end

  def self.words_for_action_exists?(action)
    WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists?
  end

  def self.get_cached_words(action)
    Discourse.cache.fetch(word_matcher_regexp_key(action), expires_in: 1.day) do
      words_for_action(action).presence
    end
  end

  # This regexp is run in miniracer, and the client JS app
  # Make sure it is compatible with major browsers when changing
  # hint: non-chrome browsers do not support 'lookbehind'
  def self.word_matcher_regexp(action, raise_errors: false)
    words = get_cached_words(action)
    if words
      if WatchedWord.has_replacement?(action.to_sym)
        words = words.keys
      end
      words = words.map do |w|
        word = word_to_regexp(w)
        word = "(#{word})" if SiteSetting.watched_words_regular_expressions?
        word
      end
      regexp = words.join('|')
      if !SiteSetting.watched_words_regular_expressions?
        regexp = "(#{regexp})"
        regexp = "(?:\\W|^)#{regexp}(?=\\W|$)"
      end
      Regexp.new(regexp, Regexp::IGNORECASE)
    end
  rescue RegexpError
    raise if raise_errors
    nil # Admin will be alerted via admin_dashboard_data.rb
  end

  def self.word_matcher_regexps(action)
    if words = get_cached_words(action)
      words.map { |w, r| [word_to_regexp(w, whole: true), r] }.to_h
    end
  end

  def self.word_to_regexp(word, whole: false)
    if SiteSetting.watched_words_regular_expressions?
      # Strip ruby regexp format if present, we're going to make the whole thing
      # case insensitive anyway
      regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
      regexp = "(#{regexp})" if whole
      return regexp
    end

    regexp = Regexp.escape(word).gsub("\\*", '\S*')

    if whole && !SiteSetting.watched_words_regular_expressions?
      regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
    end

    regexp
  end

  def self.word_matcher_regexp_key(action)
    "watched-words-list:#{action}"
  end

  def self.censor(html)
    regexp = WordWatcher.word_matcher_regexp(:censor)
    return html if regexp.blank?

    doc = Nokogiri::HTML5::fragment(html)
    doc.traverse do |node|
      if node.text?
        node.content = node.content.gsub(regexp) do |match|
          # the regex captures leading whitespaces
          padding = match.size - match.lstrip.size
          if padding > 0
            match[0..padding - 1] + REPLACEMENT_LETTER * (match.size - padding)
          else
            REPLACEMENT_LETTER * match.size
          end
        end
      end
    end
    doc.to_s
  end

  def self.clear_cache!
    WatchedWord.actions.each do |a, i|
      Discourse.cache.delete word_matcher_regexp_key(a)
    end
  end

  def requires_approval?
    word_matches_for_action?(:require_approval)
  end

  def should_flag?
    word_matches_for_action?(:flag)
  end

  def should_block?
    word_matches_for_action?(:block, all_matches: true)
  end

  def should_silence?
    word_matches_for_action?(:silence)
  end

  def word_matches_for_action?(action, all_matches: false)
    regexp = self.class.word_matcher_regexp(action)
    if regexp
      match = regexp.match(@raw)
      return match if !all_matches || !match

      if SiteSetting.watched_words_regular_expressions?
        set = Set.new
        @raw.scan(regexp).each do |m|
          if Array === m
            set.add(m.find(&:present?))
          elsif String === m
            set.add(m)
          end
        end
        matches = set.to_a
      else
        matches = @raw.scan(regexp)
        matches.flatten!
        matches.uniq!
      end
      matches.compact!
      matches.sort!
      matches
    else
      false
    end
  end

  def word_matches?(word)
    Regexp.new(WordWatcher.word_to_regexp(word, whole: true), Regexp::IGNORECASE).match?(@raw)
  end
end
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-03 06:17:27 +08:00			`# frozen_string_literal: true`

FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00			`class WordWatcher`
FEATURE: Censor Oneboxes (#12902) Previously onebox content was not passed by the censor regex, meaning you could sneak in censored words via onebox. 2021-06-03 09:39:12 +08:00			`REPLACEMENT_LETTER \|\|= CGI.unescape_html("■")`
FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00
			`def initialize(raw)`
			`@raw = raw`
			`end`

			`def self.words_for_action(action)`
Add watched words of type "replace" (#12020) This commit includes other various improvements to watched words. auto_silence_first_post_regex site setting was removed because it overlapped with 'require approval' watched words. 2021-02-25 20:00:58 +08:00			`words = WatchedWord.where(action: WatchedWord.actions[action.to_sym]).limit(1000)`
FIX: Split link watched words from replace (#13196) It was not clear that replace watched words can be used to replace text with URLs. This introduces a new watched word type that makes it easier to understand. 2021-06-02 13:36:49 +08:00			`if WatchedWord.has_replacement?(action.to_sym)`
Add watched words of type "replace" (#12020) This commit includes other various improvements to watched words. auto_silence_first_post_regex site setting was removed because it overlapped with 'require approval' watched words. 2021-02-25 20:00:58 +08:00			`words.pluck(:word, :replacement).to_h`
			`else`
			`words.pluck(:word)`
			`end`
FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00			`end`

			`def self.words_for_action_exists?(action)`
			`WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists?`
			`end`

FEATURE: Watched words improvements (#7899) This commit contains 3 features: - FEATURE: Allow downloading watched words This introduces a button that allows admins to download watched words per action in a `.txt` file. - FEATURE: Allow clearing watched words in bulk This adds a "Clear All" button that clears all deleted words per action (e.g. block, flag etc.) - FEATURE: List all blocked words contained in the post when it's blocked When a post is rejected because it contains one or more blocked words, the error message now lists all the blocked words contained in the post. ------- This also changes the format of the file for importing watched words from `.csv` to `.txt` so it becomes inconsistent with the extension of the file when watched words are exported. 2019-07-22 19:59:56 +08:00			`def self.get_cached_words(action)`
			`Discourse.cache.fetch(word_matcher_regexp_key(action), expires_in: 1.day) do`
			`words_for_action(action).presence`
			`end`
			`end`

FIX: Various watched words improvements - Client-side censoring fixed for non-chrome browsers. (Regular expression rewritten to avoid lookback) - Regex generation is now done on the server, to reduce repeated logic, and make it easier to extend in plugins - Censor tests are moved to ruby, to ensure everything works end-to-end - If "watched words regular expressions" is enabled, warn the admin when the generated regex is invalid 2019-08-01 01:33:49 +08:00			`# This regexp is run in miniracer, and the client JS app`
			`# Make sure it is compatible with major browsers when changing`
			`# hint: non-chrome browsers do not support 'lookbehind'`
			`def self.word_matcher_regexp(action, raise_errors: false)`
FEATURE: Watched words improvements (#7899) This commit contains 3 features: - FEATURE: Allow downloading watched words This introduces a button that allows admins to download watched words per action in a `.txt` file. - FEATURE: Allow clearing watched words in bulk This adds a "Clear All" button that clears all deleted words per action (e.g. block, flag etc.) - FEATURE: List all blocked words contained in the post when it's blocked When a post is rejected because it contains one or more blocked words, the error message now lists all the blocked words contained in the post. ------- This also changes the format of the file for importing watched words from `.csv` to `.txt` so it becomes inconsistent with the extension of the file when watched words are exported. 2019-07-22 19:59:56 +08:00			`words = get_cached_words(action)`
			`if words`
FIX: Split link watched words from replace (#13196) It was not clear that replace watched words can be used to replace text with URLs. This introduces a new watched word type that makes it easier to understand. 2021-06-02 13:36:49 +08:00			`if WatchedWord.has_replacement?(action.to_sym)`
Add watched words of type "replace" (#12020) This commit includes other various improvements to watched words. auto_silence_first_post_regex site setting was removed because it overlapped with 'require approval' watched words. 2021-02-25 20:00:58 +08:00			`words = words.keys`
			`end`
FEATURE: Watched words improvements (#7899) This commit contains 3 features: - FEATURE: Allow downloading watched words This introduces a button that allows admins to download watched words per action in a `.txt` file. - FEATURE: Allow clearing watched words in bulk This adds a "Clear All" button that clears all deleted words per action (e.g. block, flag etc.) - FEATURE: List all blocked words contained in the post when it's blocked When a post is rejected because it contains one or more blocked words, the error message now lists all the blocked words contained in the post. ------- This also changes the format of the file for importing watched words from `.csv` to `.txt` so it becomes inconsistent with the extension of the file when watched words are exported. 2019-07-22 19:59:56 +08:00			`words = words.map do \|w\|`
			`word = word_to_regexp(w)`
			`word = "(#{word})" if SiteSetting.watched_words_regular_expressions?`
			`word`
			`end`
			`regexp = words.join('\|')`
			`if !SiteSetting.watched_words_regular_expressions?`
			`regexp = "(#{regexp})"`
FIX: Various watched words improvements - Client-side censoring fixed for non-chrome browsers. (Regular expression rewritten to avoid lookback) - Regex generation is now done on the server, to reduce repeated logic, and make it easier to extend in plugins - Censor tests are moved to ruby, to ensure everything works end-to-end - If "watched words regular expressions" is enabled, warn the admin when the generated regex is invalid 2019-08-01 01:33:49 +08:00			`regexp = "(?:\\W\|^)#{regexp}(?=\\W\|$)"`
FIX: Allow regular expressions to specify boundaries 2017-11-18 03:10:38 +08:00			`end`
FEATURE: Watched words improvements (#7899) This commit contains 3 features: - FEATURE: Allow downloading watched words This introduces a button that allows admins to download watched words per action in a `.txt` file. - FEATURE: Allow clearing watched words in bulk This adds a "Clear All" button that clears all deleted words per action (e.g. block, flag etc.) - FEATURE: List all blocked words contained in the post when it's blocked When a post is rejected because it contains one or more blocked words, the error message now lists all the blocked words contained in the post. ------- This also changes the format of the file for importing watched words from `.csv` to `.txt` so it becomes inconsistent with the extension of the file when watched words are exported. 2019-07-22 19:59:56 +08:00			`Regexp.new(regexp, Regexp::IGNORECASE)`
FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00			`end`
FEATURE: watch title for automatic tagging (#12782) Previously watched words ignored topic titles when applying auto tagging rules. Also copy has been improved to reflect how the system behaves. The text hints that we are only watching first post now 2021-04-21 23:16:25 +08:00			`rescue RegexpError`
FIX: Various watched words improvements - Client-side censoring fixed for non-chrome browsers. (Regular expression rewritten to avoid lookback) - Regex generation is now done on the server, to reduce repeated logic, and make it easier to extend in plugins - Censor tests are moved to ruby, to ensure everything works end-to-end - If "watched words regular expressions" is enabled, warn the admin when the generated regex is invalid 2019-08-01 01:33:49 +08:00			`raise if raise_errors`
			`nil # Admin will be alerted via admin_dashboard_data.rb`
FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00			`end`

FIX: Make replace watched words work with wildcard (#13084) Watched words are always regular expressions, despite watched_words_ _regular_expressions being enabled or not. Internally, wildcard characters are replaced with a regular expression that matches any non whitespace character. 2021-05-18 17:09:47 +08:00			`def self.word_matcher_regexps(action)`
			`if words = get_cached_words(action)`
FIX: Add word boundaries to replace and tag watched words (#13405) The generated regular expressions did not contain \b which matched every text that contained the word, even if it was only a substring of a word. For example, if "art" was a watched word a post containing word "artist" matched. 2021-06-18 23:54:06 +08:00			`words.map { \|w, r\| [word_to_regexp(w, whole: true), r] }.to_h`
FIX: Make replace watched words work with wildcard (#13084) Watched words are always regular expressions, despite watched_words_ _regular_expressions being enabled or not. Internally, wildcard characters are replaced with a regular expression that matches any non whitespace character. 2021-05-18 17:09:47 +08:00			`end`
			`end`

FIX: Add word boundaries to replace and tag watched words (#13405) The generated regular expressions did not contain \b which matched every text that contained the word, even if it was only a substring of a word. For example, if "art" was a watched word a post containing word "artist" matched. 2021-06-18 23:54:06 +08:00			`def self.word_to_regexp(word, whole: false)`
FIX: Allow regular expressions to specify boundaries 2017-11-18 03:10:38 +08:00			`if SiteSetting.watched_words_regular_expressions?`
			`# Strip ruby regexp format if present, we're going to make the whole thing`
			`# case insensitive anyway`
FIX: Add word boundaries to replace and tag watched words (#13405) The generated regular expressions did not contain \b which matched every text that contained the word, even if it was only a substring of a word. For example, if "art" was a watched word a post containing word "artist" matched. 2021-06-18 23:54:06 +08:00			`regexp = word.start_with?("(?-mix:") ? word[7..-2] : word`
			`regexp = "(#{regexp})" if whole`
			`return regexp`
FIX: Allow regular expressions to specify boundaries 2017-11-18 03:10:38 +08:00			`end`
FIX: Add word boundaries to replace and tag watched words (#13405) The generated regular expressions did not contain \b which matched every text that contained the word, even if it was only a substring of a word. For example, if "art" was a watched word a post containing word "artist" matched. 2021-06-18 23:54:06 +08:00
			`regexp = Regexp.escape(word).gsub("\\", '\S')`

			`if whole && !SiteSetting.watched_words_regular_expressions?`
			`regexp = "(?:\\W\|^)(#{regexp})(?=\\W\|$)"`
			`end`

			`regexp`
FEATURE: Support regular expressions for watched words 2017-09-28 03:48:57 +08:00			`end`

FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00			`def self.word_matcher_regexp_key(action)`
FEATURE: Watched words improvements (#7899) This commit contains 3 features: - FEATURE: Allow downloading watched words This introduces a button that allows admins to download watched words per action in a `.txt` file. - FEATURE: Allow clearing watched words in bulk This adds a "Clear All" button that clears all deleted words per action (e.g. block, flag etc.) - FEATURE: List all blocked words contained in the post when it's blocked When a post is rejected because it contains one or more blocked words, the error message now lists all the blocked words contained in the post. ------- This also changes the format of the file for importing watched words from `.csv` to `.txt` so it becomes inconsistent with the extension of the file when watched words are exported. 2019-07-22 19:59:56 +08:00			`"watched-words-list:#{action}"`
FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00			`end`

FEATURE: Censor Oneboxes (#12902) Previously onebox content was not passed by the censor regex, meaning you could sneak in censored words via onebox. 2021-06-03 09:39:12 +08:00			`def self.censor(html)`
			`regexp = WordWatcher.word_matcher_regexp(:censor)`
			`return html if regexp.blank?`

			`doc = Nokogiri::HTML5::fragment(html)`
			`doc.traverse do \|node\|`
			`if node.text?`
			`node.content = node.content.gsub(regexp) do \|match\|`
			`# the regex captures leading whitespaces`
			`padding = match.size - match.lstrip.size`
			`if padding > 0`
			`match[0..padding - 1] + REPLACEMENT_LETTER * (match.size - padding)`
			`else`
			`REPLACEMENT_LETTER * match.size`
			`end`
			`end`
			`end`
			`end`
			`doc.to_s`
			`end`

FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00			`def self.clear_cache!`
FIX: iterate when clearing watched words cache 2019-06-25 07:06:32 +08:00			`WatchedWord.actions.each do \|a, i\|`
FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00			`Discourse.cache.delete word_matcher_regexp_key(a)`
			`end`
			`end`

			`def requires_approval?`
			`word_matches_for_action?(:require_approval)`
			`end`

			`def should_flag?`
			`word_matches_for_action?(:flag)`
			`end`

			`def should_block?`
FEATURE: Watched words improvements (#7899) This commit contains 3 features: - FEATURE: Allow downloading watched words This introduces a button that allows admins to download watched words per action in a `.txt` file. - FEATURE: Allow clearing watched words in bulk This adds a "Clear All" button that clears all deleted words per action (e.g. block, flag etc.) - FEATURE: List all blocked words contained in the post when it's blocked When a post is rejected because it contains one or more blocked words, the error message now lists all the blocked words contained in the post. ------- This also changes the format of the file for importing watched words from `.csv` to `.txt` so it becomes inconsistent with the extension of the file when watched words are exported. 2019-07-22 19:59:56 +08:00			`word_matches_for_action?(:block, all_matches: true)`
FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00			`end`

FEATURE: Silence watched word (#13160) This is a new type of watched word to replace auto_silence_first_post_ regex site setting. 2021-05-28 00:19:58 +08:00			`def should_silence?`
			`word_matches_for_action?(:silence)`
			`end`

FEATURE: Watched words improvements (#7899) This commit contains 3 features: - FEATURE: Allow downloading watched words This introduces a button that allows admins to download watched words per action in a `.txt` file. - FEATURE: Allow clearing watched words in bulk This adds a "Clear All" button that clears all deleted words per action (e.g. block, flag etc.) - FEATURE: List all blocked words contained in the post when it's blocked When a post is rejected because it contains one or more blocked words, the error message now lists all the blocked words contained in the post. ------- This also changes the format of the file for importing watched words from `.csv` to `.txt` so it becomes inconsistent with the extension of the file when watched words are exported. 2019-07-22 19:59:56 +08:00			`def word_matches_for_action?(action, all_matches: false)`
			`regexp = self.class.word_matcher_regexp(action)`
			`if regexp`
			`match = regexp.match(@raw)`
			`return match if !all_matches \|\| !match`
FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00
FEATURE: Watched words improvements (#7899) This commit contains 3 features: - FEATURE: Allow downloading watched words This introduces a button that allows admins to download watched words per action in a `.txt` file. - FEATURE: Allow clearing watched words in bulk This adds a "Clear All" button that clears all deleted words per action (e.g. block, flag etc.) - FEATURE: List all blocked words contained in the post when it's blocked When a post is rejected because it contains one or more blocked words, the error message now lists all the blocked words contained in the post. ------- This also changes the format of the file for importing watched words from `.csv` to `.txt` so it becomes inconsistent with the extension of the file when watched words are exported. 2019-07-22 19:59:56 +08:00			`if SiteSetting.watched_words_regular_expressions?`
			`set = Set.new`
			`@raw.scan(regexp).each do \|m\|`
			`if Array === m`
			`set.add(m.find(&:present?))`
			`elsif String === m`
			`set.add(m)`
			`end`
			`end`
			`matches = set.to_a`
			`else`
			`matches = @raw.scan(regexp)`
			`matches.flatten!`
			`matches.uniq!`
			`end`
			`matches.compact!`
			`matches.sort!`
			`matches`
			`else`
			`false`
			`end`
			`end`
FEATURE: Autotag watched words (#12244) New topics with be matched against a set of watched words and be tagged accordingly. 2021-03-03 16:53:38 +08:00
FIX: Make autotag watched words case insensitive (#13043) * FIX: Hide tag watched words if tagging is disabled These 'autotag' words were shown even if tagging was disabled. * FIX: Make autotag watched words case insensitive This commit also fixes the bug when no tag was applied if no other tag was already present. 2021-05-14 21:52:10 +08:00			`def word_matches?(word)`
FIX: Add word boundaries to replace and tag watched words (#13405) The generated regular expressions did not contain \b which matched every text that contained the word, even if it was only a substring of a word. For example, if "art" was a watched word a post containing word "artist" matched. 2021-06-18 23:54:06 +08:00			`Regexp.new(WordWatcher.word_to_regexp(word, whole: true), Regexp::IGNORECASE).match?(@raw)`
FEATURE: Autotag watched words (#12244) New topics with be matched against a set of watched words and be tagged accordingly. 2021-03-03 16:53:38 +08:00			`end`
FEATURE: Watched Words: when posts contain words, do one of flag, require approval, censor, or block 2017-06-29 04:56:44 +08:00			`end`