mirror of
https://github.com/discourse/discourse.git
synced 2025-01-20 04:32:46 +08:00
4cb3412a56
When we introduced unicode support in the regular expressions used in watched words (9a27803
) we didn't realize the cost adding the `u` flag would be.
Turns out, it's pretty bad when you have lots of regular expressions to test. A customer had slightly less than 200 watched words, and it would freeze the browser for about 2s on the first check of those regular expressions (roughly 10ms per regular expression).
This commit introduces a new field (`word`) to the serialized watched words which is then converted to a very fast and cheap regular expression on the client-side. We use that regexp to quicly check whether a matcher is even worth trying so that we don't incure the cost of compiling the expensive unicode regexp.
This commit also busts the `WordWatcher` cache since we added a new field to be serialized.
One nice side effect of using `matchAll` instead of a `while / exec` loop is that the likeliness of having a bad regexp matching infinitely is vastly reduced 🙌
440 lines
15 KiB
Ruby
440 lines
15 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
RSpec.describe WordWatcher do
|
|
let(:raw) { <<~RAW.strip }
|
|
Do you like liquorice?
|
|
|
|
|
|
I really like them. One could even say that I am *addicted* to liquorice. And if
|
|
you can mix it up with some anise, then I'm in heaven ;)
|
|
RAW
|
|
|
|
after { Discourse.redis.flushdb }
|
|
|
|
describe ".words_for_action" do
|
|
it "returns words with metadata including case sensitivity flag" do
|
|
Fabricate(:watched_word, action: WatchedWord.actions[:censor])
|
|
word1 = Fabricate(:watched_word, action: WatchedWord.actions[:block]).word
|
|
word2 =
|
|
Fabricate(:watched_word, action: WatchedWord.actions[:block], case_sensitive: true).word
|
|
|
|
expect(described_class.words_for_action(:block)).to include(
|
|
word1 => {
|
|
case_sensitive: false,
|
|
word: word1,
|
|
},
|
|
word2 => {
|
|
case_sensitive: true,
|
|
word: word2,
|
|
},
|
|
)
|
|
end
|
|
|
|
it "returns word with metadata including replacement if word has replacement" do
|
|
word =
|
|
Fabricate(
|
|
:watched_word,
|
|
action: WatchedWord.actions[:link],
|
|
replacement: "http://test.localhost/",
|
|
).word
|
|
|
|
expect(described_class.words_for_action(:link)).to include(
|
|
word => {
|
|
case_sensitive: false,
|
|
replacement: "http://test.localhost/",
|
|
word: word,
|
|
},
|
|
)
|
|
end
|
|
|
|
it "returns an empty hash when no words are present" do
|
|
expect(described_class.words_for_action(:tag)).to eq({})
|
|
end
|
|
end
|
|
|
|
describe ".word_matcher_regexp_list" do
|
|
let!(:word1) { Fabricate(:watched_word, action: WatchedWord.actions[:block]).word }
|
|
let!(:word2) { Fabricate(:watched_word, action: WatchedWord.actions[:block]).word }
|
|
let!(:word3) do
|
|
Fabricate(:watched_word, action: WatchedWord.actions[:block], case_sensitive: true).word
|
|
end
|
|
let!(:word4) do
|
|
Fabricate(:watched_word, action: WatchedWord.actions[:block], case_sensitive: true).word
|
|
end
|
|
|
|
context "when watched_words_regular_expressions = true" do
|
|
it "returns the proper regexp" do
|
|
SiteSetting.watched_words_regular_expressions = true
|
|
regexps = described_class.word_matcher_regexp_list(:block)
|
|
|
|
expect(regexps).to be_an(Array)
|
|
expect(regexps.map(&:inspect)).to contain_exactly(
|
|
"/(#{word1})|(#{word2})/i",
|
|
"/(#{word3})|(#{word4})/",
|
|
)
|
|
end
|
|
end
|
|
|
|
context "when watched_words_regular_expressions = false" do
|
|
it "returns the proper regexp" do
|
|
SiteSetting.watched_words_regular_expressions = false
|
|
regexps = described_class.word_matcher_regexp_list(:block)
|
|
|
|
expect(regexps).to be_an(Array)
|
|
expect(regexps.map(&:inspect)).to contain_exactly(
|
|
"/(?:[^[:word:]]|^)(#{word1}|#{word2})(?=[^[:word:]]|$)/i",
|
|
"/(?:[^[:word:]]|^)(#{word3}|#{word4})(?=[^[:word:]]|$)/",
|
|
)
|
|
end
|
|
|
|
it "is empty for an action without watched words" do
|
|
regexps = described_class.word_matcher_regexp_list(:censor)
|
|
|
|
expect(regexps).to be_an(Array)
|
|
expect(regexps).to be_empty
|
|
end
|
|
end
|
|
|
|
context "when regular expression is invalid" do
|
|
before do
|
|
SiteSetting.watched_words_regular_expressions = true
|
|
Fabricate(:watched_word, word: "Test[\S*", action: WatchedWord.actions[:block])
|
|
end
|
|
|
|
it "does not raise an exception by default" do
|
|
expect { described_class.word_matcher_regexp_list(:block) }.not_to raise_error
|
|
end
|
|
|
|
it "raises an exception with raise_errors set to true" do
|
|
expect {
|
|
described_class.word_matcher_regexp_list(:block, raise_errors: true)
|
|
}.to raise_error(RegexpError)
|
|
end
|
|
end
|
|
end
|
|
|
|
describe "#word_matches_for_action?" do
|
|
it "is falsey when there are no watched words" do
|
|
expect(described_class.new(raw).word_matches_for_action?(:require_approval)).to be_falsey
|
|
end
|
|
|
|
context "with watched words" do
|
|
fab!(:anise) do
|
|
Fabricate(:watched_word, word: "anise", action: WatchedWord.actions[:require_approval])
|
|
end
|
|
|
|
it "is falsey without a match" do
|
|
expect(
|
|
described_class.new("No liquorice for me, thanks...").word_matches_for_action?(
|
|
:require_approval,
|
|
),
|
|
).to be_falsey
|
|
end
|
|
|
|
it "is returns matched words if there's a match" do
|
|
matches = described_class.new(raw).word_matches_for_action?(:require_approval)
|
|
expect(matches).to be_truthy
|
|
expect(matches[1]).to eq(anise.word)
|
|
end
|
|
|
|
it "finds at start of string" do
|
|
matches =
|
|
described_class.new("#{anise.word} is garbage").word_matches_for_action?(
|
|
:require_approval,
|
|
)
|
|
expect(matches[1]).to eq(anise.word)
|
|
end
|
|
|
|
it "finds at end of string" do
|
|
matches =
|
|
described_class.new("who likes #{anise.word}").word_matches_for_action?(:require_approval)
|
|
expect(matches[1]).to eq(anise.word)
|
|
end
|
|
|
|
it "finds non-letters in place of letters" do
|
|
Fabricate(:watched_word, word: "co(onut", action: WatchedWord.actions[:require_approval])
|
|
|
|
matches =
|
|
described_class.new("This co(onut is delicious.").word_matches_for_action?(
|
|
:require_approval,
|
|
)
|
|
expect(matches[1]).to eq("co(onut")
|
|
end
|
|
|
|
it "handles * for wildcards" do
|
|
Fabricate(:watched_word, word: "a**le*", action: WatchedWord.actions[:require_approval])
|
|
|
|
matches =
|
|
described_class.new("I acknowledge you.").word_matches_for_action?(:require_approval)
|
|
expect(matches[1]).to eq("acknowledge")
|
|
end
|
|
|
|
it "handles word boundary" do
|
|
Fabricate(:watched_word, word: "love", action: WatchedWord.actions[:require_approval])
|
|
expect(
|
|
described_class.new("I Love, bananas.").word_matches_for_action?(:require_approval)[1],
|
|
).to eq("Love")
|
|
expect(
|
|
described_class.new("I LOVE; apples.").word_matches_for_action?(:require_approval)[1],
|
|
).to eq("LOVE")
|
|
expect(
|
|
described_class.new("love: is a thing.").word_matches_for_action?(:require_approval)[1],
|
|
).to eq("love")
|
|
expect(
|
|
described_class.new("I love. oranges").word_matches_for_action?(:require_approval)[1],
|
|
).to eq("love")
|
|
expect(
|
|
described_class.new("I :love. pineapples").word_matches_for_action?(:require_approval)[1],
|
|
).to eq("love")
|
|
expect(
|
|
described_class.new("peace ,love and understanding.").word_matches_for_action?(
|
|
:require_approval,
|
|
)[
|
|
1
|
|
],
|
|
).to eq("love")
|
|
end
|
|
|
|
context "when there are multiple matches" do
|
|
context "with non regexp words" do
|
|
it "lists all matching words" do
|
|
%w[bananas hate hates].each do |word|
|
|
Fabricate(:watched_word, word: word, action: WatchedWord.actions[:block])
|
|
end
|
|
|
|
matches =
|
|
described_class.new("I hate bananas").word_matches_for_action?(
|
|
:block,
|
|
all_matches: true,
|
|
)
|
|
expect(matches).to contain_exactly("hate", "bananas")
|
|
|
|
matches =
|
|
described_class.new("She hates bananas too").word_matches_for_action?(
|
|
:block,
|
|
all_matches: true,
|
|
)
|
|
expect(matches).to contain_exactly("hates", "bananas")
|
|
end
|
|
end
|
|
|
|
context "with regexp words" do
|
|
before { SiteSetting.watched_words_regular_expressions = true }
|
|
|
|
it "lists all matching patterns" do
|
|
Fabricate(:watched_word, word: "(pine)?apples", action: WatchedWord.actions[:block])
|
|
Fabricate(
|
|
:watched_word,
|
|
word: "((move|store)(d)?)|((watch|listen)(ed|ing)?)",
|
|
action: WatchedWord.actions[:block],
|
|
)
|
|
|
|
matches =
|
|
described_class.new("pine pineapples apples").word_matches_for_action?(
|
|
:block,
|
|
all_matches: true,
|
|
)
|
|
expect(matches).to contain_exactly("pineapples", "apples")
|
|
|
|
matches =
|
|
described_class.new(
|
|
"go watched watch ed ing move d moveed moved moving",
|
|
).word_matches_for_action?(:block, all_matches: true)
|
|
expect(matches).to contain_exactly(*%w[watched watch move moved])
|
|
end
|
|
end
|
|
end
|
|
|
|
context "when word is an emoji" do
|
|
it "handles emoji" do
|
|
Fabricate(:watched_word, word: ":joy:", action: WatchedWord.actions[:require_approval])
|
|
|
|
matches =
|
|
described_class.new("Lots of emojis here :joy:").word_matches_for_action?(
|
|
:require_approval,
|
|
)
|
|
expect(matches[1]).to eq(":joy:")
|
|
end
|
|
|
|
it "handles unicode emoji" do
|
|
Fabricate(:watched_word, word: "🎃", action: WatchedWord.actions[:require_approval])
|
|
|
|
matches =
|
|
described_class.new("Halloween party! 🎃").word_matches_for_action?(:require_approval)
|
|
expect(matches[1]).to eq("🎃")
|
|
end
|
|
|
|
it "handles emoji skin tone" do
|
|
Fabricate(
|
|
:watched_word,
|
|
word: ":woman:t5:",
|
|
action: WatchedWord.actions[:require_approval],
|
|
)
|
|
|
|
matches =
|
|
described_class.new("To Infinity and beyond! 🚀 :woman:t5:").word_matches_for_action?(
|
|
:require_approval,
|
|
)
|
|
expect(matches[1]).to eq(":woman:t5:")
|
|
end
|
|
end
|
|
|
|
context "when word is a regular expression" do
|
|
before { SiteSetting.watched_words_regular_expressions = true }
|
|
|
|
it "supports regular expressions on word boundaries" do
|
|
Fabricate(:watched_word, word: /\btest\b/, action: WatchedWord.actions[:block])
|
|
|
|
matches = described_class.new("this is not a test.").word_matches_for_action?(:block)
|
|
expect(matches[0]).to eq("test")
|
|
end
|
|
|
|
it "supports regular expressions as a site setting" do
|
|
Fabricate(
|
|
:watched_word,
|
|
word: /tro[uo]+t/,
|
|
action: WatchedWord.actions[:require_approval],
|
|
)
|
|
|
|
matches =
|
|
described_class.new("Evil Trout is cool").word_matches_for_action?(:require_approval)
|
|
expect(matches[0]).to eq("Trout")
|
|
|
|
matches =
|
|
described_class.new("Evil Troot is cool").word_matches_for_action?(:require_approval)
|
|
expect(matches[0]).to eq("Troot")
|
|
|
|
matches = described_class.new("trooooooooot").word_matches_for_action?(:require_approval)
|
|
expect(matches[0]).to eq("trooooooooot")
|
|
end
|
|
|
|
it "support uppercase" do
|
|
Fabricate(:watched_word, word: /a\S+ce/, action: WatchedWord.actions[:require_approval])
|
|
|
|
matches = described_class.new("Amazing place").word_matches_for_action?(:require_approval)
|
|
expect(matches).to be_nil
|
|
|
|
matches =
|
|
described_class.new("Amazing applesauce").word_matches_for_action?(:require_approval)
|
|
expect(matches[0]).to eq("applesauce")
|
|
|
|
matches =
|
|
described_class.new("Amazing AppleSauce").word_matches_for_action?(:require_approval)
|
|
expect(matches[0]).to eq("AppleSauce")
|
|
end
|
|
end
|
|
|
|
context "when case sensitive words are present" do
|
|
before do
|
|
Fabricate(
|
|
:watched_word,
|
|
word: "Discourse",
|
|
action: WatchedWord.actions[:block],
|
|
case_sensitive: true,
|
|
)
|
|
end
|
|
|
|
context "when watched_words_regular_expressions = true" do
|
|
it "respects case sensitivity flag in matching words" do
|
|
SiteSetting.watched_words_regular_expressions = true
|
|
Fabricate(:watched_word, word: "p(rivate|ublic)", action: WatchedWord.actions[:block])
|
|
|
|
matches =
|
|
described_class.new(
|
|
"PUBLIC: Discourse is great for public discourse",
|
|
).word_matches_for_action?(:block, all_matches: true)
|
|
expect(matches).to contain_exactly("PUBLIC", "Discourse", "public")
|
|
end
|
|
end
|
|
|
|
context "when watched_words_regular_expressions = false" do
|
|
it "repects case sensitivity flag in matching" do
|
|
SiteSetting.watched_words_regular_expressions = false
|
|
Fabricate(:watched_word, word: "private", action: WatchedWord.actions[:block])
|
|
|
|
matches =
|
|
described_class.new(
|
|
"PRIVATE: Discourse is also great private discourse",
|
|
).word_matches_for_action?(:block, all_matches: true)
|
|
|
|
expect(matches).to contain_exactly("PRIVATE", "Discourse", "private")
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
describe "word replacement" do
|
|
fab!(:censored_word) do
|
|
Fabricate(:watched_word, word: "censored", action: WatchedWord.actions[:censor])
|
|
end
|
|
fab!(:replaced_word) do
|
|
Fabricate(
|
|
:watched_word,
|
|
word: "to replace",
|
|
replacement: "replaced",
|
|
action: WatchedWord.actions[:replace],
|
|
)
|
|
end
|
|
fab!(:link_word) do
|
|
Fabricate(
|
|
:watched_word,
|
|
word: "https://notdiscourse.org",
|
|
replacement: "https://discourse.org",
|
|
action: WatchedWord.actions[:link],
|
|
)
|
|
end
|
|
|
|
it "censors text" do
|
|
expect(described_class.censor_text("a censored word")).to eq(
|
|
"a #{described_class::REPLACEMENT_LETTER * 8} word",
|
|
)
|
|
end
|
|
|
|
it "replaces text" do
|
|
expect(described_class.replace_text("a word to replace meow")).to eq("a word replaced meow")
|
|
end
|
|
|
|
it "replaces links" do
|
|
expect(described_class.replace_link("please visit https://notdiscourse.org meow")).to eq(
|
|
"please visit https://discourse.org meow",
|
|
)
|
|
end
|
|
|
|
describe ".apply_to_text" do
|
|
it "replaces all types of words" do
|
|
text = "hello censored world to replace https://notdiscourse.org"
|
|
expected =
|
|
"hello #{described_class::REPLACEMENT_LETTER * 8} world replaced https://discourse.org"
|
|
expect(described_class.apply_to_text(text)).to eq(expected)
|
|
end
|
|
|
|
context "when watched_words_regular_expressions = true" do
|
|
it "replaces captured non-word prefix" do
|
|
SiteSetting.watched_words_regular_expressions = true
|
|
Fabricate(
|
|
:watched_word,
|
|
word: "\\Wplaceholder",
|
|
replacement: "replacement",
|
|
action: WatchedWord.actions[:replace],
|
|
)
|
|
|
|
text = "is \tplaceholder in https://notdiscourse.org"
|
|
expected = "is replacement in https://discourse.org"
|
|
expect(described_class.apply_to_text(text)).to eq(expected)
|
|
end
|
|
end
|
|
|
|
context "when watched_words_regular_expressions = false" do
|
|
it "maintains non-word character prefix" do
|
|
SiteSetting.watched_words_regular_expressions = false
|
|
|
|
text = "to replace and\thttps://notdiscourse.org"
|
|
expected = "replaced and\thttps://discourse.org"
|
|
expect(described_class.apply_to_text(text)).to eq(expected)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|