FIX: search index duplicate parser matching is too restrictive (#20129)

Previous regex did not allow for cases where a lexeme contains a : (colon)

This can happen when parsing URLs. New algorithm allows for this.
Test was amended to more clearly call out index problems
This commit is contained in:
Sam 2023-02-02 12:17:19 +11:00 committed by GitHub
parent 41f265ae46
commit 4570118a63
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 7 deletions

View File

@ -54,7 +54,7 @@ class SearchIndexer
loop do
count += 1
break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
term, _, remaining = lexeme.partition(".")
_term, _, remaining = lexeme.partition(".")
break if remaining.blank?
array << "'#{remaining}':#{positions}"
lexeme = remaining
@ -69,8 +69,8 @@ class SearchIndexer
if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0
reduced = []
tsvector
.scan(/([^\:]+\:)(([0-9]+[A-D]?,?)+)/)
.each do |term, indexes|
.scan(/('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/)
.each do |term, _, indexes|
family_counts = Hash.new(0)
new_index_array = []

View File

@ -307,15 +307,22 @@ RSpec.describe SearchIndexer do
it "limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured" do
SiteSetting.max_duplicate_search_index_terms = 5
contents = "I am #{"sam " * 10}"
contents = <<~TEXT
#{"sam " * 10}
<a href="https://something.com/path:path'path?term='hello'">url</a>
TEXT
post.update!(raw: contents)
post_search_data = post.post_search_data
post_search_data.reload
expect(post_search_data.search_data).to eq(
"'sam':12,13,14,15,16 'test':8A 'titl':4A 'uncategor':9B",
)
terms =
"'/path:path''path':22 'com':21 'sam':10,11,12,13,14 'something.com':21 'something.com/path:path''path':20 'test':8A 'titl':4A 'uncategor':9B 'url':23".split(
" ",
).sort
expect(post_search_data.search_data.split(" ").sort).to contain_exactly(*terms)
end
end