FIX: Limit max word length in search index (#16380)

Long words bloat the index for little benefit.
This commit is contained in:
Daniel Waterworth 2022-04-06 12:23:30 -05:00 committed by GitHub
parent 69dbc207ab
commit 6e9a068e44
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 37 additions and 12 deletions

View File

@ -18,12 +18,25 @@ class SearchIndexer
end
def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
raw_data = [a_weight, b_weight, c_weight, d_weight]
raw_data = {
a: a_weight,
b: b_weight,
c: c_weight,
d: d_weight,
}
search_data = raw_data.map do |data|
# The version used in excerpts
search_data = raw_data.transform_values do |data|
Search.prepare_data(data || "", :index)
end
# The version used to build the index
indexed_data = search_data.transform_values do |data|
data.gsub(/\S+/) { |word|
word[0...SiteSetting.search_max_indexed_word_length]
}
end
table_name = "#{table}_search_data"
foreign_key = "#{table}_id"
@ -37,14 +50,7 @@ class SearchIndexer
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
SQL
ranked_params = {
a: search_data[0],
b: search_data[1],
c: search_data[2],
d: search_data[3],
}
tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
tsvector = DB.query_single("SELECT #{ranked_index}", indexed_data)[0]
additional_lexemes = []
tsvector.scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/).reduce(additional_lexemes) do |array, (lexeme, _, positions)|
@ -68,9 +74,9 @@ class SearchIndexer
indexed_data =
if table.to_s == "post"
clean_post_raw_data!(ranked_params[:d])
clean_post_raw_data!(search_data[:d])
else
search_data.select { |d| d.length > 0 }.join(' ')
search_data.values.select { |d| d.length > 0 }.join(' ')
end
params = {

View File

@ -2028,6 +2028,9 @@ search:
default: false
hidden: true
client: true
search_max_indexed_word_length:
default: 100
hidden: true
search_ranking_normalization:
default: "0"
hidden: true

View File

@ -269,6 +269,22 @@ describe SearchIndexer do
expect(post.post_search_data.search_data).to include('œuvr')
expect(post.post_search_data.search_data).to include('oeuvr')
end
it 'truncates long words in the index' do
SiteSetting.search_max_indexed_word_length = 4
title = 'A title that is long enough'
contents = 'I am the best beige object http://example.com/long/url'
topic.update!(title: title)
post.update!(raw: contents)
post_search_data = post.post_search_data
post_search_data.reload
expect(post_search_data.raw_data).to eq(contents)
words = post_search_data.search_data.scan(/'([^']*)'/).map { |match| match[0] }
expect(words).to contain_exactly('best', 'beig', 'obj', 'http', 'titl', 'long', 'enou', 'unca')
end
end
describe '.queue_post_reindex' do