mirror of
https://github.com/discourse/discourse.git
synced 2024-11-23 02:19:27 +08:00
FIX: Limit max word length in search index (#16380)
Long words bloat the index for little benefit.
This commit is contained in:
parent
69dbc207ab
commit
6e9a068e44
|
@ -18,12 +18,25 @@ class SearchIndexer
|
|||
end
|
||||
|
||||
def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
|
||||
raw_data = [a_weight, b_weight, c_weight, d_weight]
|
||||
raw_data = {
|
||||
a: a_weight,
|
||||
b: b_weight,
|
||||
c: c_weight,
|
||||
d: d_weight,
|
||||
}
|
||||
|
||||
search_data = raw_data.map do |data|
|
||||
# The version used in excerpts
|
||||
search_data = raw_data.transform_values do |data|
|
||||
Search.prepare_data(data || "", :index)
|
||||
end
|
||||
|
||||
# The version used to build the index
|
||||
indexed_data = search_data.transform_values do |data|
|
||||
data.gsub(/\S+/) { |word|
|
||||
word[0...SiteSetting.search_max_indexed_word_length]
|
||||
}
|
||||
end
|
||||
|
||||
table_name = "#{table}_search_data"
|
||||
foreign_key = "#{table}_id"
|
||||
|
||||
|
@ -37,14 +50,7 @@ class SearchIndexer
|
|||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
|
||||
SQL
|
||||
|
||||
ranked_params = {
|
||||
a: search_data[0],
|
||||
b: search_data[1],
|
||||
c: search_data[2],
|
||||
d: search_data[3],
|
||||
}
|
||||
|
||||
tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
|
||||
tsvector = DB.query_single("SELECT #{ranked_index}", indexed_data)[0]
|
||||
additional_lexemes = []
|
||||
|
||||
tsvector.scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/).reduce(additional_lexemes) do |array, (lexeme, _, positions)|
|
||||
|
@ -68,9 +74,9 @@ class SearchIndexer
|
|||
|
||||
indexed_data =
|
||||
if table.to_s == "post"
|
||||
clean_post_raw_data!(ranked_params[:d])
|
||||
clean_post_raw_data!(search_data[:d])
|
||||
else
|
||||
search_data.select { |d| d.length > 0 }.join(' ')
|
||||
search_data.values.select { |d| d.length > 0 }.join(' ')
|
||||
end
|
||||
|
||||
params = {
|
||||
|
|
|
@ -2028,6 +2028,9 @@ search:
|
|||
default: false
|
||||
hidden: true
|
||||
client: true
|
||||
search_max_indexed_word_length:
|
||||
default: 100
|
||||
hidden: true
|
||||
search_ranking_normalization:
|
||||
default: "0"
|
||||
hidden: true
|
||||
|
|
|
@ -269,6 +269,22 @@ describe SearchIndexer do
|
|||
expect(post.post_search_data.search_data).to include('œuvr')
|
||||
expect(post.post_search_data.search_data).to include('oeuvr')
|
||||
end
|
||||
|
||||
it 'truncates long words in the index' do
|
||||
SiteSetting.search_max_indexed_word_length = 4
|
||||
title = 'A title that is long enough'
|
||||
contents = 'I am the best beige object http://example.com/long/url'
|
||||
|
||||
topic.update!(title: title)
|
||||
post.update!(raw: contents)
|
||||
post_search_data = post.post_search_data
|
||||
post_search_data.reload
|
||||
|
||||
expect(post_search_data.raw_data).to eq(contents)
|
||||
|
||||
words = post_search_data.search_data.scan(/'([^']*)'/).map { |match| match[0] }
|
||||
expect(words).to contain_exactly('best', 'beig', 'obj', 'http', 'titl', 'long', 'enou', 'unca')
|
||||
end
|
||||
end
|
||||
|
||||
describe '.queue_post_reindex' do
|
||||
|
|
Loading…
Reference in New Issue
Block a user