FIX: Limit max word length in search index (#16380)

Long words bloat the index for little benefit.
2024-11-23 02:19:27 +08:00 · 2022-04-06 12:23:30 -05:00 · 2022-04-06 12:23:30 -05:00 · 6e9a068e44
commit 6e9a068e44
parent 69dbc207ab
3 changed files with 37 additions and 12 deletions
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@ -18,12 +18,25 @@ class SearchIndexer
  end

  def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
-    raw_data = [a_weight, b_weight, c_weight, d_weight]
+    raw_data = {
+      a: a_weight,
+      b: b_weight,
+      c: c_weight,
+      d: d_weight,
+    }

-    search_data = raw_data.map do |data|
+    # The version used in excerpts
+    search_data = raw_data.transform_values do |data|
      Search.prepare_data(data || "", :index)
    end

+    # The version used to build the index
+    indexed_data = search_data.transform_values do |data|
+      data.gsub(/\S+/) { |word|
+        word[0...SiteSetting.search_max_indexed_word_length]
+      }
+    end
+
    table_name = "#{table}_search_data"
    foreign_key = "#{table}_id"

@ -37,14 +50,7 @@ class SearchIndexer
      setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
    SQL

-    ranked_params = {
-      a: search_data[0],
-      b: search_data[1],
-      c: search_data[2],
-      d: search_data[3],
-    }
-
-    tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
+    tsvector = DB.query_single("SELECT #{ranked_index}", indexed_data)[0]
    additional_lexemes = []

    tsvector.scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/).reduce(additional_lexemes) do |array, (lexeme, _, positions)|
@ -68,9 +74,9 @@ class SearchIndexer

    indexed_data =
      if table.to_s == "post"
-        clean_post_raw_data!(ranked_params[:d])
+        clean_post_raw_data!(search_data[:d])
      else
-        search_data.select { |d| d.length > 0 }.join(' ')
+        search_data.values.select { |d| d.length > 0 }.join(' ')
      end

    params = {
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@ -2028,6 +2028,9 @@ search:
    default: false
    hidden: true
    client: true
+  search_max_indexed_word_length:
+    default: 100
+    hidden: true
  search_ranking_normalization:
    default: "0"
    hidden: true
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@ -269,6 +269,22 @@ describe SearchIndexer do
      expect(post.post_search_data.search_data).to include('œuvr')
      expect(post.post_search_data.search_data).to include('oeuvr')
    end
+
+    it 'truncates long words in the index' do
+      SiteSetting.search_max_indexed_word_length = 4
+      title = 'A title that is long enough'
+      contents = 'I am the best beige object http://example.com/long/url'
+
+      topic.update!(title: title)
+      post.update!(raw: contents)
+      post_search_data = post.post_search_data
+      post_search_data.reload
+
+      expect(post_search_data.raw_data).to eq(contents)
+
+      words = post_search_data.search_data.scan(/'([^']*)'/).map { |match| match[0] }
+      expect(words).to contain_exactly('best', 'beig', 'obj', 'http', 'titl', 'long', 'enou', 'unca')
+    end
  end

  describe '.queue_post_reindex' do