From 07679888c85414d99c0c69040d20f014619e0b85 Mon Sep 17 00:00:00 2001 From: Sam Date: Tue, 31 Jan 2023 12:41:31 +1100 Subject: [PATCH] FEATURE: allow restricting duplication in search index (#20062) * FEATURE: allow restricting duplication in search index This introduces the site setting `max_duplicate_search_index_terms`. Using this number we limit the amount of duplication in our search index. This allows us to more correctly weight title searches, so bloated posts don't unfairly bump to the top of search results. This feature is completely disabled by default and behind a site setting We will experiment with it first. Note entire search index must be rebuilt for it to take effect. --------- Co-authored-by: Alan Guo Xiang Tan --- app/services/search_indexer.rb | 22 ++++++++++++++++++++ config/site_settings.yml | 3 +++ spec/lib/search_spec.rb | 30 ++++++++++++++++++++++++++++ spec/services/search_indexer_spec.rb | 14 +++++++++++++ 4 files changed, 69 insertions(+) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index ea1c1b8e89c..226b8a2843f 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -66,6 +66,28 @@ class SearchIndexer tsvector = "#{tsvector} #{additional_lexemes.join(" ")}" + if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0 + reduced = [] + tsvector + .scan(/([^\:]+\:)(([0-9]+[A-D]?,?)+)/) + .each do |term, indexes| + family_counts = Hash.new(0) + new_index_array = [] + + indexes + .split(",") + .each do |index| + family = nil + family = index[-1] if index[-1].match?(/[A-D]/) + if (family_counts[family] += 1) <= max_dupes + new_index_array << index + end + end + reduced << "#{term.strip}#{new_index_array.join(",")}" + end + tsvector = reduced.join(" ") + end + indexed_data = if table.to_s == "post" clean_post_raw_data!(search_data[:d]) diff --git a/config/site_settings.yml b/config/site_settings.yml index 4d40276b7af..5b5acafaf29 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -2192,6 +2192,9 @@ backups: client: true search: + max_duplicate_search_index_terms: + default: -1 + hidden: true use_pg_headlines_for_excerpt: default: false hidden: true diff --git a/spec/lib/search_spec.rb b/spec/lib/search_spec.rb index fdc91daad44..05c1f9b9870 100644 --- a/spec/lib/search_spec.rb +++ b/spec/lib/search_spec.rb @@ -2602,4 +2602,34 @@ RSpec.describe Search do expect(result.categories.length).to eq(0) end end + + context "when max_duplicate_search_index_terms limits duplication" do + before { SearchIndexer.enable } + + after { SearchIndexer.disable } + + it "correctly ranks topics" do + SiteSetting.max_duplicate_search_index_terms = 5 + + topic1 = Fabricate(:topic, title: "this is a topic about sam") + post1 = Fabricate(:post, topic: topic1, raw: "this topic is a story about some person") + + topic2 = Fabricate(:topic, title: "this is a topic about bob") + post2 = + Fabricate( + :post, + topic: topic2, + raw: "this topic is a story about some person #{"sam " * 100}", + ) + + SearchIndexer.index(post1, force: true) + SearchIndexer.index(post2, force: true) + + result = Search.execute("sam") + expect(result.posts.length).to eq(2) + + # title match should win cause we limited duplication + expect(result.posts.pluck(:id)).to eq([post1.id, post2.id]) + end + end end diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index de52cbb7228..1d892607f14 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -303,6 +303,20 @@ RSpec.describe SearchIndexer do "unca", ) end + + it "limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured" do + SiteSetting.max_duplicate_search_index_terms = 5 + + contents = "I am #{"sam " * 10}" + post.update!(raw: contents) + + post_search_data = post.post_search_data + post_search_data.reload + + expect(post_search_data.search_data).to eq( + "'sam':12,13,14,15,16 'test':8A 'titl':4A 'uncategor':9B", + ) + end end describe ".queue_post_reindex" do