diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index ea1c1b8e89c..226b8a2843f 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -66,6 +66,28 @@ class SearchIndexer tsvector = "#{tsvector} #{additional_lexemes.join(" ")}" + if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0 + reduced = [] + tsvector + .scan(/([^\:]+\:)(([0-9]+[A-D]?,?)+)/) + .each do |term, indexes| + family_counts = Hash.new(0) + new_index_array = [] + + indexes + .split(",") + .each do |index| + family = nil + family = index[-1] if index[-1].match?(/[A-D]/) + if (family_counts[family] += 1) <= max_dupes + new_index_array << index + end + end + reduced << "#{term.strip}#{new_index_array.join(",")}" + end + tsvector = reduced.join(" ") + end + indexed_data = if table.to_s == "post" clean_post_raw_data!(search_data[:d]) diff --git a/config/site_settings.yml b/config/site_settings.yml index 4d40276b7af..5b5acafaf29 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -2192,6 +2192,9 @@ backups: client: true search: + max_duplicate_search_index_terms: + default: -1 + hidden: true use_pg_headlines_for_excerpt: default: false hidden: true diff --git a/spec/lib/search_spec.rb b/spec/lib/search_spec.rb index fdc91daad44..05c1f9b9870 100644 --- a/spec/lib/search_spec.rb +++ b/spec/lib/search_spec.rb @@ -2602,4 +2602,34 @@ RSpec.describe Search do expect(result.categories.length).to eq(0) end end + + context "when max_duplicate_search_index_terms limits duplication" do + before { SearchIndexer.enable } + + after { SearchIndexer.disable } + + it "correctly ranks topics" do + SiteSetting.max_duplicate_search_index_terms = 5 + + topic1 = Fabricate(:topic, title: "this is a topic about sam") + post1 = Fabricate(:post, topic: topic1, raw: "this topic is a story about some person") + + topic2 = Fabricate(:topic, title: "this is a topic about bob") + post2 = + Fabricate( + :post, + topic: topic2, + raw: "this topic is a story about some person #{"sam " * 100}", + ) + + SearchIndexer.index(post1, force: true) + SearchIndexer.index(post2, force: true) + + result = Search.execute("sam") + expect(result.posts.length).to eq(2) + + # title match should win cause we limited duplication + expect(result.posts.pluck(:id)).to eq([post1.id, post2.id]) + end + end end diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index de52cbb7228..1d892607f14 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -303,6 +303,20 @@ RSpec.describe SearchIndexer do "unca", ) end + + it "limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured" do + SiteSetting.max_duplicate_search_index_terms = 5 + + contents = "I am #{"sam " * 10}" + post.update!(raw: contents) + + post_search_data = post.post_search_data + post_search_data.reload + + expect(post_search_data.search_data).to eq( + "'sam':12,13,14,15,16 'test':8A 'titl':4A 'uncategor':9B", + ) + end end describe ".queue_post_reindex" do