mirror of
https://github.com/discourse/discourse.git
synced 2024-11-22 08:49:06 +08:00
FEATURE: Roll out new search optimisations (#20364)
- Reduce duplication of terms in post index from unlimited to 6. This will result in reduced index size and reduced weighting for posts containing a huge amount of duplicate terms. (Eg: a post containing "sam sam sam sam sam sam sam sam", will index as "sam sam sam sam sam sam", only including the word up to 6 times.) This corrects a flaw where title weighting could be ignored. - Prioritize exact matches of words in titles. Our search always performs a prefix match. However we want to give special weight to exact title matches meaning that a search for "sum" will find topics such as "the sum of us" vs "summer in spring". - Pick up fixes to our search algorithm which are missing from old indexes. Specifically pick up the fix that indexes URLs properly. (`https://happy.com` was stemmed to `happi` in keywords and then was not searchable) see also: https://meta.discourse.org/t/refinements-to-search-being-tested-on-meta/254158 Indexing will take a while and work in batches, in the background.
This commit is contained in:
parent
3c57db5c63
commit
cd247d5322
|
@ -1,12 +1,15 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class SearchIndexer
|
||||
POST_INDEX_VERSION = 4
|
||||
MIN_POST_REINDEX_VERSION = 3
|
||||
TOPIC_INDEX_VERSION = 3
|
||||
MIN_POST_BLURB_INDEX_VERSION = 4
|
||||
|
||||
POST_INDEX_VERSION = 5
|
||||
TOPIC_INDEX_VERSION = 4
|
||||
CATEGORY_INDEX_VERSION = 3
|
||||
USER_INDEX_VERSION = 3
|
||||
TAG_INDEX_VERSION = 3
|
||||
|
||||
# version to apply when issuing a background reindex
|
||||
REINDEX_VERSION = 0
|
||||
TS_VECTOR_PARSE_REGEX = /('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/
|
||||
|
||||
|
|
|
@ -2191,10 +2191,10 @@ search:
|
|||
default: false
|
||||
hidden: true
|
||||
prioritize_exact_search_title_match:
|
||||
default: false
|
||||
default: true
|
||||
hidden: true
|
||||
max_duplicate_search_index_terms:
|
||||
default: -1
|
||||
default: 6
|
||||
hidden: true
|
||||
use_pg_headlines_for_excerpt:
|
||||
default: false
|
||||
|
|
|
@ -71,7 +71,7 @@ class Search
|
|||
def blurb(post)
|
||||
opts = { term: @blurb_term, blurb_length: @blurb_length }
|
||||
|
||||
if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION &&
|
||||
if post.post_search_data.version >= SearchIndexer::MIN_POST_BLURB_INDEX_VERSION &&
|
||||
!Search.segment_chinese? && !Search.segment_japanese?
|
||||
if SiteSetting.use_pg_headlines_for_excerpt
|
||||
scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1')
|
||||
|
|
|
@ -70,7 +70,7 @@ RSpec.describe Jobs::ReindexSearch do
|
|||
end
|
||||
|
||||
it "should not reindex posts with a developmental version" do
|
||||
post = Fabricate(:post, version: SearchIndexer::MIN_POST_REINDEX_VERSION + 1)
|
||||
Fabricate(:post, version: SearchIndexer::POST_INDEX_VERSION + 1)
|
||||
|
||||
subject.rebuild_posts(indexer: FakeIndexer)
|
||||
|
||||
|
|
|
@ -122,6 +122,8 @@ RSpec.describe Search do
|
|||
|
||||
before do
|
||||
SearchIndexer.enable
|
||||
SiteSetting.max_duplicate_search_index_terms = -1
|
||||
SiteSetting.prioritize_exact_search_title_match = false
|
||||
[post1, post2].each { |post| SearchIndexer.index(post, force: true) }
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user