PERF: Use PG headlines for blurb generation and highlighting for search.

This commit is contained in:
Guo Xiang Tan 2020-08-06 14:15:31 +08:00
parent ba482c251c
commit 2193d02433
No known key found for this signature in database
GPG Key ID: FBD110179AAC1F20
5 changed files with 48 additions and 15 deletions

View File

@ -1792,6 +1792,9 @@ backups:
hidden: true
search:
use_pg_headlines_for_excerpt:
default: false
hidden: true
search_ranking_normalization:
default: "0"
hidden: true

View File

@ -2,6 +2,7 @@
class Search
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
HIGHLIGHT_CSS_CLASS = 'search-highlight'
cattr_accessor :preloaded_topic_custom_fields
self.preloaded_topic_custom_fields = Set.new
@ -726,12 +727,18 @@ class Search
def single_topic(id)
if @opts[:restrict_to_archetype].present?
archetype = @opts[:restrict_to_archetype] == Archetype.default ? Archetype.default : Archetype.private_message
post = Post.joins(:topic)
.where("topics.id = :id AND topics.archetype = :archetype AND posts.post_number = 1", id: id, archetype: archetype)
.first
post = posts_scope
.joins(:topic)
.find_by(
"topics.id = :id AND topics.archetype = :archetype AND posts.post_number = 1",
id: id,
archetype: archetype
)
else
post = Post.find_by(topic_id: id, post_number: 1)
post = posts_scope.find_by(topic_id: id, post_number: 1)
end
return nil unless @guardian.can_see?(post)
@results.add(post)
@ -1096,7 +1103,7 @@ class Search
def aggregate_posts(post_sql)
return [] unless post_sql
posts_eager_loads(Post)
posts_scope(posts_eager_loads(Post))
.joins("JOIN (#{post_sql}) x ON x.id = posts.topic_id AND x.post_number = posts.post_number")
.order('row_number')
end
@ -1128,7 +1135,7 @@ class Search
def topic_search
if @search_context.is_a?(Topic)
posts = posts_eager_loads(posts_query(limit))
posts = posts_scope(posts_eager_loads(posts_query(limit)))
.where('posts.topic_id = ?', @search_context.id)
posts.each do |post|
@ -1150,4 +1157,17 @@ class Search
query.includes(topic: topic_eager_loads)
end
def posts_scope(default_scope = Post.all)
if SiteSetting.use_pg_headlines_for_excerpt
default_scope
.joins("INNER JOIN post_search_data pd ON pd.post_id = posts.id")
.select(
"TS_HEADLINE(#{default_ts_config}, pd.raw_data, PLAINTO_TSQUERY('#{@term.present? ? PG::Connection.escape_string(@term) : nil}'), 'ShortWord=0, MaxFragments=1, MinWords=50, MaxWords=51, StartSel=''<span class=\"#{HIGHLIGHT_CSS_CLASS}\">'', StopSel=''</span>''') AS headline",
default_scope.arel.projections
)
else
default_scope
end
end
end

View File

@ -85,8 +85,12 @@ class Search
}
if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION
opts[:cooked] = post.post_search_data.raw_data
opts[:scrub] = false
if SiteSetting.use_pg_headlines_for_excerpt
return post.headline
else
opts[:cooked] = post.post_search_data.raw_data
opts[:scrub] = false
end
else
opts[:cooked] = post.cooked
end

View File

@ -410,27 +410,31 @@ describe Search do
end
let(:expected_blurb) do
"...quire content longer than the typical test post raw content. It really is some long content, folks. elephant"
"hundred characters to satisfy any test conditions that require content longer than the typical test post raw content. It really is some long content, folks. <span class=\"search-highlight\">elephant</span>"
end
it 'returns the post' do
SiteSetting.use_pg_headlines_for_excerpt = true
result = Search.execute('elephant',
type_filter: 'topic',
include_blurbs: true
)
expect(result.posts).to contain_exactly(reply)
expect(result.blurb(reply)).to eq(expected_blurb)
expect(result.posts.map(&:id)).to contain_exactly(reply.id)
expect(result.blurb(result.posts.first)).to eq(expected_blurb)
end
it 'returns the right post and blurb for searches with phrase' do
SiteSetting.use_pg_headlines_for_excerpt = true
result = Search.execute('"elephant"',
type_filter: 'topic',
include_blurbs: true
)
expect(result.posts).to contain_exactly(reply)
expect(result.blurb(reply)).to eq(expected_blurb)
expect(result.posts.map(&:id)).to contain_exactly(reply.id)
expect(result.blurb(result.posts.first)).to eq(expected_blurb)
end
it 'applies a small penalty to closed topic when ranking' do

View File

@ -99,6 +99,8 @@ describe SearchController do
end
it "can search correctly" do
SiteSetting.use_pg_headlines_for_excerpt = true
get "/search/query.json", params: {
term: 'awesome'
}
@ -109,11 +111,11 @@ describe SearchController do
expect(data['posts'].length).to eq(2)
expect(data['posts'][0]['id']).to eq(awesome_post_2.id)
expect(data['posts'][0]['blurb']).to eq(awesome_post_2.raw)
expect(data['posts'][0]['blurb']).to eq("this is my really <span class=\"#{Search::HIGHLIGHT_CSS_CLASS}\">awesome</span> post")
expect(data['topics'][0]['id']).to eq(awesome_post_2.topic_id)
expect(data['posts'][1]['id']).to eq(awesome_post.id)
expect(data['posts'][1]['blurb']).to eq(awesome_post.raw)
expect(data['posts'][1]['blurb']).to eq("this is my really <span class=\"#{Search::HIGHLIGHT_CSS_CLASS}\">awesome</span> post")
expect(data['topics'][1]['id']).to eq(awesome_post.topic_id)
end