FIX: Relevance search will now consider document length in ranking.

The default ranking options ranks by the number of matches which is
highly problematic when posts are stuffed with a keyword. The ranking
will now be divided by the document length which is a much fairer way to
rank.
This commit is contained in:
Guo Xiang Tan 2019-04-01 13:40:11 +08:00
parent cadd1d670f
commit e87ca59401
2 changed files with 56 additions and 11 deletions

View File

@ -838,13 +838,14 @@ class Search
posts = posts.order("posts.like_count DESC")
end
else
# 0|32 default normalization scaled into the range zero to one
# 2|32 divides the rank by the document length and scales the range from
# zero to one
data_ranking = <<~SQL
(
TS_RANK_CD(
post_search_data.search_data,
#{ts_query(weight_filter: weights)},
0|32
2|32
) *
(
CASE categories.search_priority

View File

@ -334,6 +334,27 @@ describe Search do
expect(result.posts).to contain_exactly(reply)
expect(result.blurb(reply)).to eq(expected_blurb)
end
it 'does not allow a post with repeated words to dominate the ranking' do
category = Fabricate(:category, name: "winter is coming")
post = Fabricate(:post,
raw: "I think winter will end soon",
topic: Fabricate(:topic,
title: "dragon john snow winter",
category: category
)
)
post2 = Fabricate(:post,
raw: "I think winter winter winter winter winter will end soon",
topic: Fabricate(:topic, title: "dragon john snow summer", category: category)
)
result = Search.execute('winter')
expect(result.posts).to eq([post, post2, category.topic.first_post])
end
end
context 'searching for quoted title' do
@ -940,22 +961,45 @@ describe Search do
today = Date.today
yesterday = 1.day.ago
two_days_ago = 2.days.ago
category = Fabricate(:category)
old_topic = Fabricate(:topic,
title: 'First Topic, testing the created_at sort',
created_at: two_days_ago,
category: category
)
old_topic = Fabricate(:topic,
title: 'First Topic, testing the created_at sort',
created_at: two_days_ago)
latest_topic = Fabricate(:topic,
title: 'Second Topic, testing the created_at sort',
created_at: yesterday)
title: 'Second Topic, testing the created_at sort',
created_at: yesterday,
category: category
)
old_relevant_topic_post = Fabricate(:post, topic: old_topic, created_at: yesterday, raw: 'Relevant Topic')
latest_irelevant_topic_post = Fabricate(:post, topic: latest_topic, created_at: today, raw: 'Not Relevant')
old_relevant_topic_post = Fabricate(:post,
topic: old_topic,
created_at: yesterday,
raw: 'Relevant Relevant Topic'
)
latest_irelevant_topic_post = Fabricate(:post,
topic: latest_topic,
created_at: today,
raw: 'Not Relevant'
)
# Expecting the default results
expect(Search.execute('Topic').posts.map(&:id)).to eq([old_relevant_topic_post.id, latest_irelevant_topic_post.id])
expect(Search.execute('Topic').posts).to contain_exactly(
old_relevant_topic_post,
latest_irelevant_topic_post,
category.topic.first_post
)
# Expecting the ordered by topic creation results
expect(Search.execute('Topic order:latest_topic').posts.map(&:id)).to eq([latest_irelevant_topic_post.id, old_relevant_topic_post.id])
expect(Search.execute('Topic order:latest_topic').posts).to contain_exactly(
latest_irelevant_topic_post,
old_relevant_topic_post,
category.topic.first_post
)
end
it 'can tokenize dots' do