discourse/spec/services/search_indexer_spec.rb
Sam 07679888c8
FEATURE: allow restricting duplication in search index (#20062)
* FEATURE: allow restricting duplication in search index

This introduces the site setting `max_duplicate_search_index_terms`.
Using this number we limit the amount of duplication in our search index.

This allows us to more correctly weight title searches, so bloated posts
don't unfairly bump to the top of search results.

This feature is completely disabled by default and behind a site setting

We will experiment with it first. Note entire search index must be rebuilt
for it to take effect.


---------

Co-authored-by: Alan Guo Xiang Tan <gxtan1990@gmail.com>
2023-01-31 12:41:31 +11:00

352 lines
13 KiB
Ruby

# frozen_string_literal: true
RSpec.describe SearchIndexer do
let(:post_id) { 99 }
before { SearchIndexer.enable }
after { SearchIndexer.disable }
it "correctly indexes chinese" do
SiteSetting.default_locale = "zh_CN"
data = "你好世界"
SearchIndexer.update_posts_index(
post_id: post_id,
topic_title: "",
category_name: "",
topic_tags: "",
cooked: data,
private_message: false,
)
post_search_data = PostSearchData.find_by(post_id: post_id)
expect(post_search_data.raw_data).to eq("你好 世界")
expect(post_search_data.search_data).to eq("'世界':2 '你好':1")
end
it "extract youtube title" do
html =
"<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&amp;wmode=opaque\"></div>"
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq(
"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]",
)
end
it "extract a link" do
html = "<a href='http://meta.discourse.org/'>link</a>"
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq("http://meta.discourse.org/ link")
end
it "ignores autogenerated link anchors" do
html = "<a class='anchor' href='#something-special'>something special</a>"
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq("something special")
end
it "extracts @username from mentions" do
html =
'<p><a class="mention" href="/u/%E7%8B%AE%E5%AD%90">@狮子</a> <a class="mention" href="/u/foo">@foo</a></p>'
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq("@狮子 @foo")
end
it "extracts @groupname from group mentions" do
html =
'<p><a class="mention-group" href="/groups/%D0%B0%D0%B2%D1%82%D0%BE%D0%BC%D0%BE%D0%B1%D0%B8%D0%BB%D0%B8%D1%81%D1%82">@автомобилист</a></p>'
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq("@автомобилист")
end
it "extracts emoji name from emoji image" do
emoji = Emoji["wink"]
html =
%Q|<img src=\"#{URI.join(Discourse.base_url_no_prefix, emoji.url)}\" title=\":wink:\" class=\"emoji only-emoji\" alt=\":wink:\" loading=\"lazy\" width=\"20\" height=\"20\">|
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq(":wink:")
end
it "doesn't index local files" do
html = <<~HTML
<p><img src="https://www.discourse.org/logo.png" alt="Discourse"></p>
<p><img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/0/f/0f40b818356bdc1d80acfa905034e95cfd112a3a.png" alt="51%20PM" width="289" height="398"></p>
<div class="lightbox-wrapper">
<a class="lightbox" href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b.jpg" data-download-href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/16790095df3baf318fb2eb1d7e5d7860dc45d48b" title="Untitled design (21).jpg" rel="nofollow noopener">
<img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/optimized/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b_1_563x500.jpg" alt="Untitled%20design%20(21)" width="563" height="500">
<div class="meta">
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use href="#far-image"></use></svg>
<span class="filename">Untitled design (21).jpg</span>
<span class="informations">1280x1136 472 KB</span>
<svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use href="#discourse-expand"></use></svg>
</div>
</a>
</div>
HTML
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq("Discourse 51%20PM Untitled%20design%20(21)")
end
it "correctly indexes a post according to version" do
# Preparing so that they can be indexed to right version
SearchIndexer.update_posts_index(
post_id: post_id,
topic_title: "dummy",
category_name: "",
topic_tags: nil,
cooked: nil,
private_message: false,
)
PostSearchData.find_by(post_id: post_id).update!(version: -1)
data = "<a>This</a> is a test"
SearchIndexer.update_posts_index(
post_id: post_id,
topic_title: "",
category_name: "",
topic_tags: nil,
cooked: data,
private_message: false,
)
raw_data, locale, version =
PostSearchData.where(post_id: post_id).pluck(:raw_data, :locale, :version)[0]
expect(raw_data).to eq("This is a test")
expect(locale).to eq(SiteSetting.default_locale)
expect(version).to eq(SearchIndexer::POST_INDEX_VERSION)
end
describe ".index" do
let(:topic) { Fabricate(:topic, title: "this is a title that I am testing") }
let(:post) { Fabricate(:post, topic: topic) }
it "should index posts correctly" do
expect { post }.to change { PostSearchData.count }.by(1)
expect { post.update!(raw: "this is new content") }.to change {
post.reload.post_search_data.search_data
}
expect { post.update!(topic_id: Fabricate(:topic).id) }.to change {
post.reload.post_search_data.search_data
}
end
it "should work with invalid HTML" do
post.update!(cooked: "<FD>" * Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
SearchIndexer.update_posts_index(
post_id: post.id,
topic_title: post.topic.title,
category_name: post.topic.category&.name,
topic_tags: post.topic.tags.map(&:name).join(" "),
cooked: post.cooked,
private_message: post.topic.private_message?,
)
end
it "should not index posts with empty raw" do
expect do
post = Fabricate.build(:post, raw: "", post_type: Post.types[:small_action])
post.save!(validate: false)
end.to_not change { PostSearchData.count }
end
it "should not tokenize urls and duplicate title and href in <a>" do
post.update!(raw: <<~RAW)
https://meta.discourse.org/some.png
RAW
post.rebake!
post.reload
expect(post.post_search_data.raw_data).to eq("https://meta.discourse.org/some.png")
expect(post.post_search_data.search_data).to eq(
"'/some.png':12 'discourse.org':11 'meta.discourse.org':11 'meta.discourse.org/some.png':10 'org':11 'test':8A 'titl':4A 'uncategor':9B",
)
end
it "should not tokenize versions" do
post.update!(raw: "123.223")
expect(post.post_search_data.search_data).to eq(
"'123.223':10 'test':8A 'titl':4A 'uncategor':9B",
)
post.update!(raw: "15.2.231.423")
post.reload
expect(post.post_search_data.search_data).to eq(
"'15.2.231.423':10 'test':8A 'titl':4A 'uncategor':9B",
)
end
it "should tokenize host of a URL and removes query string" do
category = Fabricate(:category, name: "awesome category")
topic = Fabricate(:topic, category: category, title: "this is a test topic")
post = Fabricate(:post, topic: topic, raw: <<~RAW)
a https://abc.com?bob=1, http://efg.com.au?bill=1 b hij.net/xyz=1
www.klm.net/?IGNORE=1 <a href="http://abc.de.nop.co.uk?IGNORE=1&ignore2=2">test</a>
RAW
post.rebake!
post.reload
topic = post.topic
# Note, a random non URL string should be tokenized properly,
# hence www.klm.net?IGNORE=1 it was inserted in autolinking.
# We could consider amending the auto linker to add
# more context to say "hey, this part of <a href>...</a> was a guess by autolinker.
# A blanket treating of non-urls without this logic is risky.
expect(post.post_search_data.raw_data).to eq(
"a https://abc.com , http://efg.com.au b http://hij.net/xyz=1 hij.net/xyz=1 http://www.klm.net/ www.klm.net/?IGNORE=1 http://abc.de.nop.co.uk test",
)
expect(post.post_search_data.search_data).to eq(
"'/?ignore=1':21 '/xyz=1':14,17 'abc.com':9 'abc.de.nop.co.uk':22 'au':10 'awesom':6B 'b':11 'categori':7B 'co.uk':22 'com':9 'com.au':10 'de.nop.co.uk':22 'efg.com.au':10 'hij.net':13,16 'hij.net/xyz=1':12,15 'klm.net':18,20 'net':13,16,18,20 'nop.co.uk':22 'test':4A,23 'topic':5A 'uk':22 'www.klm.net':18,20 'www.klm.net/?ignore=1':19",
)
end
it "should not include lightbox in search" do
Jobs.run_immediately!
SiteSetting.max_image_width = 1
stub_request(:get, "https://meta.discourse.org/some.png").to_return(
status: 200,
body: file_from_fixtures("logo.png").read,
)
src = "https://meta.discourse.org/some.png"
post = Fabricate(:post, raw: <<~RAW)
Let me see how I can fix this image
<img src="#{src}" title="GOT" alt="white walkers" width="2" height="2">
RAW
post.rebake!
post.reload
expect(post.cooked).not_to include(CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS)
expect(post.post_search_data.raw_data).to eq(
"Let me see how I can fix this image white walkers GOT",
)
end
it "should strips audio and videos URLs from raw data" do
SiteSetting.authorized_extensions = "mp4"
Fabricate(:video_upload)
post.update!(raw: <<~RAW)
link to an external page: https://google.com/?u=bar
link to an audio file: https://somesite.com/audio.m4a
link to a video file: https://somesite.com/content/somethingelse.MOV
link to an invalid URL: http:error]
RAW
expect(post.post_search_data.raw_data).to eq(
"link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]",
)
expect(post.post_search_data.search_data).to eq(
"'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27",
)
end
it "should unaccent indexed content" do
SiteSetting.search_ignore_accents = true
post.update!(raw: "Cette oeuvre d'art n'est pas une œuvre")
post.post_search_data.reload
expect(post.post_search_data.search_data).not_to include("œuvr")
expect(post.post_search_data.search_data).to include("oeuvr")
SiteSetting.search_ignore_accents = false
SearchIndexer.index(post, force: true)
post.post_search_data.reload
expect(post.post_search_data.search_data).to include("œuvr")
expect(post.post_search_data.search_data).to include("oeuvr")
end
it "truncates long words in the index" do
SiteSetting.search_max_indexed_word_length = 4
title = "A title that is long enough"
contents = "I am the best beige object http://example.com/long/url"
topic.update!(title: title)
post.update!(raw: contents)
post_search_data = post.post_search_data
post_search_data.reload
expect(post_search_data.raw_data).to eq(contents)
words = post_search_data.search_data.scan(/'([^']*)'/).map { |match| match[0] }
expect(words).to contain_exactly(
"best",
"beig",
"obj",
"http",
"titl",
"long",
"enou",
"unca",
)
end
it "limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured" do
SiteSetting.max_duplicate_search_index_terms = 5
contents = "I am #{"sam " * 10}"
post.update!(raw: contents)
post_search_data = post.post_search_data
post_search_data.reload
expect(post_search_data.search_data).to eq(
"'sam':12,13,14,15,16 'test':8A 'titl':4A 'uncategor':9B",
)
end
end
describe ".queue_post_reindex" do
let(:post) { Fabricate(:post) }
let(:topic) { post.topic }
it "should reset the version of search data for all posts in the topic" do
post2 = Fabricate(:post)
SearchIndexer.queue_post_reindex(topic.id)
expect(post.reload.post_search_data.version).to eq(SearchIndexer::REINDEX_VERSION)
expect(post2.reload.post_search_data.version).to eq(SearchIndexer::POST_INDEX_VERSION)
end
end
describe ".queue_users_reindex" do
let!(:user) { Fabricate(:user) }
let!(:user2) { Fabricate(:user) }
it "should reset the version of search data for all users" do
SearchIndexer.index(user, force: true)
SearchIndexer.index(user2, force: true)
SearchIndexer.queue_users_reindex([user.id])
expect(user.reload.user_search_data.version).to eq(SearchIndexer::REINDEX_VERSION)
expect(user2.reload.user_search_data.version).to eq(SearchIndexer::USER_INDEX_VERSION)
end
end
end