# frozen_string_literal: true RSpec.describe SearchIndexer do let(:post_id) { 99 } before { SearchIndexer.enable } after { SearchIndexer.disable } it "correctly indexes chinese" do SiteSetting.default_locale = "zh_CN" data = "你好世界" SearchIndexer.update_posts_index( post_id: post_id, topic_title: "", category_name: "", topic_tags: "", cooked: data, private_message: false, ) post_search_data = PostSearchData.find_by(post_id: post_id) expect(post_search_data.raw_data).to eq("你好 世界") expect(post_search_data.search_data).to eq("'世界':2 '你好':1") end it "extract youtube title" do html = "
" scrubbed = SearchIndexer::HtmlScrubber.scrub(html) expect(scrubbed).to eq( "Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]", ) end it "extract a link" do html = "link" scrubbed = SearchIndexer::HtmlScrubber.scrub(html) expect(scrubbed).to eq("http://meta.discourse.org/ link") end it "ignores autogenerated link anchors" do html = "something special" scrubbed = SearchIndexer::HtmlScrubber.scrub(html) expect(scrubbed).to eq("something special") end it "extracts @username from mentions" do html = '

@狮子 @foo

' scrubbed = SearchIndexer::HtmlScrubber.scrub(html) expect(scrubbed).to eq("@狮子 @foo") end it "extracts @groupname from group mentions" do html = '

@автомобилист

' scrubbed = SearchIndexer::HtmlScrubber.scrub(html) expect(scrubbed).to eq("@автомобилист") end it "extracts emoji name from emoji image" do emoji = Emoji["wink"] html = %Q|\":wink:\"| scrubbed = SearchIndexer::HtmlScrubber.scrub(html) expect(scrubbed).to eq(":wink:") end it "doesn't index local files" do html = <<~HTML

Discourse

51%20PM

HTML scrubbed = SearchIndexer::HtmlScrubber.scrub(html) expect(scrubbed).to eq("Discourse 51%20PM Untitled%20design%20(21)") end it "correctly indexes a post according to version" do # Preparing so that they can be indexed to right version SearchIndexer.update_posts_index( post_id: post_id, topic_title: "dummy", category_name: "", topic_tags: nil, cooked: nil, private_message: false, ) PostSearchData.find_by(post_id: post_id).update!(version: -1) data = "This is a test" SearchIndexer.update_posts_index( post_id: post_id, topic_title: "", category_name: "", topic_tags: nil, cooked: data, private_message: false, ) raw_data, locale, version = PostSearchData.where(post_id: post_id).pluck(:raw_data, :locale, :version)[0] expect(raw_data).to eq("This is a test") expect(locale).to eq(SiteSetting.default_locale) expect(version).to eq(SearchIndexer::POST_INDEX_VERSION) end describe ".index" do let(:topic) { Fabricate(:topic, title: "this is a title that I am testing") } let(:post) { Fabricate(:post, topic: topic) } it "should index posts correctly" do expect { post }.to change { PostSearchData.count }.by(1) expect { post.update!(raw: "this is new content") }.to change { post.reload.post_search_data.search_data } expect { post.update!(topic_id: Fabricate(:topic).id) }.to change { post.reload.post_search_data.search_data } end it "should work with edge case domain names" do # 00E5A4 stems to 00e5 and a4, which is odd, but by-design # this may cause internal indexing to fail due to indexes not aligning # when stuffing terms for domains post.update!(cooked: <<~HTML) Test.00E5A4.1 HTML SearchIndexer.update_posts_index( post_id: post.id, topic_title: post.topic.title, category_name: post.topic.category&.name, topic_tags: post.topic.tags.map(&:name).join(" "), cooked: post.cooked, private_message: post.topic.private_message?, ) end it "should work with invalid HTML" do post.update!(cooked: "" * Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH) SearchIndexer.update_posts_index( post_id: post.id, topic_title: post.topic.title, category_name: post.topic.category&.name, topic_tags: post.topic.tags.map(&:name).join(" "), cooked: post.cooked, private_message: post.topic.private_message?, ) end it "should not index posts with empty raw" do expect do post = Fabricate.build(:post, raw: "", post_type: Post.types[:small_action]) post.save!(validate: false) end.to_not change { PostSearchData.count } end it "should not tokenize urls and duplicate title and href in " do post.update!(raw: <<~RAW) https://meta.discourse.org/some.png RAW post.rebake! post.reload expect(post.post_search_data.raw_data).to eq("https://meta.discourse.org/some.png") expect(post.post_search_data.search_data).to eq_ts_vector( "'/some.png':12 'discourse.org':11 'meta.discourse.org':11 'meta.discourse.org/some.png':10 'org':11 'test':8A 'titl':4A 'uncategor':9B 'meta':11 'discours':11", ) end it "should not tokenize versions" do post.update!(raw: "123.223") expect(post.post_search_data.search_data).to eq( "'123.223':10 'test':8A 'titl':4A 'uncategor':9B", ) post.update!(raw: "15.2.231.423") post.reload expect(post.post_search_data.search_data).to eq( "'15.2.231.423':10 'test':8A 'titl':4A 'uncategor':9B", ) end it "should tokenize host of a URL and removes query string" do category = Fabricate(:category, name: "awesome category") topic = Fabricate(:topic, category: category, title: "this is a test topic") post = Fabricate(:post, topic: topic, raw: <<~RAW) a https://car.com?bob=1, http://efg.com.au?bill=1 b hij.net/xyz=1 www.klm.net/?IGNORE=1 test https://cars.com RAW post.rebake! post.reload topic = post.topic # Note, a random non URL string should be tokenized properly, # hence www.klm.net?IGNORE=1 it was inserted in autolinking. # We could consider amending the auto linker to add # more context to say "hey, this part of ... was a guess by autolinker. # A blanket treating of non-urls without this logic is risky. expect(post.post_search_data.raw_data).to eq( "a https://car.com , http://efg.com.au b http://hij.net/xyz=1 hij.net/xyz=1 http://www.klm.net/ www.klm.net/?IGNORE=1 http://abc.de.nop.co.uk test https://cars.com", ) expect(post.post_search_data.search_data).to eq_ts_vector( "'/?ignore=1':21 '/xyz=1':14,17 'car.com':9 'cars.com':24 'abc.de.nop.co.uk':22 'au':10 'awesom':6B 'b':11 'categori':7B 'co.uk':22 'com':9,10,24 'com.au':10 'de.nop.co.uk':22 'efg.com.au':10 'hij.net':13,16 'hij.net/xyz=1':12,15 'klm.net':18,20 'net':13,16,18,20 'nop.co.uk':22 'test':4A,23 'topic':5A 'uk':22 'www.klm.net':18,20 'www.klm.net/?ignore=1':19 'car':9,24 'co':22 'de':22 'efg':10 'hij':13,16 'klm':18,20 'nop':22 'www':18,20 'abc':22", ) end it "should not include lightbox in search" do Jobs.run_immediately! SiteSetting.max_image_width = 1 stub_request(:get, "https://1.2.3.4/some.png").to_return( status: 200, body: file_from_fixtures("logo.png").read, ) src = "https://meta.discourse.org/some.png" post = Fabricate(:post, raw: <<~RAW) Let me see how I can fix this image white walkers RAW post.rebake! post.reload expect(post.cooked).not_to include(CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS) expect(post.post_search_data.raw_data).to eq( "Let me see how I can fix this image white walkers GOT", ) end it "should strips audio and videos URLs from raw data" do SiteSetting.authorized_extensions = "mp4" Fabricate(:video_upload) post.update!(raw: <<~RAW) link to an external page: https://google.com/?u=bar link to an audio file: https://somesite.com/audio.m4a link to a video file: https://somesite.com/content/somethingelse.MOV link to an invalid URL: http:error] RAW expect(post.post_search_data.raw_data).to eq( "link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]", ) expect(post.post_search_data.search_data).to eq_ts_vector( "'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27 'googl':15 'somesit':22,30", ) end it "should unaccent indexed content" do SiteSetting.search_ignore_accents = true post.update!(raw: "Cette oeuvre d'art n'est pas une œuvre") post.post_search_data.reload expect(post.post_search_data.search_data).not_to include("œuvr") expect(post.post_search_data.search_data).to include("oeuvr") SiteSetting.search_ignore_accents = false SearchIndexer.index(post, force: true) post.post_search_data.reload expect(post.post_search_data.search_data).to include("œuvr") expect(post.post_search_data.search_data).to include("oeuvr") end it "truncates long words in the index" do SiteSetting.search_max_indexed_word_length = 4 title = "A title that is long enough" contents = "I am the best beige object http://example.com/long/url" topic.update!(title: title) post.update!(raw: contents) post_search_data = post.post_search_data post_search_data.reload expect(post_search_data.raw_data).to eq(contents) words = post_search_data.search_data.scan(/'([^']*)'/).map { |match| match[0] } expect(words).to contain_exactly( "best", "beig", "obj", "http", "titl", "long", "enou", "unca", ) end it "limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured" do SiteSetting.max_duplicate_search_index_terms = 5 contents = <<~TEXT #{"sam " * 10} url url TEXT post.update!(raw: contents) post_search_data = post.post_search_data post_search_data.reload terms = "'/path:path''path':22,26 'com':21,25 'sam':10,11,12,13,14 'something.com':21 'something.com/path:path''path':20 'test':8A 'titl':4A 'uncategor':9B 'url':23,27 'someth':21,25 'somethings.com':25 'somethings.com/path:path''path':24" expect(post_search_data.search_data).to eq_ts_vector(terms) end end describe ".queue_post_reindex" do let(:post) { Fabricate(:post) } let(:topic) { post.topic } it "should reset the version of search data for all posts in the topic" do post2 = Fabricate(:post) SearchIndexer.queue_post_reindex(topic.id) expect(post.reload.post_search_data.version).to eq(SearchIndexer::REINDEX_VERSION) expect(post2.reload.post_search_data.version).to eq(SearchIndexer::POST_INDEX_VERSION) end end describe ".queue_users_reindex" do let!(:user) { Fabricate(:user) } let!(:user2) { Fabricate(:user) } it "should reset the version of search data for all users" do SearchIndexer.index(user, force: true) SearchIndexer.index(user2, force: true) SearchIndexer.queue_users_reindex([user.id]) expect(user.reload.user_search_data.version).to eq(SearchIndexer::REINDEX_VERSION) expect(user2.reload.user_search_data.version).to eq(SearchIndexer::USER_INDEX_VERSION) end end end