2019-04-30 08:27:42 +08:00
# frozen_string_literal: true
2016-12-22 10:13:14 +08:00
describe SearchIndexer do
2017-08-16 19:38:34 +08:00
let ( :post_id ) { 99 }
2018-09-14 00:53:53 +08:00
2019-04-02 09:52:59 +08:00
before do
SearchIndexer . enable
end
after do
SearchIndexer . disable
end
2014-06-24 15:10:56 +08:00
it 'correctly indexes chinese' do
SiteSetting . default_locale = 'zh_CN'
data = " 你好世界 "
2020-08-18 14:51:17 +08:00
SearchIndexer . update_posts_index (
post_id : post_id ,
topic_title : " " ,
category_name : " " ,
topic_tags : " " ,
cooked : data ,
private_message : false
)
2014-06-24 15:10:56 +08:00
2020-07-17 16:27:30 +08:00
post_search_data = PostSearchData . find_by ( post_id : post_id )
expect ( post_search_data . raw_data ) . to eq ( " 你好 世界 " )
expect ( post_search_data . search_data ) . to eq ( " '世界':2 '你好':1 " )
2014-06-24 15:10:56 +08:00
end
2018-04-26 13:46:52 +08:00
it 'extract youtube title' do
html = " <div class= \" lazyYT \" data-youtube-id= \" lmFgeFh2nlw \" data-youtube-title= \" Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive] \" data-width= \" 480 \" data-height= \" 270 \" data-parameters= \" feature=oembed&wmode=opaque \" ></div> "
2022-03-08 05:03:10 +08:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2018-09-17 16:31:15 +08:00
expect ( scrubbed ) . to eq ( " Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive] " )
2018-04-26 13:46:52 +08:00
end
2018-08-20 08:39:19 +08:00
it 'extract a link' do
html = " <a href='http://meta.discourse.org/'>link</a> "
2022-03-08 05:03:10 +08:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2018-09-17 16:31:15 +08:00
expect ( scrubbed ) . to eq ( " http://meta.discourse.org/ link " )
2018-08-20 08:39:19 +08:00
end
2022-04-07 04:06:45 +08:00
it 'ignores autogenerated link anchors' do
html = " <a class='anchor' href=' # something-special'>something special</a> "
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
expect ( scrubbed ) . to eq ( " something special " )
end
2019-04-29 23:15:55 +08:00
it 'extracts @username from mentions' do
html = '<p><a class="mention" href="/u/%E7%8B%AE%E5%AD%90">@狮子</a> <a class="mention" href="/u/foo">@foo</a></p>'
2022-03-08 05:03:10 +08:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2019-04-29 23:15:55 +08:00
expect ( scrubbed ) . to eq ( '@狮子 @foo' )
end
it 'extracts @groupname from group mentions' do
html = '<p><a class="mention-group" href="/groups/%D0%B0%D0%B2%D1%82%D0%BE%D0%BC%D0%BE%D0%B1%D0%B8%D0%BB%D0%B8%D1%81%D1%82">@автомобилист</a></p>'
2022-03-08 05:03:10 +08:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2019-04-29 23:15:55 +08:00
expect ( scrubbed ) . to eq ( '@автомобилист' )
end
2019-04-29 23:26:29 +08:00
it 'extracts emoji name from emoji image' do
2022-01-24 13:35:30 +08:00
emoji = Emoji [ " wink " ]
2022-02-09 19:18:59 +08:00
html = %Q|<img src=\"#{URI.join(Discourse.base_url_no_prefix, emoji.url)}\" title=\":wink:\" class=\"emoji only-emoji\" alt=\":wink:\" loading=\"lazy\" width=\"20\" height=\"20\">|
2022-03-08 05:03:10 +08:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2022-01-24 13:35:30 +08:00
2019-04-29 23:26:29 +08:00
expect ( scrubbed ) . to eq ( ':wink:' )
end
2018-09-14 00:53:53 +08:00
it " doesn't index local files " do
html = << ~ HTML
< p > < img src = " https://www.discourse.org/logo.png " alt = " Discourse " > < / p>
< p > < img src = " #{ Discourse . base_url_no_prefix } /uploads/episodeinteractive/original/3X/0/f/0f40b818356bdc1d80acfa905034e95cfd112a3a.png " alt = " 51%20PM " width = " 289 " height = " 398 " > < / p>
< div class = " lightbox-wrapper " >
< a class = " lightbox " href = " #{ Discourse . base_url_no_prefix } /uploads/episodeinteractive/original/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b.jpg " data - download - href = " #{ Discourse . base_url_no_prefix } /uploads/episodeinteractive/16790095df3baf318fb2eb1d7e5d7860dc45d48b " title = " Untitled design (21).jpg " rel = " nofollow noopener " >
< img src = " #{ Discourse . base_url_no_prefix } /uploads/episodeinteractive/optimized/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b_1_563x500.jpg " alt = " Untitled%20design%20(21) " width = " 563 " height = " 500 " >
< div class = " meta " >
2021-11-25 12:22:43 +08:00
< svg class = " fa d-icon d-icon-far-image svg-icon " aria - hidden = " true " > < use href = " # far-image " > < / use>< /s vg >
2018-09-14 00:53:53 +08:00
< span class = " filename " > Untitled design ( 21 ) . jpg < / span>
< span class = " informations " > 1280 x1136 472 KB < / span>
2021-11-25 12:22:43 +08:00
< svg class = " fa d-icon d-icon-discourse-expand svg-icon " aria - hidden = " true " > < use href = " # discourse-expand " > < / use>< /s vg >
2018-09-14 00:53:53 +08:00
< / div>
< / a>
< / div>
HTML
2022-03-08 05:03:10 +08:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2018-09-14 00:53:53 +08:00
2019-04-01 16:18:54 +08:00
expect ( scrubbed ) . to eq ( " Discourse 51%20PM Untitled%20design%20(21) " )
2018-09-14 00:53:53 +08:00
end
2017-08-16 19:38:34 +08:00
it 'correctly indexes a post according to version' do
# Preparing so that they can be indexed to right version
2020-08-18 14:51:17 +08:00
SearchIndexer . update_posts_index (
post_id : post_id ,
topic_title : " dummy " ,
category_name : " " ,
topic_tags : nil ,
cooked : nil ,
private_message : false
)
2019-04-29 15:32:25 +08:00
PostSearchData . find_by ( post_id : post_id ) . update! ( version : - 1 )
2014-06-24 15:10:56 +08:00
2017-08-16 19:38:34 +08:00
data = " <a>This</a> is a test "
2020-08-18 14:51:17 +08:00
SearchIndexer . update_posts_index (
post_id : post_id ,
topic_title : " " ,
category_name : " " ,
topic_tags : nil ,
cooked : data ,
private_message : false
)
2014-06-24 15:10:56 +08:00
2017-08-16 19:38:34 +08:00
raw_data , locale , version = PostSearchData . where ( post_id : post_id ) . pluck ( :raw_data , :locale , :version ) [ 0 ]
2016-08-11 03:40:58 +08:00
expect ( raw_data ) . to eq ( " This is a test " )
2019-05-16 05:43:00 +08:00
expect ( locale ) . to eq ( SiteSetting . default_locale )
2020-07-23 14:10:05 +08:00
expect ( version ) . to eq ( SearchIndexer :: POST_INDEX_VERSION )
2014-06-24 15:10:56 +08:00
end
2019-03-19 17:16:57 +08:00
describe '.index' do
2020-07-27 15:22:54 +08:00
let ( :topic ) { Fabricate ( :topic , title : " this is a title that I am testing " ) }
let ( :post ) { Fabricate ( :post , topic : topic ) }
2019-03-19 17:16:57 +08:00
it 'should index posts correctly' do
expect { post } . to change { PostSearchData . count } . by ( 1 )
expect { post . update! ( raw : " this is new content " ) }
2020-07-17 16:27:30 +08:00
. to change { post . reload . post_search_data . search_data }
2019-03-19 17:16:57 +08:00
expect { post . update! ( topic_id : Fabricate ( :topic ) . id ) }
2020-07-17 16:27:30 +08:00
. to change { post . reload . post_search_data . search_data }
2019-03-19 17:16:57 +08:00
end
2019-04-01 10:06:27 +08:00
2021-04-07 15:02:00 +08:00
it 'should work with invalid HTML' do
2021-08-05 09:46:25 +08:00
post . update! ( cooked : " <FD> " * Nokogiri :: Gumbo :: DEFAULT_MAX_TREE_DEPTH )
2021-04-07 15:02:00 +08:00
SearchIndexer . update_posts_index (
post_id : post . id ,
topic_title : post . topic . title ,
category_name : post . topic . category & . name ,
topic_tags : post . topic . tags . map ( & :name ) . join ( ' ' ) ,
cooked : post . cooked ,
private_message : post . topic . private_message?
)
end
2019-04-01 10:06:27 +08:00
it 'should not index posts with empty raw' do
expect do
post = Fabricate . build ( :post , raw : " " , post_type : Post . types [ :small_action ] )
post . save! ( validate : false )
end . to_not change { PostSearchData . count }
end
2019-04-01 10:14:29 +08:00
it " should not tokenize urls and duplicate title and href in <a> " do
2020-07-27 15:22:54 +08:00
post . update! ( raw : << ~ RAW )
2019-04-01 10:14:29 +08:00
https : / /me ta . discourse . org / some . png
RAW
post . rebake!
post . reload
expect ( post . post_search_data . raw_data ) . to eq (
2020-07-17 16:27:30 +08:00
" https://meta.discourse.org/some.png "
2020-07-09 14:56:02 +08:00
)
2020-07-27 15:22:54 +08:00
expect ( post . post_search_data . search_data ) . to eq (
" '/some.png':12 'discourse.org':11 'meta.discourse.org':11 'meta.discourse.org/some.png':10 'org':11 'test':8A 'titl':4A 'uncategor':9B "
)
2020-07-09 14:56:02 +08:00
end
2020-07-27 14:46:44 +08:00
it 'should not tokenize versions' do
2020-07-27 15:17:49 +08:00
post . update! ( raw : '123.223' )
2020-07-27 14:46:44 +08:00
expect ( post . post_search_data . search_data ) . to eq (
2020-07-27 15:17:49 +08:00
" '123.223':10 'test':8A 'titl':4A 'uncategor':9B "
)
post . update! ( raw : '15.2.231.423' )
post . reload
expect ( post . post_search_data . search_data ) . to eq (
" '15.2.231.423':10 'test':8A 'titl':4A 'uncategor':9B "
2020-07-27 14:46:44 +08:00
)
end
2020-07-09 17:02:02 +08:00
it 'should tokenize host of a URL and removes query string' do
2020-07-09 14:56:02 +08:00
category = Fabricate ( :category , name : 'awesome category' )
topic = Fabricate ( :topic , category : category , title : 'this is a test topic' )
post = Fabricate ( :post , topic : topic , raw : << ~ RAW )
2020-10-08 08:40:13 +08:00
a https : / / abc . com? bob = 1 , http : / /e fg . com . au? bill = 1 b hij . net / xyz = 1
2021-05-21 09:43:47 +08:00
www . klm . net / ? IGNORE = 1 < a href = " http://abc.de.nop.co.uk?IGNORE=1&ignore2=2 " > test < / a>
2020-07-09 14:56:02 +08:00
RAW
post . rebake!
post . reload
topic = post . topic
2020-10-08 08:40:13 +08:00
# Note, a random non URL string should be tokenized properly,
# hence www.klm.net?IGNORE=1 it was inserted in autolinking.
# We could consider amending the auto linker to add
# more context to say "hey, this part of <a href>...</a> was a guess by autolinker.
# A blanket treating of non-urls without this logic is risky.
2020-07-09 14:56:02 +08:00
expect ( post . post_search_data . raw_data ) . to eq (
2020-10-08 08:40:13 +08:00
" a https://abc.com , http://efg.com.au b http://hij.net/xyz=1 hij.net/xyz=1 http://www.klm.net/ www.klm.net/?IGNORE=1 http://abc.de.nop.co.uk test "
2020-07-09 14:56:02 +08:00
)
expect ( post . post_search_data . search_data ) . to eq (
2020-10-08 08:40:13 +08:00
" '/?ignore=1':21 '/xyz=1':14,17 'abc.com':9 'abc.de.nop.co.uk':22 'au':10 'awesom':6B 'b':11 'categori':7B 'co.uk':22 'com':9 'com.au':10 'de.nop.co.uk':22 'efg.com.au':10 'hij.net':13,16 'hij.net/xyz=1':12,15 'klm.net':18,20 'net':13,16,18,20 'nop.co.uk':22 'test':4A,23 'topic':5A 'uk':22 'www.klm.net':18,20 'www.klm.net/?ignore=1':19 "
2019-04-01 10:14:29 +08:00
)
end
it 'should not include lightbox in search' do
Jobs . run_immediately!
2019-04-01 10:32:25 +08:00
SiteSetting . max_image_width = 1
stub_request ( :get , " https://meta.discourse.org/some.png " )
. to_return ( status : 200 , body : file_from_fixtures ( " logo.png " ) . read )
2019-04-01 10:14:29 +08:00
src = " https://meta.discourse.org/some.png "
post = Fabricate ( :post , raw : << ~ RAW )
Let me see how I can fix this image
2019-04-01 16:18:54 +08:00
< img src = " #{ src } " title = " GOT " alt = " white walkers " width = " 2 " height = " 2 " >
2019-04-01 10:14:29 +08:00
RAW
post . rebake!
post . reload
2021-05-21 03:19:44 +08:00
expect ( post . cooked ) . not_to include (
2019-04-01 10:32:25 +08:00
CookedPostProcessor :: LIGHTBOX_WRAPPER_CSS_CLASS
)
2019-04-01 10:14:29 +08:00
expect ( post . post_search_data . raw_data ) . to eq (
2020-07-17 16:27:30 +08:00
" Let me see how I can fix this image white walkers GOT "
2019-04-01 10:14:29 +08:00
)
end
2020-08-06 12:25:03 +08:00
it 'should strips audio and videos URLs from raw data' do
SiteSetting . authorized_extensions = 'mp4'
2020-10-08 08:40:13 +08:00
Fabricate ( :video_upload )
2020-08-06 12:25:03 +08:00
post . update! ( raw : << ~ RAW )
link to an external page : https : / / google . com / ?u = bar
link to an audio file : https : / /somesi te . com / audio . m4a
link to a video file : https : / /somesi te . com / content / somethingelse . MOV
link to an invalid URL : http : error ]
RAW
expect ( post . post_search_data . raw_data ) . to eq (
" link to an external page: https://google.com/ link to an audio file: #{ I18n . t ( " search.audio " ) } link to a video file: #{ I18n . t ( " search.video " ) } link to an invalid URL: http:error] "
)
expect ( post . post_search_data . search_data ) . to eq (
" '/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27 "
)
end
2022-03-08 05:03:10 +08:00
it 'should unaccent indexed content' do
SiteSetting . search_ignore_accents = true
post . update! ( raw : " Cette oeuvre d'art n'est pas une œuvre " )
post . post_search_data . reload
expect ( post . post_search_data . search_data ) . not_to include ( 'œuvr' )
expect ( post . post_search_data . search_data ) . to include ( 'oeuvr' )
SiteSetting . search_ignore_accents = false
SearchIndexer . index ( post , force : true )
post . post_search_data . reload
expect ( post . post_search_data . search_data ) . to include ( 'œuvr' )
expect ( post . post_search_data . search_data ) . to include ( 'oeuvr' )
end
2022-04-07 01:23:30 +08:00
it 'truncates long words in the index' do
SiteSetting . search_max_indexed_word_length = 4
title = 'A title that is long enough'
contents = 'I am the best beige object http://example.com/long/url'
topic . update! ( title : title )
post . update! ( raw : contents )
post_search_data = post . post_search_data
post_search_data . reload
expect ( post_search_data . raw_data ) . to eq ( contents )
words = post_search_data . search_data . scan ( / '([^']*)' / ) . map { | match | match [ 0 ] }
expect ( words ) . to contain_exactly ( 'best' , 'beig' , 'obj' , 'http' , 'titl' , 'long' , 'enou' , 'unca' )
end
2019-03-19 17:16:57 +08:00
end
2019-04-02 09:52:59 +08:00
describe '.queue_post_reindex' do
let ( :post ) { Fabricate ( :post ) }
let ( :topic ) { post . topic }
it 'should reset the version of search data for all posts in the topic' do
post2 = Fabricate ( :post )
SearchIndexer . queue_post_reindex ( topic . id )
expect ( post . reload . post_search_data . version ) . to eq (
SearchIndexer :: REINDEX_VERSION
)
expect ( post2 . reload . post_search_data . version ) . to eq (
2020-07-23 14:10:05 +08:00
SearchIndexer :: POST_INDEX_VERSION
2019-04-02 09:52:59 +08:00
)
end
end
2021-04-27 13:52:45 +08:00
describe '.queue_users_reindex' do
let! ( :user ) { Fabricate ( :user ) }
let! ( :user2 ) { Fabricate ( :user ) }
it 'should reset the version of search data for all users' do
SearchIndexer . index ( user , force : true )
SearchIndexer . index ( user2 , force : true )
SearchIndexer . queue_users_reindex ( [ user . id ] )
expect ( user . reload . user_search_data . version ) . to eq (
SearchIndexer :: REINDEX_VERSION
)
expect ( user2 . reload . user_search_data . version ) . to eq (
SearchIndexer :: USER_INDEX_VERSION
)
end
end
2014-06-24 15:10:56 +08:00
end