From 6428aa5b1fe7f2bd18346345d915d25dc91bad54 Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Tue, 4 Jun 2019 16:19:27 +1000 Subject: [PATCH] FIX: search indexer had various cases where it could fail Previous to this fix is a post had the test www.test.com/abc it would fail to index. This also simplifies the rules to avoid full url parsing which can be expensive --- app/services/search_indexer.rb | 15 ++++++--------- spec/services/search_indexer_spec.rb | 12 +++++++++++- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index 08df177bde5..4ea9b7b972d 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -21,16 +21,13 @@ class SearchIndexer # insert some extra words for I.am.a.word so "word" is tokenized # I.am.a.word becomes I.am.a.word am a word raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot| - if with_dot.match?(PlainTextToMarkdown::URL_REGEX) - "#{with_dot} #{URI.parse(with_dot).hostname.gsub('.', ' ')}" - else - split = with_dot.split(".") - if split.length > 1 - with_dot + ((+" ") << split[1..-1].join(" ")) - else - with_dot - end + split = with_dot.split(/https?:\/\/|[?:;,.\/]/) + + if split.length > 1 + with_dot + ((+" ") << split[1..-1].reject { |x| x.blank? }.join(" ")) + else + with_dot end end end diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index b92f342945e..f8823a9ab6d 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -17,6 +17,16 @@ describe SearchIndexer do SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics) end + it 'can correctly inject if http or https links exist' do + + val = "a https://cnn.com?bob=1, http://stuff.com.au?bill=1 b abc.net/xyz=1" + result = SearchIndexer.inject_extra_terms(val) + + expected = "a https://cnn.com?bob=1, cnn com bob=1 http://stuff.com.au?bill=1 stuff com au bill=1 b abc.net/xyz=1 net xyz=1" + + expect(result).to eq(expected) + end + it 'correctly indexes chinese' do SiteSetting.default_locale = 'zh_CN' data = "你好世界" @@ -141,7 +151,7 @@ describe SearchIndexer do topic = post.topic expect(post.post_search_data.raw_data).to eq( - "#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png meta discourse org" + "#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png meta discourse org some png" ) end