From 2196d0b9ae660bab7991608b37d267beb1c16916 Mon Sep 17 00:00:00 2001 From: Guo Xiang Tan Date: Thu, 9 Jul 2020 17:02:02 +0800 Subject: [PATCH] FIX: Strip query from URLs when indexing for search. Indexing query strings in URLS produces inconsistent results in PG and pollutes the search data for really little gain. The following seems to work as expected... ``` discourse_development=# SELECT TO_TSVECTOR('https://www.discourse.org?test=2&test2=3'); to_tsvector ------------------------------------------------------ '2':3 '3':5 'test':2 'test2':4 'www.discourse.org':1 ``` However, once a path is present ``` discourse_development=# SELECT TO_TSVECTOR('https://www.discourse.org/latest?test=2&test2=3'); to_tsvector ---------------------------------------------------------------------------------------------- '/latest?test=2&test2=3':3 'www.discourse.org':2 'www.discourse.org/latest?test=2&test2=3':1 ``` The lexeme contains both the path and the query string. --- lib/search.rb | 9 +++++++++ spec/services/search_indexer_spec.rb | 6 +++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/lib/search.rb b/lib/search.rb index 36ade32e8f0..040f149b460 100644 --- a/lib/search.rb +++ b/lib/search.rb @@ -86,6 +86,15 @@ class Search data = strip_diacritics(data) end end + + data.gsub!(EmailCook.url_regexp) do |url| + uri = URI.parse(url) + uri.query = nil + uri.to_s + rescue URI::Error + # Don't fail even if URL turns out to be invalid + end + data end diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index 027b6607b22..64488e793dc 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -145,7 +145,7 @@ describe SearchIndexer do ) end - it 'should tokenize host of a URL' do + it 'should tokenize host of a URL and removes query string' do category = Fabricate(:category, name: 'awesome category') topic = Fabricate(:topic, category: category, title: 'this is a test topic') @@ -158,11 +158,11 @@ describe SearchIndexer do topic = post.topic expect(post.post_search_data.raw_data).to eq( - "#{topic.title} #{category.name} a https://cnn.com?bob=1 , http://stuff.com.au?bill=1 b http://abc.net/xyz=1 abc.net/xyz=1" + "#{topic.title} #{category.name} a https://cnn.com , http://stuff.com.au b http://abc.net/xyz=1 abc.net/xyz=1" ) expect(post.post_search_data.search_data).to eq( - "'/xyz=1':18,21 '1':11,14 'abc':17,20 'abc.net':17,20 'abc.net/xyz=1':16,19 'au':12 'awesom':6B 'b':15 'bill':13 'bob':10 'categori':7B 'cnn':9 'cnn.com':9 'com':9,12 'com.au':12 'net':17,20 'stuff':12 'stuff.com.au':12 'test':4A 'topic':5A" + "'/xyz=1':14,17 'abc':13,16 'abc.net':13,16 'abc.net/xyz=1':12,15 'au':10 'awesom':6B 'b':11 'categori':7B 'cnn':9 'cnn.com':9 'com':9,10 'com.au':10 'net':13,16 'stuff':10 'stuff.com.au':10 'test':4A 'topic':5A" ) end