From 2196d0b9ae660bab7991608b37d267beb1c16916 Mon Sep 17 00:00:00 2001
From: Guo Xiang Tan <gxtan1990@gmail.com>
Date: Thu, 9 Jul 2020 17:02:02 +0800
Subject: [PATCH] FIX: Strip query from URLs when indexing for search.

Indexing query strings in URLS produces inconsistent results in PG and
pollutes the search data for really little gain.

The following seems to work as expected...

```
discourse_development=# SELECT TO_TSVECTOR('https://www.discourse.org?test=2&test2=3');
                     to_tsvector
------------------------------------------------------
 '2':3 '3':5 'test':2 'test2':4 'www.discourse.org':1
```

However, once a path is present

```
discourse_development=# SELECT TO_TSVECTOR('https://www.discourse.org/latest?test=2&test2=3');
                                         to_tsvector
----------------------------------------------------------------------------------------------
 '/latest?test=2&test2=3':3 'www.discourse.org':2 'www.discourse.org/latest?test=2&test2=3':1
```

The lexeme contains both the path and the query string.
---
 lib/search.rb                        | 9 +++++++++
 spec/services/search_indexer_spec.rb | 6 +++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/lib/search.rb b/lib/search.rb
index 36ade32e8f0..040f149b460 100644
--- a/lib/search.rb
+++ b/lib/search.rb
@@ -86,6 +86,15 @@ class Search
         data = strip_diacritics(data)
       end
     end
+
+    data.gsub!(EmailCook.url_regexp) do |url|
+      uri = URI.parse(url)
+      uri.query = nil
+      uri.to_s
+    rescue URI::Error
+      # Don't fail even if URL turns out to be invalid
+    end
+
     data
   end
 
diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb
index 027b6607b22..64488e793dc 100644
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@@ -145,7 +145,7 @@ describe SearchIndexer do
       )
     end
 
-    it 'should tokenize host of a URL' do
+    it 'should tokenize host of a URL and removes query string' do
       category = Fabricate(:category, name: 'awesome category')
       topic = Fabricate(:topic, category: category, title: 'this is a test topic')
 
@@ -158,11 +158,11 @@ describe SearchIndexer do
       topic = post.topic
 
       expect(post.post_search_data.raw_data).to eq(
-        "#{topic.title} #{category.name} a https://cnn.com?bob=1 , http://stuff.com.au?bill=1 b http://abc.net/xyz=1 abc.net/xyz=1"
+        "#{topic.title} #{category.name} a https://cnn.com , http://stuff.com.au b http://abc.net/xyz=1 abc.net/xyz=1"
       )
 
       expect(post.post_search_data.search_data).to eq(
-        "'/xyz=1':18,21 '1':11,14 'abc':17,20 'abc.net':17,20 'abc.net/xyz=1':16,19 'au':12 'awesom':6B 'b':15 'bill':13 'bob':10 'categori':7B 'cnn':9 'cnn.com':9 'com':9,12 'com.au':12 'net':17,20 'stuff':12 'stuff.com.au':12 'test':4A 'topic':5A"
+        "'/xyz=1':14,17 'abc':13,16 'abc.net':13,16 'abc.net/xyz=1':12,15 'au':10 'awesom':6B 'b':11 'categori':7B 'cnn':9 'cnn.com':9 'com':9,10 'com.au':10 'net':13,16 'stuff':10 'stuff.com.au':10 'test':4A 'topic':5A"
       )
     end