From 255b0e9f14a35d395d49cb521a3d7f7c6e8e8130 Mon Sep 17 00:00:00 2001
From: Guo Xiang Tan <gxtan1990@gmail.com>
Date: Thu, 6 Aug 2020 12:25:03 +0800
Subject: [PATCH] PERF: Replace video and audio links in search blurb while
 indexing.

In the near future, we will be swtiching to PG headlines to generate the
search blurb. As such, we need to replace audio and video links in the
raw data used for headline generation. This also means that we avoid
replacing links each time we need to generate the blurb.
---
 app/mailers/group_smtp_mailer.rb     |  2 +-
 app/mailers/user_notifications.rb    |  2 +-
 app/services/search_indexer.rb       | 34 ++++++++++++++++++++++------
 lib/discourse.rb                     |  2 ++
 lib/search/grouped_search_results.rb | 27 +++++++++++-----------
 spec/services/search_indexer_spec.rb | 23 +++++++++++++++++++
 6 files changed, 68 insertions(+), 22 deletions(-)

diff --git a/app/mailers/group_smtp_mailer.rb b/app/mailers/group_smtp_mailer.rb
index 01ef9b5b17c..15c3d0cf530 100644
--- a/app/mailers/group_smtp_mailer.rb
+++ b/app/mailers/group_smtp_mailer.rb
@@ -88,7 +88,7 @@ class GroupSmtpMailer < ActionMailer::Base
 
   def strip_secure_urls(raw)
     urls = Set.new
-    raw.scan(URI.regexp(%w{http https})) { urls << $& }
+    raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
 
     urls.each do |url|
       if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))
diff --git a/app/mailers/user_notifications.rb b/app/mailers/user_notifications.rb
index 62525eea177..aa700f5c028 100644
--- a/app/mailers/user_notifications.rb
+++ b/app/mailers/user_notifications.rb
@@ -365,7 +365,7 @@ class UserNotifications < ActionMailer::Base
 
   def strip_secure_urls(raw)
     urls = Set.new
-    raw.scan(URI.regexp(%w{http https})) { urls << $& }
+    raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
 
     urls.each do |url|
       if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))
diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb
index cb60237ab94..bc9c7ecaf8e 100644
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -46,13 +46,6 @@ class SearchIndexer
       d: search_data[3],
     }
 
-    indexed_data =
-      if table.to_s == "post"
-        ranked_params[:d]
-      else
-        search_data.select { |d| d.length > 0 }.join(' ')
-      end
-
     tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
     additional_lexemes = []
 
@@ -75,6 +68,13 @@ class SearchIndexer
 
     tsvector = "#{tsvector} #{additional_lexemes.join(' ')}"
 
+    indexed_data =
+      if table.to_s == "post"
+        clean_post_raw_data!(ranked_params[:d])
+      else
+        search_data.select { |d| d.length > 0 }.join(' ')
+      end
+
     params = {
       raw_data: indexed_data,
       id: id,
@@ -216,6 +216,26 @@ class SearchIndexer
     end
   end
 
+  def self.clean_post_raw_data!(raw_data)
+    urls = Set.new
+    raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
+
+    urls.each do |url|
+      begin
+        case File.extname(URI(url).path || "")
+        when Oneboxer::VIDEO_REGEX
+          raw_data.gsub!(url, I18n.t("search.video"))
+        when Oneboxer::AUDIO_REGEX
+          raw_data.gsub!(url, I18n.t("search.audio"))
+        end
+      rescue URI::InvalidURIError
+      end
+    end
+
+    raw_data
+  end
+  private_class_method :clean_post_raw_data!
+
   class HtmlScrubber < Nokogiri::XML::SAX::Document
 
     attr_reader :scrubbed
diff --git a/lib/discourse.rb b/lib/discourse.rb
index 5d328826337..2c2d8a8fa48 100644
--- a/lib/discourse.rb
+++ b/lib/discourse.rb
@@ -24,6 +24,8 @@ module Discourse
   end
 
   class Utils
+    URI_REGEXP = URI.regexp(%w{http https})
+
     # Usage:
     #   Discourse::Utils.execute_command("pwd", chdir: 'mydirectory')
     # or with a block
diff --git a/lib/search/grouped_search_results.rb b/lib/search/grouped_search_results.rb
index 16260661b55..ba112b50344 100644
--- a/lib/search/grouped_search_results.rb
+++ b/lib/search/grouped_search_results.rb
@@ -106,23 +106,24 @@ class Search
       end
     end
 
-    URI_REGEXP = URI.regexp(%w{http https})
-
     def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
       blurb = nil
-      cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub
 
-      urls = Set.new
-      cooked.scan(URI_REGEXP) { urls << $& }
-      urls.each do |url|
-        begin
-          case File.extname(URI(url).path || "")
-          when Oneboxer::VIDEO_REGEX
-            cooked.gsub!(url, I18n.t("search.video"))
-          when Oneboxer::AUDIO_REGEX
-            cooked.gsub!(url, I18n.t("search.audio"))
+      if scrub
+        cooked = SearchIndexer.scrub_html_for_search(cooked)
+
+        urls = Set.new
+        cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
+        urls.each do |url|
+          begin
+            case File.extname(URI(url).path || "")
+            when Oneboxer::VIDEO_REGEX
+              cooked.gsub!(url, I18n.t("search.video"))
+            when Oneboxer::AUDIO_REGEX
+              cooked.gsub!(url, I18n.t("search.audio"))
+            end
+          rescue URI::InvalidURIError
           end
-        rescue URI::InvalidURIError
         end
       end
 
diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb
index f55b183e864..c359c32ce63 100644
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@@ -209,6 +209,29 @@ describe SearchIndexer do
         "Let me see how I can fix this image white walkers GOT"
       )
     end
+
+    it 'should strips audio and videos URLs from raw data' do
+      SiteSetting.authorized_extensions = 'mp4'
+      upload = Fabricate(:video_upload)
+
+      post.update!(raw: <<~RAW)
+      link to an external page: https://google.com/?u=bar
+
+      link to an audio file: https://somesite.com/audio.m4a
+
+      link to a video file: https://somesite.com/content/somethingelse.MOV
+
+      link to an invalid URL: http:error]
+      RAW
+
+      expect(post.post_search_data.raw_data).to eq(
+        "link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]"
+      )
+
+      expect(post.post_search_data.search_data).to eq(
+        "'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
+      )
+    end
   end
 
   describe '.queue_post_reindex' do