PERF: Replace video and audio links in search blurb while indexing.

In the near future, we will be swtiching to PG headlines to generate the search blurb. As such, we need to replace audio and video links in the raw data used for headline generation. This also means that we avoid replacing links each time we need to generate the blurb.
2025-03-24 10:49:22 +08:00 · 2020-08-06 12:25:03 +08:00 · 2020-08-06 12:25:03 +08:00 · 255b0e9f14
commit 255b0e9f14
parent 06ef87da51
6 changed files with 68 additions and 22 deletions
--- a/app/mailers/group_smtp_mailer.rb
+++ b/app/mailers/group_smtp_mailer.rb
@ -88,7 +88,7 @@ class GroupSmtpMailer < ActionMailer::Base

  def strip_secure_urls(raw)
    urls = Set.new
-    raw.scan(URI.regexp(%w{http https})) { urls << $& }
+    raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }

    urls.each do |url|
      if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))
--- a/app/mailers/user_notifications.rb
+++ b/app/mailers/user_notifications.rb
@ -365,7 +365,7 @@ class UserNotifications < ActionMailer::Base

  def strip_secure_urls(raw)
    urls = Set.new
-    raw.scan(URI.regexp(%w{http https})) { urls << $& }
+    raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }

    urls.each do |url|
      if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@ -46,13 +46,6 @@ class SearchIndexer
      d: search_data[3],
    }

-    indexed_data =
-      if table.to_s == "post"
-        ranked_params[:d]
-      else
-        search_data.select { |d| d.length > 0 }.join(' ')
-      end
-
    tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
    additional_lexemes = []

@ -75,6 +68,13 @@ class SearchIndexer

    tsvector = "#{tsvector} #{additional_lexemes.join(' ')}"

+    indexed_data =
+      if table.to_s == "post"
+        clean_post_raw_data!(ranked_params[:d])
+      else
+        search_data.select { |d| d.length > 0 }.join(' ')
+      end
+
    params = {
      raw_data: indexed_data,
      id: id,
@ -216,6 +216,26 @@ class SearchIndexer
    end
  end

+  def self.clean_post_raw_data!(raw_data)
+    urls = Set.new
+    raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
+
+    urls.each do |url|
+      begin
+        case File.extname(URI(url).path || "")
+        when Oneboxer::VIDEO_REGEX
+          raw_data.gsub!(url, I18n.t("search.video"))
+        when Oneboxer::AUDIO_REGEX
+          raw_data.gsub!(url, I18n.t("search.audio"))
+        end
+      rescue URI::InvalidURIError
+      end
+    end
+
+    raw_data
+  end
+  private_class_method :clean_post_raw_data!
+
  class HtmlScrubber < Nokogiri::XML::SAX::Document

    attr_reader :scrubbed
--- a/lib/discourse.rb
+++ b/lib/discourse.rb
@ -24,6 +24,8 @@ module Discourse
  end

  class Utils
+    URI_REGEXP = URI.regexp(%w{http https})
+
    # Usage:
    #   Discourse::Utils.execute_command("pwd", chdir: 'mydirectory')
    # or with a block
--- a/lib/search/grouped_search_results.rb
+++ b/lib/search/grouped_search_results.rb
@ -106,23 +106,24 @@ class Search
      end
    end

-    URI_REGEXP = URI.regexp(%w{http https})
-
    def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
      blurb = nil
-      cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub

-      urls = Set.new
-      cooked.scan(URI_REGEXP) { urls << $& }
-      urls.each do |url|
-        begin
-          case File.extname(URI(url).path || "")
-          when Oneboxer::VIDEO_REGEX
-            cooked.gsub!(url, I18n.t("search.video"))
-          when Oneboxer::AUDIO_REGEX
-            cooked.gsub!(url, I18n.t("search.audio"))
+      if scrub
+        cooked = SearchIndexer.scrub_html_for_search(cooked)
+
+        urls = Set.new
+        cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
+        urls.each do |url|
+          begin
+            case File.extname(URI(url).path || "")
+            when Oneboxer::VIDEO_REGEX
+              cooked.gsub!(url, I18n.t("search.video"))
+            when Oneboxer::AUDIO_REGEX
+              cooked.gsub!(url, I18n.t("search.audio"))
+            end
+          rescue URI::InvalidURIError
          end
-        rescue URI::InvalidURIError
        end
      end

--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@ -209,6 +209,29 @@ describe SearchIndexer do
        "Let me see how I can fix this image white walkers GOT"
      )
    end
+
+    it 'should strips audio and videos URLs from raw data' do
+      SiteSetting.authorized_extensions = 'mp4'
+      upload = Fabricate(:video_upload)
+
+      post.update!(raw: <<~RAW)
+      link to an external page: https://google.com/?u=bar
+
+      link to an audio file: https://somesite.com/audio.m4a
+
+      link to a video file: https://somesite.com/content/somethingelse.MOV
+
+      link to an invalid URL: http:error]
+      RAW
+
+      expect(post.post_search_data.raw_data).to eq(
+        "link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]"
+      )
+
+      expect(post.post_search_data.search_data).to eq(
+        "'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
+      )
+    end
  end

  describe '.queue_post_reindex' do