From 255b0e9f14a35d395d49cb521a3d7f7c6e8e8130 Mon Sep 17 00:00:00 2001 From: Guo Xiang Tan Date: Thu, 6 Aug 2020 12:25:03 +0800 Subject: [PATCH] PERF: Replace video and audio links in search blurb while indexing. In the near future, we will be swtiching to PG headlines to generate the search blurb. As such, we need to replace audio and video links in the raw data used for headline generation. This also means that we avoid replacing links each time we need to generate the blurb. --- app/mailers/group_smtp_mailer.rb | 2 +- app/mailers/user_notifications.rb | 2 +- app/services/search_indexer.rb | 34 ++++++++++++++++++++++------ lib/discourse.rb | 2 ++ lib/search/grouped_search_results.rb | 27 +++++++++++----------- spec/services/search_indexer_spec.rb | 23 +++++++++++++++++++ 6 files changed, 68 insertions(+), 22 deletions(-) diff --git a/app/mailers/group_smtp_mailer.rb b/app/mailers/group_smtp_mailer.rb index 01ef9b5b17c..15c3d0cf530 100644 --- a/app/mailers/group_smtp_mailer.rb +++ b/app/mailers/group_smtp_mailer.rb @@ -88,7 +88,7 @@ class GroupSmtpMailer < ActionMailer::Base def strip_secure_urls(raw) urls = Set.new - raw.scan(URI.regexp(%w{http https})) { urls << $& } + raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& } urls.each do |url| if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url)) diff --git a/app/mailers/user_notifications.rb b/app/mailers/user_notifications.rb index 62525eea177..aa700f5c028 100644 --- a/app/mailers/user_notifications.rb +++ b/app/mailers/user_notifications.rb @@ -365,7 +365,7 @@ class UserNotifications < ActionMailer::Base def strip_secure_urls(raw) urls = Set.new - raw.scan(URI.regexp(%w{http https})) { urls << $& } + raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& } urls.each do |url| if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url)) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index cb60237ab94..bc9c7ecaf8e 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -46,13 +46,6 @@ class SearchIndexer d: search_data[3], } - indexed_data = - if table.to_s == "post" - ranked_params[:d] - else - search_data.select { |d| d.length > 0 }.join(' ') - end - tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0] additional_lexemes = [] @@ -75,6 +68,13 @@ class SearchIndexer tsvector = "#{tsvector} #{additional_lexemes.join(' ')}" + indexed_data = + if table.to_s == "post" + clean_post_raw_data!(ranked_params[:d]) + else + search_data.select { |d| d.length > 0 }.join(' ') + end + params = { raw_data: indexed_data, id: id, @@ -216,6 +216,26 @@ class SearchIndexer end end + def self.clean_post_raw_data!(raw_data) + urls = Set.new + raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& } + + urls.each do |url| + begin + case File.extname(URI(url).path || "") + when Oneboxer::VIDEO_REGEX + raw_data.gsub!(url, I18n.t("search.video")) + when Oneboxer::AUDIO_REGEX + raw_data.gsub!(url, I18n.t("search.audio")) + end + rescue URI::InvalidURIError + end + end + + raw_data + end + private_class_method :clean_post_raw_data! + class HtmlScrubber < Nokogiri::XML::SAX::Document attr_reader :scrubbed diff --git a/lib/discourse.rb b/lib/discourse.rb index 5d328826337..2c2d8a8fa48 100644 --- a/lib/discourse.rb +++ b/lib/discourse.rb @@ -24,6 +24,8 @@ module Discourse end class Utils + URI_REGEXP = URI.regexp(%w{http https}) + # Usage: # Discourse::Utils.execute_command("pwd", chdir: 'mydirectory') # or with a block diff --git a/lib/search/grouped_search_results.rb b/lib/search/grouped_search_results.rb index 16260661b55..ba112b50344 100644 --- a/lib/search/grouped_search_results.rb +++ b/lib/search/grouped_search_results.rb @@ -106,23 +106,24 @@ class Search end end - URI_REGEXP = URI.regexp(%w{http https}) - def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true) blurb = nil - cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub - urls = Set.new - cooked.scan(URI_REGEXP) { urls << $& } - urls.each do |url| - begin - case File.extname(URI(url).path || "") - when Oneboxer::VIDEO_REGEX - cooked.gsub!(url, I18n.t("search.video")) - when Oneboxer::AUDIO_REGEX - cooked.gsub!(url, I18n.t("search.audio")) + if scrub + cooked = SearchIndexer.scrub_html_for_search(cooked) + + urls = Set.new + cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& } + urls.each do |url| + begin + case File.extname(URI(url).path || "") + when Oneboxer::VIDEO_REGEX + cooked.gsub!(url, I18n.t("search.video")) + when Oneboxer::AUDIO_REGEX + cooked.gsub!(url, I18n.t("search.audio")) + end + rescue URI::InvalidURIError end - rescue URI::InvalidURIError end end diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index f55b183e864..c359c32ce63 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -209,6 +209,29 @@ describe SearchIndexer do "Let me see how I can fix this image white walkers GOT" ) end + + it 'should strips audio and videos URLs from raw data' do + SiteSetting.authorized_extensions = 'mp4' + upload = Fabricate(:video_upload) + + post.update!(raw: <<~RAW) + link to an external page: https://google.com/?u=bar + + link to an audio file: https://somesite.com/audio.m4a + + link to a video file: https://somesite.com/content/somethingelse.MOV + + link to an invalid URL: http:error] + RAW + + expect(post.post_search_data.raw_data).to eq( + "link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]" + ) + + expect(post.post_search_data.search_data).to eq( + "'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27" + ) + end end describe '.queue_post_reindex' do