discourse/lib/search/grouped_search_results.rb

# frozen_string_literal: true

require 'sanitize'

class Search

  class GroupedSearchResults
    include ActiveModel::Serialization

    class TextHelper
      extend ActionView::Helpers::TextHelper

      private

      # TODO: Remove when https://github.com/rails/rails/pull/39979 is merged
      # For a 10_000 words string, speeds up excerpts by 85X.
      def self.cut_excerpt_part(part_position, part, separator, options)
        return "", "" unless part

        radius   = options.fetch(:radius, 100)
        omission = options.fetch(:omission, "...")

        if separator != ""
          part = part.split(separator)
          part.delete("")
        end

        affix = part.length > radius ? omission : ""
        part = part.public_send(part_position == :first ? :last : :first, radius)
        part = part.join(separator) if separator != ""
        [affix, part]
      end
    end

    attr_reader(
      :type_filter,
      :posts,
      :categories,
      :users,
      :tags,
      :groups,
      :more_posts,
      :more_categories,
      :more_users,
      :term,
      :search_context,
      :more_full_page_results,
      :error
    )

    attr_accessor :search_log_id

    BLURB_LENGTH = 200

    def initialize(type_filter:, term:, search_context:, blurb_length: nil, blurb_term: nil)
      @type_filter = type_filter
      @term = term
      @blurb_term = blurb_term || term
      @search_context = search_context
      @blurb_length = blurb_length || BLURB_LENGTH
      @posts = []
      @categories = []
      @users = []
      @tags = []
      @groups = []
      @error = nil
    end

    def error=(error)
      @error = error
    end

    def find_user_data(guardian)
      if user = guardian.user
        topics = @posts.map(&:topic)
        topic_lookup = TopicUser.lookup_for(user, topics)
        topics.each { |ft| ft.user_data = topic_lookup[ft.id] }
      end
    end

    OMISSION = '...'
    SCRUB_HEADLINE_REGEXP = /<span(?: \w+="[^"]+")* class="#{Search::HIGHLIGHT_CSS_CLASS}"(?: \w+="[^"]+")*>([^<]*)<\/span>/

    def blurb(post)
      opts = {
        term: @blurb_term,
        blurb_length: @blurb_length
      }

      if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION && !Search.segment_chinese? && !Search.segment_japanese?
        if SiteSetting.use_pg_headlines_for_excerpt
          scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1')
          prefix_omission = scrubbed_headline.start_with?(post.leading_raw_data) ? '' : OMISSION
          postfix_omission = scrubbed_headline.end_with?(post.trailing_raw_data) ? '' : OMISSION
          return "#{prefix_omission}#{post.headline}#{postfix_omission}"
        else
          opts[:cooked] = post.post_search_data.raw_data
          opts[:scrub] = false
        end
      else
        opts[:cooked] = post.cooked
      end

      GroupedSearchResults.blurb_for(**opts)
    end

    def add(object)
      type = object.class.to_s.downcase.pluralize

      if @type_filter.present? && public_send(type).length == Search.per_filter
        @more_full_page_results = true
      elsif !@type_filter.present? && public_send(type).length == Search.per_facet
        instance_variable_set("@more_#{type}".to_sym, true)
      else
        (self.public_send(type)) << object
      end
    end

    def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
      blurb = nil

      if scrub
        cooked = SearchIndexer::HtmlScrubber.scrub(cooked)

        urls = Set.new
        cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
        urls.each do |url|
          begin
            case File.extname(URI(url).path || "")
            when Oneboxer::VIDEO_REGEX
              cooked.gsub!(url, I18n.t("search.video"))
            when Oneboxer::AUDIO_REGEX
              cooked.gsub!(url, I18n.t("search.audio"))
            end
          rescue URI::InvalidURIError
          end
        end
      end

      if term
        if term =~ Regexp.new(Search::PHRASE_MATCH_REGEXP_PATTERN)
          term = Regexp.last_match[1]
        end

        blurb = TextHelper.excerpt(cooked, term,
          radius: blurb_length / 2
        )
      end

      blurb = TextHelper.truncate(cooked, length: blurb_length) if blurb.blank?
      Sanitize.clean(blurb)
    end
  end

end
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-03 06:17:27 +08:00			`# frozen_string_literal: true`

FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`require 'sanitize'`

Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`class Search`

			`class GroupedSearchResults`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`include ActiveModel::Serialization`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`class TextHelper`
			`extend ActionView::Helpers::TextHelper`
PERF: Optimize `ActionView::Helpers::TextHelper#excerpt`. 2020-08-06 09:53:10 +08:00
			`private`

			`# TODO: Remove when https://github.com/rails/rails/pull/39979 is merged`
			`# For a 10_000 words string, speeds up excerpts by 85X.`
			`def self.cut_excerpt_part(part_position, part, separator, options)`
			`return "", "" unless part`

			`radius = options.fetch(:radius, 100)`
			`omission = options.fetch(:omission, "...")`

			`if separator != ""`
			`part = part.split(separator)`
			`part.delete("")`
			`end`

			`affix = part.length > radius ? omission : ""`
			`part = part.public_send(part_position == :first ? :last : :first, radius)`
			`part = part.join(separator) if separator != ""`
DEV: Make rubocop happy. 2020-08-06 10:08:40 +08:00			`[affix, part]`
PERF: Optimize `ActionView::Helpers::TextHelper#excerpt`. 2020-08-06 09:53:10 +08:00			`end`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`end`

Include the `search_log_id` in search results 2017-07-17 23:57:13 +08:00			`attr_reader(`
			`:type_filter,`
			`:posts,`
			`:categories,`
			`:users,`
FEATURE: show tags in search results 2017-08-25 23:52:18 +08:00			`:tags,`
FEATURE: displays groups in menu search (#7090) 2019-03-04 17:30:09 +08:00			`:groups,`
Include the `search_log_id` in search results 2017-07-17 23:57:13 +08:00			`:more_posts,`
			`:more_categories,`
			`:more_users,`
			`:term,`
			`:search_context,`
FEATURE: when under extreme load disable search The global setting disable_search_queue_threshold (DISCOURSE_DISABLE_SEARCH_QUEUE_THRESHOLD) which default to 1 second was added. This protection ensures that when the application is unable to keep up with requests it will simply turn off search till it is not backed up. To disable this protection set this to 0. 2019-07-02 09:21:52 +08:00			`:more_full_page_results,`
			`:error`
Include the `search_log_id` in search results 2017-07-17 23:57:13 +08:00			`)`

			`attr_accessor :search_log_id`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00
FIX: Incorrect search blurb when advanced search filters are used take2 Also remove include_blurbs attribute which isn't used. 2020-07-14 11:05:57 +08:00			`BLURB_LENGTH = 200`

			`def initialize(type_filter:, term:, search_context:, blurb_length: nil, blurb_term: nil)`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`@type_filter = type_filter`
			`@term = term`
FIX: Incorrect search blurb when advanced search filters are used take2 Also remove include_blurbs attribute which isn't used. 2020-07-14 11:05:57 +08:00			`@blurb_term = blurb_term \|\| term`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`@search_context = search_context`
FIX: Incorrect search blurb when advanced search filters are used take2 Also remove include_blurbs attribute which isn't used. 2020-07-14 11:05:57 +08:00			`@blurb_length = blurb_length \|\| BLURB_LENGTH`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`@posts = []`
			`@categories = []`
			`@users = []`
FEATURE: show tags in search results 2017-08-25 23:52:18 +08:00			`@tags = []`
FEATURE: displays groups in menu search (#7090) 2019-03-04 17:30:09 +08:00			`@groups = []`
FEATURE: when under extreme load disable search The global setting disable_search_queue_threshold (DISCOURSE_DISABLE_SEARCH_QUEUE_THRESHOLD) which default to 1 second was added. This protection ensures that when the application is unable to keep up with requests it will simply turn off search till it is not backed up. To disable this protection set this to 0. 2019-07-02 09:21:52 +08:00			`@error = nil`
			`end`

			`def error=(error)`
			`@error = error`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`end`

FIX: include pinned status in search results 2016-03-18 13:26:20 +08:00			`def find_user_data(guardian)`
			`if user = guardian.user`
			`topics = @posts.map(&:topic)`
			`topic_lookup = TopicUser.lookup_for(user, topics)`
			`topics.each { \|ft\| ft.user_data = topic_lookup[ft.id] }`
			`end`
			`end`

FIX: Limit PG headline based search blurb generation to 200 characters. * Recovers omission characters '...' in blurb as well. 2020-08-12 15:33:26 +08:00			`OMISSION = '...'`
			`SCRUB_HEADLINE_REGEXP = /<span(?: \w+="[^"]+")* class="#{Search::HIGHLIGHT_CSS_CLASS}"(?: \w+="[^"]+")>([^<])<\/span>/`

FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`def blurb(post)`
PERF: Avoid parsing `Post#cooked` with Nokogiri for every search. 2020-07-17 16:27:30 +08:00			`opts = {`
			`term: @blurb_term,`
			`blurb_length: @blurb_length`
			`}`

FEATURE: Split up text segmentation for Chinese and Japanese. * Chinese segmenetation will continue to rely on cppjieba * Japanese segmentation will use our port of TinySegmenter * Korean currently does not rely on segmentation which was dropped in c677877e4fe5381f613279901f36ae255c909573 * SiteSetting.search_tokenize_chinese_japanese_korean has been split into SiteSetting.search_tokenize_chinese and SiteSetting.search_tokenize_japanese respectively 2022-01-26 15:24:11 +08:00			`if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION && !Search.segment_chinese? && !Search.segment_japanese?`
PERF: Use PG headlines for blurb generation and highlighting for search. 2020-08-06 14:15:31 +08:00			`if SiteSetting.use_pg_headlines_for_excerpt`
FIX: Limit PG headline based search blurb generation to 200 characters. * Recovers omission characters '...' in blurb as well. 2020-08-12 15:33:26 +08:00			`scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1')`
			`prefix_omission = scrubbed_headline.start_with?(post.leading_raw_data) ? '' : OMISSION`
			`postfix_omission = scrubbed_headline.end_with?(post.trailing_raw_data) ? '' : OMISSION`
			`return "#{prefix_omission}#{post.headline}#{postfix_omission}"`
PERF: Use PG headlines for blurb generation and highlighting for search. 2020-08-06 14:15:31 +08:00			`else`
			`opts[:cooked] = post.post_search_data.raw_data`
			`opts[:scrub] = false`
			`end`
PERF: Avoid parsing `Post#cooked` with Nokogiri for every search. 2020-07-17 16:27:30 +08:00			`else`
			`opts[:cooked] = post.cooked`
			`end`

			`GroupedSearchResults.blurb_for(**opts)`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`end`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`def add(object)`
			`type = object.class.to_s.downcase.pluralize`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00
DEV: move send => public_send in lib folder This handles most of the cases in `lib` where we were using send instead of public_send 2019-05-07 10:22:37 +08:00			`if @type_filter.present? && public_send(type).length == Search.per_filter`
Add attribute to grouped search results for more available posts. 2017-07-21 00:07:13 +08:00			`@more_full_page_results = true`
DEV: move send => public_send in lib folder This handles most of the cases in `lib` where we were using send instead of public_send 2019-05-07 10:22:37 +08:00			`elsif !@type_filter.present? && public_send(type).length == Search.per_facet`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`instance_variable_set("@more_#{type}".to_sym, true)`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`else`
DEV: More `send` -> `public_send`. 2019-05-07 10:05:58 +08:00			`(self.public_send(type)) << object`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`end`
			`end`

PERF: Avoid parsing `Post#cooked` with Nokogiri for every search. 2020-07-17 16:27:30 +08:00			`def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)`
Use the same component for similar topics as search results. 2015-06-25 03:08:22 +08:00			`blurb = nil`
PERF: Replace video and audio links in search blurb while indexing. In the near future, we will be swtiching to PG headlines to generate the search blurb. As such, we need to replace audio and video links in the raw data used for headline generation. This also means that we avoid replacing links each time we need to generate the blurb. 2020-08-06 12:25:03 +08:00
			`if scrub`
FEATURE: Use Postgres unaccent to ignore accents (#16100) The search_ignore_accents site setting can be used to make the search indexer remove the accents before indexing the content. The unaccent function from PostgreSQL is better than Ruby's unicode_normalize(:nfkd). 2022-03-08 05:03:10 +08:00			`cooked = SearchIndexer::HtmlScrubber.scrub(cooked)`
PERF: Replace video and audio links in search blurb while indexing. In the near future, we will be swtiching to PG headlines to generate the search blurb. As such, we need to replace audio and video links in the raw data used for headline generation. This also means that we avoid replacing links each time we need to generate the blurb. 2020-08-06 12:25:03 +08:00
			`urls = Set.new`
			`cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }`
			`urls.each do \|url\|`
			`begin`
			`case File.extname(URI(url).path \|\| "")`
			`when Oneboxer::VIDEO_REGEX`
			`cooked.gsub!(url, I18n.t("search.video"))`
			`when Oneboxer::AUDIO_REGEX`
			`cooked.gsub!(url, I18n.t("search.audio"))`
			`end`
			`rescue URI::InvalidURIError`
FIX: skip invalid URLs when checking for audio/video in search blurbs Fixes 500 errors on search queries introduced in 580a4a8 2019-11-06 23:32:15 +08:00			`end`
DEV: Refactor excluding audio/video URLs from search result blurbs Followup to 580a4a82 2019-10-31 21:13:24 +08:00			`end`
Exclude audio/video URLs from search result blurbs Displays translatable "[audio]" or "[video]" placeholders instead of ugly (and often long) URLs. 2019-10-31 01:07:16 +08:00			`end`

Use the same component for similar topics as search results. 2015-06-25 03:08:22 +08:00			`if term`
FIX: Incorrect search blurb when advanced search filters are used take2 Also remove include_blurbs attribute which isn't used. 2020-07-14 11:05:57 +08:00			`if term =~ Regexp.new(Search::PHRASE_MATCH_REGEXP_PATTERN)`
			`term = Regexp.last_match[1]`
FIX: Post blurb incorrect when search contains a phrase match. If the blurb generated is not around the search term, we will not be able to highlight it on the client side. 2019-03-26 17:01:19 +08:00			`end`

FIX: Incorrect search blurb when advanced search filters are used take2 Also remove include_blurbs attribute which isn't used. 2020-07-14 11:05:57 +08:00			`blurb = TextHelper.excerpt(cooked, term,`
DEV: Cleanup misspelled TextHelper param 2019-10-31 21:32:42 +08:00			`radius: blurb_length / 2`
FIX: Post blurb incorrect when search contains a phrase match. If the blurb generated is not around the search term, we will not be able to highlight it on the client side. 2019-03-26 17:01:19 +08:00			`)`
Use the same component for similar topics as search results. 2015-06-25 03:08:22 +08:00			`end`
FEATURE: new 'search_ignore_accents' site setting 2018-09-17 16:31:15 +08:00
DEV: Cleanup misspelled TextHelper param 2019-10-31 21:32:42 +08:00			`blurb = TextHelper.truncate(cooked, length: blurb_length) if blurb.blank?`
Use the same component for similar topics as search results. 2015-06-25 03:08:22 +08:00			`Sanitize.clean(blurb)`
			`end`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`end`

BUGFIX: search could break when expanding 2014-02-17 11:34:14 +08:00			`end`