discourse/lib/search/grouped_search_results.rb

# frozen_string_literal: true

require "sanitize"

class Search
  class GroupedSearchResults
    include ActiveModel::Serialization

    class TextHelper
      extend ActionView::Helpers::TextHelper
    end

    attr_reader(
      :type_filter,
      :posts,
      :categories,
      :users,
      :tags,
      :groups,
      :more_posts,
      :more_categories,
      :more_users,
      :term,
      :search_context,
      :more_full_page_results,
      :error,
      :use_pg_headlines_for_excerpt,
    )

    attr_accessor :search_log_id

    BLURB_LENGTH = 200

    def initialize(
      type_filter:,
      term:,
      search_context:,
      blurb_length: nil,
      blurb_term: nil,
      is_header_search: false,
      use_pg_headlines_for_excerpt: SiteSetting.use_pg_headlines_for_excerpt
    )
      @type_filter = type_filter
      @term = term
      @blurb_term = blurb_term || term
      @search_context = search_context
      @blurb_length = blurb_length || BLURB_LENGTH
      @posts = []
      @categories = []
      @users = []
      @tags = []
      @groups = []
      @error = nil
      @is_header_search = is_header_search
      @use_pg_headlines_for_excerpt = use_pg_headlines_for_excerpt
    end

    def error=(error)
      @error = error
    end

    def find_user_data(guardian)
      if user = guardian.user
        topics = @posts.map(&:topic)
        topic_lookup = TopicUser.lookup_for(user, topics)
        topics.each { |ft| ft.user_data = topic_lookup[ft.id] }
      end
    end

    OMISSION = "..."
    SCRUB_HEADLINE_REGEXP =
      %r{<span(?: \w+="[^"]+")* class="#{Search::HIGHLIGHT_CSS_CLASS}"(?: \w+="[^"]+")*>([^<]*)</span>}

    def blurb(post)
      opts = { term: @blurb_term, blurb_length: @blurb_length }

      if post.post_search_data.version >= SearchIndexer::MIN_POST_BLURB_INDEX_VERSION &&
           !Search.segment_chinese? && !Search.segment_japanese?
        if use_pg_headlines_for_excerpt
          scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1')
          prefix_omission = scrubbed_headline.start_with?(post.leading_raw_data) ? "" : OMISSION
          postfix_omission = scrubbed_headline.end_with?(post.trailing_raw_data) ? "" : OMISSION
          return "#{prefix_omission}#{post.headline}#{postfix_omission}"
        else
          opts[:cooked] = post.post_search_data.raw_data
          opts[:scrub] = false
        end
      else
        opts[:cooked] = post.cooked
      end

      GroupedSearchResults.blurb_for(**opts)
    end

    def add(object)
      type = object.class.to_s.downcase.pluralize
      if !@is_header_search && public_send(type).length == Search.per_filter
        @more_full_page_results = true
      elsif @is_header_search && public_send(type).length == Search.per_facet
        instance_variable_set("@more_#{type}".to_sym, true)
      else
        (self.public_send(type)) << object
      end
    end

    def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
      blurb = nil

      if scrub
        cooked = SearchIndexer::HtmlScrubber.scrub(cooked)

        urls = Set.new
        cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
        urls.each do |url|
          begin
            case File.extname(URI(url).path || "")
            when Oneboxer::VIDEO_REGEX
              cooked.gsub!(url, I18n.t("search.video"))
            when Oneboxer::AUDIO_REGEX
              cooked.gsub!(url, I18n.t("search.audio"))
            end
          rescue URI::InvalidURIError
          end
        end
      end

      if term
        term = Regexp.last_match[1] if term =~ Regexp.new(Search::PHRASE_MATCH_REGEXP_PATTERN)

        blurb = TextHelper.excerpt(cooked, term, radius: blurb_length / 2)
      end

      blurb = TextHelper.truncate(cooked, length: blurb_length) if blurb.blank?
      Sanitize.clean(blurb)
    end
  end
end
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-03 06:17:27 +08:00			`# frozen_string_literal: true`

FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`require "sanitize"`

Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`class Search`
			`class GroupedSearchResults`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`include ActiveModel::Serialization`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`class TextHelper`
			`extend ActionView::Helpers::TextHelper`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`end`

Include the `search_log_id` in search results 2017-07-17 23:57:13 +08:00			`attr_reader(`
			`:type_filter,`
			`:posts,`
			`:categories,`
			`:users,`
FEATURE: show tags in search results 2017-08-25 23:52:18 +08:00			`:tags,`
FEATURE: displays groups in menu search (#7090) 2019-03-04 17:30:09 +08:00			`:groups,`
Include the `search_log_id` in search results 2017-07-17 23:57:13 +08:00			`:more_posts,`
			`:more_categories,`
			`:more_users,`
			`:term,`
			`:search_context,`
FEATURE: when under extreme load disable search The global setting disable_search_queue_threshold (DISCOURSE_DISABLE_SEARCH_QUEUE_THRESHOLD) which default to 1 second was added. This protection ensures that when the application is unable to keep up with requests it will simply turn off search till it is not backed up. To disable this protection set this to 0. 2019-07-02 09:21:52 +08:00			`:more_full_page_results,`
			`:error,`
DEV: Make search results blurb non-pg headlines setting dependent (#20939) Followup to #20915. If we're grouping search results that don't rely on core's search, we won't have access to pg headlines. This is now configurable via the constructor, defaulting to `SiteSetting.use_pg_headlines_for_excerpt` 2023-04-03 22:09:36 +08:00			`:use_pg_headlines_for_excerpt,`
Include the `search_log_id` in search results 2017-07-17 23:57:13 +08:00			`)`

			`attr_accessor :search_log_id`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00
FIX: Incorrect search blurb when advanced search filters are used take2 Also remove include_blurbs attribute which isn't used. 2020-07-14 11:05:57 +08:00			`BLURB_LENGTH = 200`

FIX: Limits for PM and group header search (#16887) When searching for PMs or PMs in a group inbox, results in the header search were not being limited to 5 with a "More" link to the full page search. This PR fixes that. It also simplifies the logic and updates the search API docs to include recently added `in:messages` and `group_messages:groupname` options. 2022-05-24 23:31:24 +08:00			`def initialize(`
			`type_filter:,`
			`term:,`
			`search_context:,`
			`blurb_length: nil,`
			`blurb_term: nil,`
DEV: Make search results blurb non-pg headlines setting dependent (#20939) Followup to #20915. If we're grouping search results that don't rely on core's search, we won't have access to pg headlines. This is now configurable via the constructor, defaulting to `SiteSetting.use_pg_headlines_for_excerpt` 2023-04-03 22:09:36 +08:00			`is_header_search: false,`
			`use_pg_headlines_for_excerpt: SiteSetting.use_pg_headlines_for_excerpt`
FIX: Limits for PM and group header search (#16887) When searching for PMs or PMs in a group inbox, results in the header search were not being limited to 5 with a "More" link to the full page search. This PR fixes that. It also simplifies the logic and updates the search API docs to include recently added `in:messages` and `group_messages:groupname` options. 2022-05-24 23:31:24 +08:00			`)`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`@type_filter = type_filter`
			`@term = term`
FIX: Incorrect search blurb when advanced search filters are used take2 Also remove include_blurbs attribute which isn't used. 2020-07-14 11:05:57 +08:00			`@blurb_term = blurb_term \|\| term`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`@search_context = search_context`
FIX: Incorrect search blurb when advanced search filters are used take2 Also remove include_blurbs attribute which isn't used. 2020-07-14 11:05:57 +08:00			`@blurb_length = blurb_length \|\| BLURB_LENGTH`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`@posts = []`
			`@categories = []`
			`@users = []`
FEATURE: show tags in search results 2017-08-25 23:52:18 +08:00			`@tags = []`
FEATURE: displays groups in menu search (#7090) 2019-03-04 17:30:09 +08:00			`@groups = []`
FEATURE: when under extreme load disable search The global setting disable_search_queue_threshold (DISCOURSE_DISABLE_SEARCH_QUEUE_THRESHOLD) which default to 1 second was added. This protection ensures that when the application is unable to keep up with requests it will simply turn off search till it is not backed up. To disable this protection set this to 0. 2019-07-02 09:21:52 +08:00			`@error = nil`
FIX: Limits for PM and group header search (#16887) When searching for PMs or PMs in a group inbox, results in the header search were not being limited to 5 with a "More" link to the full page search. This PR fixes that. It also simplifies the logic and updates the search API docs to include recently added `in:messages` and `group_messages:groupname` options. 2022-05-24 23:31:24 +08:00			`@is_header_search = is_header_search`
DEV: Make search results blurb non-pg headlines setting dependent (#20939) Followup to #20915. If we're grouping search results that don't rely on core's search, we won't have access to pg headlines. This is now configurable via the constructor, defaulting to `SiteSetting.use_pg_headlines_for_excerpt` 2023-04-03 22:09:36 +08:00			`@use_pg_headlines_for_excerpt = use_pg_headlines_for_excerpt`
FEATURE: when under extreme load disable search The global setting disable_search_queue_threshold (DISCOURSE_DISABLE_SEARCH_QUEUE_THRESHOLD) which default to 1 second was added. This protection ensures that when the application is unable to keep up with requests it will simply turn off search till it is not backed up. To disable this protection set this to 0. 2019-07-02 09:21:52 +08:00			`end`

			`def error=(error)`
			`@error = error`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`end`

FIX: include pinned status in search results 2016-03-18 13:26:20 +08:00			`def find_user_data(guardian)`
			`if user = guardian.user`
			`topics = @posts.map(&:topic)`
			`topic_lookup = TopicUser.lookup_for(user, topics)`
			`topics.each { \|ft\| ft.user_data = topic_lookup[ft.id] }`
			`end`
			`end`

FIX: Limit PG headline based search blurb generation to 200 characters. * Recovers omission characters '...' in blurb as well. 2020-08-12 15:33:26 +08:00			`OMISSION = "..."`
			`SCRUB_HEADLINE_REGEXP =`
			`%r{<span(?: \w+="[^"]+")* class="#{Search::HIGHLIGHT_CSS_CLASS}"(?: \w+="[^"]+")>([^<])</span>}`

FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`def blurb(post)`
PERF: Avoid parsing `Post#cooked` with Nokogiri for every search. 2020-07-17 16:27:30 +08:00			`opts = { term: @blurb_term, blurb_length: @blurb_length }`

FEATURE: Roll out new search optimisations (#20364) - Reduce duplication of terms in post index from unlimited to 6. This will result in reduced index size and reduced weighting for posts containing a huge amount of duplicate terms. (Eg: a post containing "sam sam sam sam sam sam sam sam", will index as "sam sam sam sam sam sam", only including the word up to 6 times.) This corrects a flaw where title weighting could be ignored. - Prioritize exact matches of words in titles. Our search always performs a prefix match. However we want to give special weight to exact title matches meaning that a search for "sum" will find topics such as "the sum of us" vs "summer in spring". - Pick up fixes to our search algorithm which are missing from old indexes. Specifically pick up the fix that indexes URLs properly. (`https://happy.com` was stemmed to `happi` in keywords and then was not searchable) see also: https://meta.discourse.org/t/refinements-to-search-being-tested-on-meta/254158 Indexing will take a while and work in batches, in the background. 2023-02-20 08:53:35 +08:00			`if post.post_search_data.version >= SearchIndexer::MIN_POST_BLURB_INDEX_VERSION &&`
FEATURE: Split up text segmentation for Chinese and Japanese. * Chinese segmenetation will continue to rely on cppjieba * Japanese segmentation will use our port of TinySegmenter * Korean currently does not rely on segmentation which was dropped in c677877e4fe5381f613279901f36ae255c909573 * SiteSetting.search_tokenize_chinese_japanese_korean has been split into SiteSetting.search_tokenize_chinese and SiteSetting.search_tokenize_japanese respectively 2022-01-26 15:24:11 +08:00			`!Search.segment_chinese? && !Search.segment_japanese?`
DEV: Make search results blurb non-pg headlines setting dependent (#20939) Followup to #20915. If we're grouping search results that don't rely on core's search, we won't have access to pg headlines. This is now configurable via the constructor, defaulting to `SiteSetting.use_pg_headlines_for_excerpt` 2023-04-03 22:09:36 +08:00			`if use_pg_headlines_for_excerpt`
FIX: Limit PG headline based search blurb generation to 200 characters. * Recovers omission characters '...' in blurb as well. 2020-08-12 15:33:26 +08:00			`scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1')`
			`prefix_omission = scrubbed_headline.start_with?(post.leading_raw_data) ? "" : OMISSION`
			`postfix_omission = scrubbed_headline.end_with?(post.trailing_raw_data) ? "" : OMISSION`
			`return "#{prefix_omission}#{post.headline}#{postfix_omission}"`
PERF: Use PG headlines for blurb generation and highlighting for search. 2020-08-06 14:15:31 +08:00			`else`
			`opts[:cooked] = post.post_search_data.raw_data`
			`opts[:scrub] = false`
			`end`
PERF: Avoid parsing `Post#cooked` with Nokogiri for every search. 2020-07-17 16:27:30 +08:00			`else`
			`opts[:cooked] = post.cooked`
			`end`

			`GroupedSearchResults.blurb_for(**opts)`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`end`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`def add(object)`
			`type = object.class.to_s.downcase.pluralize`
FIX: Limits for PM and group header search (#16887) When searching for PMs or PMs in a group inbox, results in the header search were not being limited to 5 with a "More" link to the full page search. This PR fixes that. It also simplifies the logic and updates the search API docs to include recently added `in:messages` and `group_messages:groupname` options. 2022-05-24 23:31:24 +08:00			`if !@is_header_search && public_send(type).length == Search.per_filter`
Add attribute to grouped search results for more available posts. 2017-07-21 00:07:13 +08:00			`@more_full_page_results = true`
FIX: Limits for PM and group header search (#16887) When searching for PMs or PMs in a group inbox, results in the header search were not being limited to 5 with a "More" link to the full page search. This PR fixes that. It also simplifies the logic and updates the search API docs to include recently added `in:messages` and `group_messages:groupname` options. 2022-05-24 23:31:24 +08:00			`elsif @is_header_search && public_send(type).length == Search.per_facet`
FEATURE: increase search expansion to 50 results refactor search code to deal with proper objects use proper serializers, test the controllers 2014-09-02 17:15:08 +08:00			`instance_variable_set("@more_#{type}".to_sym, true)`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`else`
DEV: More `send` -> `public_send`. 2019-05-07 10:05:58 +08:00			`(self.public_send(type)) << object`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`end`
			`end`

PERF: Avoid parsing `Post#cooked` with Nokogiri for every search. 2020-07-17 16:27:30 +08:00			`def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)`
Use the same component for similar topics as search results. 2015-06-25 03:08:22 +08:00			`blurb = nil`
PERF: Replace video and audio links in search blurb while indexing. In the near future, we will be swtiching to PG headlines to generate the search blurb. As such, we need to replace audio and video links in the raw data used for headline generation. This also means that we avoid replacing links each time we need to generate the blurb. 2020-08-06 12:25:03 +08:00
			`if scrub`
FEATURE: Use Postgres unaccent to ignore accents (#16100) The search_ignore_accents site setting can be used to make the search indexer remove the accents before indexing the content. The unaccent function from PostgreSQL is better than Ruby's unicode_normalize(:nfkd). 2022-03-08 05:03:10 +08:00			`cooked = SearchIndexer::HtmlScrubber.scrub(cooked)`
PERF: Replace video and audio links in search blurb while indexing. In the near future, we will be swtiching to PG headlines to generate the search blurb. As such, we need to replace audio and video links in the raw data used for headline generation. This also means that we avoid replacing links each time we need to generate the blurb. 2020-08-06 12:25:03 +08:00
			`urls = Set.new`
			`cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }`
			`urls.each do \|url\|`
			`begin`
			`case File.extname(URI(url).path \|\| "")`
			`when Oneboxer::VIDEO_REGEX`
			`cooked.gsub!(url, I18n.t("search.video"))`
			`when Oneboxer::AUDIO_REGEX`
			`cooked.gsub!(url, I18n.t("search.audio"))`
			`end`
			`rescue URI::InvalidURIError`
FIX: skip invalid URLs when checking for audio/video in search blurbs Fixes 500 errors on search queries introduced in 580a4a8 2019-11-06 23:32:15 +08:00			`end`
DEV: Refactor excluding audio/video URLs from search result blurbs Followup to 580a4a82 2019-10-31 21:13:24 +08:00			`end`
Exclude audio/video URLs from search result blurbs Displays translatable "[audio]" or "[video]" placeholders instead of ugly (and often long) URLs. 2019-10-31 01:07:16 +08:00			`end`

Use the same component for similar topics as search results. 2015-06-25 03:08:22 +08:00			`if term`
FIX: Incorrect search blurb when advanced search filters are used take2 Also remove include_blurbs attribute which isn't used. 2020-07-14 11:05:57 +08:00			`term = Regexp.last_match[1] if term =~ Regexp.new(Search::PHRASE_MATCH_REGEXP_PATTERN)`
FIX: Post blurb incorrect when search contains a phrase match. If the blurb generated is not around the search term, we will not be able to highlight it on the client side. 2019-03-26 17:01:19 +08:00
FIX: Incorrect search blurb when advanced search filters are used take2 Also remove include_blurbs attribute which isn't used. 2020-07-14 11:05:57 +08:00			`blurb = TextHelper.excerpt(cooked, term, radius: blurb_length / 2)`
Use the same component for similar topics as search results. 2015-06-25 03:08:22 +08:00			`end`
FEATURE: new 'search_ignore_accents' site setting 2018-09-17 16:31:15 +08:00
DEV: Cleanup misspelled TextHelper param 2019-10-31 21:32:42 +08:00			`blurb = TextHelper.truncate(cooked, length: blurb_length) if blurb.blank?`
Use the same component for similar topics as search results. 2015-06-25 03:08:22 +08:00			`Sanitize.clean(blurb)`
			`end`
Search code now uses ActiveRecord instead of SQL. 2013-05-24 02:26:51 +08:00			`end`
BUGFIX: search could break when expanding 2014-02-17 11:34:14 +08:00			`end`