discourse/lib/search/grouped_search_results.rb

# frozen_string_literal: true

require 'sanitize'

class Search

  class GroupedSearchResults
    include ActiveModel::Serialization

    class TextHelper
      extend ActionView::Helpers::TextHelper

      private

      # TODO: Remove when https://github.com/rails/rails/pull/39979 is merged
      # For a 10_000 words string, speeds up excerpts by 85X.
      def self.cut_excerpt_part(part_position, part, separator, options)
        return "", "" unless part

        radius   = options.fetch(:radius, 100)
        omission = options.fetch(:omission, "...")

        if separator != ""
          part = part.split(separator)
          part.delete("")
        end

        affix = part.length > radius ? omission : ""
        part = part.public_send(part_position == :first ? :last : :first, radius)
        part = part.join(separator) if separator != ""
        [affix, part]
      end
    end

    attr_reader(
      :type_filter,
      :posts,
      :categories,
      :users,
      :tags,
      :groups,
      :more_posts,
      :more_categories,
      :more_users,
      :term,
      :search_context,
      :more_full_page_results,
      :error
    )

    attr_accessor :search_log_id

    BLURB_LENGTH = 200

    def initialize(type_filter:, term:, search_context:, blurb_length: nil, blurb_term: nil)
      @type_filter = type_filter
      @term = term
      @blurb_term = blurb_term || term
      @search_context = search_context
      @blurb_length = blurb_length || BLURB_LENGTH
      @posts = []
      @categories = []
      @users = []
      @tags = []
      @groups = []
      @error = nil
    end

    def error=(error)
      @error = error
    end

    def find_user_data(guardian)
      if user = guardian.user
        topics = @posts.map(&:topic)
        topic_lookup = TopicUser.lookup_for(user, topics)
        topics.each { |ft| ft.user_data = topic_lookup[ft.id] }
      end
    end

    OMISSION = '...'
    SCRUB_HEADLINE_REGEXP = /<span(?: \w+="[^"]+")* class="#{Search::HIGHLIGHT_CSS_CLASS}"(?: \w+="[^"]+")*>([^<]*)<\/span>/

    def blurb(post)
      opts = {
        term: @blurb_term,
        blurb_length: @blurb_length
      }

      if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION && !Search.segment_chinese? && !Search.segment_japanese?
        if SiteSetting.use_pg_headlines_for_excerpt
          scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1')
          prefix_omission = scrubbed_headline.start_with?(post.leading_raw_data) ? '' : OMISSION
          postfix_omission = scrubbed_headline.end_with?(post.trailing_raw_data) ? '' : OMISSION
          return "#{prefix_omission}#{post.headline}#{postfix_omission}"
        else
          opts[:cooked] = post.post_search_data.raw_data
          opts[:scrub] = false
        end
      else
        opts[:cooked] = post.cooked
      end

      GroupedSearchResults.blurb_for(**opts)
    end

    def add(object)
      type = object.class.to_s.downcase.pluralize

      if @type_filter.present? && public_send(type).length == Search.per_filter
        @more_full_page_results = true
      elsif !@type_filter.present? && public_send(type).length == Search.per_facet
        instance_variable_set("@more_#{type}".to_sym, true)
      else
        (self.public_send(type)) << object
      end
    end

    def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
      blurb = nil

      if scrub
        cooked = SearchIndexer::HtmlScrubber.scrub(cooked)

        urls = Set.new
        cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
        urls.each do |url|
          begin
            case File.extname(URI(url).path || "")
            when Oneboxer::VIDEO_REGEX
              cooked.gsub!(url, I18n.t("search.video"))
            when Oneboxer::AUDIO_REGEX
              cooked.gsub!(url, I18n.t("search.audio"))
            end
          rescue URI::InvalidURIError
          end
        end
      end

      if term
        if term =~ Regexp.new(Search::PHRASE_MATCH_REGEXP_PATTERN)
          term = Regexp.last_match[1]
        end

        blurb = TextHelper.excerpt(cooked, term,
          radius: blurb_length / 2
        )
      end

      blurb = TextHelper.truncate(cooked, length: blurb_length) if blurb.blank?
      Sanitize.clean(blurb)
    end
  end

end