discourse/app/services/search_indexer.rb

# frozen_string_literal: true
require_dependency 'search'

class SearchIndexer

  def self.disable
    @disabled = true
  end

  def self.enable
    @disabled = false
  end

  def self.scrub_html_for_search(html)
    HtmlScrubber.scrub(html)
  end

  def self.inject_extra_terms(raw)
    # insert some extra words for I.am.a.word so "word" is tokenized
    # I.am.a.word becomes I.am.a.word am a word
    raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot|
      split = with_dot.split(".")
      if split.length > 1
        with_dot + ((+" ") << split[1..-1].join(" "))
      else
        with_dot
      end
    end
  end

  def self.update_index(table: , id: , raw_data:)
    search_data = raw_data.map do |data|
      inject_extra_terms(Search.prepare_data(data || "", :index))
    end

    table_name = "#{table}_search_data"
    foreign_key = "#{table}_id"

    # for user login and name use "simple" lowercase stemmer
    stemmer = table == "user" ? "simple" : Search.ts_config

    ranked_index = <<~SQL
      setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') ||
      setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') ||
      setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') ||
      setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
    SQL

    indexed_data = search_data.select { |d| d.length > 0 }.join(' ')

    params = {
      a: search_data[0],
      b: search_data[1],
      c: search_data[2],
      d: search_data[3],
      raw_data: indexed_data,
      id: id,
      locale: SiteSetting.default_locale,
      version: Search::INDEX_VERSION
    }

    # Would be nice to use AR here but not sure how to execut Postgres functions
    # when inserting data like this.
    rows = Post.exec_sql_row_count(<<~SQL, params)
       UPDATE #{table_name}
       SET
          raw_data = :raw_data,
          locale = :locale,
          search_data = #{ranked_index},
          version = :version
       WHERE #{foreign_key} = :id
    SQL

    if rows == 0
      Post.exec_sql(<<~SQL, params)
        INSERT INTO #{table_name}
        (#{foreign_key}, search_data, locale, raw_data, version)
        VALUES (:id, #{ranked_index}, :locale, :raw_data, :version)
      SQL
    end
  rescue
    # TODO is there any way we can safely avoid this?
    # best way is probably pushing search indexer into a dedicated process so it no longer happens on save
    # instead in the post processor
  end

  def self.update_topics_index(topic_id, title, cooked)
    scrubbed_cooked = scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]

    # a bit inconsitent that we use title as A and body as B when in
    # the post index body is C
    update_index(table: 'topic', id: topic_id, raw_data: [title, scrubbed_cooked])
  end

  def self.update_posts_index(post_id, title, category, tags, cooked)
    update_index(table: 'post', id: post_id, raw_data: [title, category, tags, scrub_html_for_search(cooked)])
  end

  def self.update_users_index(user_id, username, name)
    update_index(table: 'user', id: user_id, raw_data: [username, name])
  end

  def self.update_categories_index(category_id, name)
    update_index(table: 'category', id: category_id, raw_data: [name])
  end

  def self.update_tags_index(tag_id, name)
    update_index(table: 'tag', id: tag_id, raw_data: [name])
  end

  def self.queue_post_reindex(topic_id)
    return if @disabled

    ActiveRecord::Base.exec_sql(<<~SQL, topic_id: topic_id)
      UPDATE post_search_data
      SET version = 0
      WHERE post_id IN (SELECT id FROM posts WHERE topic_id = :topic_id)
    SQL
  end

  def self.index(obj, force: false)
    return if @disabled

    category_name, tag_names = nil
    topic = nil

    if Topic === obj
      topic = obj
    elsif Post === obj
      topic = obj.topic
    end

    category_name = topic.category&.name if topic
    tag_names = topic.tags.pluck(:name).join(' ') if topic

    if Post === obj && (obj.saved_change_to_cooked? || force)
      if topic
        SearchIndexer.update_posts_index(obj.id, topic.title, category_name, tag_names, obj.cooked)
        SearchIndexer.update_topics_index(topic.id, topic.title, obj.cooked) if obj.is_first_post?
      else
        Rails.logger.warn("Orphan post skipped in search_indexer, topic_id: #{obj.topic_id} post_id: #{obj.id} raw: #{obj.raw}")
      end
    end

    if User === obj && (obj.saved_change_to_username? || obj.saved_change_to_name? || force)
      SearchIndexer.update_users_index(obj.id, obj.username_lower || '', obj.name ? obj.name.downcase : '')
    end

    if Topic === obj && (obj.saved_change_to_title? || force)
      if obj.posts
        post = obj.posts.find_by(post_number: 1)
        if post
          SearchIndexer.update_posts_index(post.id, obj.title, category_name, tag_names, post.cooked)
          SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
        end
      end
    end

    if Category === obj && (obj.saved_change_to_name? || force)
      SearchIndexer.update_categories_index(obj.id, obj.name)
    end

    if Tag === obj && (obj.saved_change_to_name? || force)
      SearchIndexer.update_tags_index(obj.id, obj.name)
    end
  end

  class HtmlScrubber < Nokogiri::XML::SAX::Document
    attr_reader :scrubbed

    def initialize
      @scrubbed = +""
    end

    def self.scrub(html)
      me = new
      parser = Nokogiri::HTML::SAX::Parser.new(me)
      begin
        copy = +"<div>"
        copy << html unless html.nil?
        copy << "</div>"
        parser.parse(html) unless html.nil?
      end
      me.scrubbed
    end

    def start_element(name, attributes = [])
      attributes = Hash[*attributes.flatten]
      if attributes["alt"]
        scrubbed << " "
        scrubbed << attributes["alt"]
        scrubbed << " "
      end
      if attributes["title"]
        scrubbed << " "
        scrubbed << attributes["title"]
        scrubbed << " "
      end
    end

    def characters(string)
      scrubbed << " "
      scrubbed << string
      scrubbed << " "
    end
  end
end
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`# frozen_string_literal: true`
Fix locale dependend stemmer for FTS Fix locale dependend stemmer for FTS to improve search relevance on non English languages. 2013-07-23 07:07:59 +08:00			`require_dependency 'search'`

Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-22 10:13:14 +08:00			`class SearchIndexer`

			`def self.disable`
			`@disabled = true`
			`end`

			`def self.enable`
			`@disabled = false`
			`end`
Initial release of Discourse 2013-02-06 03:16:51 +08:00
			`def self.scrub_html_for_search(html)`
			`HtmlScrubber.scrub(html)`
			`end`

FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`def self.inject_extra_terms(raw)`
shuffle code around so excerpt is not messed up 2016-07-25 15:12:01 +08:00			`# insert some extra words for I.am.a.word so "word" is tokenized`
FIX: not splitting words correctly for search tokenizer 2017-08-15 04:19:28 +08:00			`# I.am.a.word becomes I.am.a.word am a word`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`raw.gsub(/[^[:space:]][\.]+[^[:space:]]/) do \|with_dot\|`
shuffle code around so excerpt is not messed up 2016-07-25 15:12:01 +08:00			`split = with_dot.split(".")`
			`if split.length > 1`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`with_dot + ((+" ") << split[1..-1].join(" "))`
shuffle code around so excerpt is not messed up 2016-07-25 15:12:01 +08:00			`else`
			`with_dot`
			`end`
			`end`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`end`

			`def self.update_index(table: , id: , raw_data:)`
			`search_data = raw_data.map do \|data\|`
			`inject_extra_terms(Search.prepare_data(data \|\| "", :index))`
			`end`

			`table_name = "#{table}_search_data"`
			`foreign_key = "#{table}_id"`
shuffle code around so excerpt is not messed up 2016-07-25 15:12:01 +08:00
Fix locale dependend stemmer for FTS Fix locale dependend stemmer for FTS to improve search relevance on non English languages. 2013-07-23 07:07:59 +08:00			`# for user login and name use "simple" lowercase stemmer`
Replace rmmseg gem for cppjieba_rb since better dictionary (#5006) * Rename locale to ts config in search module to make it clear * Replace rmmese-cpp for cppjieba_rb 2017-08-01 03:28:48 +08:00			`stemmer = table == "user" ? "simple" : Search.ts_config`
Fix locale dependend stemmer for FTS Fix locale dependend stemmer for FTS to improve search relevance on non English languages. 2013-07-23 07:07:59 +08:00
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`ranked_index = <<~SQL`
			`setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') \|\|`
			`setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') \|\|`
			`setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') \|\|`
			`setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')`
			`SQL`

			`indexed_data = search_data.select { \|d\| d.length > 0 }.join(' ')`

			`params = {`
			`a: search_data[0],`
			`b: search_data[1],`
			`c: search_data[2],`
			`d: search_data[3],`
			`raw_data: indexed_data,`
			`id: id,`
			`locale: SiteSetting.default_locale,`
			`version: Search::INDEX_VERSION`
			`}`

Search Refactor: Remove some manual SQL, make search data tables more idomatic Rails/AR 2013-05-23 03:33:33 +08:00			`# Would be nice to use AR here but not sure how to execut Postgres functions`
			`# when inserting data like this.`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`rows = Post.exec_sql_row_count(<<~SQL, params)`
			`UPDATE #{table_name}`
			`SET`
			`raw_data = :raw_data,`
			`locale = :locale,`
			`search_data = #{ranked_index},`
			`version = :version`
			`WHERE #{foreign_key} = :id`
			`SQL`

Search Refactor: Remove some manual SQL, make search data tables more idomatic Rails/AR 2013-05-23 03:33:33 +08:00			`if rows == 0`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`Post.exec_sql(<<~SQL, params)`
			`INSERT INTO #{table_name}`
			`(#{foreign_key}, search_data, locale, raw_data, version)`
			`VALUES (:id, #{ranked_index}, :locale, :raw_data, :version)`
			`SQL`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`end`
Search Refactor: Remove some manual SQL, make search data tables more idomatic Rails/AR 2013-05-23 03:33:33 +08:00			`rescue`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`# TODO is there any way we can safely avoid this?`
			`# best way is probably pushing search indexer into a dedicated process so it no longer happens on save`
			`# instead in the post processor`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`end`

PERF: new table used for title similarity search 2014-08-08 13:50:26 +08:00			`def self.update_topics_index(topic_id, title, cooked)`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`scrubbed_cooked = scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]`

			`# a bit inconsitent that we use title as A and body as B when in`
			`# the post index body is C`
			`update_index(table: 'topic', id: topic_id, raw_data: [title, scrubbed_cooked])`
PERF: new table used for title similarity search 2014-08-08 13:50:26 +08:00			`end`

FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`def self.update_posts_index(post_id, title, category, tags, cooked)`
			`update_index(table: 'post', id: post_id, raw_data: [title, category, tags, scrub_html_for_search(cooked)])`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`end`

			`def self.update_users_index(user_id, username, name)`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`update_index(table: 'user', id: user_id, raw_data: [username, name])`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`end`
Fix all the trailing whitespace 2013-02-07 23:45:24 +08:00
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`def self.update_categories_index(category_id, name)`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`update_index(table: 'category', id: category_id, raw_data: [name])`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`end`

FEATURE: show tags in search results 2017-08-25 23:52:18 +08:00			`def self.update_tags_index(tag_id, name)`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`update_index(table: 'tag', id: tag_id, raw_data: [name])`
			`end`

			`def self.queue_post_reindex(topic_id)`
			`return if @disabled`

			`ActiveRecord::Base.exec_sql(<<~SQL, topic_id: topic_id)`
			`UPDATE post_search_data`
			`SET version = 0`
			`WHERE post_id IN (SELECT id FROM posts WHERE topic_id = :topic_id)`
			`SQL`
FEATURE: show tags in search results 2017-08-25 23:52:18 +08:00			`end`

FIX: rebuild index when engine replaced (#5021) 2017-08-16 19:38:34 +08:00			`def self.index(obj, force: false)`
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-22 10:13:14 +08:00			`return if @disabled`

FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`category_name, tag_names = nil`
			`topic = nil`

			`if Topic === obj`
			`topic = obj`
			`elsif Post === obj`
			`topic = obj.topic`
			`end`

			`category_name = topic.category&.name if topic`
			`tag_names = topic.tags.pluck(:name).join(' ') if topic`

			`if Post === obj && (obj.saved_change_to_cooked? \|\| force)`
			`if topic`
			`SearchIndexer.update_posts_index(obj.id, topic.title, category_name, tag_names, obj.cooked)`
			`SearchIndexer.update_topics_index(topic.id, topic.title, obj.cooked) if obj.is_first_post?`
More logging, less problems 2014-05-07 10:35:26 +08:00			`else`
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-22 10:13:14 +08:00			`Rails.logger.warn("Orphan post skipped in search_indexer, topic_id: #{obj.topic_id} post_id: #{obj.id} raw: #{obj.raw}")`
More logging, less problems 2014-05-07 10:35:26 +08:00			`end`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`end`
FIX: rebuild index when engine replaced (#5021) 2017-08-16 19:38:34 +08:00
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`if User === obj && (obj.saved_change_to_username? \|\| obj.saved_change_to_name? \|\| force)`
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-22 10:13:14 +08:00			`SearchIndexer.update_users_index(obj.id, obj.username_lower \|\| '', obj.name ? obj.name.downcase : '')`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`end`

FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`if Topic === obj && (obj.saved_change_to_title? \|\| force)`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`if obj.posts`
Perform the where(...).first to find_by(...) refactoring. This refactoring was automated using the command: bundle exec "ruby refactorings/where_dot_first_to_find_by/app.rb" 2014-05-06 21:41:59 +08:00			`post = obj.posts.find_by(post_number: 1)`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`if post`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`SearchIndexer.update_posts_index(post.id, obj.title, category_name, tag_names, post.cooked)`
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-22 10:13:14 +08:00			`SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`end`
			`end`
			`end`

FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`if Category === obj && (obj.saved_change_to_name? \|\| force)`
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-22 10:13:14 +08:00			`SearchIndexer.update_categories_index(obj.id, obj.name)`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`end`
FEATURE: show tags in search results 2017-08-25 23:52:18 +08:00
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`if Tag === obj && (obj.saved_change_to_name? \|\| force)`
FEATURE: show tags in search results 2017-08-25 23:52:18 +08:00			`SearchIndexer.update_tags_index(obj.id, obj.name)`
			`end`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`end`

			`class HtmlScrubber < Nokogiri::XML::SAX::Document`
			`attr_reader :scrubbed`

			`def initialize`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`@scrubbed = +""`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`end`

			`def self.scrub(html)`
minor cleanup, using AR querying DSL over raw SQL in some places 2013-03-01 02:54:12 +08:00			`me = new`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`parser = Nokogiri::HTML::SAX::Parser.new(me)`
Fix all the trailing whitespace 2013-02-07 23:45:24 +08:00			`begin`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`copy = +"<div>"`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`copy << html unless html.nil?`
			`copy << "</div>"`
			`parser.parse(html) unless html.nil?`
			`end`
			`me.scrubbed`
			`end`

Add rubocop to our build. (#5004) 2017-07-28 09:20:09 +08:00			`def start_element(name, attributes = [])`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`attributes = Hash[*attributes.flatten]`
			`if attributes["alt"]`
			`scrubbed << " "`
			`scrubbed << attributes["alt"]`
			`scrubbed << " "`
			`end`
			`if attributes["title"]`
			`scrubbed << " "`
Fix all the trailing whitespace 2013-02-07 23:45:24 +08:00			`scrubbed << attributes["title"]`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`scrubbed << " "`
			`end`
			`end`

			`def characters(string)`
			`scrubbed << " "`
shuffle code around so excerpt is not messed up 2016-07-25 15:12:01 +08:00			`scrubbed << string`
Initial release of Discourse 2013-02-06 03:16:51 +08:00			`scrubbed << " "`
			`end`
			`end`
			`end`