discourse/app/services/search_indexer.rb
Sam cd247d5322
FEATURE: Roll out new search optimisations (#20364)
- Reduce duplication of terms in post index from unlimited to 6. This will
result in reduced index size and reduced weighting for posts containing
a huge amount of duplicate terms. (Eg: a post containing "sam sam sam sam
sam sam sam sam", will index as "sam sam sam sam sam sam", only including
the word up to 6 times.) This corrects a flaw where title weighting could
be ignored.

- Prioritize exact matches of words in titles. Our search always performs
a prefix match. However we want to give special weight to exact title matches
meaning that a search for "sum" will find topics such as "the sum of us" vs
"summer in spring".

- Pick up fixes to our search algorithm which are missing from old indexes.
Specifically pick up the fix that indexes URLs properly. (`https://happy.com`
was stemmed to `happi` in keywords and then was not searchable)

see also:

https://meta.discourse.org/t/refinements-to-search-being-tested-on-meta/254158

Indexing will take a while and work in batches, in the background.
2023-02-20 11:53:35 +11:00

407 lines
12 KiB
Ruby

# frozen_string_literal: true
class SearchIndexer
MIN_POST_BLURB_INDEX_VERSION = 4
POST_INDEX_VERSION = 5
TOPIC_INDEX_VERSION = 4
CATEGORY_INDEX_VERSION = 3
USER_INDEX_VERSION = 3
TAG_INDEX_VERSION = 3
# version to apply when issuing a background reindex
REINDEX_VERSION = 0
TS_VECTOR_PARSE_REGEX = /('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/
def self.disable
@disabled = true
end
def self.enable
@disabled = false
end
def self.update_index(table:, id:, a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
raw_data = { a: a_weight, b: b_weight, c: c_weight, d: d_weight }
# The version used in excerpts
search_data = raw_data.transform_values { |data| Search.prepare_data(data || "", :index) }
# The version used to build the index
indexed_data =
search_data.transform_values do |data|
data.gsub(/\S+/) { |word| word[0...SiteSetting.search_max_indexed_word_length] }
end
table_name = "#{table}_search_data"
foreign_key = "#{table}_id"
# for user login and name use "simple" lowercase stemmer
stemmer = table == "user" ? "simple" : Search.ts_config
ranked_index = <<~SQL
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:a,''))")}, 'A') ||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:b,''))")}, 'B') ||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:c,''))")}, 'C') ||
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
SQL
tsvector = DB.query_single("SELECT #{ranked_index}", indexed_data)[0]
additional_lexemes = []
# we also want to index parts of a domain name
# that way stemmed single word searches will match
additional_words = []
tsvector
.scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/)
.reduce(additional_lexemes) do |array, (lexeme, _, positions)|
count = 0
if lexeme !~ /\A(\d+\.)?(\d+\.)*(\*|\d+)\z/
loop do
count += 1
break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
term, _, remaining = lexeme.partition(".")
break if remaining.blank?
additional_words << [term, positions]
array << "'#{remaining}':#{positions}"
lexeme = remaining
end
end
array
end
extra_domain_word_terms =
if additional_words.length > 0
DB
.query_single(
"SELECT to_tsvector(?, ?)",
stemmer,
additional_words.map { |term, _| term }.join(" "),
)
.first
.scan(TS_VECTOR_PARSE_REGEX)
.map do |term, _, indexes|
new_indexes =
indexes.split(",").map { |index| additional_words[index.to_i - 1][1] }.join(",")
"#{term}#{new_indexes}"
end
.join(" ")
end
tsvector = "#{tsvector} #{additional_lexemes.join(" ")} #{extra_domain_word_terms}"
if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0
reduced = []
tsvector
.scan(TS_VECTOR_PARSE_REGEX)
.each do |term, _, indexes|
family_counts = Hash.new(0)
new_index_array = []
indexes
.split(",")
.each do |index|
family = nil
family = index[-1] if index[-1].match?(/[A-D]/)
if (family_counts[family] += 1) <= max_dupes
new_index_array << index
end
end
reduced << "#{term.strip}#{new_index_array.join(",")}"
end
tsvector = reduced.join(" ")
end
indexed_data =
if table.to_s == "post"
clean_post_raw_data!(search_data[:d])
else
search_data.values.select { |d| d.length > 0 }.join(" ")
end
params = {
"raw_data" => indexed_data,
"#{foreign_key}" => id,
"locale" => SiteSetting.default_locale,
"version" => const_get("#{table.upcase}_INDEX_VERSION"),
"search_data" => tsvector,
}
yield params if block_given?
table_name.camelize.constantize.upsert(params)
rescue => e
if Rails.env.test?
raise
else
# TODO is there any way we can safely avoid this?
# best way is probably pushing search indexer into a dedicated process so it no longer happens on save
# instead in the post processor
Discourse.warn_exception(
e,
message: "Unexpected error while indexing #{table} for search",
env: {
id: id,
},
)
end
end
def self.update_topics_index(topic_id, title, cooked)
# a bit inconsistent that we use title as A and body as B when in
# the post index body is D
update_index(
table: "topic",
id: topic_id,
a_weight: title,
b_weight: HtmlScrubber.scrub(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH],
)
end
def self.update_posts_index(
post_id:,
topic_title:,
category_name:,
topic_tags:,
cooked:,
private_message:
)
update_index(
table: "post",
id: post_id,
a_weight: topic_title,
b_weight: category_name,
c_weight: topic_tags,
# The tsvector resulted from parsing a string can be double the size of
# the original string. Since there is no way to estimate the length of
# the expected tsvector, we limit the input to ~50% of the maximum
# length of a tsvector (1_048_576 bytes).
d_weight: HtmlScrubber.scrub(cooked)[0..600_000],
) { |params| params["private_message"] = private_message }
end
def self.update_users_index(user_id, username, name, custom_fields)
update_index(
table: "user",
id: user_id,
a_weight: username,
b_weight: name,
c_weight: custom_fields,
)
end
def self.update_categories_index(category_id, name)
update_index(table: "category", id: category_id, a_weight: name)
end
def self.update_tags_index(tag_id, name)
update_index(table: "tag", id: tag_id, a_weight: name.downcase)
end
def self.queue_category_posts_reindex(category_id)
return if @disabled
DB.exec(<<~SQL, category_id: category_id, version: REINDEX_VERSION)
UPDATE post_search_data
SET version = :version
FROM posts
INNER JOIN topics ON posts.topic_id = topics.id
INNER JOIN categories ON topics.category_id = categories.id
WHERE post_search_data.post_id = posts.id
AND categories.id = :category_id
SQL
end
def self.queue_users_reindex(user_ids)
return if @disabled
DB.exec(<<~SQL, user_ids: user_ids, version: REINDEX_VERSION)
UPDATE user_search_data
SET version = :version
WHERE user_search_data.user_id IN (:user_ids)
SQL
end
def self.queue_post_reindex(topic_id)
return if @disabled
DB.exec(<<~SQL, topic_id: topic_id, version: REINDEX_VERSION)
UPDATE post_search_data
SET version = :version
FROM posts
WHERE post_search_data.post_id = posts.id
AND posts.topic_id = :topic_id
SQL
end
def self.index(obj, force: false)
return if @disabled
category_name = nil
tag_names = nil
topic = nil
if Topic === obj
topic = obj
elsif Post === obj
topic = obj.topic
end
category_name = topic.category&.name if topic
if topic
tags = topic.tags.select(:id, :name).to_a
if tags.present?
tag_names =
(tags.map(&:name) + Tag.where(target_tag_id: tags.map(&:id)).pluck(:name)).join(" ")
end
end
if Post === obj && obj.raw.present? &&
(force || obj.saved_change_to_cooked? || obj.saved_change_to_topic_id?)
if topic
SearchIndexer.update_posts_index(
post_id: obj.id,
topic_title: topic.title,
category_name: category_name,
topic_tags: tag_names,
cooked: obj.cooked,
private_message: topic.private_message?,
)
SearchIndexer.update_topics_index(topic.id, topic.title, obj.cooked) if obj.is_first_post?
end
end
if User === obj && (obj.saved_change_to_username? || obj.saved_change_to_name? || force)
SearchIndexer.update_users_index(
obj.id,
obj.username_lower || "",
obj.name ? obj.name.downcase : "",
obj.user_custom_fields.searchable.map(&:value).join(" "),
)
end
if Topic === obj && (obj.saved_change_to_title? || force)
if obj.posts
if post = obj.posts.find_by(post_number: 1)
SearchIndexer.update_posts_index(
post_id: post.id,
topic_title: obj.title,
category_name: category_name,
topic_tags: tag_names,
cooked: post.cooked,
private_message: obj.private_message?,
)
SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
end
end
end
if Category === obj && (obj.saved_change_to_name? || force)
SearchIndexer.queue_category_posts_reindex(obj.id)
SearchIndexer.update_categories_index(obj.id, obj.name)
end
if Tag === obj && (obj.saved_change_to_name? || force)
SearchIndexer.update_tags_index(obj.id, obj.name)
end
end
def self.clean_post_raw_data!(raw_data)
urls = Set.new
raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
urls.each do |url|
begin
case File.extname(URI(url).path || "")
when Oneboxer::VIDEO_REGEX
raw_data.gsub!(url, I18n.t("search.video"))
when Oneboxer::AUDIO_REGEX
raw_data.gsub!(url, I18n.t("search.audio"))
end
rescue URI::InvalidURIError
end
end
raw_data
end
private_class_method :clean_post_raw_data!
class HtmlScrubber < Nokogiri::XML::SAX::Document
attr_reader :scrubbed
def initialize
@scrubbed = +""
end
def self.scrub(html)
return +"" if html.blank?
begin
document = Nokogiri.HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
rescue ArgumentError
return +""
end
nodes = document.css("div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}")
if nodes.present?
nodes.each do |node|
node.traverse do |child_node|
next if child_node == node
if %w[a img].exclude?(child_node.name)
child_node.remove
elsif child_node.name == "a"
ATTRIBUTES.each { |attribute| child_node.remove_attribute(attribute) }
end
end
end
end
document.css("img.emoji").each { |node| node.remove_attribute("alt") }
document
.css("a[href]")
.each do |node|
if node["href"] == node.text || MENTION_CLASSES.include?(node["class"])
node.remove_attribute("href")
end
if node["class"] == "anchor" && node["href"].starts_with?("#")
node.remove_attribute("href")
end
end
html_scrubber = new
Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
html_scrubber.scrubbed.squish
end
MENTION_CLASSES ||= %w[mention mention-group]
ATTRIBUTES ||= %w[alt title href data-youtube-title]
def start_element(_name, attributes = [])
attributes = Hash[*attributes.flatten]
ATTRIBUTES.each do |attribute_name|
if attributes[attribute_name].present? &&
!(attribute_name == "href" && UrlHelper.is_local(attributes[attribute_name]))
characters(attributes[attribute_name])
end
end
end
def characters(str)
scrubbed << " #{str} "
end
end
end