mirror of
https://github.com/discourse/discourse.git
synced 2025-01-23 07:41:46 +08:00
4a3c13a37b
During search indexing we "stuff" the index with additional keywords for entities that look like domain names. This allows searches for `cnn` to find URLs for `www.cnn.com` The search stuffing attempted to keep indexes aligned at the correct positions by remapping the indexed terms. However under certain edge cases a single word can stem into 2 different lexemes. If this happened we had an off by one which caused the entire indexing to fail. We work around this edge case (and carry incorrect index positions) for cases like this. It is unlikely to impact search quality at all given index position makes almost no difference in the search algorithm.
417 lines
12 KiB
Ruby
417 lines
12 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
class SearchIndexer
|
|
MIN_POST_BLURB_INDEX_VERSION = 4
|
|
|
|
POST_INDEX_VERSION = 5
|
|
TOPIC_INDEX_VERSION = 4
|
|
CATEGORY_INDEX_VERSION = 3
|
|
USER_INDEX_VERSION = 3
|
|
TAG_INDEX_VERSION = 3
|
|
|
|
# version to apply when issuing a background reindex
|
|
REINDEX_VERSION = 0
|
|
TS_VECTOR_PARSE_REGEX = /('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/
|
|
|
|
def self.disable
|
|
@disabled = true
|
|
end
|
|
|
|
def self.enable
|
|
@disabled = false
|
|
end
|
|
|
|
def self.update_index(table:, id:, a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
|
|
raw_data = { a: a_weight, b: b_weight, c: c_weight, d: d_weight }
|
|
|
|
# The version used in excerpts
|
|
search_data = raw_data.transform_values { |data| Search.prepare_data(data || "", :index) }
|
|
|
|
# The version used to build the index
|
|
indexed_data =
|
|
search_data.transform_values do |data|
|
|
data.gsub(/\S+/) { |word| word[0...SiteSetting.search_max_indexed_word_length] }
|
|
end
|
|
|
|
table_name = "#{table}_search_data"
|
|
foreign_key = "#{table}_id"
|
|
|
|
# for user login and name use "simple" lowercase stemmer
|
|
stemmer = table == "user" ? "simple" : Search.ts_config
|
|
|
|
ranked_index = <<~SQL
|
|
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:a,''))")}, 'A') ||
|
|
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:b,''))")}, 'B') ||
|
|
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:c,''))")}, 'C') ||
|
|
setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
|
|
SQL
|
|
|
|
tsvector = DB.query_single("SELECT #{ranked_index}", indexed_data)[0]
|
|
additional_lexemes = []
|
|
|
|
# we also want to index parts of a domain name
|
|
# that way stemmed single word searches will match
|
|
additional_words = []
|
|
|
|
tsvector
|
|
.scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/)
|
|
.reduce(additional_lexemes) do |array, (lexeme, _, positions)|
|
|
count = 0
|
|
|
|
if lexeme !~ /\A(\d+\.)?(\d+\.)*(\*|\d+)\z/
|
|
loop do
|
|
count += 1
|
|
break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
|
|
term, _, remaining = lexeme.partition(".")
|
|
break if remaining.blank?
|
|
|
|
additional_words << [term, positions]
|
|
|
|
array << "'#{remaining}':#{positions}"
|
|
lexeme = remaining
|
|
end
|
|
end
|
|
|
|
array
|
|
end
|
|
|
|
extra_domain_word_terms =
|
|
if additional_words.length > 0
|
|
DB
|
|
.query_single(
|
|
"SELECT to_tsvector(?, ?)",
|
|
stemmer,
|
|
additional_words.map { |term, _| term }.join(" "),
|
|
)
|
|
.first
|
|
.scan(TS_VECTOR_PARSE_REGEX)
|
|
.map do |term, _, indexes|
|
|
new_indexes =
|
|
indexes
|
|
.split(",")
|
|
.map do |index|
|
|
existing_positions = additional_words[index.to_i - 1]
|
|
if existing_positions
|
|
existing_positions[1]
|
|
else
|
|
index
|
|
end
|
|
end
|
|
.join(",")
|
|
"#{term}#{new_indexes}"
|
|
end
|
|
.join(" ")
|
|
end
|
|
|
|
tsvector = "#{tsvector} #{additional_lexemes.join(" ")} #{extra_domain_word_terms}"
|
|
|
|
if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0
|
|
reduced = []
|
|
tsvector
|
|
.scan(TS_VECTOR_PARSE_REGEX)
|
|
.each do |term, _, indexes|
|
|
family_counts = Hash.new(0)
|
|
new_index_array = []
|
|
|
|
indexes
|
|
.split(",")
|
|
.each do |index|
|
|
family = nil
|
|
family = index[-1] if index[-1].match?(/[A-D]/)
|
|
if (family_counts[family] += 1) <= max_dupes
|
|
new_index_array << index
|
|
end
|
|
end
|
|
reduced << "#{term.strip}#{new_index_array.join(",")}"
|
|
end
|
|
tsvector = reduced.join(" ")
|
|
end
|
|
|
|
indexed_data =
|
|
if table.to_s == "post"
|
|
clean_post_raw_data!(search_data[:d])
|
|
else
|
|
search_data.values.select { |d| d.length > 0 }.join(" ")
|
|
end
|
|
|
|
params = {
|
|
"raw_data" => indexed_data,
|
|
"#{foreign_key}" => id,
|
|
"locale" => SiteSetting.default_locale,
|
|
"version" => const_get("#{table.upcase}_INDEX_VERSION"),
|
|
"search_data" => tsvector,
|
|
}
|
|
|
|
yield params if block_given?
|
|
table_name.camelize.constantize.upsert(params)
|
|
rescue => e
|
|
if Rails.env.test?
|
|
raise
|
|
else
|
|
# TODO is there any way we can safely avoid this?
|
|
# best way is probably pushing search indexer into a dedicated process so it no longer happens on save
|
|
# instead in the post processor
|
|
Discourse.warn_exception(
|
|
e,
|
|
message: "Unexpected error while indexing #{table} for search",
|
|
env: {
|
|
id: id,
|
|
},
|
|
)
|
|
end
|
|
end
|
|
|
|
def self.update_topics_index(topic_id, title, cooked)
|
|
# a bit inconsistent that we use title as A and body as B when in
|
|
# the post index body is D
|
|
update_index(
|
|
table: "topic",
|
|
id: topic_id,
|
|
a_weight: title,
|
|
b_weight: HtmlScrubber.scrub(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH],
|
|
)
|
|
end
|
|
|
|
def self.update_posts_index(
|
|
post_id:,
|
|
topic_title:,
|
|
category_name:,
|
|
topic_tags:,
|
|
cooked:,
|
|
private_message:
|
|
)
|
|
update_index(
|
|
table: "post",
|
|
id: post_id,
|
|
a_weight: topic_title,
|
|
b_weight: category_name,
|
|
c_weight: topic_tags,
|
|
# The tsvector resulted from parsing a string can be double the size of
|
|
# the original string. Since there is no way to estimate the length of
|
|
# the expected tsvector, we limit the input to ~50% of the maximum
|
|
# length of a tsvector (1_048_576 bytes).
|
|
d_weight: HtmlScrubber.scrub(cooked)[0..600_000],
|
|
) { |params| params["private_message"] = private_message }
|
|
end
|
|
|
|
def self.update_users_index(user_id, username, name, custom_fields)
|
|
update_index(
|
|
table: "user",
|
|
id: user_id,
|
|
a_weight: username,
|
|
b_weight: name,
|
|
c_weight: custom_fields,
|
|
)
|
|
end
|
|
|
|
def self.update_categories_index(category_id, name)
|
|
update_index(table: "category", id: category_id, a_weight: name)
|
|
end
|
|
|
|
def self.update_tags_index(tag_id, name)
|
|
update_index(table: "tag", id: tag_id, a_weight: name.downcase)
|
|
end
|
|
|
|
def self.queue_category_posts_reindex(category_id)
|
|
return if @disabled
|
|
|
|
DB.exec(<<~SQL, category_id: category_id, version: REINDEX_VERSION)
|
|
UPDATE post_search_data
|
|
SET version = :version
|
|
FROM posts
|
|
INNER JOIN topics ON posts.topic_id = topics.id
|
|
INNER JOIN categories ON topics.category_id = categories.id
|
|
WHERE post_search_data.post_id = posts.id
|
|
AND categories.id = :category_id
|
|
SQL
|
|
end
|
|
|
|
def self.queue_users_reindex(user_ids)
|
|
return if @disabled
|
|
|
|
DB.exec(<<~SQL, user_ids: user_ids, version: REINDEX_VERSION)
|
|
UPDATE user_search_data
|
|
SET version = :version
|
|
WHERE user_search_data.user_id IN (:user_ids)
|
|
SQL
|
|
end
|
|
|
|
def self.queue_post_reindex(topic_id)
|
|
return if @disabled
|
|
|
|
DB.exec(<<~SQL, topic_id: topic_id, version: REINDEX_VERSION)
|
|
UPDATE post_search_data
|
|
SET version = :version
|
|
FROM posts
|
|
WHERE post_search_data.post_id = posts.id
|
|
AND posts.topic_id = :topic_id
|
|
SQL
|
|
end
|
|
|
|
def self.index(obj, force: false)
|
|
return if @disabled
|
|
|
|
category_name = nil
|
|
tag_names = nil
|
|
topic = nil
|
|
|
|
if Topic === obj
|
|
topic = obj
|
|
elsif Post === obj
|
|
topic = obj.topic
|
|
end
|
|
|
|
category_name = topic.category&.name if topic
|
|
|
|
if topic
|
|
tags = topic.tags.select(:id, :name).to_a
|
|
|
|
if tags.present?
|
|
tag_names =
|
|
(tags.map(&:name) + Tag.where(target_tag_id: tags.map(&:id)).pluck(:name)).join(" ")
|
|
end
|
|
end
|
|
|
|
if Post === obj && obj.raw.present? &&
|
|
(force || obj.saved_change_to_cooked? || obj.saved_change_to_topic_id?)
|
|
if topic
|
|
SearchIndexer.update_posts_index(
|
|
post_id: obj.id,
|
|
topic_title: topic.title,
|
|
category_name: category_name,
|
|
topic_tags: tag_names,
|
|
cooked: obj.cooked,
|
|
private_message: topic.private_message?,
|
|
)
|
|
|
|
SearchIndexer.update_topics_index(topic.id, topic.title, obj.cooked) if obj.is_first_post?
|
|
end
|
|
end
|
|
|
|
if User === obj && (obj.saved_change_to_username? || obj.saved_change_to_name? || force)
|
|
SearchIndexer.update_users_index(
|
|
obj.id,
|
|
obj.username_lower || "",
|
|
obj.name ? obj.name.downcase : "",
|
|
obj.user_custom_fields.searchable.map(&:value).join(" "),
|
|
)
|
|
end
|
|
|
|
if Topic === obj && (obj.saved_change_to_title? || force)
|
|
if obj.posts
|
|
if post = obj.posts.find_by(post_number: 1)
|
|
SearchIndexer.update_posts_index(
|
|
post_id: post.id,
|
|
topic_title: obj.title,
|
|
category_name: category_name,
|
|
topic_tags: tag_names,
|
|
cooked: post.cooked,
|
|
private_message: obj.private_message?,
|
|
)
|
|
|
|
SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
|
|
end
|
|
end
|
|
end
|
|
|
|
if Category === obj && (obj.saved_change_to_name? || force)
|
|
SearchIndexer.queue_category_posts_reindex(obj.id)
|
|
SearchIndexer.update_categories_index(obj.id, obj.name)
|
|
end
|
|
|
|
if Tag === obj && (obj.saved_change_to_name? || force)
|
|
SearchIndexer.update_tags_index(obj.id, obj.name)
|
|
end
|
|
end
|
|
|
|
def self.clean_post_raw_data!(raw_data)
|
|
urls = Set.new
|
|
raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
|
|
|
|
urls.each do |url|
|
|
begin
|
|
case File.extname(URI(url).path || "")
|
|
when Oneboxer::VIDEO_REGEX
|
|
raw_data.gsub!(url, I18n.t("search.video"))
|
|
when Oneboxer::AUDIO_REGEX
|
|
raw_data.gsub!(url, I18n.t("search.audio"))
|
|
end
|
|
rescue URI::InvalidURIError
|
|
end
|
|
end
|
|
|
|
raw_data
|
|
end
|
|
private_class_method :clean_post_raw_data!
|
|
|
|
class HtmlScrubber < Nokogiri::XML::SAX::Document
|
|
attr_reader :scrubbed
|
|
|
|
def initialize
|
|
@scrubbed = +""
|
|
end
|
|
|
|
def self.scrub(html)
|
|
return +"" if html.blank?
|
|
|
|
begin
|
|
document = Nokogiri.HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
|
|
rescue ArgumentError
|
|
return +""
|
|
end
|
|
|
|
nodes = document.css("div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}")
|
|
|
|
if nodes.present?
|
|
nodes.each do |node|
|
|
node.traverse do |child_node|
|
|
next if child_node == node
|
|
|
|
if %w[a img].exclude?(child_node.name)
|
|
child_node.remove
|
|
elsif child_node.name == "a"
|
|
ATTRIBUTES.each { |attribute| child_node.remove_attribute(attribute) }
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
document.css("img.emoji").each { |node| node.remove_attribute("alt") }
|
|
|
|
document
|
|
.css("a[href]")
|
|
.each do |node|
|
|
if node["href"] == node.text || MENTION_CLASSES.include?(node["class"])
|
|
node.remove_attribute("href")
|
|
end
|
|
|
|
if node["class"] == "anchor" && node["href"].starts_with?("#")
|
|
node.remove_attribute("href")
|
|
end
|
|
end
|
|
|
|
html_scrubber = new
|
|
Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
|
|
html_scrubber.scrubbed.squish
|
|
end
|
|
|
|
MENTION_CLASSES ||= %w[mention mention-group]
|
|
ATTRIBUTES ||= %w[alt title href data-youtube-title]
|
|
|
|
def start_element(_name, attributes = [])
|
|
attributes = Hash[*attributes.flatten]
|
|
|
|
ATTRIBUTES.each do |attribute_name|
|
|
if attributes[attribute_name].present? &&
|
|
!(attribute_name == "href" && UrlHelper.is_local(attributes[attribute_name]))
|
|
characters(attributes[attribute_name])
|
|
end
|
|
end
|
|
end
|
|
|
|
def characters(str)
|
|
scrubbed << " #{str} "
|
|
end
|
|
end
|
|
end
|