2013-07-23 07:07:59 +08:00
|
|
|
require_dependency 'search'
|
|
|
|
|
2016-12-22 10:13:14 +08:00
|
|
|
class SearchIndexer
|
|
|
|
|
|
|
|
def self.disable
|
|
|
|
@disabled = true
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.enable
|
|
|
|
@disabled = false
|
|
|
|
end
|
2013-02-06 03:16:51 +08:00
|
|
|
|
|
|
|
def self.scrub_html_for_search(html)
|
|
|
|
HtmlScrubber.scrub(html)
|
|
|
|
end
|
|
|
|
|
2016-07-25 15:12:01 +08:00
|
|
|
def self.update_index(table, id, raw_data)
|
|
|
|
raw_data = Search.prepare_data(raw_data)
|
2014-06-24 15:10:56 +08:00
|
|
|
|
2013-05-23 03:33:33 +08:00
|
|
|
table_name = "#{table}_search_data"
|
|
|
|
foreign_key = "#{table}_id"
|
|
|
|
|
2016-07-25 15:12:01 +08:00
|
|
|
# insert some extra words for I.am.a.word so "word" is tokenized
|
|
|
|
search_data = raw_data.gsub(/\p{L}*\.\p{L}*/) do |with_dot|
|
|
|
|
split = with_dot.split(".")
|
|
|
|
if split.length > 1
|
|
|
|
with_dot + (" " << split[1..-1].join(" "))
|
|
|
|
else
|
|
|
|
with_dot
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2013-07-23 07:07:59 +08:00
|
|
|
# for user login and name use "simple" lowercase stemmer
|
|
|
|
stemmer = table == "user" ? "simple" : Search.long_locale
|
|
|
|
|
2013-05-23 03:33:33 +08:00
|
|
|
# Would be nice to use AR here but not sure how to execut Postgres functions
|
|
|
|
# when inserting data like this.
|
2014-06-24 15:10:56 +08:00
|
|
|
rows = Post.exec_sql_row_count("UPDATE #{table_name}
|
|
|
|
SET
|
2016-07-25 15:12:01 +08:00
|
|
|
raw_data = :raw_data,
|
2014-06-24 15:10:56 +08:00
|
|
|
locale = :locale,
|
|
|
|
search_data = TO_TSVECTOR('#{stemmer}', :search_data)
|
|
|
|
WHERE #{foreign_key} = :id",
|
2016-07-25 15:12:01 +08:00
|
|
|
raw_data: raw_data,
|
|
|
|
search_data: search_data,
|
|
|
|
id: id,
|
|
|
|
locale: SiteSetting.default_locale)
|
2013-05-23 03:33:33 +08:00
|
|
|
if rows == 0
|
2014-06-24 15:10:56 +08:00
|
|
|
Post.exec_sql("INSERT INTO #{table_name}
|
|
|
|
(#{foreign_key}, search_data, locale, raw_data)
|
2016-07-25 15:12:01 +08:00
|
|
|
VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :raw_data)",
|
|
|
|
raw_data: raw_data,
|
|
|
|
search_data: search_data,
|
|
|
|
id: id,
|
|
|
|
locale: SiteSetting.default_locale)
|
2013-02-06 03:16:51 +08:00
|
|
|
end
|
2013-05-23 03:33:33 +08:00
|
|
|
rescue
|
|
|
|
# don't allow concurrency to mess up saving a post
|
2013-02-06 03:16:51 +08:00
|
|
|
end
|
|
|
|
|
2014-08-08 13:50:26 +08:00
|
|
|
def self.update_topics_index(topic_id, title, cooked)
|
2015-04-04 14:33:01 +08:00
|
|
|
search_data = title.dup << " " << scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
|
2014-08-08 13:50:26 +08:00
|
|
|
update_index('topic', topic_id, search_data)
|
|
|
|
end
|
|
|
|
|
2013-02-06 03:16:51 +08:00
|
|
|
def self.update_posts_index(post_id, cooked, title, category)
|
2014-10-06 17:45:11 +08:00
|
|
|
search_data = scrub_html_for_search(cooked) << " " << title.dup.force_encoding('UTF-8')
|
2013-05-23 03:33:33 +08:00
|
|
|
search_data << " " << category if category
|
|
|
|
update_index('post', post_id, search_data)
|
2013-02-06 03:16:51 +08:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.update_users_index(user_id, username, name)
|
2013-05-23 03:33:33 +08:00
|
|
|
search_data = username.dup << " " << (name || "")
|
|
|
|
update_index('user', user_id, search_data)
|
2013-02-06 03:16:51 +08:00
|
|
|
end
|
2013-02-07 23:45:24 +08:00
|
|
|
|
2013-02-06 03:16:51 +08:00
|
|
|
def self.update_categories_index(category_id, name)
|
2013-05-23 03:33:33 +08:00
|
|
|
update_index('category', category_id, name)
|
2013-02-06 03:16:51 +08:00
|
|
|
end
|
|
|
|
|
2014-06-24 15:10:56 +08:00
|
|
|
def self.index(obj)
|
2016-12-22 10:13:14 +08:00
|
|
|
return if @disabled
|
|
|
|
|
2013-02-06 03:16:51 +08:00
|
|
|
if obj.class == Post && obj.cooked_changed?
|
2014-05-07 10:35:26 +08:00
|
|
|
if obj.topic
|
|
|
|
category_name = obj.topic.category.name if obj.topic.category
|
2016-12-22 10:13:14 +08:00
|
|
|
SearchIndexer.update_posts_index(obj.id, obj.cooked, obj.topic.title, category_name)
|
|
|
|
SearchIndexer.update_topics_index(obj.topic_id, obj.topic.title, obj.cooked) if obj.is_first_post?
|
2014-05-07 10:35:26 +08:00
|
|
|
else
|
2016-12-22 10:13:14 +08:00
|
|
|
Rails.logger.warn("Orphan post skipped in search_indexer, topic_id: #{obj.topic_id} post_id: #{obj.id} raw: #{obj.raw}")
|
2014-05-07 10:35:26 +08:00
|
|
|
end
|
2013-02-06 03:16:51 +08:00
|
|
|
end
|
|
|
|
if obj.class == User && (obj.username_changed? || obj.name_changed?)
|
2016-12-22 10:13:14 +08:00
|
|
|
SearchIndexer.update_users_index(obj.id, obj.username_lower || '', obj.name ? obj.name.downcase : '')
|
2013-02-06 03:16:51 +08:00
|
|
|
end
|
|
|
|
|
|
|
|
if obj.class == Topic && obj.title_changed?
|
|
|
|
if obj.posts
|
2014-05-06 21:41:59 +08:00
|
|
|
post = obj.posts.find_by(post_number: 1)
|
2013-02-06 03:16:51 +08:00
|
|
|
if post
|
|
|
|
category_name = obj.category.name if obj.category
|
2016-12-22 10:13:14 +08:00
|
|
|
SearchIndexer.update_posts_index(post.id, post.cooked, obj.title, category_name)
|
|
|
|
SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
|
2013-02-06 03:16:51 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2013-02-07 23:45:24 +08:00
|
|
|
if obj.class == Category && obj.name_changed?
|
2016-12-22 10:13:14 +08:00
|
|
|
SearchIndexer.update_categories_index(obj.id, obj.name)
|
2013-02-06 03:16:51 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
class HtmlScrubber < Nokogiri::XML::SAX::Document
|
|
|
|
attr_reader :scrubbed
|
|
|
|
|
|
|
|
def initialize
|
|
|
|
@scrubbed = ""
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.scrub(html)
|
2013-03-01 02:54:12 +08:00
|
|
|
me = new
|
2013-02-06 03:16:51 +08:00
|
|
|
parser = Nokogiri::HTML::SAX::Parser.new(me)
|
2013-02-07 23:45:24 +08:00
|
|
|
begin
|
|
|
|
copy = "<div>"
|
2013-02-06 03:16:51 +08:00
|
|
|
copy << html unless html.nil?
|
|
|
|
copy << "</div>"
|
|
|
|
parser.parse(html) unless html.nil?
|
|
|
|
end
|
|
|
|
me.scrubbed
|
|
|
|
end
|
|
|
|
|
|
|
|
def start_element(name, attributes=[])
|
|
|
|
attributes = Hash[*attributes.flatten]
|
|
|
|
if attributes["alt"]
|
|
|
|
scrubbed << " "
|
|
|
|
scrubbed << attributes["alt"]
|
|
|
|
scrubbed << " "
|
|
|
|
end
|
|
|
|
if attributes["title"]
|
|
|
|
scrubbed << " "
|
2013-02-07 23:45:24 +08:00
|
|
|
scrubbed << attributes["title"]
|
2013-02-06 03:16:51 +08:00
|
|
|
scrubbed << " "
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def characters(string)
|
|
|
|
scrubbed << " "
|
2016-07-25 15:12:01 +08:00
|
|
|
scrubbed << string
|
2013-02-06 03:16:51 +08:00
|
|
|
scrubbed << " "
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|