2018-02-20 11:41:00 +08:00
# frozen_string_literal: true
2013-07-23 07:07:59 +08:00
require_dependency 'search'
2016-12-22 10:13:14 +08:00
class SearchIndexer
def self . disable
@disabled = true
end
def self . enable
@disabled = false
end
2013-02-06 03:16:51 +08:00
def self . scrub_html_for_search ( html )
HtmlScrubber . scrub ( html )
end
2018-02-20 11:41:00 +08:00
def self . inject_extra_terms ( raw )
2016-07-25 15:12:01 +08:00
# insert some extra words for I.am.a.word so "word" is tokenized
2017-08-15 04:19:28 +08:00
# I.am.a.word becomes I.am.a.word am a word
2018-02-20 11:41:00 +08:00
raw . gsub ( / [^[:space:]]*[ \ .]+[^[:space:]]* / ) do | with_dot |
2016-07-25 15:12:01 +08:00
split = with_dot . split ( " . " )
if split . length > 1
2018-02-20 11:41:00 +08:00
with_dot + ( ( + " " ) << split [ 1 .. - 1 ] . join ( " " ) )
2016-07-25 15:12:01 +08:00
else
with_dot
end
end
2018-02-20 11:41:00 +08:00
end
def self . update_index ( table : , id : , raw_data : )
search_data = raw_data . map do | data |
inject_extra_terms ( Search . prepare_data ( data || " " , :index ) )
end
table_name = " #{ table } _search_data "
foreign_key = " #{ table } _id "
2016-07-25 15:12:01 +08:00
2013-07-23 07:07:59 +08:00
# for user login and name use "simple" lowercase stemmer
2017-08-01 03:28:48 +08:00
stemmer = table == " user " ? " simple " : Search . ts_config
2013-07-23 07:07:59 +08:00
2018-02-20 11:41:00 +08:00
ranked_index = << ~ SQL
setweight ( to_tsvector ( '#{stemmer}' , coalesce ( :a , '' ) ) , 'A' ) ||
setweight ( to_tsvector ( '#{stemmer}' , coalesce ( :b , '' ) ) , 'B' ) ||
setweight ( to_tsvector ( '#{stemmer}' , coalesce ( :c , '' ) ) , 'C' ) ||
setweight ( to_tsvector ( '#{stemmer}' , coalesce ( :d , '' ) ) , 'D' )
SQL
indexed_data = search_data . select { | d | d . length > 0 } . join ( ' ' )
params = {
a : search_data [ 0 ] ,
b : search_data [ 1 ] ,
c : search_data [ 2 ] ,
d : search_data [ 3 ] ,
raw_data : indexed_data ,
id : id ,
locale : SiteSetting . default_locale ,
version : Search :: INDEX_VERSION
}
2013-05-23 03:33:33 +08:00
# Would be nice to use AR here but not sure how to execut Postgres functions
# when inserting data like this.
2018-06-19 14:13:14 +08:00
rows = DB . exec ( << ~ SQL , params )
2018-02-20 11:41:00 +08:00
UPDATE #{table_name}
SET
raw_data = :raw_data ,
locale = :locale ,
search_data = #{ranked_index},
version = :version
WHERE #{foreign_key} = :id
SQL
2013-05-23 03:33:33 +08:00
if rows == 0
2018-06-19 14:13:14 +08:00
DB . exec ( << ~ SQL , params )
2018-02-20 11:41:00 +08:00
INSERT INTO #{table_name}
( #{foreign_key}, search_data, locale, raw_data, version)
VALUES ( :id , #{ranked_index}, :locale, :raw_data, :version)
SQL
2013-02-06 03:16:51 +08:00
end
2013-05-23 03:33:33 +08:00
rescue
2018-02-20 11:41:00 +08:00
# TODO is there any way we can safely avoid this?
# best way is probably pushing search indexer into a dedicated process so it no longer happens on save
# instead in the post processor
2013-02-06 03:16:51 +08:00
end
2014-08-08 13:50:26 +08:00
def self . update_topics_index ( topic_id , title , cooked )
2018-02-20 11:41:00 +08:00
scrubbed_cooked = scrub_html_for_search ( cooked ) [ 0 ... Topic :: MAX_SIMILAR_BODY_LENGTH ]
# a bit inconsitent that we use title as A and body as B when in
# the post index body is C
update_index ( table : 'topic' , id : topic_id , raw_data : [ title , scrubbed_cooked ] )
2014-08-08 13:50:26 +08:00
end
2018-02-20 11:41:00 +08:00
def self . update_posts_index ( post_id , title , category , tags , cooked )
update_index ( table : 'post' , id : post_id , raw_data : [ title , category , tags , scrub_html_for_search ( cooked ) ] )
2013-02-06 03:16:51 +08:00
end
def self . update_users_index ( user_id , username , name )
2018-02-20 11:41:00 +08:00
update_index ( table : 'user' , id : user_id , raw_data : [ username , name ] )
2013-02-06 03:16:51 +08:00
end
2013-02-07 23:45:24 +08:00
2013-02-06 03:16:51 +08:00
def self . update_categories_index ( category_id , name )
2018-02-20 11:41:00 +08:00
update_index ( table : 'category' , id : category_id , raw_data : [ name ] )
2013-02-06 03:16:51 +08:00
end
2017-08-25 23:52:18 +08:00
def self . update_tags_index ( tag_id , name )
2018-02-20 11:41:00 +08:00
update_index ( table : 'tag' , id : tag_id , raw_data : [ name ] )
end
def self . queue_post_reindex ( topic_id )
return if @disabled
2018-06-19 14:13:14 +08:00
DB . exec ( << ~ SQL , topic_id : topic_id )
2018-02-20 11:41:00 +08:00
UPDATE post_search_data
SET version = 0
WHERE post_id IN ( SELECT id FROM posts WHERE topic_id = :topic_id )
SQL
2017-08-25 23:52:18 +08:00
end
2017-08-16 19:38:34 +08:00
def self . index ( obj , force : false )
2016-12-22 10:13:14 +08:00
return if @disabled
2018-08-23 23:13:52 +08:00
category_name = nil
tag_names = nil
2018-02-20 11:41:00 +08:00
topic = nil
if Topic === obj
topic = obj
elsif Post === obj
topic = obj . topic
end
category_name = topic . category & . name if topic
tag_names = topic . tags . pluck ( :name ) . join ( ' ' ) if topic
if Post === obj && ( obj . saved_change_to_cooked? || force )
if topic
SearchIndexer . update_posts_index ( obj . id , topic . title , category_name , tag_names , obj . cooked )
SearchIndexer . update_topics_index ( topic . id , topic . title , obj . cooked ) if obj . is_first_post?
2014-05-07 10:35:26 +08:00
else
2016-12-22 10:13:14 +08:00
Rails . logger . warn ( " Orphan post skipped in search_indexer, topic_id: #{ obj . topic_id } post_id: #{ obj . id } raw: #{ obj . raw } " )
2014-05-07 10:35:26 +08:00
end
2013-02-06 03:16:51 +08:00
end
2017-08-16 19:38:34 +08:00
2018-02-20 11:41:00 +08:00
if User === obj && ( obj . saved_change_to_username? || obj . saved_change_to_name? || force )
2016-12-22 10:13:14 +08:00
SearchIndexer . update_users_index ( obj . id , obj . username_lower || '' , obj . name ? obj . name . downcase : '' )
2013-02-06 03:16:51 +08:00
end
2018-02-20 11:41:00 +08:00
if Topic === obj && ( obj . saved_change_to_title? || force )
2013-02-06 03:16:51 +08:00
if obj . posts
2018-08-23 23:13:52 +08:00
if post = obj . posts . find_by ( post_number : 1 )
2018-02-20 11:41:00 +08:00
SearchIndexer . update_posts_index ( post . id , obj . title , category_name , tag_names , post . cooked )
2016-12-22 10:13:14 +08:00
SearchIndexer . update_topics_index ( obj . id , obj . title , post . cooked )
2013-02-06 03:16:51 +08:00
end
end
end
2018-02-20 11:41:00 +08:00
if Category === obj && ( obj . saved_change_to_name? || force )
2016-12-22 10:13:14 +08:00
SearchIndexer . update_categories_index ( obj . id , obj . name )
2013-02-06 03:16:51 +08:00
end
2017-08-25 23:52:18 +08:00
2018-02-20 11:41:00 +08:00
if Tag === obj && ( obj . saved_change_to_name? || force )
2017-08-25 23:52:18 +08:00
SearchIndexer . update_tags_index ( obj . id , obj . name )
end
2013-02-06 03:16:51 +08:00
end
class HtmlScrubber < Nokogiri :: XML :: SAX :: Document
2018-08-24 08:00:51 +08:00
def self . strip_diacritics ( str )
s = str . unicode_normalize ( :nfkd )
s . gsub! ( DIACRITICS , " " )
s . strip!
s
end
2013-02-06 03:16:51 +08:00
attr_reader :scrubbed
2018-08-31 09:46:55 +08:00
def initialize ( strip_diacritics : false )
2018-02-20 11:41:00 +08:00
@scrubbed = + " "
2018-08-31 09:46:55 +08:00
# for now we are disabling this per: https://meta.discourse.org/t/discourse-should-ignore-if-a-character-is-accented-when-doing-a-search/90198/16?u=sam
@strip_diacritics = strip_diacritics
2013-02-06 03:16:51 +08:00
end
2018-08-31 09:46:55 +08:00
def self . scrub ( html , strip_diacritics : false )
2018-08-24 00:00:07 +08:00
return + " " if html . blank?
2018-08-31 09:46:55 +08:00
me = new ( strip_diacritics : strip_diacritics )
2018-08-24 00:00:07 +08:00
Nokogiri :: HTML :: SAX :: Parser . new ( me ) . parse ( " <div> #{ html } </div> " )
2013-02-06 03:16:51 +08:00
me . scrubbed
end
2018-08-23 23:13:52 +08:00
ATTRIBUTES || = %w{ alt title href data-youtube-title }
def start_element ( _ , attributes = [ ] )
2013-02-06 03:16:51 +08:00
attributes = Hash [ * attributes . flatten ]
2018-08-23 23:13:52 +08:00
ATTRIBUTES . each do | name |
characters ( attributes [ name ] ) if attributes [ name ] . present?
2018-08-20 08:39:19 +08:00
end
2013-02-06 03:16:51 +08:00
end
2018-08-24 06:38:44 +08:00
DIACRITICS || = / ([ \ u0300- \ u036f]|[ \ u1AB0- \ u1AFF]|[ \ u1DC0- \ u1DFF]|[ \ u20D0- \ u20FF]) /
2018-08-24 08:00:51 +08:00
def characters ( str )
2018-08-31 09:46:55 +08:00
str = HtmlScrubber . strip_diacritics ( str ) if @strip_diacritics
scrubbed << " #{ str } "
2013-02-06 03:16:51 +08:00
end
end
end