From 3c84876660515272edaa294db72a4b71dfa31b07 Mon Sep 17 00:00:00 2001 From: Sam Date: Tue, 24 Jun 2014 17:10:56 +1000 Subject: [PATCH] BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it --- Gemfile | 2 + Gemfile.lock | 2 + app/jobs/scheduled/periodical_updates.rb | 2 + app/models/search_observer.rb | 21 +++++- .../20140624044600_add_raw_data_to_search.rb | 11 +++ lib/search.rb | 68 +++++++++++++++---- lib/sql_builder.rb | 4 ++ spec/models/search_observer_spec.rb | 40 +++++++++++ 8 files changed, 134 insertions(+), 16 deletions(-) create mode 100644 db/migrate/20140624044600_add_raw_data_to_search.rb create mode 100644 spec/models/search_observer_spec.rb diff --git a/Gemfile b/Gemfile index bb5ddeedd10..16a7d3ad86d 100644 --- a/Gemfile +++ b/Gemfile @@ -223,6 +223,8 @@ gem 'gctools', require: false, platform: :mri_21 gem 'stackprof', require: false, platform: :mri_21 gem 'memory_profiler', require: false, platform: :mri_21 +gem 'rmmseg-cpp', require: false + # This silly path comment just makes it easier for me to do dev # will be removed in a few weeks gem 'logster'#, path: '../logster' diff --git a/Gemfile.lock b/Gemfile.lock index d959283b4b9..91c65a82523 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -283,6 +283,7 @@ GEM rest-client (1.6.7) mime-types (>= 1.16) rinku (1.7.3) + rmmseg-cpp (0.2.9) rspec (2.14.1) rspec-core (~> 2.14.0) rspec-expectations (~> 2.14.0) @@ -462,6 +463,7 @@ DEPENDENCIES redis rest-client rinku + rmmseg-cpp rspec-given rspec-rails ruby-readability diff --git a/app/jobs/scheduled/periodical_updates.rb b/app/jobs/scheduled/periodical_updates.rb index be77a683390..75819b89f5f 100644 --- a/app/jobs/scheduled/periodical_updates.rb +++ b/app/jobs/scheduled/periodical_updates.rb @@ -31,6 +31,8 @@ module Jobs unless UserAvatar.where("last_gravatar_download_attempt IS NULL").limit(1).first Post.rebake_old(250) end + + Search.rebuild_problem_posts end end diff --git a/app/models/search_observer.rb b/app/models/search_observer.rb index bab4216265d..89798a34ee9 100644 --- a/app/models/search_observer.rb +++ b/app/models/search_observer.rb @@ -8,6 +8,8 @@ class SearchObserver < ActiveRecord::Observer end def self.update_index(table, id, search_data) + search_data = Search.prepare_data(search_data) + table_name = "#{table}_search_data" foreign_key = "#{table}_id" @@ -16,9 +18,18 @@ class SearchObserver < ActiveRecord::Observer # Would be nice to use AR here but not sure how to execut Postgres functions # when inserting data like this. - rows = Post.exec_sql_row_count("UPDATE #{table_name} SET search_data = TO_TSVECTOR('#{stemmer}', ?) WHERE #{foreign_key} = ?", search_data, id) + rows = Post.exec_sql_row_count("UPDATE #{table_name} + SET + raw_data = :search_data, + locale = :locale, + search_data = TO_TSVECTOR('#{stemmer}', :search_data) + WHERE #{foreign_key} = :id", + search_data: search_data, id: id, locale: SiteSetting.default_locale) if rows == 0 - Post.exec_sql("INSERT INTO #{table_name} (#{foreign_key}, search_data) VALUES (?, TO_TSVECTOR('#{stemmer}', ?))", id, search_data) + Post.exec_sql("INSERT INTO #{table_name} + (#{foreign_key}, search_data, locale, raw_data) + VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :search_data)", + search_data: search_data, id: id, locale: SiteSetting.default_locale) end rescue # don't allow concurrency to mess up saving a post @@ -39,7 +50,7 @@ class SearchObserver < ActiveRecord::Observer update_index('category', category_id, name) end - def after_save(obj) + def self.index(obj) if obj.class == Post && obj.cooked_changed? if obj.topic category_name = obj.topic.category.name if obj.topic.category @@ -67,6 +78,10 @@ class SearchObserver < ActiveRecord::Observer end end + def after_save(object) + SearchObserver.index(object) + end + class HtmlScrubber < Nokogiri::XML::SAX::Document attr_reader :scrubbed diff --git a/db/migrate/20140624044600_add_raw_data_to_search.rb b/db/migrate/20140624044600_add_raw_data_to_search.rb new file mode 100644 index 00000000000..51fe1c4e8ef --- /dev/null +++ b/db/migrate/20140624044600_add_raw_data_to_search.rb @@ -0,0 +1,11 @@ +class AddRawDataToSearch < ActiveRecord::Migration + def change + add_column :post_search_data, :raw_data, :text + add_column :user_search_data, :raw_data, :text + add_column :category_search_data, :raw_data, :text + + add_column :post_search_data, :locale, :string + add_column :user_search_data, :locale, :text + add_column :category_search_data, :locale, :text + end +end diff --git a/lib/search.rb b/lib/search.rb index b21ba0fe7c0..2400918799b 100644 --- a/lib/search.rb +++ b/lib/search.rb @@ -19,25 +19,67 @@ class Search end def self.long_locale - case I18n.locale # Currently-present in /conf/locales/* only, sorry :-( Add as needed - when :da then 'danish' - when :de then 'german' - when :en then 'english' - when :es then 'spanish' - when :fr then 'french' - when :it then 'italian' - when :ja then 'japanese' - when :nl then 'dutch' - when :pt then 'portuguese' - when :sv then 'swedish' - when :ru then 'russian' + # if adding a language see: + # /usr/share/postgresql/9.3/tsearch_data for possible options + # Do not add languages that are missing without amending the + # base docker config + # + case SiteSetting.default_locale.to_sym + when :da then 'danish' + when :de then 'german' + when :en then 'english' + when :es then 'spanish' + when :fr then 'french' + when :it then 'italian' + when :nl then 'dutch' + when :nb_NO then 'norwegian' + when :pt then 'portuguese' + when :pt_BR then 'portuguese' + when :sv then 'swedish' + when :ru then 'russian' else 'simple' # use the 'simple' stemmer for other languages end end + def self.rebuild_problem_posts(limit = 10000) + posts = Post.joins(:topic) + .where('posts.id NOT IN ( + SELECT post_id from post_search_data + WHERE locale = ? + )', SiteSetting.default_locale).limit(10000) + + posts.each do |post| + post.cooked += " " + SearchObserver.index(post) + end + + nil + end + + def self.prepare_data(search_data) + data = search_data.squish + # TODO rmmseg is designed for chinese, we need something else for Korean / Japanese + if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale) + unless defined? RMMSeg + require 'rmmseg' + RMMSeg::Dictionary.load_dictionaries + end + + algo = RMMSeg::Algorithm.new(search_data) + + data = "" + while token = algo.next_token + data << token.text << " " + end + end + + data.force_encoding("UTF-8") + data + end + def initialize(term, opts=nil) if term.present? - @term = term.to_s + @term = Search.prepare_data(term.to_s) @original_term = PG::Connection.escape_string(@term) end diff --git a/lib/sql_builder.rb b/lib/sql_builder.rb index 69c2b2cc904..5a99f27718c 100644 --- a/lib/sql_builder.rb +++ b/lib/sql_builder.rb @@ -77,6 +77,10 @@ class SqlBuilder 16 => :value_to_boolean } + def self.map_exec(klass, sql, args = {}) + self.new(sql).map_exec(klass, args) + end + def map_exec(klass = OpenStruct, args = {}) results = exec(args) diff --git a/spec/models/search_observer_spec.rb b/spec/models/search_observer_spec.rb new file mode 100644 index 00000000000..ee4101471ce --- /dev/null +++ b/spec/models/search_observer_spec.rb @@ -0,0 +1,40 @@ +require 'spec_helper' + +describe SearchObserver do + + def get_row(post_id) + SqlBuilder.map_exec( + OpenStruct, + "select * from post_search_data where post_id = :post_id", + post_id: post_id + ).first + end + + it 'correctly indexes chinese' do + SiteSetting.default_locale = 'zh_CN' + data = "你好世界" + data.split(" ").length.should == 1 + + SearchObserver.update_posts_index(99, "你好世界", "", nil) + + row = get_row(99) + row.raw_data.split(' ').length.should == 2 + end + + it 'correctly indexes a post' do + data = "This is a test" + + SearchObserver.update_posts_index(99, data, "", nil) + + row = get_row(99) + + row.raw_data.should == "This is a test" + row.locale.should == "en" + + SearchObserver.update_posts_index(99, "tester", "", nil) + + row = get_row(99) + + row.raw_data.should == "tester" + end +end