From f1eccd113c643a8c7e785187cda8849a0efea86b Mon Sep 17 00:00:00 2001 From: Erick Guan Date: Mon, 31 Jul 2017 21:28:48 +0200 Subject: [PATCH] Replace rmmseg gem for cppjieba_rb since better dictionary (#5006) * Rename locale to ts config in search module to make it clear * Replace rmmese-cpp for cppjieba_rb --- Gemfile | 2 +- Gemfile.lock | 4 +-- app/services/search_indexer.rb | 4 +-- lib/search.rb | 63 ++++++++++++++++------------------ spec/components/search_spec.rb | 27 +++++++++++++-- 5 files changed, 58 insertions(+), 42 deletions(-) diff --git a/Gemfile b/Gemfile index 217d13c6b12..3e3f46af53f 100644 --- a/Gemfile +++ b/Gemfile @@ -183,7 +183,7 @@ gem 'simple-rss', require: false gem 'stackprof', require: false, platform: :mri gem 'memory_profiler', require: false, platform: :mri -gem 'rmmseg-cpp', require: false +gem 'cppjieba_rb', require: false gem 'logster' diff --git a/Gemfile.lock b/Gemfile.lock index dd0872fce47..aeed7d7d32c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -71,6 +71,7 @@ GEM coderay (1.1.1) concurrent-ruby (1.0.5) connection_pool (2.2.1) + cppjieba_rb (0.3.0) crack (0.4.3) safe_yaml (~> 1.0.0) crass (1.0.2) @@ -288,7 +289,6 @@ GEM redis-namespace (1.5.3) redis (~> 3.0, >= 3.0.4) rinku (2.0.2) - rmmseg-cpp (0.2.9) rspec (3.6.0) rspec-core (~> 3.6.0) rspec-expectations (~> 3.6.0) @@ -396,6 +396,7 @@ DEPENDENCIES bullet byebug certified + cppjieba_rb discourse-qunit-rails discourse_image_optim email_reply_trimmer (= 0.1.7) @@ -460,7 +461,6 @@ DEPENDENCIES redis redis-namespace rinku - rmmseg-cpp rspec rspec-html-matchers rspec-rails diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index 6eb92412863..e294f41131a 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -15,7 +15,7 @@ class SearchIndexer end def self.update_index(table, id, raw_data) - raw_data = Search.prepare_data(raw_data) + raw_data = Search.prepare_data(raw_data, :index) table_name = "#{table}_search_data" foreign_key = "#{table}_id" @@ -31,7 +31,7 @@ class SearchIndexer end # for user login and name use "simple" lowercase stemmer - stemmer = table == "user" ? "simple" : Search.long_locale + stemmer = table == "user" ? "simple" : Search.ts_config # Would be nice to use AR here but not sure how to execut Postgres functions # when inserting data like this. diff --git a/lib/search.rb b/lib/search.rb index 339d30473e1..67f8a239936 100644 --- a/lib/search.rb +++ b/lib/search.rb @@ -20,13 +20,15 @@ class Search %w(topic category user private_messages) end - def self.long_locale - # if adding a language see: - # /usr/share/postgresql/9.3/tsearch_data for possible options - # Do not add languages that are missing without amending the + def self.ts_config(locale = SiteSetting.default_locale) + # if adding a text search configuration, you should check PG beforehand: + # SELECT cfgname FROM pg_ts_config; + # As an aside, dictionaries can be listed by `\dFd`, the + # physical locations are in /usr/share/postgresql//tsearch_data. + # But it may not appear there based on pg extension configuration. # base docker config # - case SiteSetting.default_locale.to_sym + case locale.to_sym when :da then 'danish' when :de then 'german' when :en then 'english' @@ -39,7 +41,7 @@ class Search when :pt_BR then 'portuguese' when :sv then 'swedish' when :ru then 'russian' - else 'simple' # use the 'simple' stemmer for other languages + else 'simple' # use the 'simple' stemmer for other languages end end @@ -73,21 +75,16 @@ class Search nil end - def self.prepare_data(search_data) + def self.prepare_data(search_data, purpose = :query) data = search_data.squish - # TODO rmmseg is designed for chinese, we need something else for Korean / Japanese + # TODO cppjieba_rb is designed for chinese, we need something else for Korean / Japanese if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese_japanese_korean - unless defined? RMMSeg - require 'rmmseg' - RMMSeg::Dictionary.load_dictionaries - end - - algo = RMMSeg::Algorithm.new(search_data) - - data = "" - while token = algo.next_token - data << token.text << " " + unless defined? CppjiebaRb + require 'cppjieba_rb' end + mode = (purpose == :query ? :query : :mix) + data = CppjiebaRb.segment(search_data, mode: mode) + data = CppjiebaRb.filter_stop_word(data).join(' ') end data.force_encoding("UTF-8") @@ -477,7 +474,7 @@ class Search FROM topic_tags tt, tags WHERE tt.tag_id = tags.id GROUP BY tt.topic_id - HAVING to_tsvector(#{query_locale}, array_to_string(array_agg(tags.name), ' ')) @@ to_tsquery(#{query_locale}, ?) + HAVING to_tsvector(#{default_ts_config}, array_to_string(array_agg(tags.name), ' ')) @@ to_tsquery(#{default_ts_config}, ?) )", tags.join('&')) else tags = match.split(",") @@ -678,7 +675,6 @@ class Search posts = posts.where("posts.raw || ' ' || u.username || ' ' || COALESCE(u.name, '') ilike ?", "%#{term_without_quote}%") else posts = posts.where("post_search_data.search_data @@ #{ts_query}") - exact_terms = @term.scan(/"([^"]+)"/).flatten exact_terms.each do |exact| posts = posts.where("posts.raw ilike ?", "%#{exact}%") @@ -740,7 +736,7 @@ class Search posts = posts.order("posts.like_count DESC") end else - posts = posts.order("TS_RANK_CD(TO_TSVECTOR(#{query_locale}, topics.title), #{ts_query}) DESC") + posts = posts.order("TS_RANK_CD(TO_TSVECTOR(#{default_ts_config}, topics.title), #{ts_query}) DESC") data_ranking = "TS_RANK_CD(post_search_data.search_data, #{ts_query})" if opts[:aggregate_search] @@ -760,34 +756,33 @@ class Search posts.limit(limit) end - def self.query_locale - "'#{Search.long_locale}'" + def self.default_ts_config + "'#{Search.ts_config}'" end - def query_locale - self.class.query_locale + def default_ts_config + self.class.default_ts_config end - def self.ts_query(term, locale = nil, joiner = "&") + def self.ts_query(term, ts_config = nil, joiner = "&") - data = Post.exec_sql("SELECT to_tsvector(:locale, :term)", - locale: 'simple', - term: term - ).values[0][0] + data = Post.exec_sql("SELECT TO_TSVECTOR(:config, :term)", + config: 'simple', + term: term).values[0][0] - locale = Post.sanitize(locale) if locale + ts_config = Post.sanitize(ts_config) if ts_config all_terms = data.scan(/'([^']+)'\:\d+/).flatten all_terms.map! do |t| t.split(/[\)\(&']/)[0] end.compact! query = Post.sanitize(all_terms.map { |t| "'#{PG::Connection.escape_string(t)}':*" }.join(" #{joiner} ")) - "TO_TSQUERY(#{locale || query_locale}, #{query})" + "TO_TSQUERY(#{ts_config || default_ts_config}, #{query})" end - def ts_query(locale = nil) + def ts_query(ts_config = nil) @ts_query_cache ||= {} - @ts_query_cache[(locale || query_locale) + " " + @term] ||= Search.ts_query(@term, locale) + @ts_query_cache["#{ts_config || default_ts_config} #{@term}"] ||= Search.ts_query(@term, ts_config) end def wrap_rows(query) diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb index 7d0f86cff4a..7700bdb077e 100644 --- a/spec/components/search_spec.rb +++ b/spec/components/search_spec.rb @@ -440,10 +440,31 @@ describe Search do end describe 'Chinese search' do - it 'splits English / Chinese' do + let(:sentence) { 'Discourse中国的基础设施网络正在组装' } + let(:sentence_t) { 'Discourse太平山森林遊樂區' } + + it 'splits English / Chinese and filter out stop words' do SiteSetting.default_locale = 'zh_CN' - data = Search.prepare_data('Discourse社区指南').split(' ') - expect(data).to eq(['Discourse', '社区', '指南']) + data = Search.prepare_data(sentence).split(' ') + expect(data).to eq(["Discourse", "中国", "基础", "设施", "基础设施", "网络", "正在", "组装"]) + end + + it 'splits for indexing and filter out stop words' do + SiteSetting.default_locale = 'zh_CN' + data = Search.prepare_data(sentence, :index).split(' ') + expect(data).to eq(["Discourse", "中国", "基础设施", "网络", "正在", "组装"]) + end + + it 'splits English / Traditional Chinese and filter out stop words' do + SiteSetting.default_locale = 'zh_TW' + data = Search.prepare_data(sentence_t).split(' ') + expect(data).to eq(["Discourse", "太平", "平山", "太平山", "森林", "遊樂區"]) + end + + it 'splits for indexing and filter out stop words' do + SiteSetting.default_locale = 'zh_TW' + data = Search.prepare_data(sentence_t, :index).split(' ') + expect(data).to eq(["Discourse", "太平山", "森林", "遊樂區"]) end it 'finds chinese topic based on title' do