From f1eccd113c643a8c7e785187cda8849a0efea86b Mon Sep 17 00:00:00 2001
From: Erick Guan <fantasticfears@gmail.com>
Date: Mon, 31 Jul 2017 21:28:48 +0200
Subject: [PATCH] Replace rmmseg gem for cppjieba_rb since better dictionary
 (#5006)

* Rename locale to ts config in search module to make it clear

* Replace rmmese-cpp for cppjieba_rb
---
 Gemfile                        |  2 +-
 Gemfile.lock                   |  4 +--
 app/services/search_indexer.rb |  4 +--
 lib/search.rb                  | 63 ++++++++++++++++------------------
 spec/components/search_spec.rb | 27 +++++++++++++--
 5 files changed, 58 insertions(+), 42 deletions(-)
diff --git a/Gemfile b/Gemfile
index 217d13c6b12..3e3f46af53f 100644
--- a/Gemfile
+++ b/Gemfile
@@ -183,7 +183,7 @@ gem 'simple-rss', require: false
 gem 'stackprof', require: false, platform: :mri
 gem 'memory_profiler', require: false, platform: :mri
 
-gem 'rmmseg-cpp', require: false
+gem 'cppjieba_rb', require: false
 
 gem 'logster'
 
diff --git a/Gemfile.lock b/Gemfile.lock
index dd0872fce47..aeed7d7d32c 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -71,6 +71,7 @@ GEM
     coderay (1.1.1)
     concurrent-ruby (1.0.5)
     connection_pool (2.2.1)
+    cppjieba_rb (0.3.0)
     crack (0.4.3)
       safe_yaml (~> 1.0.0)
     crass (1.0.2)
@@ -288,7 +289,6 @@ GEM
     redis-namespace (1.5.3)
       redis (~> 3.0, >= 3.0.4)
     rinku (2.0.2)
-    rmmseg-cpp (0.2.9)
     rspec (3.6.0)
       rspec-core (~> 3.6.0)
       rspec-expectations (~> 3.6.0)
@@ -396,6 +396,7 @@ DEPENDENCIES
   bullet
   byebug
   certified
+  cppjieba_rb
   discourse-qunit-rails
   discourse_image_optim
   email_reply_trimmer (= 0.1.7)
@@ -460,7 +461,6 @@ DEPENDENCIES
   redis
   redis-namespace
   rinku
-  rmmseg-cpp
   rspec
   rspec-html-matchers
   rspec-rails
diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb
index 6eb92412863..e294f41131a 100644
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -15,7 +15,7 @@ class SearchIndexer
   end
 
   def self.update_index(table, id, raw_data)
-    raw_data = Search.prepare_data(raw_data)
+    raw_data = Search.prepare_data(raw_data, :index)
 
     table_name = "#{table}_search_data"
     foreign_key = "#{table}_id"
@@ -31,7 +31,7 @@ class SearchIndexer
     end
 
     # for user login and name use "simple" lowercase stemmer
-    stemmer = table == "user" ? "simple" : Search.long_locale
+    stemmer = table == "user" ? "simple" : Search.ts_config
 
     # Would be nice to use AR here but not sure how to execut Postgres functions
     # when inserting data like this.
diff --git a/lib/search.rb b/lib/search.rb
index 339d30473e1..67f8a239936 100644
--- a/lib/search.rb
+++ b/lib/search.rb
@@ -20,13 +20,15 @@ class Search
     %w(topic category user private_messages)
   end
 
-  def self.long_locale
-    # if adding a language see:
-    # /usr/share/postgresql/9.3/tsearch_data for possible options
-    # Do not add languages that are missing without amending the
+  def self.ts_config(locale = SiteSetting.default_locale)
+    # if adding a text search configuration, you should check PG beforehand:
+    # SELECT cfgname FROM pg_ts_config;
+    # As an aside, dictionaries can be listed by `\dFd`, the
+    # physical locations are in /usr/share/postgresql/<version>/tsearch_data.
+    # But it may not appear there based on pg extension configuration.
     # base docker config
     #
-    case SiteSetting.default_locale.to_sym
+    case locale.to_sym
     when :da     then 'danish'
     when :de     then 'german'
     when :en     then 'english'
@@ -39,7 +41,7 @@ class Search
     when :pt_BR  then 'portuguese'
     when :sv     then 'swedish'
     when :ru     then 'russian'
-      else 'simple' # use the 'simple' stemmer for other languages
+    else 'simple' # use the 'simple' stemmer for other languages
     end
   end
 
@@ -73,21 +75,16 @@ class Search
     nil
   end
 
-  def self.prepare_data(search_data)
+  def self.prepare_data(search_data, purpose = :query)
     data = search_data.squish
-    # TODO rmmseg is designed for chinese, we need something else for Korean / Japanese
+    # TODO cppjieba_rb is designed for chinese, we need something else for Korean / Japanese
     if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese_japanese_korean
-      unless defined? RMMSeg
-        require 'rmmseg'
-        RMMSeg::Dictionary.load_dictionaries
-      end
-
-      algo = RMMSeg::Algorithm.new(search_data)
-
-      data = ""
-      while token = algo.next_token
-        data << token.text << " "
+      unless defined? CppjiebaRb
+        require 'cppjieba_rb'
       end
+      mode = (purpose == :query ? :query : :mix)
+      data = CppjiebaRb.segment(search_data, mode: mode)
+      data = CppjiebaRb.filter_stop_word(data).join(' ')
     end
 
     data.force_encoding("UTF-8")
@@ -477,7 +474,7 @@ class Search
       FROM topic_tags tt, tags
       WHERE tt.tag_id = tags.id
       GROUP BY tt.topic_id
-      HAVING to_tsvector(#{query_locale}, array_to_string(array_agg(tags.name), ' ')) @@ to_tsquery(#{query_locale}, ?)
+      HAVING to_tsvector(#{default_ts_config}, array_to_string(array_agg(tags.name), ' ')) @@ to_tsquery(#{default_ts_config}, ?)
       )", tags.join('&'))
     else
       tags = match.split(",")
@@ -678,7 +675,6 @@ class Search
           posts = posts.where("posts.raw  || ' ' || u.username || ' ' || COALESCE(u.name, '') ilike ?", "%#{term_without_quote}%")
         else
           posts = posts.where("post_search_data.search_data @@ #{ts_query}")
-
           exact_terms = @term.scan(/"([^"]+)"/).flatten
           exact_terms.each do |exact|
             posts = posts.where("posts.raw ilike ?", "%#{exact}%")
@@ -740,7 +736,7 @@ class Search
           posts = posts.order("posts.like_count DESC")
         end
       else
-        posts = posts.order("TS_RANK_CD(TO_TSVECTOR(#{query_locale}, topics.title), #{ts_query}) DESC")
+        posts = posts.order("TS_RANK_CD(TO_TSVECTOR(#{default_ts_config}, topics.title), #{ts_query}) DESC")
 
         data_ranking = "TS_RANK_CD(post_search_data.search_data, #{ts_query})"
         if opts[:aggregate_search]
@@ -760,34 +756,33 @@ class Search
       posts.limit(limit)
     end
 
-    def self.query_locale
-      "'#{Search.long_locale}'"
+    def self.default_ts_config
+      "'#{Search.ts_config}'"
     end
 
-    def query_locale
-      self.class.query_locale
+    def default_ts_config
+      self.class.default_ts_config
     end
 
-    def self.ts_query(term, locale = nil, joiner = "&")
+    def self.ts_query(term, ts_config = nil, joiner = "&")
 
-      data = Post.exec_sql("SELECT to_tsvector(:locale, :term)",
-                            locale: 'simple',
-                            term: term
-                          ).values[0][0]
+      data = Post.exec_sql("SELECT TO_TSVECTOR(:config, :term)",
+                           config: 'simple',
+                           term: term).values[0][0]
 
-      locale = Post.sanitize(locale) if locale
+      ts_config = Post.sanitize(ts_config) if ts_config
       all_terms = data.scan(/'([^']+)'\:\d+/).flatten
       all_terms.map! do |t|
         t.split(/[\)\(&']/)[0]
       end.compact!
 
       query = Post.sanitize(all_terms.map { |t| "'#{PG::Connection.escape_string(t)}':*" }.join(" #{joiner} "))
-      "TO_TSQUERY(#{locale || query_locale}, #{query})"
+      "TO_TSQUERY(#{ts_config || default_ts_config}, #{query})"
     end
 
-    def ts_query(locale = nil)
+    def ts_query(ts_config = nil)
       @ts_query_cache ||= {}
-      @ts_query_cache[(locale || query_locale) + " " + @term] ||= Search.ts_query(@term, locale)
+      @ts_query_cache["#{ts_config || default_ts_config} #{@term}"] ||= Search.ts_query(@term, ts_config)
     end
 
     def wrap_rows(query)
diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb
index 7d0f86cff4a..7700bdb077e 100644
--- a/spec/components/search_spec.rb
+++ b/spec/components/search_spec.rb
@@ -440,10 +440,31 @@ describe Search do
   end
 
   describe 'Chinese search' do
-    it 'splits English / Chinese' do
+    let(:sentence) { 'Discourse中国的基础设施网络正在组装' }
+    let(:sentence_t) { 'Discourse太平山森林遊樂區' }
+
+    it 'splits English / Chinese and filter out stop words' do
       SiteSetting.default_locale = 'zh_CN'
-      data = Search.prepare_data('Discourse社区指南').split(' ')
-      expect(data).to eq(['Discourse', '社区', '指南'])
+      data = Search.prepare_data(sentence).split(' ')
+      expect(data).to eq(["Discourse", "中国", "基础", "设施", "基础设施", "网络", "正在", "组装"])
+    end
+
+    it 'splits for indexing and filter out stop words' do
+      SiteSetting.default_locale = 'zh_CN'
+      data = Search.prepare_data(sentence, :index).split(' ')
+      expect(data).to eq(["Discourse", "中国", "基础设施", "网络", "正在", "组装"])
+    end
+
+    it 'splits English / Traditional Chinese and filter out stop words' do
+      SiteSetting.default_locale = 'zh_TW'
+      data = Search.prepare_data(sentence_t).split(' ')
+      expect(data).to eq(["Discourse", "太平", "平山", "太平山", "森林", "遊樂區"])
+    end
+
+    it 'splits for indexing and filter out stop words' do
+      SiteSetting.default_locale = 'zh_TW'
+      data = Search.prepare_data(sentence_t, :index).split(' ')
+      expect(data).to eq(["Discourse", "太平山", "森林", "遊樂區"])
     end
 
     it 'finds chinese topic based on title' do