diff --git a/lib/search.rb b/lib/search.rb index 516660b3414..def3be0fd72 100644 --- a/lib/search.rb +++ b/lib/search.rb @@ -69,19 +69,17 @@ class Search SiteSetting.search_tokenize_chinese_japanese_korean end - def self.prepare_data(search_data, purpose = :query) - purpose ||= :query - + def self.prepare_data(search_data, purpose = nil) data = search_data.dup data.force_encoding("UTF-8") + if purpose != :topic # TODO cppjieba_rb is designed for chinese, we need something else for Japanese # Korean appears to be safe cause words are already space separated # For Japanese we should investigate using kakasi if segment_cjk? require 'cppjieba_rb' unless defined? CppjiebaRb - mode = (purpose == :query ? :query : :mix) - data = CppjiebaRb.segment(search_data, mode: mode) + data = CppjiebaRb.segment(search_data, mode: :mix) # TODO: we still want to tokenize here but the current stopword list is too wide # in cppjieba leading to words such as volume to be skipped. PG already has an English diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb index c8f2a5c9521..d3c61aacbbf 100644 --- a/spec/components/search_spec.rb +++ b/spec/components/search_spec.rb @@ -1107,7 +1107,7 @@ describe Search do it 'splits English / Chinese and filter out stop words' do SiteSetting.default_locale = 'zh_CN' data = Search.prepare_data(sentence).split(' ') - expect(data).to eq(["Discourse", "中国", "基础", "设施", "基础设施", "网络", "正在", "组装"]) + expect(data).to eq(["Discourse", "中国", "基础设施", "网络", "正在", "组装"]) end it 'splits for indexing and filter out stop words' do @@ -1119,12 +1119,6 @@ describe Search do it 'splits English / Traditional Chinese and filter out stop words' do SiteSetting.default_locale = 'zh_TW' data = Search.prepare_data(sentence_t).split(' ') - expect(data).to eq(["Discourse", "太平", "平山", "太平山", "森林", "遊樂區"]) - end - - it 'splits for indexing and filter out stop words' do - SiteSetting.default_locale = 'zh_TW' - data = Search.prepare_data(sentence_t, :index).split(' ') expect(data).to eq(["Discourse", "太平山", "森林", "遊樂區"]) end