mirror of
https://github.com/discourse/discourse.git
synced 2024-11-22 12:12:26 +08:00
Replace rmmseg gem for cppjieba_rb since better dictionary (#5006)
* Rename locale to ts config in search module to make it clear * Replace rmmese-cpp for cppjieba_rb
This commit is contained in:
parent
6c997b65d9
commit
f1eccd113c
2
Gemfile
2
Gemfile
|
@ -183,7 +183,7 @@ gem 'simple-rss', require: false
|
||||||
gem 'stackprof', require: false, platform: :mri
|
gem 'stackprof', require: false, platform: :mri
|
||||||
gem 'memory_profiler', require: false, platform: :mri
|
gem 'memory_profiler', require: false, platform: :mri
|
||||||
|
|
||||||
gem 'rmmseg-cpp', require: false
|
gem 'cppjieba_rb', require: false
|
||||||
|
|
||||||
gem 'logster'
|
gem 'logster'
|
||||||
|
|
||||||
|
|
|
@ -71,6 +71,7 @@ GEM
|
||||||
coderay (1.1.1)
|
coderay (1.1.1)
|
||||||
concurrent-ruby (1.0.5)
|
concurrent-ruby (1.0.5)
|
||||||
connection_pool (2.2.1)
|
connection_pool (2.2.1)
|
||||||
|
cppjieba_rb (0.3.0)
|
||||||
crack (0.4.3)
|
crack (0.4.3)
|
||||||
safe_yaml (~> 1.0.0)
|
safe_yaml (~> 1.0.0)
|
||||||
crass (1.0.2)
|
crass (1.0.2)
|
||||||
|
@ -288,7 +289,6 @@ GEM
|
||||||
redis-namespace (1.5.3)
|
redis-namespace (1.5.3)
|
||||||
redis (~> 3.0, >= 3.0.4)
|
redis (~> 3.0, >= 3.0.4)
|
||||||
rinku (2.0.2)
|
rinku (2.0.2)
|
||||||
rmmseg-cpp (0.2.9)
|
|
||||||
rspec (3.6.0)
|
rspec (3.6.0)
|
||||||
rspec-core (~> 3.6.0)
|
rspec-core (~> 3.6.0)
|
||||||
rspec-expectations (~> 3.6.0)
|
rspec-expectations (~> 3.6.0)
|
||||||
|
@ -396,6 +396,7 @@ DEPENDENCIES
|
||||||
bullet
|
bullet
|
||||||
byebug
|
byebug
|
||||||
certified
|
certified
|
||||||
|
cppjieba_rb
|
||||||
discourse-qunit-rails
|
discourse-qunit-rails
|
||||||
discourse_image_optim
|
discourse_image_optim
|
||||||
email_reply_trimmer (= 0.1.7)
|
email_reply_trimmer (= 0.1.7)
|
||||||
|
@ -460,7 +461,6 @@ DEPENDENCIES
|
||||||
redis
|
redis
|
||||||
redis-namespace
|
redis-namespace
|
||||||
rinku
|
rinku
|
||||||
rmmseg-cpp
|
|
||||||
rspec
|
rspec
|
||||||
rspec-html-matchers
|
rspec-html-matchers
|
||||||
rspec-rails
|
rspec-rails
|
||||||
|
|
|
@ -15,7 +15,7 @@ class SearchIndexer
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.update_index(table, id, raw_data)
|
def self.update_index(table, id, raw_data)
|
||||||
raw_data = Search.prepare_data(raw_data)
|
raw_data = Search.prepare_data(raw_data, :index)
|
||||||
|
|
||||||
table_name = "#{table}_search_data"
|
table_name = "#{table}_search_data"
|
||||||
foreign_key = "#{table}_id"
|
foreign_key = "#{table}_id"
|
||||||
|
@ -31,7 +31,7 @@ class SearchIndexer
|
||||||
end
|
end
|
||||||
|
|
||||||
# for user login and name use "simple" lowercase stemmer
|
# for user login and name use "simple" lowercase stemmer
|
||||||
stemmer = table == "user" ? "simple" : Search.long_locale
|
stemmer = table == "user" ? "simple" : Search.ts_config
|
||||||
|
|
||||||
# Would be nice to use AR here but not sure how to execut Postgres functions
|
# Would be nice to use AR here but not sure how to execut Postgres functions
|
||||||
# when inserting data like this.
|
# when inserting data like this.
|
||||||
|
|
|
@ -20,13 +20,15 @@ class Search
|
||||||
%w(topic category user private_messages)
|
%w(topic category user private_messages)
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.long_locale
|
def self.ts_config(locale = SiteSetting.default_locale)
|
||||||
# if adding a language see:
|
# if adding a text search configuration, you should check PG beforehand:
|
||||||
# /usr/share/postgresql/9.3/tsearch_data for possible options
|
# SELECT cfgname FROM pg_ts_config;
|
||||||
# Do not add languages that are missing without amending the
|
# As an aside, dictionaries can be listed by `\dFd`, the
|
||||||
|
# physical locations are in /usr/share/postgresql/<version>/tsearch_data.
|
||||||
|
# But it may not appear there based on pg extension configuration.
|
||||||
# base docker config
|
# base docker config
|
||||||
#
|
#
|
||||||
case SiteSetting.default_locale.to_sym
|
case locale.to_sym
|
||||||
when :da then 'danish'
|
when :da then 'danish'
|
||||||
when :de then 'german'
|
when :de then 'german'
|
||||||
when :en then 'english'
|
when :en then 'english'
|
||||||
|
@ -39,7 +41,7 @@ class Search
|
||||||
when :pt_BR then 'portuguese'
|
when :pt_BR then 'portuguese'
|
||||||
when :sv then 'swedish'
|
when :sv then 'swedish'
|
||||||
when :ru then 'russian'
|
when :ru then 'russian'
|
||||||
else 'simple' # use the 'simple' stemmer for other languages
|
else 'simple' # use the 'simple' stemmer for other languages
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -73,21 +75,16 @@ class Search
|
||||||
nil
|
nil
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.prepare_data(search_data)
|
def self.prepare_data(search_data, purpose = :query)
|
||||||
data = search_data.squish
|
data = search_data.squish
|
||||||
# TODO rmmseg is designed for chinese, we need something else for Korean / Japanese
|
# TODO cppjieba_rb is designed for chinese, we need something else for Korean / Japanese
|
||||||
if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese_japanese_korean
|
if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese_japanese_korean
|
||||||
unless defined? RMMSeg
|
unless defined? CppjiebaRb
|
||||||
require 'rmmseg'
|
require 'cppjieba_rb'
|
||||||
RMMSeg::Dictionary.load_dictionaries
|
|
||||||
end
|
|
||||||
|
|
||||||
algo = RMMSeg::Algorithm.new(search_data)
|
|
||||||
|
|
||||||
data = ""
|
|
||||||
while token = algo.next_token
|
|
||||||
data << token.text << " "
|
|
||||||
end
|
end
|
||||||
|
mode = (purpose == :query ? :query : :mix)
|
||||||
|
data = CppjiebaRb.segment(search_data, mode: mode)
|
||||||
|
data = CppjiebaRb.filter_stop_word(data).join(' ')
|
||||||
end
|
end
|
||||||
|
|
||||||
data.force_encoding("UTF-8")
|
data.force_encoding("UTF-8")
|
||||||
|
@ -477,7 +474,7 @@ class Search
|
||||||
FROM topic_tags tt, tags
|
FROM topic_tags tt, tags
|
||||||
WHERE tt.tag_id = tags.id
|
WHERE tt.tag_id = tags.id
|
||||||
GROUP BY tt.topic_id
|
GROUP BY tt.topic_id
|
||||||
HAVING to_tsvector(#{query_locale}, array_to_string(array_agg(tags.name), ' ')) @@ to_tsquery(#{query_locale}, ?)
|
HAVING to_tsvector(#{default_ts_config}, array_to_string(array_agg(tags.name), ' ')) @@ to_tsquery(#{default_ts_config}, ?)
|
||||||
)", tags.join('&'))
|
)", tags.join('&'))
|
||||||
else
|
else
|
||||||
tags = match.split(",")
|
tags = match.split(",")
|
||||||
|
@ -678,7 +675,6 @@ class Search
|
||||||
posts = posts.where("posts.raw || ' ' || u.username || ' ' || COALESCE(u.name, '') ilike ?", "%#{term_without_quote}%")
|
posts = posts.where("posts.raw || ' ' || u.username || ' ' || COALESCE(u.name, '') ilike ?", "%#{term_without_quote}%")
|
||||||
else
|
else
|
||||||
posts = posts.where("post_search_data.search_data @@ #{ts_query}")
|
posts = posts.where("post_search_data.search_data @@ #{ts_query}")
|
||||||
|
|
||||||
exact_terms = @term.scan(/"([^"]+)"/).flatten
|
exact_terms = @term.scan(/"([^"]+)"/).flatten
|
||||||
exact_terms.each do |exact|
|
exact_terms.each do |exact|
|
||||||
posts = posts.where("posts.raw ilike ?", "%#{exact}%")
|
posts = posts.where("posts.raw ilike ?", "%#{exact}%")
|
||||||
|
@ -740,7 +736,7 @@ class Search
|
||||||
posts = posts.order("posts.like_count DESC")
|
posts = posts.order("posts.like_count DESC")
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
posts = posts.order("TS_RANK_CD(TO_TSVECTOR(#{query_locale}, topics.title), #{ts_query}) DESC")
|
posts = posts.order("TS_RANK_CD(TO_TSVECTOR(#{default_ts_config}, topics.title), #{ts_query}) DESC")
|
||||||
|
|
||||||
data_ranking = "TS_RANK_CD(post_search_data.search_data, #{ts_query})"
|
data_ranking = "TS_RANK_CD(post_search_data.search_data, #{ts_query})"
|
||||||
if opts[:aggregate_search]
|
if opts[:aggregate_search]
|
||||||
|
@ -760,34 +756,33 @@ class Search
|
||||||
posts.limit(limit)
|
posts.limit(limit)
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.query_locale
|
def self.default_ts_config
|
||||||
"'#{Search.long_locale}'"
|
"'#{Search.ts_config}'"
|
||||||
end
|
end
|
||||||
|
|
||||||
def query_locale
|
def default_ts_config
|
||||||
self.class.query_locale
|
self.class.default_ts_config
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.ts_query(term, locale = nil, joiner = "&")
|
def self.ts_query(term, ts_config = nil, joiner = "&")
|
||||||
|
|
||||||
data = Post.exec_sql("SELECT to_tsvector(:locale, :term)",
|
data = Post.exec_sql("SELECT TO_TSVECTOR(:config, :term)",
|
||||||
locale: 'simple',
|
config: 'simple',
|
||||||
term: term
|
term: term).values[0][0]
|
||||||
).values[0][0]
|
|
||||||
|
|
||||||
locale = Post.sanitize(locale) if locale
|
ts_config = Post.sanitize(ts_config) if ts_config
|
||||||
all_terms = data.scan(/'([^']+)'\:\d+/).flatten
|
all_terms = data.scan(/'([^']+)'\:\d+/).flatten
|
||||||
all_terms.map! do |t|
|
all_terms.map! do |t|
|
||||||
t.split(/[\)\(&']/)[0]
|
t.split(/[\)\(&']/)[0]
|
||||||
end.compact!
|
end.compact!
|
||||||
|
|
||||||
query = Post.sanitize(all_terms.map { |t| "'#{PG::Connection.escape_string(t)}':*" }.join(" #{joiner} "))
|
query = Post.sanitize(all_terms.map { |t| "'#{PG::Connection.escape_string(t)}':*" }.join(" #{joiner} "))
|
||||||
"TO_TSQUERY(#{locale || query_locale}, #{query})"
|
"TO_TSQUERY(#{ts_config || default_ts_config}, #{query})"
|
||||||
end
|
end
|
||||||
|
|
||||||
def ts_query(locale = nil)
|
def ts_query(ts_config = nil)
|
||||||
@ts_query_cache ||= {}
|
@ts_query_cache ||= {}
|
||||||
@ts_query_cache[(locale || query_locale) + " " + @term] ||= Search.ts_query(@term, locale)
|
@ts_query_cache["#{ts_config || default_ts_config} #{@term}"] ||= Search.ts_query(@term, ts_config)
|
||||||
end
|
end
|
||||||
|
|
||||||
def wrap_rows(query)
|
def wrap_rows(query)
|
||||||
|
|
|
@ -440,10 +440,31 @@ describe Search do
|
||||||
end
|
end
|
||||||
|
|
||||||
describe 'Chinese search' do
|
describe 'Chinese search' do
|
||||||
it 'splits English / Chinese' do
|
let(:sentence) { 'Discourse中国的基础设施网络正在组装' }
|
||||||
|
let(:sentence_t) { 'Discourse太平山森林遊樂區' }
|
||||||
|
|
||||||
|
it 'splits English / Chinese and filter out stop words' do
|
||||||
SiteSetting.default_locale = 'zh_CN'
|
SiteSetting.default_locale = 'zh_CN'
|
||||||
data = Search.prepare_data('Discourse社区指南').split(' ')
|
data = Search.prepare_data(sentence).split(' ')
|
||||||
expect(data).to eq(['Discourse', '社区', '指南'])
|
expect(data).to eq(["Discourse", "中国", "基础", "设施", "基础设施", "网络", "正在", "组装"])
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'splits for indexing and filter out stop words' do
|
||||||
|
SiteSetting.default_locale = 'zh_CN'
|
||||||
|
data = Search.prepare_data(sentence, :index).split(' ')
|
||||||
|
expect(data).to eq(["Discourse", "中国", "基础设施", "网络", "正在", "组装"])
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'splits English / Traditional Chinese and filter out stop words' do
|
||||||
|
SiteSetting.default_locale = 'zh_TW'
|
||||||
|
data = Search.prepare_data(sentence_t).split(' ')
|
||||||
|
expect(data).to eq(["Discourse", "太平", "平山", "太平山", "森林", "遊樂區"])
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'splits for indexing and filter out stop words' do
|
||||||
|
SiteSetting.default_locale = 'zh_TW'
|
||||||
|
data = Search.prepare_data(sentence_t, :index).split(' ')
|
||||||
|
expect(data).to eq(["Discourse", "太平山", "森林", "遊樂區"])
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'finds chinese topic based on title' do
|
it 'finds chinese topic based on title' do
|
||||||
|
|
Loading…
Reference in New Issue
Block a user