mirror of
https://github.com/discourse/discourse.git
synced 2025-04-02 08:55:56 +08:00
FIX: Inject extra lexemes for host lexeme.
``` discourse_development=# SELECT alias, lexemes FROM TS_DEBUG('www.discourse.org'); alias | lexemes -------+--------------------- host | {www.discourse.org} discourse_development=# SELECT TO_TSVECTOR('www.discourse.org'); to_tsvector ----------------------- 'www.discourse.org':1 ``` Given the above lexeme, we will inject additional lexeme by splitting the host on `.`. The actual tsvector stored will look something like ``` tsvector --------------------------------------- 'discourse':1 'discourse.org':1 'org':1 'www':1 'www.discourse.org':1 ```
This commit is contained in:
parent
5c31216aea
commit
5c230266d3
@ -16,26 +16,9 @@ class SearchIndexer
|
|||||||
HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
|
HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.inject_extra_terms(raw)
|
|
||||||
return raw if !SiteSetting.search_inject_extra_terms
|
|
||||||
|
|
||||||
# insert some extra words for I.am.a.word so "word" is tokenized
|
|
||||||
# I.am.a.word becomes I.am.a.word am a word
|
|
||||||
raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot|
|
|
||||||
|
|
||||||
split = with_dot.split(/https?:\/\/|[?:;,.\/]/)
|
|
||||||
|
|
||||||
if split.length > 1
|
|
||||||
with_dot + ((+" ") << split[1..-1].reject { |x| x.blank? }.join(" "))
|
|
||||||
else
|
|
||||||
with_dot
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def self.update_index(table: , id: , raw_data:)
|
def self.update_index(table: , id: , raw_data:)
|
||||||
search_data = raw_data.map do |data|
|
search_data = raw_data.map do |data|
|
||||||
inject_extra_terms(Search.prepare_data(data || "", :index))
|
Search.prepare_data(data || "", :index)
|
||||||
end
|
end
|
||||||
|
|
||||||
table_name = "#{table}_search_data"
|
table_name = "#{table}_search_data"
|
||||||
@ -53,15 +36,39 @@ class SearchIndexer
|
|||||||
|
|
||||||
indexed_data = search_data.select { |d| d.length > 0 }.join(' ')
|
indexed_data = search_data.select { |d| d.length > 0 }.join(' ')
|
||||||
|
|
||||||
params = {
|
ranked_params = {
|
||||||
a: search_data[0],
|
a: search_data[0],
|
||||||
b: search_data[1],
|
b: search_data[1],
|
||||||
c: search_data[2],
|
c: search_data[2],
|
||||||
d: search_data[3],
|
d: search_data[3],
|
||||||
|
}
|
||||||
|
|
||||||
|
tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
|
||||||
|
additional_lexemes = []
|
||||||
|
|
||||||
|
tsvector.scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/).reduce(additional_lexemes) do |array, (lexeme, _, positions)|
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
loop do
|
||||||
|
count += 1
|
||||||
|
break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
|
||||||
|
term, _, remaining = lexeme.partition(".")
|
||||||
|
break if remaining.blank?
|
||||||
|
array << "'#{term}':#{positions} '#{remaining}':#{positions}"
|
||||||
|
lexeme = remaining
|
||||||
|
end
|
||||||
|
|
||||||
|
array
|
||||||
|
end
|
||||||
|
|
||||||
|
tsvector = "#{tsvector} #{additional_lexemes.join(' ')}"
|
||||||
|
|
||||||
|
params = {
|
||||||
raw_data: indexed_data,
|
raw_data: indexed_data,
|
||||||
id: id,
|
id: id,
|
||||||
locale: SiteSetting.default_locale,
|
locale: SiteSetting.default_locale,
|
||||||
version: INDEX_VERSION
|
version: INDEX_VERSION,
|
||||||
|
tsvector: tsvector,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Would be nice to use AR here but not sure how to execut Postgres functions
|
# Would be nice to use AR here but not sure how to execut Postgres functions
|
||||||
@ -71,7 +78,7 @@ class SearchIndexer
|
|||||||
SET
|
SET
|
||||||
raw_data = :raw_data,
|
raw_data = :raw_data,
|
||||||
locale = :locale,
|
locale = :locale,
|
||||||
search_data = #{ranked_index},
|
search_data = (:tsvector)::tsvector,
|
||||||
version = :version
|
version = :version
|
||||||
WHERE #{foreign_key} = :id
|
WHERE #{foreign_key} = :id
|
||||||
SQL
|
SQL
|
||||||
@ -80,7 +87,7 @@ class SearchIndexer
|
|||||||
DB.exec(<<~SQL, params)
|
DB.exec(<<~SQL, params)
|
||||||
INSERT INTO #{table_name}
|
INSERT INTO #{table_name}
|
||||||
(#{foreign_key}, search_data, locale, raw_data, version)
|
(#{foreign_key}, search_data, locale, raw_data, version)
|
||||||
VALUES (:id, #{ranked_index}, :locale, :raw_data, :version)
|
VALUES (:id, (:tsvector)::tsvector, :locale, :raw_data, :version)
|
||||||
SQL
|
SQL
|
||||||
end
|
end
|
||||||
rescue
|
rescue
|
||||||
|
@ -1750,9 +1750,6 @@ search:
|
|||||||
search_ranking_normalization:
|
search_ranking_normalization:
|
||||||
default: '1'
|
default: '1'
|
||||||
hidden: true
|
hidden: true
|
||||||
search_inject_extra_terms:
|
|
||||||
default: true
|
|
||||||
hidden: true
|
|
||||||
min_search_term_length:
|
min_search_term_length:
|
||||||
client: true
|
client: true
|
||||||
default: 3
|
default: 3
|
||||||
|
@ -1255,20 +1255,26 @@ describe Search do
|
|||||||
])
|
])
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'can tokenize dots' do
|
it 'can search for terms with dots' do
|
||||||
post = Fabricate(:post, raw: 'Will.2000 Will.Bob.Bill...')
|
post = Fabricate(:post, raw: 'Will.2000 Will.Bob.Bill...')
|
||||||
expect(Search.execute('bill').posts.map(&:id)).to eq([post.id])
|
expect(Search.execute('bill').posts.map(&:id)).to eq([post.id])
|
||||||
|
expect(Search.execute('bob').posts.map(&:id)).to eq([post.id])
|
||||||
|
expect(Search.execute('2000').posts.map(&:id)).to eq([post.id])
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'can search URLS correctly' do
|
it 'can search URLS correctly' do
|
||||||
post = Fabricate(:post, raw: 'i like http://wb.camra.org.uk/latest#test so yay')
|
post = Fabricate(:post, raw: 'i like http://wb.camra.org.uk/latest#test so yay')
|
||||||
|
|
||||||
expect(Search.execute('http://wb.camra.org.uk/latest#test').posts.map(&:id)).to eq([post.id])
|
expect(Search.execute('http://wb.camra.org.uk/latest#test').posts.map(&:id)).to eq([post.id])
|
||||||
expect(Search.execute('camra').posts.map(&:id)).to eq([post.id])
|
expect(Search.execute('camra').posts.map(&:id)).to eq([post.id])
|
||||||
|
expect(Search.execute('http://wb').posts.map(&:id)).to eq([post.id])
|
||||||
complex_url = "https://test.some.site.com/path?some.range_input=74235a"
|
expect(Search.execute('wb.camra').posts.map(&:id)).to eq([post.id])
|
||||||
post2 = Fabricate(:post, raw: "this is a complex url #{complex_url} so complex")
|
expect(Search.execute('wb.camra.org').posts.map(&:id)).to eq([post.id])
|
||||||
|
expect(Search.execute('org.uk').posts.map(&:id)).to eq([post.id])
|
||||||
expect(Search.execute(complex_url).posts.map(&:id)).to eq([post2.id])
|
expect(Search.execute('camra.org.uk').posts.map(&:id)).to eq([post.id])
|
||||||
|
expect(Search.execute('wb.camra.org.uk').posts.map(&:id)).to eq([post.id])
|
||||||
|
expect(Search.execute('wb.camra.org.uk/latest').posts.map(&:id)).to eq([post.id])
|
||||||
|
expect(Search.execute('/latest#test').posts.map(&:id)).to eq([post.id])
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'supports category slug and tags' do
|
it 'supports category slug and tags' do
|
||||||
|
@ -17,16 +17,6 @@ describe SearchIndexer do
|
|||||||
SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
|
SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'can correctly inject if http or https links exist' do
|
|
||||||
|
|
||||||
val = "a https://cnn.com?bob=1, http://stuff.com.au?bill=1 b abc.net/xyz=1"
|
|
||||||
result = SearchIndexer.inject_extra_terms(val)
|
|
||||||
|
|
||||||
expected = "a https://cnn.com?bob=1, cnn com bob=1 http://stuff.com.au?bill=1 stuff com au bill=1 b abc.net/xyz=1 net xyz=1"
|
|
||||||
|
|
||||||
expect(result).to eq(expected)
|
|
||||||
end
|
|
||||||
|
|
||||||
it 'correctly indexes chinese' do
|
it 'correctly indexes chinese' do
|
||||||
SiteSetting.default_locale = 'zh_CN'
|
SiteSetting.default_locale = 'zh_CN'
|
||||||
data = "你好世界"
|
data = "你好世界"
|
||||||
@ -151,7 +141,28 @@ describe SearchIndexer do
|
|||||||
topic = post.topic
|
topic = post.topic
|
||||||
|
|
||||||
expect(post.post_search_data.raw_data).to eq(
|
expect(post.post_search_data.raw_data).to eq(
|
||||||
"#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png meta discourse org some png"
|
"#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png"
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'should tokenize host of a URL' do
|
||||||
|
category = Fabricate(:category, name: 'awesome category')
|
||||||
|
topic = Fabricate(:topic, category: category, title: 'this is a test topic')
|
||||||
|
|
||||||
|
post = Fabricate(:post, topic: topic, raw: <<~RAW)
|
||||||
|
a https://cnn.com?bob=1, http://stuff.com.au?bill=1 b abc.net/xyz=1
|
||||||
|
RAW
|
||||||
|
|
||||||
|
post.rebake!
|
||||||
|
post.reload
|
||||||
|
topic = post.topic
|
||||||
|
|
||||||
|
expect(post.post_search_data.raw_data).to eq(
|
||||||
|
"#{topic.title} #{category.name} a https://cnn.com?bob=1 , http://stuff.com.au?bill=1 b http://abc.net/xyz=1 abc.net/xyz=1"
|
||||||
|
)
|
||||||
|
|
||||||
|
expect(post.post_search_data.search_data).to eq(
|
||||||
|
"'/xyz=1':18,21 '1':11,14 'abc':17,20 'abc.net':17,20 'abc.net/xyz=1':16,19 'au':12 'awesom':6B 'b':15 'bill':13 'bob':10 'categori':7B 'cnn':9 'cnn.com':9 'com':9,12 'com.au':12 'net':17,20 'stuff':12 'stuff.com.au':12 'test':4A 'topic':5A"
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user