mirror of
https://github.com/discourse/discourse.git
synced 2025-01-31 09:12:01 +08:00
FEATURE: new 'search_ignore_accents' site setting
This commit is contained in:
parent
52eed73294
commit
4481836de2
|
@ -11,8 +11,8 @@ class SearchIndexer
|
||||||
@disabled = false
|
@disabled = false
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.scrub_html_for_search(html)
|
def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore_accents)
|
||||||
HtmlScrubber.scrub(html)
|
HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.inject_extra_terms(raw)
|
def self.inject_extra_terms(raw)
|
||||||
|
@ -169,18 +169,10 @@ class SearchIndexer
|
||||||
|
|
||||||
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
|
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
|
||||||
|
|
||||||
def self.strip_diacritics(str)
|
|
||||||
s = str.unicode_normalize(:nfkd)
|
|
||||||
s.gsub!(DIACRITICS, "")
|
|
||||||
s.strip!
|
|
||||||
s
|
|
||||||
end
|
|
||||||
|
|
||||||
attr_reader :scrubbed
|
attr_reader :scrubbed
|
||||||
|
|
||||||
def initialize(strip_diacritics: false)
|
def initialize(strip_diacritics: false)
|
||||||
@scrubbed = +""
|
@scrubbed = +""
|
||||||
# for now we are disabling this per: https://meta.discourse.org/t/discourse-should-ignore-if-a-character-is-accented-when-doing-a-search/90198/16?u=sam
|
|
||||||
@strip_diacritics = strip_diacritics
|
@strip_diacritics = strip_diacritics
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -189,7 +181,7 @@ class SearchIndexer
|
||||||
|
|
||||||
me = new(strip_diacritics: strip_diacritics)
|
me = new(strip_diacritics: strip_diacritics)
|
||||||
Nokogiri::HTML::SAX::Parser.new(me).parse("<div>#{html}</div>")
|
Nokogiri::HTML::SAX::Parser.new(me).parse("<div>#{html}</div>")
|
||||||
me.scrubbed
|
me.scrubbed.squish
|
||||||
end
|
end
|
||||||
|
|
||||||
ATTRIBUTES ||= %w{alt title href data-youtube-title}
|
ATTRIBUTES ||= %w{alt title href data-youtube-title}
|
||||||
|
@ -204,8 +196,15 @@ class SearchIndexer
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def strip_diacritics(str)
|
||||||
|
s = str.unicode_normalize(:nfkd)
|
||||||
|
s.gsub!(DIACRITICS, "")
|
||||||
|
s.strip!
|
||||||
|
s
|
||||||
|
end
|
||||||
|
|
||||||
def characters(str)
|
def characters(str)
|
||||||
str = HtmlScrubber.strip_diacritics(str) if @strip_diacritics
|
str = strip_diacritics(str) if @strip_diacritics
|
||||||
scrubbed << " #{str} "
|
scrubbed << " #{str} "
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -1144,6 +1144,7 @@ en:
|
||||||
log_search_queries: "Log search queries performed by users"
|
log_search_queries: "Log search queries performed by users"
|
||||||
search_query_log_max_size: "Maximum amount of search queries to keep"
|
search_query_log_max_size: "Maximum amount of search queries to keep"
|
||||||
search_query_log_max_retention_days: "Maximum amount of time to keep search queries, in days."
|
search_query_log_max_retention_days: "Maximum amount of time to keep search queries, in days."
|
||||||
|
search_ignore_accents: "Ignore accents when searching for text."
|
||||||
allow_uncategorized_topics: "Allow topics to be created without a category. WARNING: If there are any uncategorized topics, you must recategorize them before turning this off."
|
allow_uncategorized_topics: "Allow topics to be created without a category. WARNING: If there are any uncategorized topics, you must recategorize them before turning this off."
|
||||||
allow_duplicate_topic_titles: "Allow topics with identical, duplicate titles."
|
allow_duplicate_topic_titles: "Allow topics with identical, duplicate titles."
|
||||||
unique_posts_mins: "How many minutes before a user can make a post with the same content again"
|
unique_posts_mins: "How many minutes before a user can make a post with the same content again"
|
||||||
|
|
|
@ -1431,7 +1431,6 @@ search:
|
||||||
zh_TW: 2
|
zh_TW: 2
|
||||||
ko: 2
|
ko: 2
|
||||||
ja: 2
|
ja: 2
|
||||||
|
|
||||||
search_tokenize_chinese_japanese_korean: false
|
search_tokenize_chinese_japanese_korean: false
|
||||||
search_prefer_recent_posts: false
|
search_prefer_recent_posts: false
|
||||||
search_recent_posts_size:
|
search_recent_posts_size:
|
||||||
|
@ -1446,6 +1445,22 @@ search:
|
||||||
search_query_log_max_retention_days:
|
search_query_log_max_retention_days:
|
||||||
default: 365 # 1 year
|
default: 365 # 1 year
|
||||||
max: 1825 # 5 years
|
max: 1825 # 5 years
|
||||||
|
search_ignore_accents:
|
||||||
|
default: false
|
||||||
|
locale_default:
|
||||||
|
ar: true
|
||||||
|
ca: true
|
||||||
|
cs: true
|
||||||
|
el: true
|
||||||
|
es: true
|
||||||
|
fa_IR: true
|
||||||
|
fr: true
|
||||||
|
hu: true
|
||||||
|
pt: true
|
||||||
|
pt_BR: true
|
||||||
|
ro: true
|
||||||
|
sk: true
|
||||||
|
tr_TR: true
|
||||||
|
|
||||||
uncategorized:
|
uncategorized:
|
||||||
version_checks:
|
version_checks:
|
||||||
|
|
|
@ -63,13 +63,14 @@ class Search
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.blurb_for(cooked, term = nil, blurb_length = 200)
|
def self.blurb_for(cooked, term = nil, blurb_length = 200)
|
||||||
cooked = SearchIndexer::HtmlScrubber.scrub(cooked).squish
|
|
||||||
|
|
||||||
blurb = nil
|
blurb = nil
|
||||||
|
cooked = SearchIndexer.scrub_html_for_search(cooked)
|
||||||
|
|
||||||
if term
|
if term
|
||||||
terms = term.split(/\s+/)
|
terms = term.split(/\s+/)
|
||||||
blurb = TextHelper.excerpt(cooked, terms.first, radius: blurb_length / 2, seperator: " ")
|
blurb = TextHelper.excerpt(cooked, terms.first, radius: blurb_length / 2, seperator: " ")
|
||||||
end
|
end
|
||||||
|
|
||||||
blurb = TextHelper.truncate(cooked, length: blurb_length, seperator: " ") if blurb.blank?
|
blurb = TextHelper.truncate(cooked, length: blurb_length, seperator: " ") if blurb.blank?
|
||||||
Sanitize.clean(blurb)
|
Sanitize.clean(blurb)
|
||||||
end
|
end
|
||||||
|
|
|
@ -3,6 +3,10 @@ require 'rails_helper'
|
||||||
describe SearchIndexer do
|
describe SearchIndexer do
|
||||||
let(:post_id) { 99 }
|
let(:post_id) { 99 }
|
||||||
|
|
||||||
|
def scrub(html, strip_diacritics: false)
|
||||||
|
SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
|
||||||
|
end
|
||||||
|
|
||||||
it 'correctly indexes chinese' do
|
it 'correctly indexes chinese' do
|
||||||
SiteSetting.default_locale = 'zh_CN'
|
SiteSetting.default_locale = 'zh_CN'
|
||||||
data = "你好世界"
|
data = "你好世界"
|
||||||
|
@ -16,26 +20,26 @@ describe SearchIndexer do
|
||||||
|
|
||||||
it 'extract youtube title' do
|
it 'extract youtube title' do
|
||||||
html = "<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&wmode=opaque\"></div>"
|
html = "<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&wmode=opaque\"></div>"
|
||||||
|
scrubbed = scrub(html)
|
||||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]")
|
||||||
|
|
||||||
expect(scrubbed).to eq(" Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive] ")
|
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'extract a link' do
|
it 'extract a link' do
|
||||||
html = "<a href='http://meta.discourse.org/'>link</a>"
|
html = "<a href='http://meta.discourse.org/'>link</a>"
|
||||||
|
scrubbed = scrub(html)
|
||||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
expect(scrubbed).to eq("http://meta.discourse.org/ link")
|
||||||
|
|
||||||
expect(scrubbed).to eq(" http://meta.discourse.org/ link ")
|
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'removes diacritics' do
|
it 'uses ignore_accent setting to strip diacritics' do
|
||||||
html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"
|
html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"
|
||||||
|
|
||||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html, strip_diacritics: true)
|
SiteSetting.search_ignore_accents = true
|
||||||
|
scrubbed = SearchIndexer.scrub_html_for_search(html)
|
||||||
|
expect(scrubbed).to eq("HELLO Heterogeneite Здравствуите هتاف للترحيب 你好")
|
||||||
|
|
||||||
expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ")
|
SiteSetting.search_ignore_accents = false
|
||||||
|
scrubbed = SearchIndexer.scrub_html_for_search(html)
|
||||||
|
expect(scrubbed).to eq("HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好")
|
||||||
end
|
end
|
||||||
|
|
||||||
it "doesn't index local files" do
|
it "doesn't index local files" do
|
||||||
|
@ -54,9 +58,9 @@ describe SearchIndexer do
|
||||||
</div>
|
</div>
|
||||||
HTML
|
HTML
|
||||||
|
|
||||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html).gsub(/\s+/, " ")
|
scrubbed = scrub(html)
|
||||||
|
|
||||||
expect(scrubbed).to eq(" Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB ")
|
expect(scrubbed).to eq("Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB")
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'correctly indexes a post according to version' do
|
it 'correctly indexes a post according to version' do
|
||||||
|
|
Loading…
Reference in New Issue
Block a user