mirror of
https://github.com/discourse/discourse.git
synced 2025-02-20 14:39:48 +08:00
FIX: remove diacritics when tokenizing html for search
This commit is contained in:
parent
30722240e4
commit
2fcf2b899e
|
@ -121,7 +121,8 @@ class SearchIndexer
|
|||
def self.index(obj, force: false)
|
||||
return if @disabled
|
||||
|
||||
category_name, tag_names = nil
|
||||
category_name = nil
|
||||
tag_names = nil
|
||||
topic = nil
|
||||
|
||||
if Topic === obj
|
||||
|
@ -148,8 +149,7 @@ class SearchIndexer
|
|||
|
||||
if Topic === obj && (obj.saved_change_to_title? || force)
|
||||
if obj.posts
|
||||
post = obj.posts.find_by(post_number: 1)
|
||||
if post
|
||||
if post = obj.posts.find_by(post_number: 1)
|
||||
SearchIndexer.update_posts_index(post.id, obj.title, category_name, tag_names, post.cooked)
|
||||
SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
|
||||
end
|
||||
|
@ -175,43 +175,22 @@ class SearchIndexer
|
|||
def self.scrub(html)
|
||||
me = new
|
||||
parser = Nokogiri::HTML::SAX::Parser.new(me)
|
||||
begin
|
||||
copy = +"<div>"
|
||||
copy << html unless html.nil?
|
||||
copy << "</div>"
|
||||
parser.parse(html) unless html.nil?
|
||||
end
|
||||
parser.parse("<div>#{html}</div>") if html.present?
|
||||
me.scrubbed
|
||||
end
|
||||
|
||||
def start_element(name, attributes = [])
|
||||
ATTRIBUTES ||= %w{alt title href data-youtube-title}
|
||||
|
||||
def start_element(_, attributes = [])
|
||||
attributes = Hash[*attributes.flatten]
|
||||
if attributes["alt"]
|
||||
scrubbed << " "
|
||||
scrubbed << attributes["alt"]
|
||||
scrubbed << " "
|
||||
end
|
||||
if attributes["title"]
|
||||
scrubbed << " "
|
||||
scrubbed << attributes["title"]
|
||||
scrubbed << " "
|
||||
end
|
||||
if attributes["data-youtube-title"]
|
||||
scrubbed << " "
|
||||
scrubbed << attributes["data-youtube-title"]
|
||||
scrubbed << " "
|
||||
end
|
||||
if attributes["href"]
|
||||
scrubbed << " "
|
||||
scrubbed << attributes["href"]
|
||||
scrubbed << " "
|
||||
|
||||
ATTRIBUTES.each do |name|
|
||||
characters(attributes[name]) if attributes[name].present?
|
||||
end
|
||||
end
|
||||
|
||||
def characters(string)
|
||||
scrubbed << " "
|
||||
scrubbed << string
|
||||
scrubbed << " "
|
||||
scrubbed << " #{ActiveSupport::Inflector.transliterate(string).strip} "
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -29,6 +29,14 @@ describe SearchIndexer do
|
|||
expect(scrubbed).to eq(" http://meta.discourse.org/ link ")
|
||||
end
|
||||
|
||||
it 'removes diacritics' do
|
||||
html = "<p>Hétérogénéité</p>"
|
||||
|
||||
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
|
||||
|
||||
expect(scrubbed).to eq(" Heterogeneite ")
|
||||
end
|
||||
|
||||
it 'correctly indexes a post according to version' do
|
||||
# Preparing so that they can be indexed to right version
|
||||
SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)
|
||||
|
|
Loading…
Reference in New Issue
Block a user