FIX: remove diacritics when tokenizing html for search

This commit is contained in:
Régis Hanol 2018-08-23 17:13:52 +02:00
parent 30722240e4
commit 2fcf2b899e
2 changed files with 19 additions and 32 deletions

View File

@ -121,7 +121,8 @@ class SearchIndexer
def self.index(obj, force: false)
return if @disabled
category_name, tag_names = nil
category_name = nil
tag_names = nil
topic = nil
if Topic === obj
@ -148,8 +149,7 @@ class SearchIndexer
if Topic === obj && (obj.saved_change_to_title? || force)
if obj.posts
post = obj.posts.find_by(post_number: 1)
if post
if post = obj.posts.find_by(post_number: 1)
SearchIndexer.update_posts_index(post.id, obj.title, category_name, tag_names, post.cooked)
SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
end
@ -175,43 +175,22 @@ class SearchIndexer
def self.scrub(html)
me = new
parser = Nokogiri::HTML::SAX::Parser.new(me)
begin
copy = +"<div>"
copy << html unless html.nil?
copy << "</div>"
parser.parse(html) unless html.nil?
end
parser.parse("<div>#{html}</div>") if html.present?
me.scrubbed
end
def start_element(name, attributes = [])
ATTRIBUTES ||= %w{alt title href data-youtube-title}
def start_element(_, attributes = [])
attributes = Hash[*attributes.flatten]
if attributes["alt"]
scrubbed << " "
scrubbed << attributes["alt"]
scrubbed << " "
end
if attributes["title"]
scrubbed << " "
scrubbed << attributes["title"]
scrubbed << " "
end
if attributes["data-youtube-title"]
scrubbed << " "
scrubbed << attributes["data-youtube-title"]
scrubbed << " "
end
if attributes["href"]
scrubbed << " "
scrubbed << attributes["href"]
scrubbed << " "
ATTRIBUTES.each do |name|
characters(attributes[name]) if attributes[name].present?
end
end
def characters(string)
scrubbed << " "
scrubbed << string
scrubbed << " "
scrubbed << " #{ActiveSupport::Inflector.transliterate(string).strip} "
end
end
end

View File

@ -29,6 +29,14 @@ describe SearchIndexer do
expect(scrubbed).to eq(" http://meta.discourse.org/ link ")
end
it 'removes diacritics' do
html = "<p>Hétérogénéité</p>"
scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
expect(scrubbed).to eq(" Heterogeneite ")
end
it 'correctly indexes a post according to version' do
# Preparing so that they can be indexed to right version
SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)