diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index 04dc32b697e..f76b9037e8d 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -167,6 +167,8 @@ class SearchIndexer class HtmlScrubber < Nokogiri::XML::SAX::Document + DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/ + def self.strip_diacritics(str) s = str.unicode_normalize(:nfkd) s.gsub!(DIACRITICS, "") @@ -196,12 +198,12 @@ class SearchIndexer attributes = Hash[*attributes.flatten] ATTRIBUTES.each do |name| - characters(attributes[name]) if attributes[name].present? + if attributes[name].present? + characters(attributes[name]) unless name == "href" && UrlHelper.is_local(attributes[name]) + end end end - DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/ - def characters(str) str = HtmlScrubber.strip_diacritics(str) if @strip_diacritics scrubbed << " #{str} " diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index e98d52e3657..a9f309a4890 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -2,6 +2,7 @@ require 'rails_helper' describe SearchIndexer do let(:post_id) { 99 } + it 'correctly indexes chinese' do SiteSetting.default_locale = 'zh_CN' data = "你好世界" @@ -37,6 +38,27 @@ describe SearchIndexer do expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ") end + it "doesn't index local files" do + html = <<~HTML +

Discourse

+

51%20PM

+ + HTML + + scrubbed = SearchIndexer::HtmlScrubber.scrub(html).gsub(/\s+/, " ") + + expect(scrubbed).to eq(" Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB ") + end + it 'correctly indexes a post according to version' do # Preparing so that they can be indexed to right version SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)