FEATURE: Use Postgres unaccent to ignore accents (#16100)

The search_ignore_accents site setting can be used to make the search indexer remove the accents before indexing the content. The unaccent function from PostgreSQL is better than Ruby's unicode_normalize(:nfkd).
2024-11-22 10:57:04 +08:00 · 2022-03-07 23:03:10 +02:00 · 2022-03-07 23:03:10 +02:00 · 34b4b53bac
commit 34b4b53bac
parent 6e7cdc5bc3
6 changed files with 75 additions and 53 deletions
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@ -17,10 +17,6 @@ class SearchIndexer
    @disabled = false
  end

-  def self.scrub_html_for_search(html, strip_diacritics: SiteSetting.search_ignore_accents)
-    HtmlScrubber.scrub(html, strip_diacritics: strip_diacritics)
-  end
-
  def self.update_index(table: , id: , a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
    raw_data = [a_weight, b_weight, c_weight, d_weight]

@ -35,10 +31,10 @@ class SearchIndexer
    stemmer = table == "user" ? "simple" : Search.ts_config

    ranked_index = <<~SQL
-      setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') ||
-      setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') ||
-      setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') ||
-      setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
+      setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:a,''))")}, 'A') ||
+      setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:b,''))")}, 'B') ||
+      setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:c,''))")}, 'C') ||
+      setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
    SQL

    ranked_params = {
@ -109,7 +105,7 @@ class SearchIndexer
      table: 'topic',
      id: topic_id,
      a_weight: title,
-      b_weight: scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
+      b_weight: HtmlScrubber.scrub(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
    )
  end

@ -124,7 +120,7 @@ class SearchIndexer
      # the original string. Since there is no way to estimate the length of
      # the expected tsvector, we limit the input to ~50% of the maximum
      # length of a tsvector (1_048_576 bytes).
-      d_weight: scrub_html_for_search(cooked)[0..600_000]
+      d_weight: HtmlScrubber.scrub(cooked)[0..600_000]
    ) do |params|
      params["private_message"] = private_message
    end
@ -294,12 +290,11 @@ class SearchIndexer

    attr_reader :scrubbed

-    def initialize(strip_diacritics: false)
+    def initialize
      @scrubbed = +""
-      @strip_diacritics = strip_diacritics
    end

-    def self.scrub(html, strip_diacritics: false)
+    def self.scrub(html)
      return +"" if html.blank?

      begin
@ -338,9 +333,9 @@ class SearchIndexer
        end
      end

-      me = new(strip_diacritics: strip_diacritics)
-      Nokogiri::HTML::SAX::Parser.new(me).parse(document.to_html)
-      me.scrubbed.squish
+      html_scrubber = new
+      Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
+      html_scrubber.scrubbed.squish
    end

    MENTION_CLASSES ||= %w{mention mention-group}
@ -362,7 +357,6 @@ class SearchIndexer
    end

    def characters(str)
-      str = Search.strip_diacritics(str) if @strip_diacritics
      scrubbed << " #{str} "
    end
  end
--- a/db/migrate/20220304162250_enable_unaccent_extension.rb
+++ b/db/migrate/20220304162250_enable_unaccent_extension.rb
@ -0,0 +1,7 @@
+# frozen_string_literal: true
+
+class EnableUnaccentExtension < ActiveRecord::Migration[6.1]
+  def change
+    enable_extension 'unaccent'
+  end
+end
--- a/lib/search.rb
+++ b/lib/search.rb
@ -21,13 +21,6 @@ class Search
    5
  end

-  def self.strip_diacritics(str)
-    s = str.unicode_normalize(:nfkd)
-    s.gsub!(DIACRITICS, "")
-    s.strip!
-    s
-  end
-
  def self.per_filter
    50
  end
@ -64,6 +57,10 @@ class Search
    end
  end

+  def self.wrap_unaccent(str)
+    SiteSetting.search_ignore_accents ? "unaccent(#{str})" : str
+  end
+
  def self.segment_chinese?
    ['zh_TW', 'zh_CN'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese
  end
@ -115,10 +112,6 @@ class Search
      else
        data.squish!
      end
-
-      if SiteSetting.search_ignore_accents
-        data = strip_diacritics(data)
-      end
    end

    data.gsub!(/\S+/) do |str|
@ -704,7 +697,7 @@ class Search
        FROM topic_tags tt, tags
        WHERE tt.tag_id = tags.id
        GROUP BY tt.topic_id
-        HAVING to_tsvector(#{default_ts_config}, array_to_string(array_agg(lower(tags.name)), ' ')) @@ to_tsquery(#{default_ts_config}, ?)
+        HAVING to_tsvector(#{default_ts_config}, #{Search.wrap_unaccent("array_to_string(array_agg(lower(tags.name)), ' ')")}) @@ to_tsquery(#{default_ts_config}, #{Search.wrap_unaccent('?')})
      )", tags.join('&'))
    else
      tags = match.split(",")
@ -1151,7 +1144,8 @@ class Search

  def self.to_tsquery(ts_config: nil, term:, joiner: nil)
    ts_config = ActiveRecord::Base.connection.quote(ts_config) if ts_config
-    tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, '#{self.escape_string(term)}')"
+    escaped_term = Search.wrap_unaccent("'#{self.escape_string(term)}'")
+    tsquery = "TO_TSQUERY(#{ts_config || default_ts_config}, #{escaped_term})"
    tsquery = "REPLACE(#{tsquery}::text, '&', '#{self.escape_string(joiner)}')::tsquery" if joiner
    tsquery
  end
--- a/lib/search/grouped_search_results.rb
+++ b/lib/search/grouped_search_results.rb
@ -120,7 +120,7 @@ class Search
      blurb = nil

      if scrub
-        cooked = SearchIndexer.scrub_html_for_search(cooked)
+        cooked = SearchIndexer::HtmlScrubber.scrub(cooked)

        urls = Set.new
        cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
--- a/spec/lib/search_spec.rb
+++ b/spec/lib/search_spec.rb
@ -77,6 +77,35 @@ describe Search do
        expect(result.tags).to contain_exactly()
      end
    end
+
+    context "accents" do
+      fab!(:post_1) { Fabricate(:post, raw: "Cette ****** d'art n'est pas une œuvre") }
+      fab!(:post_2) { Fabricate(:post, raw: "Cette oeuvre d'art n'est pas une *****") }
+
+      before do
+        SearchIndexer.enable
+      end
+
+      after do
+        SearchIndexer.disable
+      end
+
+      it "removes them if search_ignore_accents" do
+        SiteSetting.search_ignore_accents = true
+        [post_1, post_2].each { |post| SearchIndexer.index(post.topic, force: true) }
+
+        expect(Search.execute("oeuvre").posts).to contain_exactly(post_1, post_2)
+        expect(Search.execute("œuvre").posts).to contain_exactly(post_1, post_2)
+      end
+
+      it "does not remove them if not search_ignore_accents" do
+        SiteSetting.search_ignore_accents = false
+        [post_1, post_2].each { |post| SearchIndexer.index(post.topic, force: true) }
+
+        expect(Search.execute("œuvre").posts).to contain_exactly(post_1)
+        expect(Search.execute("oeuvre").posts).to contain_exactly(post_2)
+      end
+    end
  end

  context "custom_eager_load" do
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@ -11,10 +11,6 @@ describe SearchIndexer do
    SearchIndexer.disable
  end

-  def scrub(html, strip_diacritics: false)
-    SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
-  end
-
  it 'correctly indexes chinese' do
    SiteSetting.default_locale = 'zh_CN'
    data = "你好世界"
@ -36,48 +32,36 @@ describe SearchIndexer do

  it 'extract youtube title' do
    html = "<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&amp;wmode=opaque\"></div>"
-    scrubbed = scrub(html)
+    scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
    expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]")
  end

  it 'extract a link' do
    html = "<a href='http://meta.discourse.org/'>link</a>"
-    scrubbed = scrub(html)
+    scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
    expect(scrubbed).to eq("http://meta.discourse.org/ link")
  end

  it 'extracts @username from mentions' do
    html = '<p><a class="mention" href="/u/%E7%8B%AE%E5%AD%90">@狮子</a> <a class="mention" href="/u/foo">@foo</a></p>'
-    scrubbed = scrub(html)
+    scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
    expect(scrubbed).to eq('@狮子 @foo')
  end

  it 'extracts @groupname from group mentions' do
    html = '<p><a class="mention-group" href="/groups/%D0%B0%D0%B2%D1%82%D0%BE%D0%BC%D0%BE%D0%B1%D0%B8%D0%BB%D0%B8%D1%81%D1%82">@автомобилист</a></p>'
-    scrubbed = scrub(html)
+    scrubbed = SearchIndexer::HtmlScrubber.scrub(html)
    expect(scrubbed).to eq('@автомобилист')
  end

  it 'extracts emoji name from emoji image' do
    emoji = Emoji["wink"]
    html = %Q|<img src=\"#{URI.join(Discourse.base_url_no_prefix, emoji.url)}\" title=\":wink:\" class=\"emoji only-emoji\" alt=\":wink:\" loading=\"lazy\" width=\"20\" height=\"20\">|
-    scrubbed = scrub(html)
+    scrubbed = SearchIndexer::HtmlScrubber.scrub(html)

    expect(scrubbed).to eq(':wink:')
  end

-  it 'uses ignore_accent setting to strip diacritics' do
-    html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"
-
-    SiteSetting.search_ignore_accents = true
-    scrubbed = SearchIndexer.scrub_html_for_search(html)
-    expect(scrubbed).to eq("HELLO Heterogeneite Здравствуите هتاف للترحيب 你好")
-
-    SiteSetting.search_ignore_accents = false
-    scrubbed = SearchIndexer.scrub_html_for_search(html)
-    expect(scrubbed).to eq("HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好")
-  end
-
  it "doesn't index local files" do
    html = <<~HTML
      <p><img src="https://www.discourse.org/logo.png" alt="Discourse"></p>
@ -95,7 +79,7 @@ describe SearchIndexer do
      </div>
    HTML

-    scrubbed = scrub(html)
+    scrubbed = SearchIndexer::HtmlScrubber.scrub(html)

    expect(scrubbed).to eq("Discourse 51%20PM Untitled%20design%20(21)")
  end
@ -271,6 +255,20 @@ describe SearchIndexer do
        "'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
      )
    end
+
+    it 'should unaccent indexed content' do
+      SiteSetting.search_ignore_accents = true
+      post.update!(raw: "Cette oeuvre d'art n'est pas une œuvre")
+      post.post_search_data.reload
+      expect(post.post_search_data.search_data).not_to include('œuvr')
+      expect(post.post_search_data.search_data).to include('oeuvr')
+
+      SiteSetting.search_ignore_accents = false
+      SearchIndexer.index(post, force: true)
+      post.post_search_data.reload
+      expect(post.post_search_data.search_data).to include('œuvr')
+      expect(post.post_search_data.search_data).to include('oeuvr')
+    end
  end

  describe '.queue_post_reindex' do