discourse/spec/services/search_indexer_spec.rb

require 'rails_helper'

describe SearchIndexer do
  let(:post_id) { 99 }

  def scrub(html, strip_diacritics: false)
    SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
  end

  it 'correctly indexes chinese' do
    SiteSetting.default_locale = 'zh_CN'
    data = "你好世界"
    expect(data.split(" ").length).to eq(1)

    SearchIndexer.update_posts_index(post_id, "你好世界", "", "", nil)

    raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]
    expect(raw_data.split(' ').length).to eq(2)
  end

  it 'extract youtube title' do
    html = "<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&amp;wmode=opaque\"></div>"
    scrubbed = scrub(html)
    expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]")
  end

  it 'extract a link' do
    html = "<a href='http://meta.discourse.org/'>link</a>"
    scrubbed = scrub(html)
    expect(scrubbed).to eq("http://meta.discourse.org/ link")
  end

  it 'uses ignore_accent setting to strip diacritics' do
    html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"

    SiteSetting.search_ignore_accents = true
    scrubbed = SearchIndexer.scrub_html_for_search(html)
    expect(scrubbed).to eq("HELLO Heterogeneite Здравствуите هتاف للترحيب 你好")

    SiteSetting.search_ignore_accents = false
    scrubbed = SearchIndexer.scrub_html_for_search(html)
    expect(scrubbed).to eq("HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好")
  end

  it "doesn't index local files" do
    html = <<~HTML
      <p><img src="https://www.discourse.org/logo.png" alt="Discourse"></p>
      <p><img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/0/f/0f40b818356bdc1d80acfa905034e95cfd112a3a.png" alt="51%20PM" width="289" height="398"></p>
      <div class="lightbox-wrapper">
        <a class="lightbox" href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b.jpg" data-download-href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/16790095df3baf318fb2eb1d7e5d7860dc45d48b" title="Untitled design (21).jpg" rel="nofollow noopener">
          <img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/optimized/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b_1_563x500.jpg" alt="Untitled%20design%20(21)" width="563" height="500">
          <div class="meta">
            <span class="filename">Untitled design (21).jpg</span>
            <span class="informations">1280x1136 472 KB</span>
            <span class="expand"></span>
          </div>
        </a>
      </div>
    HTML

    scrubbed = scrub(html)

    expect(scrubbed).to eq("Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB")
  end

  it 'correctly indexes a post according to version' do
    # Preparing so that they can be indexed to right version
    SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)
    PostSearchData.find_by(post_id: post_id).update_attributes!(version: -1)

    data = "<a>This</a> is a test"
    SearchIndexer.update_posts_index(post_id, "", "", nil, data)

    raw_data, locale, version = PostSearchData.where(post_id: post_id).pluck(:raw_data, :locale, :version)[0]
    expect(raw_data).to eq("This is a test")
    expect(locale).to eq("en")
    expect(version).to eq(Search::INDEX_VERSION)

    SearchIndexer.update_posts_index(post_id, "tester", "", nil, nil)

    raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]
    expect(raw_data).to eq("tester")
  end
end
Prepare for separation of RSpec helper files Since rspec-rails 3, the default installation creates two helper files: * `spec_helper.rb` * `rails_helper.rb` `spec_helper.rb` is intended as a way of running specs that do not require Rails, whereas `rails_helper.rb` loads Rails (as Discourse's current `spec_helper.rb` does). For more information: https://www.relishapp.com/rspec/rspec-rails/docs/upgrade#default-helper-files In this commit, I've simply replaced all instances of `spec_helper` with `rails_helper`, and renamed the original `spec_helper.rb`. This brings the Discourse project closer to the standard usage of RSpec in a Rails app. At present, every spec relies on loading Rails, but there are likely many that don't need to. In a future pull request, I hope to introduce a separate, minimal `spec_helper.rb` which can be used in tests which don't rely on Rails. 2015-10-11 17:41:23 +08:00			`require 'rails_helper'`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 15:10:56 +08:00
Remove SearchObserver, aim is to remove all observers rails-observers gem is mostly unmaintained and is a pain to carry forward new implementation contains significantly less magic as a bonus 2016-12-22 10:13:14 +08:00			`describe SearchIndexer do`
FIX: rebuild index when engine replaced (#5021) 2017-08-16 19:38:34 +08:00			`let(:post_id) { 99 }`
FIX: don't index urls to local files 2018-09-14 00:53:53 +08:00
FEATURE: new 'search_ignore_accents' site setting 2018-09-17 16:31:15 +08:00			`def scrub(html, strip_diacritics: false)`
			`SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)`
			`end`

BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 15:10:56 +08:00			`it 'correctly indexes chinese' do`
			`SiteSetting.default_locale = 'zh_CN'`
			`data = "你好世界"`
models with rspec3 syntax 2015-01-06 00:04:23 +08:00			`expect(data.split(" ").length).to eq(1)`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 15:10:56 +08:00
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`SearchIndexer.update_posts_index(post_id, "你好世界", "", "", nil)`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 15:10:56 +08:00
FIX: rebuild index when engine replaced (#5021) 2017-08-16 19:38:34 +08:00			`raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]`
FEATURE: An option to search more recent posts for very large sites. On very large forums searching posts can be slow, so this commit introduces the ability to try and search only the most recent posts first, and then going for a larger breadth search if there aren't enough results. Enable `search_prefer_recent_posts` and you can customize how many recent posts to filter with `search_recent_posts_size` 2016-08-11 03:40:58 +08:00			`expect(raw_data.split(' ').length).to eq(2)`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 15:10:56 +08:00			`end`

FEATURE: index YouTube titles in search Previously we omitted the titles for videos that YouTube provided 2018-04-26 13:46:52 +08:00			`it 'extract youtube title' do`
			`html = "<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&wmode=opaque\"></div>"`
FEATURE: new 'search_ignore_accents' site setting 2018-09-17 16:31:15 +08:00			`scrubbed = scrub(html)`
			`expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]")`
FEATURE: index YouTube titles in search Previously we omitted the titles for videos that YouTube provided 2018-04-26 13:46:52 +08:00			`end`

FEATURE: Make links indexable. (#6285) 2018-08-20 08:39:19 +08:00			`it 'extract a link' do`
			`html = "<a href='http://meta.discourse.org/'>link</a>"`
FEATURE: new 'search_ignore_accents' site setting 2018-09-17 16:31:15 +08:00			`scrubbed = scrub(html)`
			`expect(scrubbed).to eq("http://meta.discourse.org/ link")`
FEATURE: Make links indexable. (#6285) 2018-08-20 08:39:19 +08:00			`end`

FEATURE: new 'search_ignore_accents' site setting 2018-09-17 16:31:15 +08:00			`it 'uses ignore_accent setting to strip diacritics' do`
FIX: remove diacritics instead of transliterating 2018-08-24 06:38:44 +08:00			`html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"`
FIX: remove diacritics when tokenizing html for search 2018-08-23 23:13:52 +08:00
FEATURE: new 'search_ignore_accents' site setting 2018-09-17 16:31:15 +08:00			`SiteSetting.search_ignore_accents = true`
			`scrubbed = SearchIndexer.scrub_html_for_search(html)`
			`expect(scrubbed).to eq("HELLO Heterogeneite Здравствуите هتاف للترحيب 你好")`
FIX: remove diacritics when tokenizing html for search 2018-08-23 23:13:52 +08:00
FEATURE: new 'search_ignore_accents' site setting 2018-09-17 16:31:15 +08:00			`SiteSetting.search_ignore_accents = false`
			`scrubbed = SearchIndexer.scrub_html_for_search(html)`
			`expect(scrubbed).to eq("HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好")`
FIX: remove diacritics when tokenizing html for search 2018-08-23 23:13:52 +08:00			`end`

FIX: don't index urls to local files 2018-09-14 00:53:53 +08:00			`it "doesn't index local files" do`
			`html = <<~HTML`
			`<p><img src="https://www.discourse.org/logo.png" alt="Discourse"></p>`
			`<p><img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/0/f/0f40b818356bdc1d80acfa905034e95cfd112a3a.png" alt="51%20PM" width="289" height="398"></p>`
			`<div class="lightbox-wrapper">`
			`<a class="lightbox" href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b.jpg" data-download-href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/16790095df3baf318fb2eb1d7e5d7860dc45d48b" title="Untitled design (21).jpg" rel="nofollow noopener">`
			`<img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/optimized/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b_1_563x500.jpg" alt="Untitled%20design%20(21)" width="563" height="500">`
			`<div class="meta">`
			`<span class="filename">Untitled design (21).jpg</span>`
			`<span class="informations">1280x1136 472 KB</span>`
			`<span class="expand"></span>`
			`</div>`
			`</a>`
			`</div>`
			`HTML`

FEATURE: new 'search_ignore_accents' site setting 2018-09-17 16:31:15 +08:00			`scrubbed = scrub(html)`
FIX: don't index urls to local files 2018-09-14 00:53:53 +08:00
FEATURE: new 'search_ignore_accents' site setting 2018-09-17 16:31:15 +08:00			`expect(scrubbed).to eq("Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB")`
FIX: don't index urls to local files 2018-09-14 00:53:53 +08:00			`end`

FIX: rebuild index when engine replaced (#5021) 2017-08-16 19:38:34 +08:00			`it 'correctly indexes a post according to version' do`
			`# Preparing so that they can be indexed to right version`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)`
FIX: rebuild index when engine replaced (#5021) 2017-08-16 19:38:34 +08:00			`PostSearchData.find_by(post_id: post_id).update_attributes!(version: -1)`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 15:10:56 +08:00
FIX: rebuild index when engine replaced (#5021) 2017-08-16 19:38:34 +08:00			`data = "<a>This</a> is a test"`
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`SearchIndexer.update_posts_index(post_id, "", "", nil, data)`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 15:10:56 +08:00
FIX: rebuild index when engine replaced (#5021) 2017-08-16 19:38:34 +08:00			`raw_data, locale, version = PostSearchData.where(post_id: post_id).pluck(:raw_data, :locale, :version)[0]`
FEATURE: An option to search more recent posts for very large sites. On very large forums searching posts can be slow, so this commit introduces the ability to try and search only the most recent posts first, and then going for a larger breadth search if there aren't enough results. Enable `search_prefer_recent_posts` and you can customize how many recent posts to filter with `search_recent_posts_size` 2016-08-11 03:40:58 +08:00			`expect(raw_data).to eq("This is a test")`
			`expect(locale).to eq("en")`
FIX: rebuild index when engine replaced (#5021) 2017-08-16 19:38:34 +08:00			`expect(version).to eq(Search::INDEX_VERSION)`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 15:10:56 +08:00
FEATURE: search within title using in:title Also - Significantly improved search ranking, title is treated most strongly - Adds tag names to the index - Run search re-indexer more aggressively - Re-index topic and all posts on category change 2018-02-20 11:41:00 +08:00			`SearchIndexer.update_posts_index(post_id, "tester", "", nil, nil)`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 15:10:56 +08:00
FIX: rebuild index when engine replaced (#5021) 2017-08-16 19:38:34 +08:00			`raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]`
FEATURE: An option to search more recent posts for very large sites. On very large forums searching posts can be slow, so this commit introduces the ability to try and search only the most recent posts first, and then going for a larger breadth search if there aren't enough results. Enable `search_prefer_recent_posts` and you can customize how many recent posts to filter with `search_recent_posts_size` 2016-08-11 03:40:58 +08:00			`expect(raw_data).to eq("tester")`
BUGFIX: Chinese search was broken BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it 2014-06-24 15:10:56 +08:00			`end`
			`end`