discourse/spec/services/search_indexer_spec.rb
Guo Xiang Tan cfd507822f
PERF: Improve quality of PostSearchData#raw_data. (#7275)
This commit fixes the follow quality issue with `PostSearchData#raw_data`:

1. URLs are being tokenized and links with similar href and characters
are being duplicated in the raw data.

`Post#cooked`:

```
<p><a href=\"https://meta.discourse.org/some.png\" class=\"onebox\" target=\"_blank\" rel=\"nofollow noopener\">https://meta.discourse.org/some.png</a></p>
```

`PostSearchData#raw_data` Before:

```
This is a test topic 0 Uncategorized https://meta.discourse.org/some.png discourse org/some png https://meta.discourse.org/some.png discourse org/some png
```

`PostSearchData#raw_data` After:

```
This is a test topic 0 Uncategorized https://meta.discourse.org/some.png meta discourse org
```

2. Ligthbox being included in search pollutes the
`PostSearchData#raw_data` unncessarily.

From 28 March 2018 to 28 March 2019, searches for the term `image` on
`meta.discourse.org` had a click through rate of 2.1%. Non-lightboxed images are not included in indexing for search yet we were indexing content within a lightbox. Also, search for terms like `image` was affected we were using `Pasted image` as the filename for
uploads that were pasted.

`Post#cooked`

```
<p>Let me see how I can fix this image<br>\n<div class=\"lightbox-wrapper\"><a class=\"lightbox\" href=\"https://meta.discourse.org/some.png\" title=\"some.png\" rel=\"nofollow noopener\"><img src=\"https://meta.discourse.org/some.png\" width=\"275\" height=\"299\"><div class=\"meta\">\n<svg class=\"fa d-icon d-icon-far-image svg-icon\" aria-hidden=\"true\"><use xlink:href=\"#far-image\"></use></svg><span class=\"filename\">some.png</span><span class=\"informations\">1750×2000</span><svg class=\"fa d-icon d-icon-discourse-expand svg-icon\" aria-hidden=\"true\"><use xlink:href=\"#discourse-expand\"></use></svg>\n</div></a></div></p>
```

`PostSearchData#raw_data` Before:

```
This is a test topic 0 Uncategorized Let me see how I can fix this image some.png png https://meta.discourse.org/some.png discourse org/some png some.png png 1750×2000
```

`PostSearchData#raw_data` After:

```
This is a test topic 0 Uncategorized Let me see how I can fix this image
```

In terms of indexing performance, we now have to parse the given HTML
through nokogiri twice. However performance is not a huge worry here since a string length of 194170 takes only 30ms
to scrub plus the indexing takes place in a background job.
2019-04-01 10:14:29 +08:00

151 lines
5.7 KiB
Ruby

require 'rails_helper'
describe SearchIndexer do
let(:post_id) { 99 }
def scrub(html, strip_diacritics: false)
SearchIndexer.scrub_html_for_search(html, strip_diacritics: strip_diacritics)
end
it 'correctly indexes chinese' do
SiteSetting.default_locale = 'zh_CN'
data = "你好世界"
expect(data.split(" ").length).to eq(1)
SearchIndexer.update_posts_index(post_id, "你好世界", "", "", nil)
raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]
expect(raw_data.split(' ').length).to eq(2)
end
it 'extract youtube title' do
html = "<div class=\"lazyYT\" data-youtube-id=\"lmFgeFh2nlw\" data-youtube-title=\"Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]\" data-width=\"480\" data-height=\"270\" data-parameters=\"feature=oembed&amp;wmode=opaque\"></div>"
scrubbed = scrub(html)
expect(scrubbed).to eq("Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive]")
end
it 'extract a link' do
html = "<a href='http://meta.discourse.org/'>link</a>"
scrubbed = scrub(html)
expect(scrubbed).to eq("http://meta.discourse.org/ link")
end
it 'uses ignore_accent setting to strip diacritics' do
html = "<p>HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好</p>"
SiteSetting.search_ignore_accents = true
scrubbed = SearchIndexer.scrub_html_for_search(html)
expect(scrubbed).to eq("HELLO Heterogeneite Здравствуите هتاف للترحيب 你好")
SiteSetting.search_ignore_accents = false
scrubbed = SearchIndexer.scrub_html_for_search(html)
expect(scrubbed).to eq("HELLO Hétérogénéité Здравствуйте هتاف للترحيب 你好")
end
it "doesn't index local files" do
html = <<~HTML
<p><img src="https://www.discourse.org/logo.png" alt="Discourse"></p>
<p><img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/0/f/0f40b818356bdc1d80acfa905034e95cfd112a3a.png" alt="51%20PM" width="289" height="398"></p>
<div class="lightbox-wrapper">
<a class="lightbox" href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b.jpg" data-download-href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/16790095df3baf318fb2eb1d7e5d7860dc45d48b" title="Untitled design (21).jpg" rel="nofollow noopener">
<img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/optimized/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b_1_563x500.jpg" alt="Untitled%20design%20(21)" width="563" height="500">
<div class="meta">
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use xlink:href="#far-image"></use></svg>
<span class="filename">Untitled design (21).jpg</span>
<span class="informations">1280x1136 472 KB</span>
<svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use xlink:href="#discourse-expand"></use></svg>
</div>
</a>
</div>
HTML
scrubbed = scrub(html)
expect(scrubbed).to eq("Discourse 51%20PM")
end
it 'correctly indexes a post according to version' do
# Preparing so that they can be indexed to right version
SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)
PostSearchData.find_by(post_id: post_id).update_attributes!(version: -1)
data = "<a>This</a> is a test"
SearchIndexer.update_posts_index(post_id, "", "", nil, data)
raw_data, locale, version = PostSearchData.where(post_id: post_id).pluck(:raw_data, :locale, :version)[0]
expect(raw_data).to eq("This is a test")
expect(locale).to eq("en")
expect(version).to eq(Search::INDEX_VERSION)
SearchIndexer.update_posts_index(post_id, "tester", "", nil, nil)
raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]
expect(raw_data).to eq("tester")
end
describe '.index' do
let(:post) { Fabricate(:post) }
before do
SearchIndexer.enable
end
after do
SearchIndexer.disable
end
it 'should index posts correctly' do
expect { post }.to change { PostSearchData.count }.by(1)
expect { post.update!(raw: "this is new content") }
.to change { post.reload.post_search_data.raw_data }
expect { post.update!(topic_id: Fabricate(:topic).id) }
.to change { post.reload.post_search_data.raw_data }
end
it 'should not index posts with empty raw' do
expect do
post = Fabricate.build(:post, raw: "", post_type: Post.types[:small_action])
post.save!(validate: false)
end.to_not change { PostSearchData.count }
end
it "should not tokenize urls and duplicate title and href in <a>" do
post = Fabricate(:post, raw: <<~RAW)
https://meta.discourse.org/some.png
RAW
post.rebake!
post.reload
topic = post.topic
expect(post.post_search_data.raw_data).to eq(
"#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png meta discourse org"
)
end
it 'should not include lightbox in search' do
Jobs.run_immediately!
SiteSetting.max_image_height = 2000
SiteSetting.crawl_images = true
FastImage.expects(:size).returns([1750, 2000])
src = "https://meta.discourse.org/some.png"
post = Fabricate(:post, raw: <<~RAW)
Let me see how I can fix this image
<img src="#{src}" width="275" height="299">
RAW
post.rebake!
post.reload
topic = post.topic
expect(post.post_search_data.raw_data).to eq(
"#{topic.title} #{topic.category.name} Let me see how I can fix this image"
)
end
end
end