mirror of
https://github.com/discourse/discourse.git
synced 2024-12-15 07:33:51 +08:00
FIX: don't index urls to local files
This commit is contained in:
parent
74eec1849d
commit
39a2d92417
|
@ -167,6 +167,8 @@ class SearchIndexer
|
||||||
|
|
||||||
class HtmlScrubber < Nokogiri::XML::SAX::Document
|
class HtmlScrubber < Nokogiri::XML::SAX::Document
|
||||||
|
|
||||||
|
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
|
||||||
|
|
||||||
def self.strip_diacritics(str)
|
def self.strip_diacritics(str)
|
||||||
s = str.unicode_normalize(:nfkd)
|
s = str.unicode_normalize(:nfkd)
|
||||||
s.gsub!(DIACRITICS, "")
|
s.gsub!(DIACRITICS, "")
|
||||||
|
@ -196,12 +198,12 @@ class SearchIndexer
|
||||||
attributes = Hash[*attributes.flatten]
|
attributes = Hash[*attributes.flatten]
|
||||||
|
|
||||||
ATTRIBUTES.each do |name|
|
ATTRIBUTES.each do |name|
|
||||||
characters(attributes[name]) if attributes[name].present?
|
if attributes[name].present?
|
||||||
|
characters(attributes[name]) unless name == "href" && UrlHelper.is_local(attributes[name])
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
DIACRITICS ||= /([\u0300-\u036f]|[\u1AB0-\u1AFF]|[\u1DC0-\u1DFF]|[\u20D0-\u20FF])/
|
|
||||||
|
|
||||||
def characters(str)
|
def characters(str)
|
||||||
str = HtmlScrubber.strip_diacritics(str) if @strip_diacritics
|
str = HtmlScrubber.strip_diacritics(str) if @strip_diacritics
|
||||||
scrubbed << " #{str} "
|
scrubbed << " #{str} "
|
||||||
|
|
|
@ -2,6 +2,7 @@ require 'rails_helper'
|
||||||
|
|
||||||
describe SearchIndexer do
|
describe SearchIndexer do
|
||||||
let(:post_id) { 99 }
|
let(:post_id) { 99 }
|
||||||
|
|
||||||
it 'correctly indexes chinese' do
|
it 'correctly indexes chinese' do
|
||||||
SiteSetting.default_locale = 'zh_CN'
|
SiteSetting.default_locale = 'zh_CN'
|
||||||
data = "你好世界"
|
data = "你好世界"
|
||||||
|
@ -37,6 +38,27 @@ describe SearchIndexer do
|
||||||
expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ")
|
expect(scrubbed).to eq(" HELLO Heterogeneite Здравствуите هتاف للترحيب 你好 ")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "doesn't index local files" do
|
||||||
|
html = <<~HTML
|
||||||
|
<p><img src="https://www.discourse.org/logo.png" alt="Discourse"></p>
|
||||||
|
<p><img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/0/f/0f40b818356bdc1d80acfa905034e95cfd112a3a.png" alt="51%20PM" width="289" height="398"></p>
|
||||||
|
<div class="lightbox-wrapper">
|
||||||
|
<a class="lightbox" href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/original/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b.jpg" data-download-href="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/16790095df3baf318fb2eb1d7e5d7860dc45d48b" title="Untitled design (21).jpg" rel="nofollow noopener">
|
||||||
|
<img src="#{Discourse.base_url_no_prefix}/uploads/episodeinteractive/optimized/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b_1_563x500.jpg" alt="Untitled%20design%20(21)" width="563" height="500">
|
||||||
|
<div class="meta">
|
||||||
|
<span class="filename">Untitled design (21).jpg</span>
|
||||||
|
<span class="informations">1280x1136 472 KB</span>
|
||||||
|
<span class="expand"></span>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
HTML
|
||||||
|
|
||||||
|
scrubbed = SearchIndexer::HtmlScrubber.scrub(html).gsub(/\s+/, " ")
|
||||||
|
|
||||||
|
expect(scrubbed).to eq(" Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB ")
|
||||||
|
end
|
||||||
|
|
||||||
it 'correctly indexes a post according to version' do
|
it 'correctly indexes a post according to version' do
|
||||||
# Preparing so that they can be indexed to right version
|
# Preparing so that they can be indexed to right version
|
||||||
SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)
|
SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user