FIX: don't extract links from .elided parts

This commit is contained in:
Régis Hanol 2016-12-05 15:19:15 +01:00
parent 59523aef9d
commit 06469ef0ce
2 changed files with 31 additions and 28 deletions

View File

@ -19,7 +19,7 @@ class PostAnalyzer
result = Oneboxer.apply(cooked, topic_id: @topic_id) do |url, _|
@found_oneboxes = true
Oneboxer.invalidate(url) if args.last[:invalidate_oneboxes]
Oneboxer.cached_onebox url
Oneboxer.cached_onebox(url)
end
cooked = result.to_html if result.changed?
@ -30,10 +30,9 @@ class PostAnalyzer
def image_count
return 0 unless @raw.present?
cooked_document.search("img").reject do |t|
dom_class = t["class"]
if dom_class
(Post.white_listed_image_classes & dom_class.split(" ")).count > 0
cooked_stripped.css("img").reject do |t|
if dom_class = t["class"]
(Post.white_listed_image_classes & dom_class.split).count > 0
end
end.count
end
@ -42,8 +41,8 @@ class PostAnalyzer
def attachment_count
return 0 unless @raw.present?
attachments = cooked_document.css("a.attachment[href^=\"#{Discourse.store.absolute_base_url}\"]")
attachments += cooked_document.css("a.attachment[href^=\"#{Discourse.store.relative_base_url}\"]") if Discourse.store.internal?
attachments = cooked_stripped.css("a.attachment[href^=\"#{Discourse.store.absolute_base_url}\"]")
attachments += cooked_stripped.css("a.attachment[href^=\"#{Discourse.store.relative_base_url}\"]") if Discourse.store.internal?
attachments.count
end
@ -51,13 +50,6 @@ class PostAnalyzer
return [] if @raw.blank?
return @raw_mentions if @raw_mentions.present?
# strip quotes, code blocks and oneboxes
cooked_stripped = cooked_document
cooked_stripped.css("aside.quote").remove
cooked_stripped.css("pre").remove
cooked_stripped.css("code").remove
cooked_stripped.css(".onebox").remove
raw_mentions = cooked_stripped.css('.mention, .mention-group').map do |e|
if name = e.inner_text
name = name[1..-1]
@ -105,11 +97,10 @@ class PostAnalyzer
@raw_links = []
cooked_document.search("a").each do |l|
cooked_stripped.css("a[href]").each do |l|
# Don't include @mentions in the link count
next if l.attributes['href'].nil? || link_is_a_mention?(l)
url = l.attributes['href'].to_s
@raw_links << url
next if l['href'].blank? || link_is_a_mention?(l)
@raw_links << l['href'].to_s
end
@raw_links
@ -122,13 +113,18 @@ class PostAnalyzer
private
def cooked_document
@cooked_document ||= Nokogiri::HTML.fragment(cook(@raw, topic_id: @topic_id))
end
def cooked_stripped
@cooked_stripped ||= begin
doc = Nokogiri::HTML.fragment(cook(@raw, topic_id: @topic_id))
doc.css("pre, code, aside.quote, .onebox, .elided").remove
doc
end
end
def link_is_a_mention?(l)
html_class = l['class']
return false if html_class.blank?
html_class.to_s['mention'] && l['href'].to_s[/^\/users\//]
end
def link_is_a_mention?(l)
html_class = l.attributes['class']
return false if html_class.nil?
html_class.to_s == 'mention' && l.attributes['href'].to_s =~ /^\/users\//
end
end

View File

@ -38,8 +38,9 @@ describe PostAnalyzer do
context "links" do
let(:raw_no_links) { "hello world my name is evil trout" }
let(:raw_one_link_md) { "[jlawr](http://www.imdb.com/name/nm2225369)" }
let(:raw_two_links_html) { "<a href='http://disneyland.disney.go.com/'>disney</a> <a href='http://reddit.com'>reddit</a>"}
let(:raw_three_links) { "http://discourse.org and http://discourse.org/another_url and http://www.imdb.com/name/nm2225369"}
let(:raw_two_links_html) { "<a href='http://disneyland.disney.go.com/'>disney</a> <a href='http://reddit.com'>reddit</a>" }
let(:raw_three_links) { "http://discourse.org and http://discourse.org/another_url and http://www.imdb.com/name/nm2225369" }
let(:raw_elided) { "<details class='elided'>\n<summary title='Show trimmed content'>&#183;&#183;&#183;</summary>\nhttp://discourse.org\n</details>" }
describe "raw_links" do
it "returns a blank collection for a post with no links" do
@ -61,6 +62,12 @@ describe PostAnalyzer do
post_analyzer = PostAnalyzer.new(raw_three_links, default_topic_id)
expect(post_analyzer.raw_links).to eq(["http://discourse.org", "http://discourse.org/another_url", "http://www.imdb.com/name/nm2225369"])
end
it "doesn't extract links from elided part" do
post_analyzer = PostAnalyzer.new(raw_elided, default_topic_id)
post_analyzer.expects(:cook).returns("<p><details class='elided'>\n<summary title='Show trimmed content'>&#183;&#183;&#183;</summary>\n<a href='http://discourse.org'>discourse.org</a>\n</details></p>")
expect(post_analyzer.raw_links).to be_blank
end
end
describe "linked_hosts" do