FEATURE: Whitelists for inline oneboxing

This commit is contained in:
Robin Ward 2017-07-21 15:29:04 -04:00
parent 4c7b725e19
commit 2f8f2aa1dd
7 changed files with 185 additions and 76 deletions

View File

@ -1,50 +1,11 @@
require 'open-uri'
require 'nokogiri'
require 'excon'
require 'final_destination'
require_dependency 'retrieve_title'
module Jobs
class CrawlTopicLink < Jobs::Base
class ReadEnough < StandardError; end
def self.max_chunk_size(uri)
# Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
# one host but amazon is a big one.
return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
# Default is 10k
10
end
# Fetch the beginning of a HTML document at a url
def self.fetch_beginning(url)
# Never crawl in test mode
return if Rails.env.test?
fd = FinalDestination.new(url)
uri = fd.resolve
return "" unless uri
result = ""
streamer = lambda do |chunk, _, _|
result << chunk
# Using exceptions for flow control is really bad, but there really seems to
# be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
# that matter!)
raise ReadEnough.new if result.size > (CrawlTopicLink.max_chunk_size(uri) * 1024)
end
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
result
rescue Excon::Errors::SocketError => ex
return result if ex.socket_error.is_a?(ReadEnough)
raise
rescue ReadEnough
result
end
def execute(args)
raise Discourse::InvalidParameters.new(:topic_link_id) unless args[:topic_link_id].present?
@ -72,18 +33,9 @@ module Jobs
unless crawled
# Fetch the beginning of the document to find the title
result = CrawlTopicLink.fetch_beginning(topic_link.url)
doc = Nokogiri::HTML(result)
if doc
title = doc.at('title').try(:inner_text)
if title.present?
title.gsub!(/\n/, ' ')
title.gsub!(/ +/, ' ')
title.strip!
if title.present?
crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1)
end
end
title = RetrieveTitle.crawl(topic_link.url)
if title.present?
crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1)
end
end
rescue Exception

View File

@ -979,6 +979,7 @@ en:
show_pinned_excerpt_desktop: "Show excerpt on pinned topics in desktop view."
post_onebox_maxlength: "Maximum length of a oneboxed Discourse post in characters."
onebox_domains_blacklist: "A list of domains that will never be oneboxed."
inline_onebox_domains_whitelist: "A list of domains that will be oneboxed in miniature form if linked without a title"
max_oneboxes_per_post: "Maximum number of oneboxes in a post."
logo_url: "The logo image at the top left of your site, should be a wide rectangle shape. If left blank site title text will be shown."

View File

@ -920,6 +920,9 @@ onebox:
max_oneboxes_per_post:
default: 50
client: true
inline_onebox_domains_whitelist:
default: ''
type: list
spam:
add_rel_nofollow_to_user_content: true

View File

@ -1,38 +1,50 @@
require_dependency 'retrieve_title'
class InlineOneboxer
def initialize(urls)
def initialize(urls, opts=nil)
@urls = urls
@opts = opts || {}
end
def process
@urls.map {|url| InlineOneboxer.lookup(url) }.compact
@urls.map {|url| InlineOneboxer.lookup(url, @opts) }.compact
end
def self.clear_cache!
def self.purge(url)
Rails.cache.delete(cache_key(url))
end
def self.cache_lookup(url)
Rails.cache.read(cache_key(url))
end
def self.lookup(url)
cached = cache_lookup(url)
return cached if cached.present?
def self.lookup(url, opts=nil)
opts ||= {}
unless opts[:skip_cache]
cached = cache_lookup(url)
return cached if cached.present?
end
if route = Discourse.route_for(url)
if route[:controller] == "topics" &&
route[:action] == "show" &&
topic = (Topic.where(id: route[:topic_id].to_i).first rescue nil)
# Only public topics
if Guardian.new.can_see?(topic)
onebox = {
url: url,
title: Emoji.gsub_emoji_to_unicode(topic.title)
}
Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
return onebox
end
return onebox_for(url, topic.title, opts) if Guardian.new.can_see?(topic)
end
end
if whitelist = SiteSetting.inline_onebox_domains_whitelist
uri = URI(url) rescue nil
domains = whitelist.split('|')
if uri.present? &&
uri.hostname.present? &&
domains.include?(uri.hostname) &&
title = RetrieveTitle.crawl(url)
return onebox_for(url, title, opts)
end
end
@ -41,6 +53,18 @@ class InlineOneboxer
private
def self.onebox_for(url, title, opts)
onebox = {
url: url,
title: Emoji.gsub_emoji_to_unicode(title)
}
unless opts[:skip_cache]
Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
end
onebox
end
def self.cache_key(url)
"inline_onebox:#{url}"
end

70
lib/retrieve_title.rb Normal file
View File

@ -0,0 +1,70 @@
require_dependency 'final_destination'
module RetrieveTitle
class ReadEnough < StandardError; end
def self.crawl(url)
extract_title(fetch_beginning(url))
rescue Exception
# If there was a connection error, do nothing
end
def self.extract_title(html)
title = nil
if doc = Nokogiri::HTML(html)
if node = doc.at('meta[property="og:title"]')
title = node['content']
end
title ||= doc.at('title')&.inner_text
end
if title.present?
title.gsub!(/\n/, ' ')
title.gsub!(/ +/, ' ')
title.strip!
return title
end
nil
end
private
def self.max_chunk_size(uri)
# Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
# one host but amazon is a big one.
return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
# default is 10k
10
end
# Fetch the beginning of a HTML document at a url
def self.fetch_beginning(url)
# Never crawl in test mode
return if Rails.env.test?
fd = FinalDestination.new(url)
uri = fd.resolve
return "" unless uri
result = ""
streamer = lambda do |chunk, _, _|
result << chunk
# Using exceptions for flow control is really bad, but there really seems to
# be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
# that matter!)
raise ReadEnough.new if result.size > (max_chunk_size(uri) * 1024)
end
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
result
rescue Excon::Errors::SocketError => ex
return result if ex.socket_error.is_a?(ReadEnough)
raise
rescue ReadEnough
result
end
end

View File

@ -3,17 +3,13 @@ require_dependency 'inline_oneboxer'
describe InlineOneboxer do
before do
InlineOneboxer.clear_cache!
end
it "should return nothing with empty input" do
expect(InlineOneboxer.new([]).process).to be_blank
end
it "can onebox a topic" do
topic = Fabricate(:topic)
results = InlineOneboxer.new([topic.url]).process
results = InlineOneboxer.new([topic.url], skip_cache: true).process
expect(results).to be_present
expect(results[0][:url]).to eq(topic.url)
expect(results[0][:title]).to eq(topic.title)
@ -21,13 +17,18 @@ describe InlineOneboxer do
it "doesn't onebox private messages" do
topic = Fabricate(:private_message_topic)
results = InlineOneboxer.new([topic.url]).process
results = InlineOneboxer.new([topic.url], skip_cache: true).process
expect(results).to be_blank
end
context "caching" do
let(:topic) { Fabricate(:topic) }
before do
InlineOneboxer.purge(topic.url)
end
it "puts an entry in the cache" do
topic = Fabricate(:topic)
expect(InlineOneboxer.cache_lookup(topic.url)).to be_blank
result = InlineOneboxer.lookup(topic.url)
@ -43,7 +44,7 @@ describe InlineOneboxer do
context ".lookup" do
it "can lookup one link at a time" do
topic = Fabricate(:topic)
onebox = InlineOneboxer.lookup(topic.url)
onebox = InlineOneboxer.lookup(topic.url, skip_cache: true)
expect(onebox).to be_present
expect(onebox[:url]).to eq(topic.url)
expect(onebox[:title]).to eq(topic.title)
@ -56,12 +57,30 @@ describe InlineOneboxer do
it "will return the fancy title" do
topic = Fabricate(:topic, title: "Hello :pizza: with an emoji")
onebox = InlineOneboxer.lookup(topic.url)
onebox = InlineOneboxer.lookup(topic.url, skip_cache: true)
expect(onebox).to be_present
expect(onebox[:url]).to eq(topic.url)
expect(onebox[:title]).to eq("Hello 🍕 with an emoji")
end
it "will not crawl domains that aren't whitelisted" do
onebox = InlineOneboxer.lookup("https://eviltrout.com", skip_cache: true)
expect(onebox).to be_blank
end
it "will lookup whitelisted domains" do
SiteSetting.inline_onebox_domains_whitelist = "eviltrout.com"
RetrieveTitle.stubs(:crawl).returns("Evil Trout's Blog")
onebox = InlineOneboxer.lookup(
"https://eviltrout.com/some-path",
skip_cache: true
)
expect(onebox).to be_present
expect(onebox[:url]).to eq("https://eviltrout.com/some-path")
expect(onebox[:title]).to eq("Evil Trout's Blog")
end
end

View File

@ -0,0 +1,40 @@
require 'rails_helper'
require_dependency 'retrieve_title'
describe RetrieveTitle do
context "extract_title" do
it "will extract the value from the title tag" do
title = RetrieveTitle.extract_title(
"<html><title>My Cool Title</title></html>"
)
expect(title).to eq("My Cool Title")
end
it "will strip whitespace" do
title = RetrieveTitle.extract_title(
"<html><title> Another Title\n\n </title></html>"
)
expect(title).to eq("Another Title")
end
it "will prefer the title from an opengraph tag" do
title = RetrieveTitle.extract_title(<<~HTML
<html>
<title>Bad Title</title>
<meta property="og:title" content="Good Title" />
</html>
HTML
)
expect(title).to eq("Good Title")
end
end
end