mirror of
https://github.com/discourse/discourse.git
synced 2024-11-25 09:42:07 +08:00
FEATURE: Whitelists for inline oneboxing
This commit is contained in:
parent
4c7b725e19
commit
2f8f2aa1dd
|
@ -1,50 +1,11 @@
|
|||
require 'open-uri'
|
||||
require 'nokogiri'
|
||||
require 'excon'
|
||||
require 'final_destination'
|
||||
require_dependency 'retrieve_title'
|
||||
|
||||
module Jobs
|
||||
class CrawlTopicLink < Jobs::Base
|
||||
|
||||
class ReadEnough < StandardError; end
|
||||
|
||||
def self.max_chunk_size(uri)
|
||||
# Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
|
||||
# one host but amazon is a big one.
|
||||
return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
|
||||
|
||||
# Default is 10k
|
||||
10
|
||||
end
|
||||
|
||||
# Fetch the beginning of a HTML document at a url
|
||||
def self.fetch_beginning(url)
|
||||
# Never crawl in test mode
|
||||
return if Rails.env.test?
|
||||
|
||||
fd = FinalDestination.new(url)
|
||||
uri = fd.resolve
|
||||
return "" unless uri
|
||||
|
||||
result = ""
|
||||
streamer = lambda do |chunk, _, _|
|
||||
result << chunk
|
||||
|
||||
# Using exceptions for flow control is really bad, but there really seems to
|
||||
# be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
|
||||
# that matter!)
|
||||
raise ReadEnough.new if result.size > (CrawlTopicLink.max_chunk_size(uri) * 1024)
|
||||
end
|
||||
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
|
||||
result
|
||||
|
||||
rescue Excon::Errors::SocketError => ex
|
||||
return result if ex.socket_error.is_a?(ReadEnough)
|
||||
raise
|
||||
rescue ReadEnough
|
||||
result
|
||||
end
|
||||
|
||||
def execute(args)
|
||||
raise Discourse::InvalidParameters.new(:topic_link_id) unless args[:topic_link_id].present?
|
||||
|
||||
|
@ -72,18 +33,9 @@ module Jobs
|
|||
|
||||
unless crawled
|
||||
# Fetch the beginning of the document to find the title
|
||||
result = CrawlTopicLink.fetch_beginning(topic_link.url)
|
||||
doc = Nokogiri::HTML(result)
|
||||
if doc
|
||||
title = doc.at('title').try(:inner_text)
|
||||
if title.present?
|
||||
title.gsub!(/\n/, ' ')
|
||||
title.gsub!(/ +/, ' ')
|
||||
title.strip!
|
||||
if title.present?
|
||||
crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1)
|
||||
end
|
||||
end
|
||||
title = RetrieveTitle.crawl(topic_link.url)
|
||||
if title.present?
|
||||
crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1)
|
||||
end
|
||||
end
|
||||
rescue Exception
|
||||
|
|
|
@ -979,6 +979,7 @@ en:
|
|||
show_pinned_excerpt_desktop: "Show excerpt on pinned topics in desktop view."
|
||||
post_onebox_maxlength: "Maximum length of a oneboxed Discourse post in characters."
|
||||
onebox_domains_blacklist: "A list of domains that will never be oneboxed."
|
||||
inline_onebox_domains_whitelist: "A list of domains that will be oneboxed in miniature form if linked without a title"
|
||||
max_oneboxes_per_post: "Maximum number of oneboxes in a post."
|
||||
|
||||
logo_url: "The logo image at the top left of your site, should be a wide rectangle shape. If left blank site title text will be shown."
|
||||
|
|
|
@ -920,6 +920,9 @@ onebox:
|
|||
max_oneboxes_per_post:
|
||||
default: 50
|
||||
client: true
|
||||
inline_onebox_domains_whitelist:
|
||||
default: ''
|
||||
type: list
|
||||
|
||||
spam:
|
||||
add_rel_nofollow_to_user_content: true
|
||||
|
|
|
@ -1,38 +1,50 @@
|
|||
require_dependency 'retrieve_title'
|
||||
|
||||
class InlineOneboxer
|
||||
|
||||
def initialize(urls)
|
||||
def initialize(urls, opts=nil)
|
||||
@urls = urls
|
||||
@opts = opts || {}
|
||||
end
|
||||
|
||||
def process
|
||||
@urls.map {|url| InlineOneboxer.lookup(url) }.compact
|
||||
@urls.map {|url| InlineOneboxer.lookup(url, @opts) }.compact
|
||||
end
|
||||
|
||||
def self.clear_cache!
|
||||
def self.purge(url)
|
||||
Rails.cache.delete(cache_key(url))
|
||||
end
|
||||
|
||||
def self.cache_lookup(url)
|
||||
Rails.cache.read(cache_key(url))
|
||||
end
|
||||
|
||||
def self.lookup(url)
|
||||
cached = cache_lookup(url)
|
||||
return cached if cached.present?
|
||||
def self.lookup(url, opts=nil)
|
||||
opts ||= {}
|
||||
|
||||
unless opts[:skip_cache]
|
||||
cached = cache_lookup(url)
|
||||
return cached if cached.present?
|
||||
end
|
||||
|
||||
if route = Discourse.route_for(url)
|
||||
if route[:controller] == "topics" &&
|
||||
route[:action] == "show" &&
|
||||
topic = (Topic.where(id: route[:topic_id].to_i).first rescue nil)
|
||||
|
||||
# Only public topics
|
||||
if Guardian.new.can_see?(topic)
|
||||
onebox = {
|
||||
url: url,
|
||||
title: Emoji.gsub_emoji_to_unicode(topic.title)
|
||||
}
|
||||
Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
|
||||
return onebox
|
||||
end
|
||||
return onebox_for(url, topic.title, opts) if Guardian.new.can_see?(topic)
|
||||
end
|
||||
end
|
||||
|
||||
if whitelist = SiteSetting.inline_onebox_domains_whitelist
|
||||
uri = URI(url) rescue nil
|
||||
|
||||
domains = whitelist.split('|')
|
||||
if uri.present? &&
|
||||
uri.hostname.present? &&
|
||||
domains.include?(uri.hostname) &&
|
||||
title = RetrieveTitle.crawl(url)
|
||||
return onebox_for(url, title, opts)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -41,6 +53,18 @@ class InlineOneboxer
|
|||
|
||||
private
|
||||
|
||||
def self.onebox_for(url, title, opts)
|
||||
onebox = {
|
||||
url: url,
|
||||
title: Emoji.gsub_emoji_to_unicode(title)
|
||||
}
|
||||
unless opts[:skip_cache]
|
||||
Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
|
||||
end
|
||||
|
||||
onebox
|
||||
end
|
||||
|
||||
def self.cache_key(url)
|
||||
"inline_onebox:#{url}"
|
||||
end
|
||||
|
|
70
lib/retrieve_title.rb
Normal file
70
lib/retrieve_title.rb
Normal file
|
@ -0,0 +1,70 @@
|
|||
require_dependency 'final_destination'
|
||||
|
||||
module RetrieveTitle
|
||||
class ReadEnough < StandardError; end
|
||||
|
||||
def self.crawl(url)
|
||||
extract_title(fetch_beginning(url))
|
||||
rescue Exception
|
||||
# If there was a connection error, do nothing
|
||||
end
|
||||
|
||||
def self.extract_title(html)
|
||||
title = nil
|
||||
if doc = Nokogiri::HTML(html)
|
||||
|
||||
if node = doc.at('meta[property="og:title"]')
|
||||
title = node['content']
|
||||
end
|
||||
|
||||
title ||= doc.at('title')&.inner_text
|
||||
end
|
||||
|
||||
if title.present?
|
||||
title.gsub!(/\n/, ' ')
|
||||
title.gsub!(/ +/, ' ')
|
||||
title.strip!
|
||||
return title
|
||||
end
|
||||
nil
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def self.max_chunk_size(uri)
|
||||
# Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
|
||||
# one host but amazon is a big one.
|
||||
return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
|
||||
|
||||
# default is 10k
|
||||
10
|
||||
end
|
||||
|
||||
# Fetch the beginning of a HTML document at a url
|
||||
def self.fetch_beginning(url)
|
||||
# Never crawl in test mode
|
||||
return if Rails.env.test?
|
||||
|
||||
fd = FinalDestination.new(url)
|
||||
uri = fd.resolve
|
||||
return "" unless uri
|
||||
|
||||
result = ""
|
||||
streamer = lambda do |chunk, _, _|
|
||||
result << chunk
|
||||
|
||||
# Using exceptions for flow control is really bad, but there really seems to
|
||||
# be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
|
||||
# that matter!)
|
||||
raise ReadEnough.new if result.size > (max_chunk_size(uri) * 1024)
|
||||
end
|
||||
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
|
||||
result
|
||||
|
||||
rescue Excon::Errors::SocketError => ex
|
||||
return result if ex.socket_error.is_a?(ReadEnough)
|
||||
raise
|
||||
rescue ReadEnough
|
||||
result
|
||||
end
|
||||
end
|
|
@ -3,17 +3,13 @@ require_dependency 'inline_oneboxer'
|
|||
|
||||
describe InlineOneboxer do
|
||||
|
||||
before do
|
||||
InlineOneboxer.clear_cache!
|
||||
end
|
||||
|
||||
it "should return nothing with empty input" do
|
||||
expect(InlineOneboxer.new([]).process).to be_blank
|
||||
end
|
||||
|
||||
it "can onebox a topic" do
|
||||
topic = Fabricate(:topic)
|
||||
results = InlineOneboxer.new([topic.url]).process
|
||||
results = InlineOneboxer.new([topic.url], skip_cache: true).process
|
||||
expect(results).to be_present
|
||||
expect(results[0][:url]).to eq(topic.url)
|
||||
expect(results[0][:title]).to eq(topic.title)
|
||||
|
@ -21,13 +17,18 @@ describe InlineOneboxer do
|
|||
|
||||
it "doesn't onebox private messages" do
|
||||
topic = Fabricate(:private_message_topic)
|
||||
results = InlineOneboxer.new([topic.url]).process
|
||||
results = InlineOneboxer.new([topic.url], skip_cache: true).process
|
||||
expect(results).to be_blank
|
||||
end
|
||||
|
||||
context "caching" do
|
||||
let(:topic) { Fabricate(:topic) }
|
||||
|
||||
before do
|
||||
InlineOneboxer.purge(topic.url)
|
||||
end
|
||||
|
||||
it "puts an entry in the cache" do
|
||||
topic = Fabricate(:topic)
|
||||
expect(InlineOneboxer.cache_lookup(topic.url)).to be_blank
|
||||
|
||||
result = InlineOneboxer.lookup(topic.url)
|
||||
|
@ -43,7 +44,7 @@ describe InlineOneboxer do
|
|||
context ".lookup" do
|
||||
it "can lookup one link at a time" do
|
||||
topic = Fabricate(:topic)
|
||||
onebox = InlineOneboxer.lookup(topic.url)
|
||||
onebox = InlineOneboxer.lookup(topic.url, skip_cache: true)
|
||||
expect(onebox).to be_present
|
||||
expect(onebox[:url]).to eq(topic.url)
|
||||
expect(onebox[:title]).to eq(topic.title)
|
||||
|
@ -56,12 +57,30 @@ describe InlineOneboxer do
|
|||
|
||||
it "will return the fancy title" do
|
||||
topic = Fabricate(:topic, title: "Hello :pizza: with an emoji")
|
||||
onebox = InlineOneboxer.lookup(topic.url)
|
||||
onebox = InlineOneboxer.lookup(topic.url, skip_cache: true)
|
||||
expect(onebox).to be_present
|
||||
expect(onebox[:url]).to eq(topic.url)
|
||||
expect(onebox[:title]).to eq("Hello 🍕 with an emoji")
|
||||
end
|
||||
|
||||
it "will not crawl domains that aren't whitelisted" do
|
||||
onebox = InlineOneboxer.lookup("https://eviltrout.com", skip_cache: true)
|
||||
expect(onebox).to be_blank
|
||||
end
|
||||
|
||||
it "will lookup whitelisted domains" do
|
||||
SiteSetting.inline_onebox_domains_whitelist = "eviltrout.com"
|
||||
RetrieveTitle.stubs(:crawl).returns("Evil Trout's Blog")
|
||||
|
||||
onebox = InlineOneboxer.lookup(
|
||||
"https://eviltrout.com/some-path",
|
||||
skip_cache: true
|
||||
)
|
||||
expect(onebox).to be_present
|
||||
expect(onebox[:url]).to eq("https://eviltrout.com/some-path")
|
||||
expect(onebox[:title]).to eq("Evil Trout's Blog")
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
|
||||
|
|
40
spec/components/retrieve_title_spec.rb
Normal file
40
spec/components/retrieve_title_spec.rb
Normal file
|
@ -0,0 +1,40 @@
|
|||
require 'rails_helper'
|
||||
require_dependency 'retrieve_title'
|
||||
|
||||
describe RetrieveTitle do
|
||||
|
||||
context "extract_title" do
|
||||
|
||||
it "will extract the value from the title tag" do
|
||||
title = RetrieveTitle.extract_title(
|
||||
"<html><title>My Cool Title</title></html>"
|
||||
)
|
||||
|
||||
expect(title).to eq("My Cool Title")
|
||||
end
|
||||
|
||||
it "will strip whitespace" do
|
||||
title = RetrieveTitle.extract_title(
|
||||
"<html><title> Another Title\n\n </title></html>"
|
||||
)
|
||||
|
||||
expect(title).to eq("Another Title")
|
||||
end
|
||||
|
||||
it "will prefer the title from an opengraph tag" do
|
||||
title = RetrieveTitle.extract_title(<<~HTML
|
||||
<html>
|
||||
<title>Bad Title</title>
|
||||
<meta property="og:title" content="Good Title" />
|
||||
</html>
|
||||
HTML
|
||||
)
|
||||
|
||||
expect(title).to eq("Good Title")
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user