FEATURE: Whitelists for inline oneboxing

2024-11-25 09:42:07 +08:00 · 2017-07-21 15:29:04 -04:00 · 2017-07-21 15:29:04 -04:00 · 2f8f2aa1dd
commit 2f8f2aa1dd
parent 4c7b725e19
7 changed files with 185 additions and 76 deletions
--- a/app/jobs/regular/crawl_topic_link.rb
+++ b/app/jobs/regular/crawl_topic_link.rb
@ -1,50 +1,11 @@
 require 'open-uri'
 require 'nokogiri'
 require 'excon'
-require 'final_destination'
+require_dependency 'retrieve_title'

 module Jobs
  class CrawlTopicLink < Jobs::Base

-    class ReadEnough < StandardError; end
-
-    def self.max_chunk_size(uri)
-      # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
-      # one host but amazon is a big one.
-      return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
-
-      # Default is 10k
-      10
-    end
-
-    # Fetch the beginning of a HTML document at a url
-    def self.fetch_beginning(url)
-      # Never crawl in test mode
-      return if Rails.env.test?
-
-      fd = FinalDestination.new(url)
-      uri = fd.resolve
-      return "" unless uri
-
-      result = ""
-      streamer = lambda do |chunk, _, _|
-        result << chunk
-
-        # Using exceptions for flow control is really bad, but there really seems to
-        # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
-        # that matter!)
-        raise ReadEnough.new if result.size > (CrawlTopicLink.max_chunk_size(uri) * 1024)
-      end
-      Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
-      result
-
-    rescue Excon::Errors::SocketError => ex
-      return result if ex.socket_error.is_a?(ReadEnough)
-      raise
-    rescue ReadEnough
-      result
-    end
-
    def execute(args)
      raise Discourse::InvalidParameters.new(:topic_link_id) unless args[:topic_link_id].present?

@ -72,18 +33,9 @@ module Jobs

        unless crawled
          # Fetch the beginning of the document to find the title
-          result = CrawlTopicLink.fetch_beginning(topic_link.url)
-          doc = Nokogiri::HTML(result)
-          if doc
-            title = doc.at('title').try(:inner_text)
-            if title.present?
-              title.gsub!(/\n/, ' ')
-              title.gsub!(/ +/, ' ')
-              title.strip!
-              if title.present?
-                crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1)
-              end
-            end
+          title = RetrieveTitle.crawl(topic_link.url)
+          if title.present?
+            crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..254]]) == 1)
          end
        end
      rescue Exception
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@ -979,6 +979,7 @@ en:
    show_pinned_excerpt_desktop: "Show excerpt on pinned topics in desktop view."
    post_onebox_maxlength: "Maximum length of a oneboxed Discourse post in characters."
    onebox_domains_blacklist: "A list of domains that will never be oneboxed."
+    inline_onebox_domains_whitelist: "A list of domains that will be oneboxed in miniature form if linked without a title"
    max_oneboxes_per_post: "Maximum number of oneboxes in a post."

    logo_url: "The logo image at the top left of your site, should be a wide rectangle shape. If left blank site title text will be shown."
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@ -920,6 +920,9 @@ onebox:
  max_oneboxes_per_post:
    default: 50
    client: true
+  inline_onebox_domains_whitelist:
+    default: ''
+    type: list

 spam:
  add_rel_nofollow_to_user_content: true
--- a/lib/inline_oneboxer.rb
+++ b/lib/inline_oneboxer.rb
@ -1,38 +1,50 @@
+require_dependency 'retrieve_title'
+
 class InlineOneboxer

-  def initialize(urls)
+  def initialize(urls, opts=nil)
    @urls = urls
+    @opts = opts || {}
  end

  def process
-    @urls.map {|url| InlineOneboxer.lookup(url) }.compact
+    @urls.map {|url| InlineOneboxer.lookup(url, @opts) }.compact
  end

-  def self.clear_cache!
+  def self.purge(url)
+    Rails.cache.delete(cache_key(url))
  end

  def self.cache_lookup(url)
    Rails.cache.read(cache_key(url))
  end

-  def self.lookup(url)
-    cached = cache_lookup(url)
-    return cached if cached.present?
+  def self.lookup(url, opts=nil)
+    opts ||= {}
+
+    unless opts[:skip_cache]
+      cached = cache_lookup(url)
+      return cached if cached.present?
+    end

    if route = Discourse.route_for(url)
      if route[:controller] == "topics" &&
        route[:action] == "show" &&
        topic = (Topic.where(id: route[:topic_id].to_i).first rescue nil)

-        # Only public topics
-        if Guardian.new.can_see?(topic)
-          onebox = {
-            url: url,
-            title: Emoji.gsub_emoji_to_unicode(topic.title)
-          }
-          Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
-          return onebox
-        end
+        return onebox_for(url, topic.title, opts) if Guardian.new.can_see?(topic)
+      end
+    end
+
+    if whitelist = SiteSetting.inline_onebox_domains_whitelist
+      uri = URI(url) rescue nil
+
+      domains = whitelist.split('|')
+      if uri.present? &&
+        uri.hostname.present? &&
+        domains.include?(uri.hostname) &&
+        title = RetrieveTitle.crawl(url)
+          return onebox_for(url, title, opts)
      end
    end

@ -41,6 +53,18 @@ class InlineOneboxer

  private

+    def self.onebox_for(url, title, opts)
+      onebox = {
+        url: url,
+        title: Emoji.gsub_emoji_to_unicode(title)
+      }
+      unless opts[:skip_cache]
+        Rails.cache.write(cache_key(url), onebox, expires_in: 1.day)
+      end
+
+      onebox
+    end
+
    def self.cache_key(url)
      "inline_onebox:#{url}"
    end
--- a/lib/retrieve_title.rb
+++ b/lib/retrieve_title.rb
@ -0,0 +1,70 @@
+require_dependency 'final_destination'
+
+module RetrieveTitle
+  class ReadEnough < StandardError; end
+
+  def self.crawl(url)
+    extract_title(fetch_beginning(url))
+  rescue Exception
+    # If there was a connection error, do nothing
+  end
+
+  def self.extract_title(html)
+    title = nil
+    if doc = Nokogiri::HTML(html)
+
+      if node = doc.at('meta[property="og:title"]')
+        title = node['content']
+      end
+
+      title ||= doc.at('title')&.inner_text
+    end
+
+    if title.present?
+      title.gsub!(/\n/, ' ')
+      title.gsub!(/ +/, ' ')
+      title.strip!
+      return title
+    end
+    nil
+  end
+
+  private
+
+    def self.max_chunk_size(uri)
+      # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
+      # one host but amazon is a big one.
+      return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
+
+      # default is 10k
+      10
+    end
+
+    # Fetch the beginning of a HTML document at a url
+    def self.fetch_beginning(url)
+      # Never crawl in test mode
+      return if Rails.env.test?
+
+      fd = FinalDestination.new(url)
+      uri = fd.resolve
+      return "" unless uri
+
+      result = ""
+      streamer = lambda do |chunk, _, _|
+        result << chunk
+
+        # Using exceptions for flow control is really bad, but there really seems to
+        # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
+        # that matter!)
+        raise ReadEnough.new if result.size > (max_chunk_size(uri) * 1024)
+      end
+      Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
+      result
+
+    rescue Excon::Errors::SocketError => ex
+      return result if ex.socket_error.is_a?(ReadEnough)
+      raise
+    rescue ReadEnough
+      result
+    end
+end
--- a/spec/components/inline_oneboxer_spec.rb
+++ b/spec/components/inline_oneboxer_spec.rb
@ -3,17 +3,13 @@ require_dependency 'inline_oneboxer'

 describe InlineOneboxer do

-  before do
-    InlineOneboxer.clear_cache!
-  end
-
  it "should return nothing with empty input" do
    expect(InlineOneboxer.new([]).process).to be_blank
  end

  it "can onebox a topic" do
    topic = Fabricate(:topic)
-    results = InlineOneboxer.new([topic.url]).process
+    results = InlineOneboxer.new([topic.url], skip_cache: true).process
    expect(results).to be_present
    expect(results[0][:url]).to eq(topic.url)
    expect(results[0][:title]).to eq(topic.title)
@ -21,13 +17,18 @@ describe InlineOneboxer do

  it "doesn't onebox private messages" do
    topic = Fabricate(:private_message_topic)
-    results = InlineOneboxer.new([topic.url]).process
+    results = InlineOneboxer.new([topic.url], skip_cache: true).process
    expect(results).to be_blank
  end

  context "caching" do
+    let(:topic) { Fabricate(:topic) }
+
+    before do
+      InlineOneboxer.purge(topic.url)
+    end
+
    it "puts an entry in the cache" do
-      topic = Fabricate(:topic)
      expect(InlineOneboxer.cache_lookup(topic.url)).to be_blank

      result = InlineOneboxer.lookup(topic.url)
@ -43,7 +44,7 @@ describe InlineOneboxer do
  context ".lookup" do
    it "can lookup one link at a time" do
      topic = Fabricate(:topic)
-      onebox = InlineOneboxer.lookup(topic.url)
+      onebox = InlineOneboxer.lookup(topic.url, skip_cache: true)
      expect(onebox).to be_present
      expect(onebox[:url]).to eq(topic.url)
      expect(onebox[:title]).to eq(topic.title)
@ -56,12 +57,30 @@ describe InlineOneboxer do

    it "will return the fancy title" do
      topic = Fabricate(:topic, title: "Hello :pizza: with an emoji")
-      onebox = InlineOneboxer.lookup(topic.url)
+      onebox = InlineOneboxer.lookup(topic.url, skip_cache: true)
      expect(onebox).to be_present
      expect(onebox[:url]).to eq(topic.url)
      expect(onebox[:title]).to eq("Hello 🍕 with an emoji")
    end

+    it "will not crawl domains that aren't whitelisted" do
+      onebox = InlineOneboxer.lookup("https://eviltrout.com", skip_cache: true)
+      expect(onebox).to be_blank
+    end
+
+    it "will lookup whitelisted domains" do
+      SiteSetting.inline_onebox_domains_whitelist = "eviltrout.com"
+      RetrieveTitle.stubs(:crawl).returns("Evil Trout's Blog")
+
+      onebox = InlineOneboxer.lookup(
+        "https://eviltrout.com/some-path",
+        skip_cache: true
+      )
+      expect(onebox).to be_present
+      expect(onebox[:url]).to eq("https://eviltrout.com/some-path")
+      expect(onebox[:title]).to eq("Evil Trout's Blog")
+    end
+
  end


--- a/spec/components/retrieve_title_spec.rb
+++ b/spec/components/retrieve_title_spec.rb
@ -0,0 +1,40 @@
+require 'rails_helper'
+require_dependency 'retrieve_title'
+
+describe RetrieveTitle do
+
+  context "extract_title" do
+
+    it "will extract the value from the title tag" do
+      title = RetrieveTitle.extract_title(
+        "<html><title>My Cool Title</title></html>"
+      )
+
+      expect(title).to eq("My Cool Title")
+    end
+
+    it "will strip whitespace" do
+      title = RetrieveTitle.extract_title(
+        "<html><title>   Another Title\n\n </title></html>"
+      )
+
+      expect(title).to eq("Another Title")
+    end
+
+    it "will prefer the title from an opengraph tag" do
+      title = RetrieveTitle.extract_title(<<~HTML
+        <html>
+          <title>Bad Title</title>
+          <meta property="og:title" content="Good Title" />
+        </html>
+        HTML
+      )
+
+      expect(title).to eq("Good Title")
+    end
+
+  end
+
+end
+
+