discourse/app/models/topic_embed.rb

245 lines
7.3 KiB
Ruby
Raw Normal View History

require_dependency 'nokogiri'
class TopicEmbed < ActiveRecord::Base
2017-04-25 02:29:04 +08:00
include Trashable
belongs_to :topic
belongs_to :post
validates_presence_of :embed_url
validates_uniqueness_of :embed_url
2017-04-25 02:29:04 +08:00
before_validation(on: :create) do
unless (topic_embed = TopicEmbed.with_deleted.where('deleted_at IS NOT NULL AND embed_url = ?', embed_url).first).nil?
topic_embed.destroy!
end
end
class FetchResponse
attr_accessor :title, :body, :author
end
def self.normalize_url(url)
url.downcase.sub(/\/$/, '').sub(/\-+/, '-').strip
end
2014-04-03 03:54:21 +08:00
def self.imported_from_html(url)
"\n<hr>\n<small>#{I18n.t('embed.imported_from', link: "<a href='#{url}'>#{url}</a>")}</small>\n"
end
DOUBLE_ESCAPED_EXPR = /%25([0-9a-f]{2})/i
# Prevents double URL encode
# https://stackoverflow.com/a/37599235
def self.escape_uri(uri, pattern = URI::UNSAFE)
encoded = URI.encode(uri, pattern)
encoded.gsub!(DOUBLE_ESCAPED_EXPR, '%\1')
encoded
end
# Import an article from a source (RSS/Atom/Other)
def self.import(user, url, title, contents)
return unless url =~ /^https?\:\/\//
2014-03-19 06:02:33 +08:00
if SiteSetting.embed_truncate
contents = first_paragraph_from(contents)
end
contents ||= ''
2014-04-03 03:54:21 +08:00
contents << imported_from_html(url)
url = normalize_url(url)
embed = TopicEmbed.find_by("lower(embed_url) = ?", url)
content_sha1 = Digest::SHA1.hexdigest(contents)
post = nil
# If there is no embed, create a topic, post and the embed.
if embed.blank?
Topic.transaction do
eh = EmbeddableHost.record_for_url(url)
creator = PostCreator.new(user,
title: title,
raw: absolutize_urls(url, contents),
skip_validations: true,
cook_method: Post.cook_methods[:raw_html],
category: eh.try(:category_id))
post = creator.create
if post.present?
TopicEmbed.create!(topic_id: post.topic_id,
embed_url: url,
content_sha1: content_sha1,
post_id: post.id)
end
end
else
2014-03-19 06:02:33 +08:00
absolutize_urls(url, contents)
post = embed.post
# Update the topic if it changed
2014-05-09 04:44:32 +08:00
if post && post.topic && content_sha1 != embed.content_sha1
post.revise(user, { raw: absolutize_urls(url, contents) }, skip_validations: true, bypass_rate_limiter: true)
embed.update_column(:content_sha1, content_sha1)
end
end
post
end
2014-04-02 06:16:56 +08:00
def self.find_remote(url)
require 'ruby-readability'
url = escape_uri(url)
original_uri = URI.parse(url)
opts = {
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote],
attributes: %w[href src class],
remove_empty_nodes: false
}
opts[:whitelist] = SiteSetting.embed_whitelist_selector if SiteSetting.embed_whitelist_selector.present?
opts[:blacklist] = SiteSetting.embed_blacklist_selector if SiteSetting.embed_blacklist_selector.present?
embed_classname_whitelist = SiteSetting.embed_classname_whitelist if SiteSetting.embed_classname_whitelist.present?
response = FetchResponse.new
begin
html = open(url, allow_redirections: :safe).read
rescue OpenURI::HTTPError, Net::OpenTimeout
return
end
2014-04-03 03:54:21 +08:00
raw_doc = Nokogiri::HTML(html)
auth_element = raw_doc.at('meta[@name="author"]')
if auth_element.present?
response.author = User.where(username_lower: auth_element[:content].strip).first
end
read_doc = Readability::Document.new(html, opts)
title = raw_doc.title || ''
title.strip!
if SiteSetting.embed_title_scrubber.present?
title.sub!(Regexp.new(SiteSetting.embed_title_scrubber), '')
title.strip!
end
response.title = title
doc = Nokogiri::HTML(read_doc.content)
2017-07-28 09:20:09 +08:00
tags = { 'img' => 'src', 'script' => 'src', 'a' => 'href' }
2014-04-03 03:54:21 +08:00
doc.search(tags.keys.join(',')).each do |node|
url_param = tags[node.name]
src = node[url_param]
unless (src.nil? || src.empty?)
begin
uri = URI.parse(escape_uri(src))
unless uri.host
uri.scheme = original_uri.scheme
uri.host = original_uri.host
node[url_param] = uri.to_s
end
rescue URI::InvalidURIError, URI::InvalidComponentError
# If there is a mistyped URL, just do nothing
2014-04-03 03:54:21 +08:00
end
end
# only allow classes in the whitelist
allowed_classes = if embed_classname_whitelist.blank? then [] else embed_classname_whitelist.split(/[ ,]+/i) end
doc.search('[class]:not([class=""])').each do |classnode|
2017-07-28 09:20:09 +08:00
classes = classnode[:class].split(' ').select { |classname| allowed_classes.include?(classname) }
if classes.length === 0
classnode.delete('class')
else
classnode[:class] = classes.join(' ')
end
end
2014-04-03 03:54:21 +08:00
end
response.body = doc.to_html
response
2014-04-02 06:16:56 +08:00
end
2017-07-28 09:20:09 +08:00
def self.import_remote(import_user, url, opts = nil)
2014-04-02 06:16:56 +08:00
opts = opts || {}
response = find_remote(url)
return if response.nil?
response.title = opts[:title] if opts[:title].present?
import_user = response.author if response.author.present?
TopicEmbed.import(import_user, url, response.title, response.body)
end
# Convert any relative URLs to absolute. RSS is annoying for this.
def self.absolutize_urls(url, contents)
url = normalize_url(url)
uri = URI(escape_uri(url))
prefix = "#{uri.scheme}://#{uri.host}"
prefix << ":#{uri.port}" if uri.port != 80 && uri.port != 443
2014-03-19 06:02:33 +08:00
fragment = Nokogiri::HTML.fragment("<div>#{contents}</div>")
fragment.css('a').each do |a|
href = a['href']
if href.present? && href.start_with?('/')
a['href'] = "#{prefix}/#{href.sub(/^\/+/, '')}"
end
end
fragment.css('img').each do |a|
src = a['src']
if src.present? && src.start_with?('/')
a['src'] = "#{prefix}/#{src.sub(/^\/+/, '')}"
end
end
2014-03-19 06:02:33 +08:00
fragment.at('div').inner_html
end
def self.topic_id_for_embed(embed_url)
embed_url = normalize_url(embed_url)
TopicEmbed.where("lower(embed_url) = ?", embed_url).pluck(:topic_id).first
end
2014-03-19 06:02:33 +08:00
def self.first_paragraph_from(html)
doc = Nokogiri::HTML(html)
result = ""
doc.css('p').each do |p|
if p.text.present?
result << p.to_s
return result if result.size >= 100
end
end
return result unless result.blank?
# If there is no first paragaph, return the first div (onebox)
doc.css('div').first
end
def self.expanded_for(post)
Rails.cache.fetch("embed-topic:#{post.topic_id}", expires_in: 10.minutes) do
url = TopicEmbed.where(topic_id: post.topic_id).pluck(:embed_url).first
response = TopicEmbed.find_remote(url)
body = response.body
body << TopicEmbed.imported_from_html(url)
body
end
end
end
2014-02-07 08:07:36 +08:00
# == Schema Information
#
# Table name: topic_embeds
#
2017-04-25 02:29:04 +08:00
# id :integer not null, primary key
# topic_id :integer not null
# post_id :integer not null
# embed_url :string(1000) not null
# content_sha1 :string(40)
# created_at :datetime not null
# updated_at :datetime not null
# deleted_at :datetime
# deleted_by_id :integer
2014-02-07 08:07:36 +08:00
#
# Indexes
#
# index_topic_embeds_on_embed_url (embed_url) UNIQUE
#