discourse/app/models/topic_embed.rb
Gerhard Schlager f3b2ee8e1b
FIX: Use default locale for footer of embedded topics (#17760)
The content from the remote site and the footer get cached for 10 minutes, so Discourse should use the default locale instead of the user locale for the footer. Otherwise Discourse might cache the message in a different language.
2022-08-02 20:49:28 +02:00

285 lines
8.2 KiB
Ruby

# frozen_string_literal: true
class TopicEmbed < ActiveRecord::Base
include Trashable
belongs_to :topic
belongs_to :post
validates_presence_of :embed_url
validates_uniqueness_of :embed_url
before_validation(on: :create) do
unless (topic_embed = TopicEmbed.with_deleted.where('deleted_at IS NOT NULL AND embed_url = ?', embed_url).first).nil?
topic_embed.destroy!
end
end
class FetchResponse
attr_accessor :title, :body, :author
end
def self.normalize_url(url)
url.downcase.sub(/\/$/, '').sub(/\-+/, '-').strip
end
def self.imported_from_html(url)
I18n.with_locale(SiteSetting.default_locale) do
"\n<hr>\n<small>#{I18n.t('embed.imported_from', link: "<a href='#{url}'>#{url}</a>")}</small>\n"
end
end
# Import an article from a source (RSS/Atom/Other)
def self.import(user, url, title, contents, category_id: nil, cook_method: nil, tags: nil)
return unless url =~ /^https?\:\/\//
if SiteSetting.embed_truncate && cook_method.nil?
contents = first_paragraph_from(contents)
end
contents ||= ''
contents = contents.dup << imported_from_html(url)
url = normalize_url(url)
embed = TopicEmbed.find_by("lower(embed_url) = ?", url)
content_sha1 = Digest::SHA1.hexdigest(contents)
post = nil
# If there is no embed, create a topic, post and the embed.
if embed.blank?
Topic.transaction do
eh = EmbeddableHost.record_for_url(url)
cook_method ||= if SiteSetting.embed_support_markdown
Post.cook_methods[:regular]
else
Post.cook_methods[:raw_html]
end
create_args = {
title: title,
raw: absolutize_urls(url, contents),
skip_validations: true,
cook_method: cook_method,
category: category_id || eh.try(:category_id),
tags: SiteSetting.tagging_enabled ? tags : nil,
}
if SiteSetting.embed_unlisted?
create_args[:visible] = false
end
creator = PostCreator.new(user, create_args)
post = creator.create
if post.present?
TopicEmbed.create!(topic_id: post.topic_id,
embed_url: url,
content_sha1: content_sha1,
post_id: post.id)
end
end
else
absolutize_urls(url, contents)
post = embed.post
# Update the topic if it changed
if post&.topic
if post.user != user
PostOwnerChanger.new(
post_ids: [post.id],
topic_id: post.topic_id,
new_owner: user,
acting_user: Discourse.system_user
).change_owner!
# make sure the post returned has the right author
post.reload
end
if (content_sha1 != embed.content_sha1) || (title && title != post&.topic&.title)
changes = { raw: absolutize_urls(url, contents) }
changes[:title] = title if title.present?
post.revise(user, changes, skip_validations: true, bypass_rate_limiter: true)
embed.update!(content_sha1: content_sha1)
end
end
end
post
end
def self.find_remote(url)
require 'ruby-readability'
url = UrlHelper.escape_uri(url)
original_uri = URI.parse(url)
fd = FinalDestination.new(
url,
validate_uri: true,
max_redirects: 5,
follow_canonical: true,
)
uri = fd.resolve
return if uri.blank?
opts = {
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote],
attributes: %w[href src class],
remove_empty_nodes: false
}
opts[:whitelist] = SiteSetting.allowed_embed_selectors if SiteSetting.allowed_embed_selectors.present?
opts[:blacklist] = SiteSetting.blocked_embed_selectors if SiteSetting.blocked_embed_selectors.present?
allowed_embed_classnames = SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present?
response = FetchResponse.new
begin
html = uri.read
rescue OpenURI::HTTPError, Net::OpenTimeout
return
end
raw_doc = Nokogiri::HTML5(html)
auth_element = raw_doc.at('meta[@name="author"]')
if auth_element.present?
response.author = User.where(username_lower: auth_element[:content].strip).first
end
read_doc = Readability::Document.new(html, opts)
title = +(raw_doc.title || '')
title.strip!
if SiteSetting.embed_title_scrubber.present?
title.sub!(Regexp.new(SiteSetting.embed_title_scrubber), '')
title.strip!
end
response.title = title
doc = Nokogiri::HTML5(read_doc.content)
tags = { 'img' => 'src', 'script' => 'src', 'a' => 'href' }
doc.search(tags.keys.join(',')).each do |node|
url_param = tags[node.name]
src = node[url_param]
unless (src.nil? || src.empty?)
begin
# convert URL to absolute form
node[url_param] = URI.join(url, UrlHelper.escape_uri(src)).to_s
rescue URI::Error, Addressable::URI::InvalidURIError
# If there is a mistyped URL, just do nothing
end
end
# only allow classes in the allowlist
allowed_classes = if allowed_embed_classnames.blank? then [] else allowed_embed_classnames.split(/[ ,]+/i) end
doc.search('[class]:not([class=""])').each do |classnode|
classes = classnode[:class].split(' ').select { |classname| allowed_classes.include?(classname) }
if classes.length === 0
classnode.delete('class')
else
classnode[:class] = classes.join(' ')
end
end
end
response.body = doc.to_html
response
end
def self.import_remote(import_user, url, opts = nil)
opts = opts || {}
response = find_remote(url)
return if response.nil?
response.title = opts[:title] if opts[:title].present?
import_user = response.author if response.author.present?
TopicEmbed.import(import_user, url, response.title, response.body)
end
# Convert any relative URLs to absolute. RSS is annoying for this.
def self.absolutize_urls(url, contents)
url = normalize_url(url)
begin
uri = URI(UrlHelper.escape_uri(url))
rescue URI::Error
return contents
end
prefix = "#{uri.scheme}://#{uri.host}"
prefix += ":#{uri.port}" if uri.port != 80 && uri.port != 443
fragment = Nokogiri::HTML5.fragment("<div>#{contents}</div>")
fragment.css('a').each do |a|
if a['href'].present?
begin
a['href'] = URI.join(prefix, a['href']).to_s
rescue URI::InvalidURIError
# NOOP, URL is malformed
end
end
end
fragment.css('img').each do |a|
if a['src'].present?
begin
a['src'] = URI.join(prefix, a['src']).to_s
rescue URI::InvalidURIError
# NOOP, URL is malformed
end
end
end
fragment.at('div').inner_html
end
def self.topic_id_for_embed(embed_url)
embed_url = normalize_url(embed_url).sub(/^https?\:\/\//, '')
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").pluck_first(:topic_id)
end
def self.first_paragraph_from(html)
doc = Nokogiri::HTML5(html)
result = +""
doc.css('p').each do |p|
if p.text.present?
result << p.to_s
return result if result.size >= 100
end
end
return result unless result.blank?
# If there is no first paragraph, return the first div (onebox)
doc.css('div').first.to_s
end
def self.expanded_for(post)
Discourse.cache.fetch("embed-topic:#{post.topic_id}", expires_in: 10.minutes) do
url = TopicEmbed.where(topic_id: post.topic_id).pluck_first(:embed_url)
response = TopicEmbed.find_remote(url)
body = response.body
body << TopicEmbed.imported_from_html(url)
body
end
end
end
# == Schema Information
#
# Table name: topic_embeds
#
# id :integer not null, primary key
# topic_id :integer not null
# post_id :integer not null
# embed_url :string(1000) not null
# content_sha1 :string(40)
# created_at :datetime not null
# updated_at :datetime not null
# deleted_at :datetime
# deleted_by_id :integer
#
# Indexes
#
# index_topic_embeds_on_embed_url (embed_url) UNIQUE
#