discourse/app/models/topic_embed.rb
Blake Erickson 5e3106387f
SECURITY: Use canonical url for topic embeddings (#22088)
This prevents duplicate topics from being created when using embed_urls
that only differ on query params.
2023-06-13 11:09:23 -06:00

333 lines
9.1 KiB
Ruby

# frozen_string_literal: true
class TopicEmbed < ActiveRecord::Base
include Trashable
belongs_to :topic
belongs_to :post
validates_presence_of :embed_url
validates_uniqueness_of :embed_url
before_validation(on: :create) do
unless (
topic_embed =
TopicEmbed
.with_deleted
.where("deleted_at IS NOT NULL AND embed_url = ?", embed_url)
.first
).nil?
topic_embed.destroy!
end
end
class FetchResponse
attr_accessor :title, :body, :author, :url
end
def self.normalize_url(url)
# downcase
# remove trailing forward slash/
# remove consecutive hyphens
# remove leading and trailing whitespace
url.downcase.sub(%r{/\z}, "").sub(/\-+/, "-").strip
end
def self.imported_from_html(url)
url = UrlHelper.normalized_encode(url)
I18n.with_locale(SiteSetting.default_locale) do
"\n<hr>\n<small>#{I18n.t("embed.imported_from", link: "<a href='#{url}'>#{url}</a>")}</small>\n"
end
end
# Import an article from a source (RSS/Atom/Other)
def self.import(user, url, title, contents, category_id: nil, cook_method: nil, tags: nil)
return unless url =~ %r{^https?\://}
contents = first_paragraph_from(contents) if SiteSetting.embed_truncate && cook_method.nil?
contents ||= ""
contents = contents.dup << imported_from_html(url)
url = normalize_url(url)
embed = topic_embed_by_url(url)
content_sha1 = Digest::SHA1.hexdigest(contents)
post = nil
# If there is no embed, create a topic, post and the embed.
if embed.blank?
Topic.transaction do
eh = EmbeddableHost.record_for_url(url)
cook_method ||=
if SiteSetting.embed_support_markdown
Post.cook_methods[:regular]
else
Post.cook_methods[:raw_html]
end
create_args = {
title: title,
raw: absolutize_urls(url, contents),
skip_validations: true,
cook_method: cook_method,
category: category_id || eh.try(:category_id),
tags: SiteSetting.tagging_enabled ? tags : nil,
}
create_args[:visible] = false if SiteSetting.embed_unlisted?
creator = PostCreator.new(user, create_args)
post = creator.create
if post.present?
TopicEmbed.create!(
topic_id: post.topic_id,
embed_url: url,
content_sha1: content_sha1,
post_id: post.id,
)
end
end
else
absolutize_urls(url, contents)
post = embed.post
# Update the topic if it changed
if post&.topic
if post.user != user
PostOwnerChanger.new(
post_ids: [post.id],
topic_id: post.topic_id,
new_owner: user,
acting_user: Discourse.system_user,
).change_owner!
# make sure the post returned has the right author
post.reload
end
if (content_sha1 != embed.content_sha1) || (title && title != post&.topic&.title)
changes = { raw: absolutize_urls(url, contents) }
changes[:title] = title if title.present?
post.revise(user, changes, skip_validations: true, bypass_rate_limiter: true)
embed.update!(content_sha1: content_sha1)
end
end
end
post
end
def self.find_remote(url)
require "ruby-readability"
url = UrlHelper.normalized_encode(url)
original_uri = URI.parse(url)
fd = FinalDestination.new(url, validate_uri: true, max_redirects: 5, follow_canonical: true)
uri = fd.resolve
return if uri.blank?
begin
html = uri.read
rescue OpenURI::HTTPError, Net::OpenTimeout
return
end
parse_html(html, uri.to_s)
end
def self.parse_html(html, url)
require "ruby-readability"
opts = {
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote],
attributes: %w[href src class],
remove_empty_nodes: false,
}
opts[
:whitelist
] = SiteSetting.allowed_embed_selectors if SiteSetting.allowed_embed_selectors.present?
opts[
:blacklist
] = SiteSetting.blocked_embed_selectors if SiteSetting.blocked_embed_selectors.present?
allowed_embed_classnames =
SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present?
response = FetchResponse.new
raw_doc = Nokogiri.HTML5(html)
response.url = url
auth_element =
raw_doc.at('meta[@name="discourse-username"]') || raw_doc.at('meta[@name="author"]')
if auth_element.present?
response.author = User.where(username_lower: auth_element[:content].strip).first
end
read_doc = Readability::Document.new(html, opts)
title = +(raw_doc.title || "")
title.strip!
if SiteSetting.embed_title_scrubber.present?
title.sub!(Regexp.new(SiteSetting.embed_title_scrubber), "")
title.strip!
end
response.title = title
doc = Nokogiri.HTML5(read_doc.content)
tags = { "img" => "src", "script" => "src", "a" => "href" }
doc
.search(tags.keys.join(","))
.each do |node|
url_param = tags[node.name]
src = node[url_param]
unless (src.nil? || src.empty?)
begin
# convert URL to absolute form
node[url_param] = URI.join(url, UrlHelper.normalized_encode(src)).to_s
rescue URI::Error, Addressable::URI::InvalidURIError
# If there is a mistyped URL, just do nothing
end
end
# only allow classes in the allowlist
allowed_classes =
if allowed_embed_classnames.blank?
[]
else
allowed_embed_classnames.split(/[ ,]+/i)
end
doc
.search('[class]:not([class=""])')
.each do |classnode|
classes =
classnode[:class]
.split(" ")
.select { |classname| allowed_classes.include?(classname) }
if classes.length === 0
classnode.delete("class")
else
classnode[:class] = classes.join(" ")
end
end
end
response.body = doc.to_html
response
end
def self.import_remote(url, opts = nil)
opts = opts || {}
response = find_remote(url)
return if response.nil?
response.title = opts[:title] if opts[:title].present?
import_user = opts[:user] if opts[:user].present?
import_user = response.author if response.author.present?
url = normalize_url(response.url) if response.url.present?
TopicEmbed.import(import_user, url, response.title, response.body)
end
# Convert any relative URLs to absolute. RSS is annoying for this.
def self.absolutize_urls(url, contents)
url = normalize_url(url)
begin
uri = URI(UrlHelper.normalized_encode(url))
rescue URI::Error
return contents
end
prefix = "#{uri.scheme}://#{uri.host}"
prefix += ":#{uri.port}" if uri.port != 80 && uri.port != 443
fragment = Nokogiri::HTML5.fragment("<div>#{contents}</div>")
fragment
.css("a")
.each do |a|
if a["href"].present?
begin
a["href"] = URI.join(prefix, a["href"]).to_s
rescue URI::InvalidURIError
# NOOP, URL is malformed
end
end
end
fragment
.css("img")
.each do |a|
if a["src"].present?
begin
a["src"] = URI.join(prefix, a["src"]).to_s
rescue URI::InvalidURIError
# NOOP, URL is malformed
end
end
end
fragment.at("div").inner_html
end
def self.topic_embed_by_url(embed_url)
embed_url = normalize_url(embed_url).sub(%r{\Ahttps?\://}, "")
TopicEmbed.where("embed_url ~* ?", "^https?://#{Regexp.escape(embed_url)}$").first
end
def self.topic_id_for_embed(embed_url)
topic_embed = topic_embed_by_url(embed_url)
topic_embed&.topic_id
end
def self.first_paragraph_from(html)
doc = Nokogiri.HTML5(html)
result = +""
doc
.css("p")
.each do |p|
if p.text.present?
result << p.to_s
return result if result.size >= 100
end
end
return result unless result.blank?
# If there is no first paragraph, return the first div (onebox)
doc.css("div").first.to_s
end
def self.expanded_for(post)
Discourse
.cache
.fetch("embed-topic:#{post.topic_id}", expires_in: 10.minutes) do
url = TopicEmbed.where(topic_id: post.topic_id).pluck_first(:embed_url)
response = TopicEmbed.find_remote(url)
body = response.body
body << TopicEmbed.imported_from_html(url)
body
end
end
end
# == Schema Information
#
# Table name: topic_embeds
#
# id :integer not null, primary key
# topic_id :integer not null
# post_id :integer not null
# embed_url :string(1000) not null
# content_sha1 :string(40)
# created_at :datetime not null
# updated_at :datetime not null
# deleted_at :datetime
# deleted_by_id :integer
#
# Indexes
#
# index_topic_embeds_on_embed_url (embed_url) UNIQUE
#