discourse/lib/oneboxer.rb
Martin Brennan 560e8aff75
FEATURE: Allow oneboxing private GitHub URLs (#27705)
This commit adds the ability to onebox private GitHub
commits, pull requests, issues, blobs, and actions using
a new `github_onebox_access_token` site setting. The token
must be set up in correctly to have access to the repos needed.

To do this successfully with the Oneboxer, we need to skip
redirects on the github.com host, otherwise we get a 404
on the URL before it is translated into a GitHub API URL
and has the appropriate headers added.
2024-07-10 09:39:31 +10:00

709 lines
20 KiB
Ruby

# frozen_string_literal: true
require "uri"
Dir["#{Rails.root}/lib/onebox/engine/*_onebox.rb"].sort.each { |f| require f }
module Oneboxer
ONEBOX_CSS_CLASS = "onebox"
AUDIO_REGEX = /\A\.(mp3|og[ga]|opus|wav|m4[abpr]|aac|flac)\z/i
VIDEO_REGEX = /\A\.(mov|mp4|webm|m4v|3gp|ogv|avi|mpeg|ogv)\z/i
# keep reloaders happy
unless defined?(Oneboxer::Result)
Result =
Struct.new(:doc, :changed) do
def to_html
doc.to_html
end
def changed?
changed
end
end
end
def self.ignore_redirects
@ignore_redirects ||= [
"http://www.dropbox.com",
"http://store.steampowered.com",
"http://vimeo.com",
"https://www.youtube.com",
"https://twitter.com",
Discourse.base_url,
]
end
def self.amazon_domains
amazon_suffixes = %w[
com
com.br
ca
cn
fr
de
in
it
co.jp
com.mx
nl
pl
sa
sg
es
se
com.tr
ae
co.uk
]
amazon_suffixes.collect { |suffix| "https://www.amazon.#{suffix}" }
end
def self.force_get_hosts
hosts = []
hosts += SiteSetting.force_get_hosts.split("|").collect { |domain| "https://#{domain}" }
hosts +=
SiteSetting
.cache_onebox_response_body_domains
.split("|")
.collect { |domain| "https://www.#{domain}" }
hosts += amazon_domains
hosts.uniq
end
def self.force_custom_user_agent_hosts
SiteSetting.force_custom_user_agent_hosts.split("|")
end
def self.allowed_post_types
@allowed_post_types ||= [Post.types[:regular], Post.types[:moderator_action]]
end
def self.local_handlers
@local_handlers ||= {}
end
def self.register_local_handler(controller, &handler)
local_handlers[controller] = handler
end
def self.preview(url, options = nil)
options ||= {}
invalidate(url) if options[:invalidate_oneboxes]
onebox_raw(url, options)[:preview]
end
def self.onebox(url, options = nil)
options ||= {}
invalidate(url) if options[:invalidate_oneboxes]
onebox_raw(url, options)[:onebox]
end
def self.cached_onebox(url)
if c = Discourse.cache.read(onebox_cache_key(url))
c[:onebox]
end
rescue => e
invalidate(url)
Rails.logger.warn("invalid cached onebox for #{url} #{e}")
""
end
def self.cached_preview(url)
if c = Discourse.cache.read(onebox_cache_key(url))
c[:preview]
end
rescue => e
invalidate(url)
Rails.logger.warn("invalid cached preview for #{url} #{e}")
""
end
def self.invalidate(url)
Discourse.cache.delete(onebox_cache_key(url))
Discourse.cache.delete(onebox_failed_cache_key(url))
end
def self.cache_response_body?(uri)
uri = URI.parse(uri) if uri.is_a?(String)
if SiteSetting.cache_onebox_response_body?
SiteSetting
.cache_onebox_response_body_domains
.split("|")
.any? { |domain| uri.hostname.ends_with?(domain) }
end
end
def self.cache_response_body(uri, response)
key = redis_cached_response_body_key(uri)
Discourse.redis.without_namespace.setex(key, 1.minutes.to_i, response)
end
def self.cached_response_body_exists?(uri)
key = redis_cached_response_body_key(uri)
Discourse.redis.without_namespace.exists(key).to_i > 0
end
def self.fetch_cached_response_body(uri)
key = redis_cached_response_body_key(uri)
Discourse.redis.without_namespace.get(key)
end
def self.redis_cached_response_body_key(uri)
"CACHED_RESPONSE_#{uri}"
end
# Parse URLs out of HTML, returning the document when finished.
def self.each_onebox_link(doc, extra_paths: [])
onebox_links = doc.css("a.#{ONEBOX_CSS_CLASS}", *extra_paths)
if onebox_links.present?
onebox_links.each { |link| yield(link["href"], link) if link["href"].present? }
end
doc
end
HTML5_BLOCK_ELEMENTS ||= %w[
address
article
aside
blockquote
canvas
center
dd
div
dl
dt
fieldset
figcaption
figure
footer
form
h1
h2
h3
h4
h5
h6
header
hgroup
hr
li
main
nav
noscript
ol
output
p
pre
section
table
tfoot
ul
video
]
def self.apply(string_or_doc, extra_paths: nil)
doc = string_or_doc
doc = Loofah.html5_fragment(doc) if doc.is_a?(String)
changed = false
each_onebox_link(doc, extra_paths: extra_paths) do |url, element|
onebox, _ = yield(url, element)
next if onebox.blank?
parsed_onebox = Loofah.html5_fragment(onebox)
next if parsed_onebox.children.blank?
changed = true
parent = element.parent
if parent&.node_name&.downcase == "p" &&
parsed_onebox.children.any? { |child|
HTML5_BLOCK_ELEMENTS.include?(child.node_name.downcase)
}
siblings = parent.children
element_idx = siblings.find_index(element)
before_idx = first_significant_element_index(siblings, element_idx - 1, -1)
after_idx = first_significant_element_index(siblings, element_idx + 1, +1)
if before_idx < 0 && after_idx >= siblings.size
parent.replace parsed_onebox
elsif before_idx < 0
parent.children = siblings[after_idx..siblings.size]
parent.add_previous_sibling(parsed_onebox)
elsif after_idx >= siblings.size
parent.children = siblings[0..before_idx]
parent.add_next_sibling(parsed_onebox)
else
parent_rest = parent.dup
parent.children = siblings[0..before_idx]
parent_rest.children = siblings[after_idx..siblings.size]
parent.add_next_sibling(parent_rest)
parent.add_next_sibling(parsed_onebox)
end
else
element.replace parsed_onebox
end
end
Result.new(doc, changed)
end
def self.first_significant_element_index(elements, index, step)
while index >= 0 && index < elements.size &&
(
elements[index].node_name.downcase == "br" ||
(
elements[index].node_name.downcase == "text" &&
elements[index].to_html.strip.blank?
)
)
index = index + step
end
index
end
def self.is_previewing?(user_id)
Discourse.redis.get(preview_key(user_id)) == "1"
end
def self.preview_onebox!(user_id)
Discourse.redis.setex(preview_key(user_id), 1.minute, "1")
end
def self.onebox_previewed!(user_id)
Discourse.redis.del(preview_key(user_id))
end
def self.engine(url)
Onebox::Matcher.new(
url,
{ allowed_iframe_regexes: Onebox::Engine.origins_to_regexes(allowed_iframe_origins) },
).oneboxed
end
def self.recently_failed?(url)
Discourse.cache.read(onebox_failed_cache_key(url)).present?
end
def self.cache_failed!(url)
Discourse.cache.write(onebox_failed_cache_key(url), true, expires_in: 1.hour)
end
private
def self.preview_key(user_id)
"onebox:preview:#{user_id}"
end
def self.blank_onebox
{ preview: "", onebox: "" }
end
def self.onebox_cache_key(url)
"onebox__#{url}"
end
def self.onebox_failed_cache_key(url)
"onebox_failed__#{url}"
end
def self.onebox_raw(url, opts = {})
url = UrlHelper.normalized_encode(url).to_s
local_onebox(url, opts) || external_onebox(url)
rescue => e
# no point warning here, just cause we have an issue oneboxing a url
# we can later hunt for failed oneboxes by searching logs if needed
Rails.logger.info("Failed to onebox #{url} #{e} #{e.backtrace}")
# return a blank hash, so rest of the code works
blank_onebox
end
def self.local_onebox(url, opts = {})
return unless route = Discourse.route_for(url)
html =
case route[:controller]
when "uploads"
local_upload_html(url)
when "topics"
local_topic_html(url, route, opts)
when "users"
local_user_html(url, route)
when "list"
local_category_html(url, route)
else
if handler = local_handlers[route[:controller]]
handler.call(url, route)
end
end
normalized_url = ::Onebox::Helpers.normalize_url_for_output(URI(url).to_s)
html = html.presence || "<a href='#{normalized_url}'>#{normalized_url}</a>"
{ onebox: html, preview: html }
end
def self.local_upload_html(url)
additional_controls =
if SiteSetting.disable_onebox_media_download_controls
"controlslist='nodownload'"
else
""
end
normalized_url = ::Onebox::Helpers.normalize_url_for_output(url)
case File.extname(URI(url).path || "")
when VIDEO_REGEX
<<~HTML
<div class="onebox video-onebox">
<video #{additional_controls} width="100%" height="100%" controls="">
<source src='#{normalized_url}'>
<a href='#{normalized_url}'>
#{normalized_url}
</a>
</video>
</div>
HTML
when AUDIO_REGEX
<<~HTML
<audio #{additional_controls} controls>
<source src='#{normalized_url}'>
<a href='#{normalized_url}'>
#{normalized_url}
</a>
</audio>
HTML
end
end
def self.local_topic(url, route, opts)
if current_user = User.find_by(id: opts[:user_id])
if current_category = Category.find_by(id: opts[:category_id])
return unless Guardian.new(current_user).can_see_category?(current_category)
end
if current_topic = Topic.find_by(id: opts[:topic_id])
return unless Guardian.new(current_user).can_see_topic?(current_topic)
end
end
return unless topic = Topic.find_by(id: route[:id] || route[:topic_id])
return if topic.private_message?
if current_category.blank? || current_category.id != topic.category_id
return unless Guardian.new.can_see_topic?(topic)
end
topic
end
def self.local_topic_html(url, route, opts)
return unless topic = local_topic(url, route, opts)
post_number = route[:post_number].to_i
post =
(
if post_number > 1
topic.posts.where(post_number: post_number).first
else
topic.ordered_posts.first
end
)
return if !post || post.hidden || !allowed_post_types.include?(post.post_type)
if post_number > 1 && opts[:topic_id] == topic.id
excerpt = post.excerpt(SiteSetting.post_onebox_maxlength, keep_svg: true)
excerpt.gsub!(/[\r\n]+/, " ")
excerpt.gsub!("[/quote]", "[quote]") # don't break my quote
quote =
"[quote=\"#{post.user.username}, topic:#{topic.id}, post:#{post.post_number}\"]\n#{excerpt}\n[/quote]"
PrettyText.cook(quote)
else
args = {
topic_id: topic.id,
post_number: post.post_number,
avatar: PrettyText.avatar_img(post.user.avatar_template_url, "tiny"),
original_url: url,
title: PrettyText.unescape_emoji(CGI.escapeHTML(topic.title)),
category_html: CategoryBadge.html_for(topic.category),
quote:
PrettyText.unescape_emoji(
post.excerpt(SiteSetting.post_onebox_maxlength, keep_svg: true),
),
}
template = template("discourse_topic_onebox")
Mustache.render(template, args)
end
end
def self.local_user_html(url, route)
username = route[:username] || ""
if user = User.find_by(username_lower: username.downcase)
name = user.name if SiteSetting.enable_names
args = {
user_id: user.id,
username: user.username,
avatar: PrettyText.avatar_img(user.avatar_template, "huge"),
name: name,
bio: user.user_profile.bio_excerpt(230),
location: Onebox::Helpers.sanitize(user.user_profile.location),
joined:
I18n.t(
"onebox.discourse.user_joined_community",
date: user.created_at.strftime(I18n.t("datetime_formats.formats.date_only")),
),
website: user.user_profile.website,
website_name: UserSerializer.new(user).website_name,
original_url: url,
}
Mustache.render(template("discourse_user_onebox"), args)
else
nil
end
end
def self.local_category_html(url, route)
return unless route[:category_slug_path_with_id]
category = Category.find_by_slug_path_with_id(route[:category_slug_path_with_id])
if Guardian.new.can_see_category?(category)
args = {
url: category.url,
name: category.name,
color: category.color,
logo_url: category.uploaded_logo&.url,
description: Onebox::Helpers.sanitize(category.description),
has_subcategories: category.subcategories.present?,
subcategories:
category.subcategories.collect { |sc| { name: sc.name, color: sc.color, url: sc.url } },
}
Mustache.render(template("discourse_category_onebox"), args)
end
end
def self.preserve_fragment_url_hosts
@preserve_fragment_url_hosts ||= ["http://github.com"]
end
def self.allowed_iframe_origins
allowed = SiteSetting.allowed_onebox_iframes.split("|")
allowed = Onebox::Engine.all_iframe_origins if allowed.include?("*")
allowed += SiteSetting.allowed_iframes.split("|")
end
def self.external_onebox(url, available_strategies = nil)
Discourse
.cache
.fetch(onebox_cache_key(url), expires_in: 1.day) do
uri = URI(url)
available_strategies ||= Oneboxer.ordered_strategies(uri.hostname)
strategy = available_strategies.shift
max_redirects = 0 if SiteSetting.block_onebox_on_redirect
fd =
FinalDestination.new(
url,
get_final_destination_options(url, strategy).merge(
stop_at_blocked_pages: true,
max_redirects: max_redirects,
initial_https_redirect_ignore_limit: SiteSetting.block_onebox_on_redirect,
),
)
uri = fd.resolve
return blank_onebox if fd.status == :blocked_page
if fd.status != :resolved
args = { link: url }
if fd.status == :invalid_address
args[:error_message] = I18n.t("errors.onebox.invalid_address", hostname: fd.hostname)
elsif (fd.status_code || uri.nil?) && available_strategies.present?
# Try a different oneboxing strategy, if we have any options left:
return external_onebox(url, available_strategies)
elsif fd.status_code
args[:error_message] = I18n.t(
"errors.onebox.error_response",
status_code: fd.status_code,
)
end
error_box = blank_onebox
error_box[:preview] = preview_error_onebox(args)
return error_box
end
return blank_onebox if uri.blank?
onebox_options = {
max_width: 695,
sanitize_config: Onebox::SanitizeConfig::DISCOURSE_ONEBOX,
allowed_iframe_origins: allowed_iframe_origins,
hostname: GlobalSetting.hostname,
facebook_app_access_token: SiteSetting.facebook_app_access_token,
disable_media_download_controls: SiteSetting.disable_onebox_media_download_controls,
body_cacher: self,
content_type: fd.content_type,
}
onebox_options[:cookie] = fd.cookie if fd.cookie
user_agent_override = SiteSetting.cache_onebox_user_agent if Oneboxer.cache_response_body?(
url,
) && SiteSetting.cache_onebox_user_agent.present?
onebox_options[:user_agent] = user_agent_override if user_agent_override
preview_result = Onebox.preview(uri.to_s, onebox_options)
result = {
onebox: WordWatcher.censor(preview_result.to_s),
preview: WordWatcher.censor(preview_result.placeholder_html.to_s),
}
# NOTE: Call preview_result.errors after calling placeholder_html
if preview_result.errors.any?
error_keys = preview_result.errors.keys
skip_if_only_error = [:image]
unless error_keys.length == 1 && skip_if_only_error.include?(error_keys.first)
missing_attributes = error_keys.map(&:to_s).sort.join(I18n.t("word_connector.comma"))
error_message =
I18n.t(
"errors.onebox.missing_data",
missing_attributes: missing_attributes,
count: error_keys.size,
)
args = preview_result.verified_data.merge(error_message: error_message)
if result[:preview].blank?
result[:preview] = preview_error_onebox(args)
else
doc = Nokogiri::HTML5.fragment(result[:preview])
aside = doc.at("aside")
if aside
# Add an error message to the preview that was returned
error_fragment = preview_error_onebox_fragment(args)
aside.add_child(error_fragment)
result[:preview] = doc.to_html
end
end
end
end
Oneboxer.cache_preferred_strategy(uri.hostname, strategy)
result
end
end
def self.preview_error_onebox(args, is_fragment = false)
args[:title] ||= args[:link] if args[:link]
args[:error_message] = PrettyText.unescape_emoji(args[:error_message]) if args[:error_message]
template_name = is_fragment ? "preview_error_fragment_onebox" : "preview_error_onebox"
Mustache.render(template(template_name), args)
end
def self.preview_error_onebox_fragment(args)
preview_error_onebox(args, true)
end
def self.template(template_name)
@template_cache ||= {}
@template_cache[template_name] ||= begin
full_path = "#{Rails.root}/lib/onebox/templates/#{template_name}.mustache"
File.read(full_path)
end
end
def self.ordered_strategies(hostname)
all = strategies.keys
preferred = Oneboxer.preferred_strategy(hostname)
all.insert(0, all.delete(preferred)) if all.include?(preferred)
all
end
def self.strategies
{
default: {
}, # don't override anything by default
force_get_and_ua: {
force_get_host: true,
force_custom_user_agent_host: true,
},
}
end
def self.cache_preferred_strategy(hostname, strategy)
return if strategy == :default
key = redis_oneboxer_strategy_key(hostname)
Discourse.redis.without_namespace.setex(key, 2.weeks.to_i, strategy.to_s)
end
def self.clear_preferred_strategy!(hostname)
key = redis_oneboxer_strategy_key(hostname)
Discourse.redis.without_namespace.del(key)
end
def self.preferred_strategy(hostname)
key = redis_oneboxer_strategy_key(hostname)
Discourse.redis.without_namespace.get(key)&.to_sym
end
def self.redis_oneboxer_strategy_key(hostname)
"ONEBOXER_STRATEGY_#{hostname}"
end
def self.get_final_destination_options(url, strategy = nil)
fd_options = {
ignore_redirects: ignore_redirects,
force_get_hosts: force_get_hosts,
force_custom_user_agent_hosts: force_custom_user_agent_hosts,
preserve_fragment_url_hosts: preserve_fragment_url_hosts,
timeout: 5,
}
uri = URI(url)
# For private GitHub repos, we get a 404 when trying to use
# FinalDestination to request the final URL because no auth headers
# are sent. In this case we can ignore redirects and go straight to
# using Onebox.preview
if SiteSetting.github_onebox_access_token.present? && uri.hostname == "github.com"
fd_options[:ignore_redirects] << "https://github.com"
end
strategy = Oneboxer.ordered_strategies(uri.hostname).shift if strategy.blank?
if strategy && Oneboxer.strategies[strategy][:force_get_host]
fd_options[:force_get_hosts] = ["https://#{uri.hostname}"]
end
if strategy && Oneboxer.strategies[strategy][:force_custom_user_agent_host]
fd_options[:force_custom_user_agent_hosts] = ["https://#{uri.hostname}"]
end
user_agent_override = SiteSetting.cache_onebox_user_agent if Oneboxer.cache_response_body?(
url,
) && SiteSetting.cache_onebox_user_agent.present?
fd_options[:default_user_agent] = user_agent_override if user_agent_override
fd_options
end
end