mirror of
https://github.com/discourse/discourse.git
synced 2025-02-27 12:00:54 +08:00

raw_html posts (i.e. those which are pulled as part of our comments integration) don't go through our markdown pipeline, so `upload://` URLs are not supported. Running pull_hotlinked_images will break any images in the post. In future we may add support for pulling hotlinked images in these posts. But for now, disabling it will stop it breaking images.
262 lines
8.7 KiB
Ruby
262 lines
8.7 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module Jobs
|
|
|
|
class PullHotlinkedImages < ::Jobs::Base
|
|
sidekiq_options queue: 'low'
|
|
|
|
def initialize
|
|
@max_size = SiteSetting.max_image_size_kb.kilobytes
|
|
end
|
|
|
|
def execute(args)
|
|
@post_id = args[:post_id]
|
|
raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
|
|
|
|
post = Post.find_by(id: @post_id)
|
|
return if post.blank?
|
|
return if post.topic.blank?
|
|
return if post.cook_method == Post.cook_methods[:raw_html]
|
|
|
|
raw = post.raw.dup
|
|
start_raw = raw.dup
|
|
|
|
large_image_urls = post.custom_fields[Post::LARGE_IMAGES] || []
|
|
broken_image_urls = post.custom_fields[Post::BROKEN_IMAGES] || []
|
|
downloaded_image_ids = post.custom_fields[Post::DOWNLOADED_IMAGES] || {}
|
|
|
|
upload_records = Upload.where(id: downloaded_image_ids.values)
|
|
upload_records = Hash[upload_records.map { |u| [u.id, u] }]
|
|
|
|
downloaded_images = {}
|
|
downloaded_image_ids.each { |url, id| downloaded_images[url] = upload_records[id] }
|
|
|
|
extract_images_from(post.cooked).each do |node|
|
|
download_src = original_src = node['src'] || node['href']
|
|
download_src = "#{SiteSetting.force_https ? "https" : "http"}:#{original_src}" if original_src.start_with?("//")
|
|
normalized_src = normalize_src(download_src)
|
|
|
|
next if !should_download_image?(download_src, post)
|
|
|
|
begin
|
|
already_attempted_download = downloaded_images.include?(normalized_src) || large_image_urls.include?(normalized_src) || broken_image_urls.include?(normalized_src)
|
|
if !already_attempted_download
|
|
downloaded_images[normalized_src] = attempt_download(download_src, post.user_id)
|
|
end
|
|
rescue ImageTooLargeError
|
|
large_image_urls << normalized_src
|
|
rescue ImageBrokenError
|
|
broken_image_urls << normalized_src
|
|
end
|
|
|
|
# have we successfully downloaded that file?
|
|
if upload = downloaded_images[normalized_src]
|
|
raw = replace_in_raw(original_src: original_src, upload: upload, raw: raw)
|
|
end
|
|
rescue => e
|
|
raise e if Rails.env.test?
|
|
log(:error, "Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
|
|
end
|
|
|
|
large_image_urls.uniq!
|
|
broken_image_urls.uniq!
|
|
downloaded_images.compact!
|
|
|
|
post.custom_fields[Post::LARGE_IMAGES] = large_image_urls
|
|
post.custom_fields[Post::BROKEN_IMAGES] = broken_image_urls
|
|
|
|
downloaded_image_ids = {}
|
|
downloaded_images.each { |url, upload| downloaded_image_ids[url] = upload.id }
|
|
post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_image_ids
|
|
|
|
[Post::LARGE_IMAGES, Post::BROKEN_IMAGES, Post::DOWNLOADED_IMAGES].each do |key|
|
|
post.custom_fields.delete(key) if !post.custom_fields[key].present?
|
|
end
|
|
|
|
custom_fields_updated = !post.custom_fields_clean?
|
|
|
|
# only save custom fields if they changed
|
|
post.save_custom_fields if custom_fields_updated
|
|
|
|
# If post changed while we were downloading images, never apply edits
|
|
post.reload
|
|
post_changed_elsewhere = (start_raw != post.raw)
|
|
raw_changed_here = (raw != post.raw)
|
|
|
|
if !post_changed_elsewhere && raw_changed_here
|
|
changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
|
|
post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true)
|
|
elsif custom_fields_updated
|
|
post.trigger_post_process(
|
|
bypass_bump: true,
|
|
skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling
|
|
)
|
|
end
|
|
end
|
|
|
|
def download(src)
|
|
downloaded = nil
|
|
|
|
begin
|
|
retries ||= 3
|
|
|
|
if SiteSetting.verbose_upload_logging
|
|
Rails.logger.warn("Verbose Upload Logging: Downloading hotlinked image from #{src}")
|
|
end
|
|
|
|
downloaded = FileHelper.download(
|
|
src,
|
|
max_file_size: @max_size,
|
|
retain_on_max_file_size_exceeded: true,
|
|
tmp_file_name: "discourse-hotlinked",
|
|
follow_redirect: true
|
|
)
|
|
rescue => e
|
|
if SiteSetting.verbose_upload_logging
|
|
Rails.logger.warn("Verbose Upload Logging: Error '#{e.message}' while downloading #{src}")
|
|
end
|
|
|
|
if (retries -= 1) > 0 && !Rails.env.test?
|
|
sleep 1
|
|
retry
|
|
end
|
|
end
|
|
|
|
downloaded
|
|
end
|
|
|
|
class ImageTooLargeError < StandardError; end
|
|
class ImageBrokenError < StandardError; end
|
|
|
|
def attempt_download(src, user_id)
|
|
# secure-media-uploads endpoint prevents anonymous downloads, so we
|
|
# need the presigned S3 URL here
|
|
src = Upload.signed_url_from_secure_media_url(src) if Upload.secure_media_url?(src)
|
|
|
|
hotlinked = download(src)
|
|
raise ImageBrokenError if !hotlinked
|
|
raise ImageTooLargeError if File.size(hotlinked.path) > @max_size
|
|
|
|
filename = File.basename(URI.parse(src).path)
|
|
filename << File.extname(hotlinked.path) unless filename["."]
|
|
upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(user_id)
|
|
|
|
if upload.persisted?
|
|
upload
|
|
else
|
|
log(:info, "Failed to persist downloaded hotlinked image for post: #{@post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")
|
|
nil
|
|
end
|
|
end
|
|
|
|
def replace_in_raw(original_src:, raw:, upload:)
|
|
raw = raw.dup
|
|
escaped_src = Regexp.escape(original_src)
|
|
|
|
replace_raw = ->(match, match_src, replacement, _index) {
|
|
if normalize_src(original_src) == normalize_src(match_src)
|
|
replacement =
|
|
if replacement.include?(InlineUploads::PLACEHOLDER)
|
|
replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
|
|
elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER)
|
|
replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path)
|
|
end
|
|
|
|
raw = raw.gsub(
|
|
match,
|
|
replacement
|
|
)
|
|
end
|
|
}
|
|
|
|
# there are 6 ways to insert an image in a post
|
|
# HTML tag - <img src="http://...">
|
|
InlineUploads.match_img(raw, external_src: true, &replace_raw)
|
|
|
|
# BBCode tag - [img]http://...[/img]
|
|
InlineUploads.match_bbcode_img(raw, external_src: true, &replace_raw)
|
|
|
|
# Markdown linked image - [](http://...)
|
|
# Markdown inline - 
|
|
# Markdown inline - 
|
|
# Markdown inline - 
|
|
InlineUploads.match_md_inline_img(raw, external_src: true, &replace_raw)
|
|
|
|
# Direct link
|
|
raw.gsub!(/^#{escaped_src}(\s?)$/) { "#{$1}" }
|
|
|
|
raw
|
|
end
|
|
|
|
def extract_images_from(html)
|
|
doc = Nokogiri::HTML5::fragment(html)
|
|
|
|
doc.css("img[src], a.lightbox[href]") -
|
|
doc.css("img.avatar") -
|
|
doc.css(".lightbox img[src]")
|
|
end
|
|
|
|
def should_download_image?(src, post = nil)
|
|
# make sure we actually have a url
|
|
return false unless src.present?
|
|
|
|
local_bases = [
|
|
Discourse.base_url,
|
|
Discourse.asset_host,
|
|
SiteSetting.external_emoji_url.presence
|
|
].compact.map { |s| normalize_src(s) }
|
|
|
|
if Discourse.store.has_been_uploaded?(src) || normalize_src(src).start_with?(*local_bases) || src =~ /\A\/[^\/]/i
|
|
return false if !(src =~ /\/uploads\// || Upload.secure_media_url?(src))
|
|
|
|
# Someone could hotlink a file from a different site on the same CDN,
|
|
# so check whether we have it in this database
|
|
#
|
|
# if the upload already exists and is attached to a different post,
|
|
# or the original_sha1 is missing meaning it was created before secure
|
|
# media was enabled, then we definitely want to redownload again otherwise
|
|
# we end up reusing existing uploads which may be linked to many posts
|
|
# already.
|
|
upload = Upload.consider_for_reuse(Upload.get_from_url(src), post)
|
|
|
|
return !upload.present?
|
|
end
|
|
|
|
# Don't download non-local images unless site setting enabled
|
|
return false unless SiteSetting.download_remote_images_to_local?
|
|
|
|
# parse the src
|
|
begin
|
|
uri = URI.parse(src)
|
|
rescue URI::Error
|
|
return false
|
|
end
|
|
|
|
hostname = uri.hostname
|
|
return false unless hostname
|
|
|
|
# check the domains blocklist
|
|
SiteSetting.should_download_images?(src)
|
|
end
|
|
|
|
def log(log_level, message)
|
|
Rails.logger.public_send(
|
|
log_level,
|
|
"#{RailsMultisite::ConnectionManagement.current_db}: #{message}"
|
|
)
|
|
end
|
|
|
|
protected
|
|
|
|
def normalize_src(src)
|
|
uri = Addressable::URI.heuristic_parse(src)
|
|
uri.normalize!
|
|
uri.scheme = nil
|
|
uri.to_s
|
|
rescue URI::Error, Addressable::URI::InvalidURIError
|
|
src
|
|
end
|
|
end
|
|
|
|
end
|