discourse/app/jobs/regular/pull_hotlinked_images.rb
Sam 9e241e82e9
DEV: use HTML5 version of loofah (#21522)
https://meta.discourse.org/t/markdown-preview-and-result-differ/263878

The result of this markdown had different results in the composer preview and the post. This is solved by updating Loofah to the latest version and using html5 fragments like our user had reported. While the change was only needed in cooked_post_processor.rb for this fix, other areas also had to be updated due to various side effects.
2023-06-20 09:49:22 +08:00

243 lines
7.4 KiB
Ruby

# frozen_string_literal: true
module Jobs
class PullHotlinkedImages < ::Jobs::Base
sidekiq_options queue: "low"
def initialize
@max_size = SiteSetting.max_image_size_kb.kilobytes
end
def execute(args)
disable_if_low_on_disk_space
@post_id = args[:post_id]
raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
post = Post.find_by(id: @post_id)
return if post.nil? || post.topic.nil?
hotlinked_map = post.post_hotlinked_media.map { |r| [r.url, r] }.to_h
changed_hotlink_records = false
extract_images_from(post.cooked).each do |node|
download_src =
original_src = node["src"] || node[PrettyText::BLOCKED_HOTLINKED_SRC_ATTR] || node["href"]
download_src = replace_encoded_src(download_src)
download_src =
"#{SiteSetting.force_https ? "https" : "http"}:#{original_src}" if original_src.start_with?(
"//",
)
normalized_src = normalize_src(download_src)
next if !should_download_image?(download_src, post)
hotlink_record = hotlinked_map[normalized_src]
if hotlink_record.nil?
hotlinked_map[normalized_src] = hotlink_record =
PostHotlinkedMedia.new(post: post, url: normalized_src)
begin
hotlink_record.upload = attempt_download(download_src, post.user_id)
hotlink_record.status = :downloaded
rescue ImageTooLargeError
hotlink_record.status = :too_large
rescue ImageBrokenError
hotlink_record.status = :download_failed
rescue UploadCreateError
hotlink_record.status = :upload_create_failed
end
end
if hotlink_record.changed?
changed_hotlink_records = true
hotlink_record.save!
end
rescue => e
raise e if Rails.env.test?
log(
:error,
"Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message +
"\n" + e.backtrace.join("\n"),
)
end
if changed_hotlink_records
post.trigger_post_process(
bypass_bump: true,
skip_pull_hotlinked_images: true, # Avoid an infinite loop of job scheduling
)
end
if hotlinked_map.size > 0
Jobs.cancel_scheduled_job(:update_hotlinked_raw, post_id: post.id)
update_raw_delay = SiteSetting.editing_grace_period + 1
Jobs.enqueue_in(update_raw_delay, :update_hotlinked_raw, post_id: post.id)
end
end
def download(src)
downloaded = nil
begin
retries ||= 3
if SiteSetting.verbose_upload_logging
Rails.logger.warn("Verbose Upload Logging: Downloading hotlinked image from #{src}")
end
downloaded =
FileHelper.download(
src,
max_file_size: @max_size,
retain_on_max_file_size_exceeded: true,
tmp_file_name: "discourse-hotlinked",
follow_redirect: true,
read_timeout: 15,
)
rescue => e
if SiteSetting.verbose_upload_logging
Rails.logger.warn("Verbose Upload Logging: Error '#{e.message}' while downloading #{src}")
end
if (retries -= 1) > 0 && !Rails.env.test?
sleep 1
retry
end
end
downloaded
end
class ImageTooLargeError < StandardError
end
class ImageBrokenError < StandardError
end
class UploadCreateError < StandardError
end
def attempt_download(src, user_id)
# secure-uploads endpoint prevents anonymous downloads, so we
# need the presigned S3 URL here
src = Upload.signed_url_from_secure_uploads_url(src) if Upload.secure_uploads_url?(src)
hotlinked = download(src)
raise ImageBrokenError if !hotlinked
raise ImageTooLargeError if File.size(hotlinked.path) > @max_size
filename = File.basename(URI.parse(src).path)
filename << File.extname(hotlinked.path) unless filename["."]
upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(user_id)
if upload.persisted?
upload
else
log(
:info,
"Failed to persist downloaded hotlinked image for post: #{@post_id}: #{src} - #{upload.errors.full_messages.join("\n")}",
)
raise UploadCreateError
end
end
def extract_images_from(html)
doc = Nokogiri::HTML5.fragment(html)
doc.css("img[src], [#{PrettyText::BLOCKED_HOTLINKED_SRC_ATTR}], a.lightbox[href]") -
doc.css("img.avatar") - doc.css(".lightbox img[src]")
end
def should_download_image?(src, post = nil)
# make sure we actually have a url
return false unless src.present?
local_bases =
[Discourse.base_url, Discourse.asset_host, SiteSetting.external_emoji_url.presence].compact
.map { |s| normalize_src(s) }
if Discourse.store.has_been_uploaded?(src) || normalize_src(src).start_with?(*local_bases) ||
src =~ %r{\A/[^/]}i
return false if !(src =~ %r{/uploads/} || Upload.secure_uploads_url?(src))
# Someone could hotlink a file from a different site on the same CDN,
# so check whether we have it in this database
#
# if the upload already exists and is attached to a different post,
# or the original_sha1 is missing meaning it was created before secure
# media was enabled, then we definitely want to redownload again otherwise
# we end up reusing existing uploads which may be linked to many posts
# already.
upload = Upload.consider_for_reuse(Upload.get_from_url(src), post)
return !upload.present?
end
# Don't download non-local images unless site setting enabled
return false unless SiteSetting.download_remote_images_to_local?
# parse the src
begin
uri = URI.parse(src)
rescue URI::Error
return false
end
hostname = uri.hostname
return false unless hostname
# check the domains blocklist
SiteSetting.should_download_images?(src)
end
def log(log_level, message)
Rails.logger.public_send(
log_level,
"#{RailsMultisite::ConnectionManagement.current_db}: #{message}",
)
end
protected
def replace_encoded_src(src)
PostHotlinkedMedia.normalize_src(src, reset_scheme: false)
end
def normalize_src(src)
PostHotlinkedMedia.normalize_src(src)
end
def disable_if_low_on_disk_space
return if Discourse.store.external?
return if !SiteSetting.download_remote_images_to_local
return if available_disk_space >= SiteSetting.download_remote_images_threshold
SiteSetting.download_remote_images_to_local = false
# log the site setting change
reason = I18n.t("disable_remote_images_download_reason")
staff_action_logger = StaffActionLogger.new(Discourse.system_user)
staff_action_logger.log_site_setting_change(
"download_remote_images_to_local",
true,
false,
details: reason,
)
# also send a private message to the site contact user notify_about_low_disk_space
notify_about_low_disk_space
end
def notify_about_low_disk_space
SystemMessage.create_from_system_user(
Discourse.site_contact_user,
:download_remote_images_disabled,
)
end
def available_disk_space
100 - DiskSpace.percent_free("#{Rails.root}/public/uploads")
end
end
end