discourse/app/jobs/regular/pull_hotlinked_images.rb
Martin Brennan 8ebd5edd1e
DEV: Rename secure_media to secure_uploads (#18376)
This commit renames all secure_media related settings to secure_uploads_* along with the associated functionality.

This is being done because "media" does not really cover it, we aren't just doing this for images and videos etc. but for all uploads in the site.

Additionally, in future we want to secure more types of uploads, and enable a kind of "mixed mode" where some uploads are secure and some are not, so keeping media in the name is just confusing.

This also keeps compatibility with the `secure-media-uploads` path, and changes new
secure URLs to be `secure-uploads`.

Deprecated settings:

* secure_media -> secure_uploads
* secure_media_allow_embed_images_in_emails -> secure_uploads_allow_embed_images_in_emails
* secure_media_max_email_embed_image_size_kb -> secure_uploads_max_email_embed_image_size_kb
2022-09-29 09:24:33 +10:00

221 lines
7.1 KiB
Ruby

# frozen_string_literal: true
module Jobs
class PullHotlinkedImages < ::Jobs::Base
sidekiq_options queue: 'low'
def initialize
@max_size = SiteSetting.max_image_size_kb.kilobytes
end
def execute(args)
disable_if_low_on_disk_space
@post_id = args[:post_id]
raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
post = Post.find_by(id: @post_id)
return if post.nil? || post.topic.nil?
hotlinked_map = post.post_hotlinked_media.map { |r| [r.url, r] }.to_h
changed_hotlink_records = false
extract_images_from(post.cooked).each do |node|
download_src = original_src = node['src'] || node[PrettyText::BLOCKED_HOTLINKED_SRC_ATTR] || node['href']
download_src = "#{SiteSetting.force_https ? "https" : "http"}:#{original_src}" if original_src.start_with?("//")
normalized_src = normalize_src(download_src)
next if !should_download_image?(download_src, post)
hotlink_record = hotlinked_map[normalized_src]
if hotlink_record.nil?
hotlinked_map[normalized_src] = hotlink_record = PostHotlinkedMedia.new(
post: post,
url: normalized_src
)
begin
hotlink_record.upload = attempt_download(download_src, post.user_id)
hotlink_record.status = :downloaded
rescue ImageTooLargeError
hotlink_record.status = :too_large
rescue ImageBrokenError
hotlink_record.status = :download_failed
rescue UploadCreateError
hotlink_record.status = :upload_create_failed
end
end
if hotlink_record.changed?
changed_hotlink_records = true
hotlink_record.save!
end
rescue => e
raise e if Rails.env.test?
log(:error, "Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
end
if changed_hotlink_records
post.trigger_post_process(
bypass_bump: true,
skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling
)
end
if hotlinked_map.size > 0
Jobs.cancel_scheduled_job(:update_hotlinked_raw, post_id: post.id)
update_raw_delay = SiteSetting.editing_grace_period + 1
Jobs.enqueue_in(update_raw_delay, :update_hotlinked_raw, post_id: post.id)
end
end
def download(src)
downloaded = nil
begin
retries ||= 3
if SiteSetting.verbose_upload_logging
Rails.logger.warn("Verbose Upload Logging: Downloading hotlinked image from #{src}")
end
downloaded = FileHelper.download(
src,
max_file_size: @max_size,
retain_on_max_file_size_exceeded: true,
tmp_file_name: "discourse-hotlinked",
follow_redirect: true,
read_timeout: 15
)
rescue => e
if SiteSetting.verbose_upload_logging
Rails.logger.warn("Verbose Upload Logging: Error '#{e.message}' while downloading #{src}")
end
if (retries -= 1) > 0 && !Rails.env.test?
sleep 1
retry
end
end
downloaded
end
class ImageTooLargeError < StandardError; end
class ImageBrokenError < StandardError; end
class UploadCreateError < StandardError; end
def attempt_download(src, user_id)
# secure-uploads endpoint prevents anonymous downloads, so we
# need the presigned S3 URL here
src = Upload.signed_url_from_secure_uploads_url(src) if Upload.secure_uploads_url?(src)
hotlinked = download(src)
raise ImageBrokenError if !hotlinked
raise ImageTooLargeError if File.size(hotlinked.path) > @max_size
filename = File.basename(URI.parse(src).path)
filename << File.extname(hotlinked.path) unless filename["."]
upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(user_id)
if upload.persisted?
upload
else
log(:info, "Failed to persist downloaded hotlinked image for post: #{@post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")
raise UploadCreateError
end
end
def extract_images_from(html)
doc = Nokogiri::HTML5::fragment(html)
doc.css("img[src], [#{PrettyText::BLOCKED_HOTLINKED_SRC_ATTR}], a.lightbox[href]") -
doc.css("img.avatar") -
doc.css(".lightbox img[src]")
end
def should_download_image?(src, post = nil)
# make sure we actually have a url
return false unless src.present?
local_bases = [
Discourse.base_url,
Discourse.asset_host,
SiteSetting.external_emoji_url.presence
].compact.map { |s| normalize_src(s) }
if Discourse.store.has_been_uploaded?(src) || normalize_src(src).start_with?(*local_bases) || src =~ /\A\/[^\/]/i
return false if !(src =~ /\/uploads\// || Upload.secure_uploads_url?(src))
# Someone could hotlink a file from a different site on the same CDN,
# so check whether we have it in this database
#
# if the upload already exists and is attached to a different post,
# or the original_sha1 is missing meaning it was created before secure
# media was enabled, then we definitely want to redownload again otherwise
# we end up reusing existing uploads which may be linked to many posts
# already.
upload = Upload.consider_for_reuse(Upload.get_from_url(src), post)
return !upload.present?
end
# Don't download non-local images unless site setting enabled
return false unless SiteSetting.download_remote_images_to_local?
# parse the src
begin
uri = URI.parse(src)
rescue URI::Error
return false
end
hostname = uri.hostname
return false unless hostname
# check the domains blocklist
SiteSetting.should_download_images?(src)
end
def log(log_level, message)
Rails.logger.public_send(
log_level,
"#{RailsMultisite::ConnectionManagement.current_db}: #{message}"
)
end
protected
def normalize_src(src)
PostHotlinkedMedia.normalize_src(src)
end
def disable_if_low_on_disk_space
return if Discourse.store.external?
return if !SiteSetting.download_remote_images_to_local
return if available_disk_space >= SiteSetting.download_remote_images_threshold
SiteSetting.download_remote_images_to_local = false
# log the site setting change
reason = I18n.t("disable_remote_images_download_reason")
staff_action_logger = StaffActionLogger.new(Discourse.system_user)
staff_action_logger.log_site_setting_change("download_remote_images_to_local", true, false, details: reason)
# also send a private message to the site contact user notify_about_low_disk_space
notify_about_low_disk_space
end
def notify_about_low_disk_space
SystemMessage.create_from_system_user(Discourse.site_contact_user, :download_remote_images_disabled)
end
def available_disk_space
100 - DiskSpace.percent_free("#{Rails.root}/public/uploads")
end
end
end