FEATURE: Pull hotlinked images immediately after posting

Previously, with the default `editing_grace_period`, hotlinked images were pulled 5 minutes after a post is created. This delay was added to reduce the chance of automated edits clashing with user edits.

This commit refactors things so that we can pull hotlinked images immediately. URLs are immediately updated in the post's `cooked` HTML. The post's raw markdown is updated later, after the `editing_grace_period`.

This involves a number of behind-the-scenes changes including:

- Schedule Jobs::PullHotlinkedImages immediately after Jobs::ProcessPost. Move scheduling to after the `update_column` call to avoid race conditions

- Move raw changes into a separate job, which is delayed until after the ninja-edit window

- Move disable_if_low_on_disk_space logic into the `pull_hotlinked_images` job

- Move raw-parsing/replacing logic into `InlineUpload` so it can be easily be shared between `UpdateHotlinkedRaw` and `PullUserProfileHotlinkedImages`
This commit is contained in:
David Taylor 2022-05-16 17:56:00 +01:00
parent 0baabafa9d
commit bf6f8299a7
9 changed files with 193 additions and 199 deletions

View File

@ -40,6 +40,8 @@ module Jobs
end
end
enqueue_pull_hotlinked_images(post) unless args[:skip_pull_hotlinked_images]
if !post.user&.staff? && !post.user&.staged?
s = post.raw
s << " #{post.topic.title}" if post.post_number == 1
@ -60,6 +62,11 @@ module Jobs
TopicLink.extract_from(post)
QuotedPost.extract_from(post)
end
def enqueue_pull_hotlinked_images(post)
Jobs.cancel_scheduled_job(:pull_hotlinked_images, post_id: post.id)
Jobs.enqueue(:pull_hotlinked_images, post_id: post.id)
end
end
end

View File

@ -10,16 +10,13 @@ module Jobs
end
def execute(args)
disable_if_low_on_disk_space
@post_id = args[:post_id]
raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
post = Post.find_by(id: @post_id)
return if post.blank?
return if post.topic.blank?
return if post.cook_method == Post.cook_methods[:raw_html]
raw = post.raw.dup
start_raw = raw.dup
return if post.nil? || post.topic.nil?
hotlinked_map = post.post_hotlinked_media.map { |r| [r.url, r] }.to_h
@ -55,30 +52,23 @@ module Jobs
changed_hotlink_records = true
hotlink_record.save!
end
# have we successfully downloaded that file?
if upload = hotlink_record&.upload
raw = replace_in_raw(original_src: original_src, upload: upload, raw: raw)
end
rescue => e
raise e if Rails.env.test?
log(:error, "Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
end
# If post changed while we were downloading images, never apply edits
post.reload
post_changed_elsewhere = (start_raw != post.raw)
raw_changed_here = (raw != post.raw)
if !post_changed_elsewhere && raw_changed_here
changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true)
elsif changed_hotlink_records
if changed_hotlink_records
post.trigger_post_process(
bypass_bump: true,
skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling
)
end
if hotlinked_map.size > 0
Jobs.cancel_scheduled_job(:update_hotlinked_raw, post_id: post.id)
update_raw_delay = SiteSetting.editing_grace_period + 1
Jobs.enqueue_in(update_raw_delay, :update_hotlinked_raw, post_id: post.id)
end
end
def download(src)
@ -137,45 +127,6 @@ module Jobs
end
end
def replace_in_raw(original_src:, raw:, upload:)
raw = raw.dup
escaped_src = Regexp.escape(original_src)
replace_raw = ->(match, match_src, replacement, _index) {
if normalize_src(original_src) == normalize_src(match_src)
replacement =
if replacement.include?(InlineUploads::PLACEHOLDER)
replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER)
replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path)
end
raw = raw.gsub(
match,
replacement
)
end
}
# there are 6 ways to insert an image in a post
# HTML tag - <img src="http://...">
InlineUploads.match_img(raw, external_src: true, &replace_raw)
# BBCode tag - [img]http://...[/img]
InlineUploads.match_bbcode_img(raw, external_src: true, &replace_raw)
# Markdown linked image - [![alt](http://...)](http://...)
# Markdown inline - ![alt](http://...)
# Markdown inline - ![](http://... "image title")
# Markdown inline - ![alt](http://... "image title")
InlineUploads.match_md_inline_img(raw, external_src: true, &replace_raw)
# Direct link
raw.gsub!(/^#{escaped_src}(\s?)$/) { "![](#{upload.short_url})#{$1}" }
raw
end
def extract_images_from(html)
doc = Nokogiri::HTML5::fragment(html)
@ -239,6 +190,30 @@ module Jobs
def normalize_src(src)
PostHotlinkedMedia.normalize_src(src)
end
def disable_if_low_on_disk_space
return if Discourse.store.external?
return if !SiteSetting.download_remote_images_to_local
return if available_disk_space >= SiteSetting.download_remote_images_threshold
SiteSetting.download_remote_images_to_local = false
# log the site setting change
reason = I18n.t("disable_remote_images_download_reason")
staff_action_logger = StaffActionLogger.new(Discourse.system_user)
staff_action_logger.log_site_setting_change("download_remote_images_to_local", true, false, details: reason)
# also send a private message to the site contact user notify_about_low_disk_space
notify_about_low_disk_space
end
def notify_about_low_disk_space
SystemMessage.create_from_system_user(Discourse.site_contact_user, :download_remote_images_disabled)
end
def available_disk_space
100 - DiskSpace.percent_free("#{Rails.root}/public/uploads")
end
end
end

View File

@ -30,16 +30,16 @@ module Jobs
rescue ImageBrokenError
broken_image_urls << normalized_src
end
# have we successfully downloaded that file?
if upload = downloaded_images[normalized_src]
user_profile.bio_raw = replace_in_raw(original_src: original_src, upload: upload, raw: user_profile.bio_raw)
end
rescue => e
raise e if Rails.env.test?
log(:error, "Failed to pull hotlinked image (#{download_src}) user: #{@user_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
end
user_profile.bio_raw = InlineUploads.replace_hotlinked_image_urls(raw: user_profile.bio_raw) do |match_src|
normalized_match_src = PostHotlinkedMedia.normalize_src(match_src)
downloaded_images[normalized_match_src]
end
user_profile.skip_pull_hotlinked_image = true
user_profile.save!
end

View File

@ -0,0 +1,28 @@
# frozen_string_literal: true
module Jobs
class UpdateHotlinkedRaw < ::Jobs::Base
sidekiq_options queue: 'low'
def execute(args)
@post_id = args[:post_id]
raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
post = Post.find_by(id: @post_id)
return if post.nil?
return if post.cook_method == Post.cook_methods[:raw_html]
hotlinked_map = post.post_hotlinked_media.preload(:upload).map { |r| [r.url, r] }.to_h
raw = InlineUploads.replace_hotlinked_image_urls(raw: post.raw) do |match_src|
normalized_match_src = PostHotlinkedMedia.normalize_src(match_src)
hotlinked_map[normalized_match_src]&.upload
end
if post.raw != raw
changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true)
end
end
end
end

View File

@ -214,6 +214,40 @@ class InlineUploads
end
end
def self.replace_hotlinked_image_urls(raw:, &blk)
replace = Proc.new do |match, match_src, replacement, _index|
upload = blk.call(match_src)
next if !upload
replacement =
if replacement.include?(InlineUploads::PLACEHOLDER)
replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER)
replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path)
end
raw = raw.gsub(
match,
replacement
)
end
# there are 6 ways to insert an image in a post
# HTML tag - <img src="http://...">
InlineUploads.match_img(raw, external_src: true, &replace)
# BBCode tag - [img]http://...[/img]
InlineUploads.match_bbcode_img(raw, external_src: true, &replace)
# Markdown linked image - [![alt](http://...)](http://...)
# Markdown inline - ![alt](http://...)
# Markdown inline - ![](http://... "image title")
# Markdown inline - ![alt](http://... "image title")
InlineUploads.match_md_inline_img(raw, external_src: true, &replace)
raw
end
def self.matched_uploads(node)
upload_path = Discourse.store.upload_path
base_url = Discourse.base_url.sub(/https?:\/\//, "(https?://)")

View File

@ -47,7 +47,6 @@ class CookedPostProcessor
remove_user_ids
update_post_image
enforce_nofollow
pull_hotlinked_images
grant_badges
@post.link_post_uploads(fragments: @doc)
DiscourseEvent.trigger(:post_process_cooked, @doc, @post)
@ -361,41 +360,6 @@ class CookedPostProcessor
PrettyText.add_rel_attributes_to_user_content(@doc, add_nofollow)
end
def pull_hotlinked_images
return if @opts[:skip_pull_hotlinked_images]
# have we enough disk space?
disable_if_low_on_disk_space # But still enqueue the job
# make sure no other job is scheduled
Jobs.cancel_scheduled_job(:pull_hotlinked_images, post_id: @post.id)
# schedule the job
delay = SiteSetting.editing_grace_period + 1
Jobs.enqueue_in(delay.seconds.to_i, :pull_hotlinked_images, post_id: @post.id)
end
def disable_if_low_on_disk_space
return if Discourse.store.external?
return if !SiteSetting.download_remote_images_to_local
return if available_disk_space >= SiteSetting.download_remote_images_threshold
SiteSetting.download_remote_images_to_local = false
# log the site setting change
reason = I18n.t("disable_remote_images_download_reason")
staff_action_logger = StaffActionLogger.new(Discourse.system_user)
staff_action_logger.log_site_setting_change("download_remote_images_to_local", true, false, details: reason)
# also send a private message to the site contact user notify_about_low_disk_space
notify_about_low_disk_space
end
def notify_about_low_disk_space
SystemMessage.create_from_system_user(Discourse.site_contact_user, :download_remote_images_disabled)
end
def available_disk_space
100 - DiskSpace.percent_free("#{Rails.root}/public/uploads")
end
private
def post_process_images

View File

@ -88,4 +88,35 @@ describe Jobs::ProcessPost do
expect(post.topic.reload.excerpt).to eq("Some OP content")
end
end
context "#enqueue_pull_hotlinked_images" do
fab!(:post) { Fabricate(:post, created_at: 20.days.ago) }
let(:job) { Jobs::ProcessPost.new }
it "runs even when download_remote_images_to_local is disabled" do
# We want to run it to pull hotlinked optimized images
SiteSetting.download_remote_images_to_local = false
expect_enqueued_with(job: :pull_hotlinked_images, args: { post_id: post.id }) do
job.execute({ post_id: post.id })
end
end
context "when download_remote_images_to_local? is enabled" do
before do
SiteSetting.download_remote_images_to_local = true
end
it "enqueues" do
expect_enqueued_with(job: :pull_hotlinked_images, args: { post_id: post.id }) do
job.execute({ post_id: post.id })
end
end
it "does not run when requested to skip" do
job.execute({ post_id: post.id, skip_pull_hotlinked_images: true })
expect(Jobs::PullHotlinkedImages.jobs.size).to eq(0)
end
end
end
end

View File

@ -67,6 +67,21 @@ describe Jobs::PullHotlinkedImages do
expect(post.reload.raw).to eq("<img src=\"#{Upload.last.short_url}\">")
end
it 'enqueues raw replacement job with a delay' do
Jobs.run_later!
post = Fabricate(:post, raw: "<img src='#{image_url}'>")
stub_image_size
freeze_time
Jobs.expects(:cancel_scheduled_job).with(:update_hotlinked_raw, post_id: post.id).once
delay = SiteSetting.editing_grace_period + 1
expect_enqueued_with(job: :update_hotlinked_raw, args: { post_id: post.id }, at: Time.zone.now + delay.seconds) do
Jobs::PullHotlinkedImages.new.execute(post_id: post.id)
end
end
it 'removes downloaded images when they are no longer needed' do
post = Fabricate(:post, raw: "<img src='#{image_url}'>")
stub_image_size
@ -196,14 +211,14 @@ describe Jobs::PullHotlinkedImages do
expect(post.uploads).to contain_exactly(upload)
end
it "skips raw_html posts" do
it "skips editing raw for raw_html posts" do
raw = "<img src=\"#{image_url}\">"
post = Fabricate(:post, raw: raw, cook_method: Post.cook_methods[:raw_html])
stub_image_size
expect do
post.rebake!
post.reload
end.not_to change { Upload.count }
end.to change { Upload.count }.by(1)
expect(post.raw).to eq(raw)
end
@ -556,6 +571,43 @@ describe Jobs::PullHotlinkedImages do
end
end
context "#disable_if_low_on_disk_space" do
fab!(:post) { Fabricate(:post, created_at: 20.days.ago) }
let(:job) { Jobs::PullHotlinkedImages.new }
before do
SiteSetting.download_remote_images_to_local = true
SiteSetting.download_remote_images_threshold = 20
job.stubs(:available_disk_space).returns(50)
end
it "does nothing when there's enough disk space" do
SiteSetting.expects(:download_remote_images_to_local=).never
job.execute({ post_id: post.id })
end
context "when there's not enough disk space" do
before { SiteSetting.download_remote_images_threshold = 75 }
it "disables download_remote_images_threshold and send a notification to the admin" do
StaffActionLogger.any_instance.expects(:log_site_setting_change).once
SystemMessage.expects(:create_from_system_user).with(Discourse.site_contact_user, :download_remote_images_disabled).once
job.execute({ post_id: post.id })
expect(SiteSetting.download_remote_images_to_local).to eq(false)
end
it "doesn't disable download_remote_images_to_local if site uses S3" do
setup_s3
job.execute({ post_id: post.id })
expect(SiteSetting.download_remote_images_to_local).to eq(true)
end
end
end
def stub_s3(upload)
stub_upload(upload)
stub_request(:get, "https:" + upload.url).to_return(status: 200, body: file_from_fixtures("smallest.png"))

View File

@ -21,7 +21,6 @@ describe CookedPostProcessor do
cpp.expects(:post_process_oneboxes).in_sequence(post_process)
cpp.expects(:post_process_images).in_sequence(post_process)
cpp.expects(:optimize_urls).in_sequence(post_process)
cpp.expects(:pull_hotlinked_images).in_sequence(post_process)
cpp.post_process
expect(PostUpload.exists?(post: post, upload: upload)).to eq(true)
@ -1543,102 +1542,6 @@ describe CookedPostProcessor do
end
end
context "#pull_hotlinked_images" do
let(:post) { build(:post, created_at: 20.days.ago) }
let(:cpp) { CookedPostProcessor.new(post) }
before { cpp.stubs(:available_disk_space).returns(90) }
it "runs even when download_remote_images_to_local is disabled" do
# We want to run it to pull hotlinked optimized images
SiteSetting.download_remote_images_to_local = false
expect { cpp.pull_hotlinked_images }.
to change { Jobs::PullHotlinkedImages.jobs.count }.by 1
end
context "when download_remote_images_to_local? is enabled" do
before do
SiteSetting.download_remote_images_to_local = true
end
it "disables download_remote_images if there is not enough disk space" do
cpp.expects(:available_disk_space).returns(5)
cpp.pull_hotlinked_images
expect(SiteSetting.download_remote_images_to_local).to eq(false)
end
it "does not run when requested to skip" do
CookedPostProcessor.new(post, skip_pull_hotlinked_images: true).pull_hotlinked_images
expect(Jobs::PullHotlinkedImages.jobs.size).to eq(0)
end
context "and there is enough disk space" do
before { cpp.expects(:disable_if_low_on_disk_space).at_least_once }
context "and the post has been updated by an actual user" do
before { post.id = 42 }
it "ensures only one job is scheduled right after the editing_grace_period" do
freeze_time
Jobs.expects(:cancel_scheduled_job).with(:pull_hotlinked_images, post_id: post.id).once
delay = SiteSetting.editing_grace_period + 1
expect_enqueued_with(job: :pull_hotlinked_images, args: { post_id: post.id }, at: Time.zone.now + delay.seconds) do
cpp.pull_hotlinked_images
end
end
end
end
end
end
context "#disable_if_low_on_disk_space" do
let(:post) { build(:post, created_at: 20.days.ago) }
let(:cpp) { CookedPostProcessor.new(post) }
before do
SiteSetting.download_remote_images_to_local = true
SiteSetting.download_remote_images_threshold = 20
cpp.stubs(:available_disk_space).returns(50)
end
it "does nothing when there's enough disk space" do
SiteSetting.expects(:download_remote_images_to_local=).never
cpp.disable_if_low_on_disk_space
end
context "when there's not enough disk space" do
before { SiteSetting.download_remote_images_threshold = 75 }
it "disables download_remote_images_threshold and send a notification to the admin" do
StaffActionLogger.any_instance.expects(:log_site_setting_change).once
SystemMessage.expects(:create_from_system_user).with(Discourse.site_contact_user, :download_remote_images_disabled).once
cpp.disable_if_low_on_disk_space
expect(SiteSetting.download_remote_images_to_local).to eq(false)
end
it "doesn't disable download_remote_images_to_local if site uses S3" do
setup_s3
cpp.disable_if_low_on_disk_space
expect(SiteSetting.download_remote_images_to_local).to eq(true)
end
end
end
context "#is_a_hyperlink?" do
let(:post) { build(:post) }