From 1a3b9a73520296338d2571bf19b5d92a618d1369 Mon Sep 17 00:00:00 2001 From: Martin Brennan Date: Thu, 31 Oct 2024 13:33:11 +1000 Subject: [PATCH] DEV: Secure upload rake task improvements (#29484) This commit changes the uploads:secure_upload_analyse_and_update and uploads:disable_secure_uploads to no longer rebake affected posts inline. This just took way too long, and if the task stalled you couldn't be sure if the rest of it completed. Instead, we can update the baked_version of affected posts and utilize our PeriodicalUpdates job to gradually rebake them. I added warnings about increasing the site setting rebake_old_posts_count and the global setting max_old_rebakes_per_15_minutes before doing this as well. For good measure, the affected post IDs are written to a JSON file too. --- lib/tasks/uploads.rake | 61 +++++++---- spec/tasks/uploads_spec.rb | 200 +++++++++++++++++++++---------------- 2 files changed, 153 insertions(+), 108 deletions(-) diff --git a/lib/tasks/uploads.rake b/lib/tasks/uploads.rake index 97973557254..79a966b1c54 100644 --- a/lib/tasks/uploads.rake +++ b/lib/tasks/uploads.rake @@ -545,6 +545,10 @@ task "uploads:sync_s3_acls" => :environment do end end +def secure_upload_rebake_warning + puts "This task may mark a lot of posts for rebaking. To get through these quicker, the max_old_rebakes_per_15_minutes global setting (current value #{GlobalSetting.max_old_rebakes_per_15_minutes}) should be changed and the rebake_old_posts_count site setting (current value #{SiteSetting.rebake_old_posts_count}) increased as well. Do you want to proceed? (y/n)" +end + # NOTE: This needs to be updated to use the _first_ UploadReference # record for each upload to determine security, and do not mark things # as secure if the first record is something public e.g. a site setting. @@ -558,6 +562,9 @@ task "uploads:disable_secure_uploads" => :environment do exit 1 end + secure_upload_rebake_warning + exit 1 if STDIN.gets.chomp.downcase != "y" + puts "Disabling secure upload and resetting uploads to not secure in #{db}...", "" SiteSetting.secure_uploads = false @@ -583,8 +590,7 @@ task "uploads:disable_secure_uploads" => :environment do secure_upload_ids, ) adjust_acls(secure_upload_ids) - post_rebake_errors = rebake_upload_posts(post_ids_to_rebake) - log_rebake_errors(post_rebake_errors) + mark_upload_posts_for_rebake(post_ids_to_rebake) puts "", "Rebaking and uploading complete!", "" end @@ -612,6 +618,9 @@ task "uploads:secure_upload_analyse_and_update" => :environment do exit 1 end + secure_upload_rebake_warning + exit 1 if STDIN.gets.chomp.downcase != "y" + puts "Analyzing security for uploads in #{db}...", "" all_upload_ids_changed, post_ids_to_rebake = nil Upload.transaction do @@ -627,11 +636,17 @@ task "uploads:secure_upload_analyse_and_update" => :environment do # Simply mark all uploads linked to posts secure if login_required because # no anons will be able to access them; however if secure_uploads_pm_only is # true then login_required will not mark other uploads secure. + puts "", + "Site is login_required, and secure_uploads_pm_only is false. Continuing with strategy to mark all post uploads as secure.", + "" post_ids_to_rebake, all_upload_ids_changed = mark_all_as_secure_login_required else # Otherwise only mark uploads linked to posts either: # * In secure categories or PMs if !SiteSetting.secure_uploads_pm_only # * In PMs if SiteSetting.secure_uploads_pm_only + puts "", + "Site is not login_required. Continuing with normal strategy to mark uploads in secure contexts as secure.", + "" post_ids_to_rebake, all_upload_ids_changed = update_specific_upload_security_no_login_required end @@ -639,8 +654,14 @@ task "uploads:secure_upload_analyse_and_update" => :environment do # Enqueue rebakes AFTER upload transaction complete, so there is no race condition # between updating the DB and the rebakes occurring. - post_rebake_errors = rebake_upload_posts(post_ids_to_rebake) - log_rebake_errors(post_rebake_errors) + # + # This is done asynchronously by changing the baked_version to NULL on + # affected posts and relying on Post.rebake_old in the PeriodicalUpdates + # job. To speed this up, these levers can be adjusted: + # + # * SiteSetting.rebake_old_posts_count + # * GlobalSetting.max_old_rebakes_per_15_minutes + mark_upload_posts_for_rebake(post_ids_to_rebake) # Also do this AFTER upload transaction complete so we don't end up with any # errors leaving ACLs in a bad state (the ACL sync task can be run to fix any @@ -677,7 +698,7 @@ def mark_all_as_secure_login_required security_last_changed_reason = 'upload security rake task mark as secure', security_last_changed_at = NOW() FROM upl - WHERE uploads.id = upl.upload_id AND NOT uploads.secure + WHERE uploads.id = upl.upload_id RETURNING uploads.id SQL puts "Marked #{post_upload_ids_marked_secure.count} upload(s) as secure because login_required is true.", @@ -687,7 +708,7 @@ def mark_all_as_secure_login_required SET secure = false, security_last_changed_reason = 'upload security rake task mark as not secure', security_last_changed_at = NOW() - WHERE id NOT IN (?) AND uploads.secure + WHERE id NOT IN (?) RETURNING uploads.id SQL puts "Marked #{upload_ids_marked_not_secure.count} upload(s) as not secure because they are not linked to posts.", @@ -797,24 +818,20 @@ def update_uploads_access_control_post SQL end -def rebake_upload_posts(post_ids_to_rebake) +def mark_upload_posts_for_rebake(post_ids_to_rebake) posts_to_rebake = Post.where(id: post_ids_to_rebake) post_rebake_errors = [] - puts "", "Rebaking #{posts_to_rebake.length} posts with affected uploads.", "" - begin - i = 0 - posts_to_rebake.each do |post| - RakeHelpers.print_status_with_label("Rebaking posts.....", i, posts_to_rebake.length) - post.rebake! - i += 1 - end - - RakeHelpers.print_status_with_label("Rebaking complete! ", i, posts_to_rebake.length) - puts "" - rescue => e - post_rebake_errors << e.message - end - post_rebake_errors + puts "", + "Marking #{posts_to_rebake.length} posts with affected uploads for rebake. Every 15 minutes, a batch of these will be enqueued for rebaking.", + "" + posts_to_rebake.update_all(baked_version: nil) + File.write( + "secure_upload_analyse_and_update_posts_for_rebake.json", + MultiJson.dump({ post_ids: post_ids_to_rebake }), + ) + puts "", + "Post IDs written to secure_upload_analyse_and_update_posts_for_rebake.json for reference", + "" end def inline_uploads(post) diff --git a/spec/tasks/uploads_spec.rb b/spec/tasks/uploads_spec.rb index 2b251b22ae6..72325a06699 100644 --- a/spec/tasks/uploads_spec.rb +++ b/spec/tasks/uploads_spec.rb @@ -5,25 +5,26 @@ RSpec.describe "tasks/uploads" do Rake::Task.clear Discourse::Application.load_tasks SiteSetting.authorized_extensions += "|pdf" + STDIN.stubs(:gets).returns("y\n") end describe "uploads:secure_upload_analyse_and_update" do - let!(:uploads) { [multi_post_upload1, upload1, upload2, upload3] } - let(:multi_post_upload1) { Fabricate(:upload_s3) } - let(:upload1) { Fabricate(:upload_s3) } - let(:upload2) { Fabricate(:upload_s3) } - let(:upload3) { Fabricate(:upload_s3, original_filename: "test.pdf", extension: "pdf") } + let!(:uploads) { [multi_post_upload_1, upload_1, upload_2, upload_3] } + let(:multi_post_upload_1) { Fabricate(:upload_s3) } + let(:upload_1) { Fabricate(:upload_s3) } + let(:upload_2) { Fabricate(:upload_s3) } + let(:upload_3) { Fabricate(:upload_s3, original_filename: "test.pdf", extension: "pdf") } - let!(:post1) { Fabricate(:post) } - let!(:post2) { Fabricate(:post) } - let!(:post3) { Fabricate(:post) } + let!(:post_1) { Fabricate(:post) } + let!(:post_2) { Fabricate(:post) } + let!(:post_3) { Fabricate(:post) } before do - UploadReference.create(target: post1, upload: multi_post_upload1) - UploadReference.create(target: post2, upload: multi_post_upload1) - UploadReference.create(target: post2, upload: upload1) - UploadReference.create(target: post3, upload: upload2) - UploadReference.create(target: post3, upload: upload3) + UploadReference.create(target: post_1, upload: multi_post_upload_1) + UploadReference.create(target: post_2, upload: multi_post_upload_1) + UploadReference.create(target: post_2, upload: upload_1) + UploadReference.create(target: post_3, upload: upload_2) + UploadReference.create(target: post_3, upload: upload_3) end def invoke_task @@ -48,105 +49,121 @@ RSpec.describe "tasks/uploads" do it "sets an access_control_post for each post upload, using the first linked post in the case of multiple links" do invoke_task - expect(multi_post_upload1.reload.access_control_post).to eq(post1) - expect(upload1.reload.access_control_post).to eq(post2) - expect(upload2.reload.access_control_post).to eq(post3) - expect(upload3.reload.access_control_post).to eq(post3) + expect(multi_post_upload_1.reload.access_control_post).to eq(post_1) + expect(upload_1.reload.access_control_post).to eq(post_2) + expect(upload_2.reload.access_control_post).to eq(post_3) + expect(upload_3.reload.access_control_post).to eq(post_3) end context "when login_required" do before { SiteSetting.login_required = true } - it "sets everything attached to a post as secure and rebakes all those posts" do - freeze_time - - post1.update_columns(baked_at: 1.week.ago) - post2.update_columns(baked_at: 1.week.ago) - post3.update_columns(baked_at: 1.week.ago) + after do + if File.exist?("secure_upload_analyse_and_update_posts_for_rebake.json") + File.delete("secure_upload_analyse_and_update_posts_for_rebake.json") + end + end + it "sets everything attached to a post as secure" do invoke_task - expect(post1.reload.baked_at).not_to eq_time(1.week.ago) - expect(post2.reload.baked_at).not_to eq_time(1.week.ago) - expect(post3.reload.baked_at).not_to eq_time(1.week.ago) - expect(upload2.reload.secure).to eq(true) - expect(upload1.reload.secure).to eq(true) - expect(upload3.reload.secure).to eq(true) + expect(upload_2.reload.secure).to eq(true) + expect(upload_1.reload.secure).to eq(true) + expect(upload_3.reload.secure).to eq(true) + end + + it "writes a file with the post IDs to rebake" do + invoke_task + + expect(File.exist?("secure_upload_analyse_and_update_posts_for_rebake.json")).to eq( + true, + ) + expect( + JSON.parse(File.read("secure_upload_analyse_and_update_posts_for_rebake.json")), + ).to eq({ "post_ids" => [post_1.id, post_2.id, post_3.id] }) + end + + it "sets the baked_version to NULL for affected posts" do + invoke_task + + expect(post_1.reload.baked_version).to eq(nil) + expect(post_2.reload.baked_version).to eq(nil) + expect(post_3.reload.baked_version).to eq(nil) end context "when secure_uploads_pm_only" do before { SiteSetting.secure_uploads_pm_only = true } it "only sets everything attached to a private message post as secure and rebakes all those posts" do - freeze_time - - post1.update_columns(baked_at: 1.week.ago) - post2.update_columns(baked_at: 1.week.ago) - post3.update_columns(baked_at: 1.week.ago) - post3.topic.update(archetype: "private_message", category: nil) + post_3.topic.update(archetype: "private_message", category: nil) invoke_task - expect(post1.reload.baked_at).to eq_time(1.week.ago) - expect(post2.reload.baked_at).to eq_time(1.week.ago) - expect(post3.reload.baked_at).not_to eq_time(1.week.ago) - expect(upload1.reload.secure).to eq(false) - expect(upload2.reload.secure).to eq(true) - expect(upload3.reload.secure).to eq(true) + expect(post_1.reload.baked_version).not_to eq(nil) + expect(post_2.reload.baked_version).not_to eq(nil) + expect(post_3.reload.baked_version).to eq(nil) + expect(upload_1.reload.secure).to eq(false) + expect(upload_2.reload.secure).to eq(true) + expect(upload_3.reload.secure).to eq(true) end end end + context "when secure_uploads_pm_only" do + before { SiteSetting.secure_uploads_pm_only = true } + + it "sets a non-PM post upload to not secure" do + upload_2.update!(secure: true) + invoke_task + expect(upload_2.reload.secure).to eq(false) + end + end + it "sets the uploads that are media and attachments in the read restricted topic category to secure" do - post3.topic.update(category: Fabricate(:private_category, group: Fabricate(:group))) + post_3.topic.update(category: Fabricate(:private_category, group: Fabricate(:group))) invoke_task - expect(upload2.reload.secure).to eq(true) - expect(upload1.reload.secure).to eq(false) - expect(upload3.reload.secure).to eq(true) + expect(upload_2.reload.secure).to eq(true) + expect(upload_1.reload.secure).to eq(false) + expect(upload_3.reload.secure).to eq(true) end it "sets the upload in the PM topic to secure" do - post3.topic.update(archetype: "private_message", category: nil) + post_3.topic.update(archetype: "private_message", category: nil) invoke_task - expect(upload2.reload.secure).to eq(true) - expect(upload1.reload.secure).to eq(false) + expect(upload_2.reload.secure).to eq(true) + expect(upload_1.reload.secure).to eq(false) end - it "rebakes the posts attached for uploads that change secure status" do - post3.topic.update(category: Fabricate(:private_category, group: Fabricate(:group))) - freeze_time - - post1.update_columns(baked_at: 1.week.ago) - post2.update_columns(baked_at: 1.week.ago) - post3.update_columns(baked_at: 1.week.ago) + it "sets the baked_version version to NULL for the posts attached for uploads that change secure status" do + post_3.topic.update(category: Fabricate(:private_category, group: Fabricate(:group))) invoke_task - expect(post1.reload.baked_at).to eq_time(1.week.ago) - expect(post2.reload.baked_at).to eq_time(1.week.ago) - expect(post3.reload.baked_at).not_to eq_time(1.week.ago) + expect(post_1.reload.baked_version).not_to eq(nil) + expect(post_2.reload.baked_version).not_to eq(nil) + expect(post_3.reload.baked_version).to eq(nil) end context "for an upload that is already secure and does not need to change" do before do - post3.topic.update(archetype: "private_message", category: nil) - upload2.update(access_control_post: post3) - upload2.update_secure_status - upload3.update(access_control_post: post3) - upload3.update_secure_status + post_3.topic.update(archetype: "private_message", category: nil) + upload_2.update(access_control_post: post_3) + upload_2.update_secure_status + upload_3.update(access_control_post: post_3) + upload_3.update_secure_status end it "does not rebake the associated post" do freeze_time - post3.update_columns(baked_at: 1.week.ago) + post_3.update_columns(baked_at: 1.week.ago) invoke_task - expect(post3.reload.baked_at).to eq_time(1.week.ago) + expect(post_3.reload.baked_at).to eq_time(1.week.ago) end it "does not attempt to update the acl" do - FileStore::S3Store.any_instance.expects(:update_upload_ACL).with(upload2).never + FileStore::S3Store.any_instance.expects(:update_upload_ACL).with(upload_2).never invoke_task end end @@ -179,19 +196,25 @@ RSpec.describe "tasks/uploads" do uploads.each { |upload| stub_upload(upload) } SiteSetting.secure_uploads = true - UploadReference.create(target: post1, upload: upload1) - UploadReference.create(target: post1, upload: upload2) - UploadReference.create(target: post2, upload: upload3) - UploadReference.create(target: post2, upload: upload4) + UploadReference.create(target: post_1, upload: upload_1) + UploadReference.create(target: post_1, upload: upload_2) + UploadReference.create(target: post_2, upload: upload_3) + UploadReference.create(target: post_2, upload: upload4) end - let!(:uploads) { [upload1, upload2, upload3, upload4, upload5] } - let(:post1) { Fabricate(:post) } - let(:post2) { Fabricate(:post) } - let(:upload1) { Fabricate(:upload_s3, secure: true, access_control_post: post1) } - let(:upload2) { Fabricate(:upload_s3, secure: true, access_control_post: post1) } - let(:upload3) { Fabricate(:upload_s3, secure: true, access_control_post: post2) } - let(:upload4) { Fabricate(:upload_s3, secure: true, access_control_post: post2) } + after do + if File.exist?("secure_upload_analyse_and_update_posts_for_rebake.json") + File.delete("secure_upload_analyse_and_update_posts_for_rebake.json") + end + end + + let!(:uploads) { [upload_1, upload_2, upload_3, upload4, upload5] } + let(:post_1) { Fabricate(:post) } + let(:post_2) { Fabricate(:post) } + let(:upload_1) { Fabricate(:upload_s3, secure: true, access_control_post: post_1) } + let(:upload_2) { Fabricate(:upload_s3, secure: true, access_control_post: post_1) } + let(:upload_3) { Fabricate(:upload_s3, secure: true, access_control_post: post_2) } + let(:upload4) { Fabricate(:upload_s3, secure: true, access_control_post: post_2) } let(:upload5) { Fabricate(:upload_s3, secure: false) } it "disables the secure upload setting" do @@ -201,24 +224,29 @@ RSpec.describe "tasks/uploads" do it "updates all secure uploads to secure: false" do invoke_task - [upload1, upload2, upload3, upload4].each { |upl| expect(upl.reload.secure).to eq(false) } + [upload_1, upload_2, upload_3, upload4].each { |upl| expect(upl.reload.secure).to eq(false) } end - it "rebakes the associated posts" do - freeze_time - - post1.update_columns(baked_at: 1.week.ago) - post2.update_columns(baked_at: 1.week.ago) + it "sets the baked_version to NULL for affected posts" do invoke_task - expect(post1.reload.baked_at).not_to eq_time(1.week.ago) - expect(post2.reload.baked_at).not_to eq_time(1.week.ago) + expect(post_1.reload.baked_version).to eq(nil) + expect(post_2.reload.baked_version).to eq(nil) + end + + it "writes a file with the post IDs to rebake" do + invoke_task + + expect(File.exist?("secure_upload_analyse_and_update_posts_for_rebake.json")).to eq(true) + expect(JSON.parse(File.read("secure_upload_analyse_and_update_posts_for_rebake.json"))).to eq( + { "post_ids" => [post_1.id, post_2.id] }, + ) end it "updates the affected ACLs via the SyncAclsForUploads job" do invoke_task expect(Jobs::SyncAclsForUploads.jobs.last["args"][0]["upload_ids"]).to match_array( - [upload1.id, upload2.id, upload3.id, upload4.id], + [upload_1.id, upload_2.id, upload_3.id, upload4.id], ) end end