From df16ab07588da8502c7ab43372eee434899732c7 Mon Sep 17 00:00:00 2001 From: Alan Guo Xiang Tan Date: Fri, 24 May 2024 10:54:06 +0800 Subject: [PATCH] FIX: `S3Inventory` to ignore files older than last backup restore date (#27166) This commit updates `S3Inventory#files` to ignore S3 inventory files which have a `last_modified` timestamp which are not at least 2 days older than `BackupMetadata.last_restore_date` timestamp. This check was previously only in `Jobs::EnsureS3UploadsExistence` but `S3Inventory` can also be used via Rake tasks so this protection needs to be in `S3Inventory` and not in the scheduled job. --- .../scheduled/ensure_s3_uploads_existence.rb | 14 +-- lib/s3_helper.rb | 1 + lib/s3_inventory.rb | 24 ++++- spec/jobs/ensure_s3_uploads_existence_spec.rb | 14 --- spec/lib/s3_inventory_spec.rb | 96 +++++++++++-------- 5 files changed, 77 insertions(+), 72 deletions(-) diff --git a/app/jobs/scheduled/ensure_s3_uploads_existence.rb b/app/jobs/scheduled/ensure_s3_uploads_existence.rb index cf71f4253a3..29338784433 100644 --- a/app/jobs/scheduled/ensure_s3_uploads_existence.rb +++ b/app/jobs/scheduled/ensure_s3_uploads_existence.rb @@ -4,8 +4,6 @@ module Jobs class EnsureS3UploadsExistence < ::Jobs::Scheduled every 1.day - WAIT_AFTER_RESTORE_HOURS = 48 - def perform(*args) super ensure @@ -28,7 +26,7 @@ module Jobs end def execute(args) - return if !executable? + return if !SiteSetting.enable_s3_inventory require "s3_inventory" if !@db_inventories && Rails.configuration.multisite && GlobalSetting.use_s3? @@ -48,15 +46,5 @@ module Jobs S3Inventory.new(s3_helper, :upload).backfill_etags_and_list_missing end end - - def executable? - return false if !SiteSetting.enable_s3_inventory - - if last_restore_date = BackupMetadata.last_restore_date - return false if last_restore_date > WAIT_AFTER_RESTORE_HOURS.hours.ago - end - - true - end end end diff --git a/lib/s3_helper.rb b/lib/s3_helper.rb index 1d50ce26bdb..e8d5a05e9e0 100644 --- a/lib/s3_helper.rb +++ b/lib/s3_helper.rb @@ -27,6 +27,7 @@ class S3Helper def initialize(s3_bucket_name, tombstone_prefix = "", options = {}) @s3_client = options.delete(:client) + @s3_bucket = options.delete(:bucket) @s3_options = default_s3_options.merge(options) @s3_bucket_name, @s3_bucket_folder_path = diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb index 3b9f4391e77..973dd75e02f 100644 --- a/lib/s3_inventory.rb +++ b/lib/s3_inventory.rb @@ -6,11 +6,12 @@ require "csv" class S3Inventory attr_reader :type, :model, :inventory_date - CSV_KEY_INDEX ||= 1 - CSV_ETAG_INDEX ||= 2 - INVENTORY_PREFIX ||= "inventory" - INVENTORY_VERSION ||= "1" - INVENTORY_LAG ||= 2.days + CSV_KEY_INDEX = 1 + CSV_ETAG_INDEX = 2 + INVENTORY_PREFIX = "inventory" + INVENTORY_VERSION = "1" + INVENTORY_LAG = 2.days + WAIT_AFTER_RESTORE_DAYS = 2 def initialize(s3_helper, type, preloaded_inventory_file: nil, preloaded_inventory_date: nil) @s3_helper = s3_helper @@ -41,11 +42,13 @@ class S3Inventory download_and_decompress_files if !@preloaded_inventory_file multisite_prefix = Discourse.store.upload_path + ActiveRecord::Base.transaction do begin connection.exec( "CREATE TEMP TABLE #{table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))", ) + connection.copy_data("COPY #{table_name} FROM STDIN CSV") do for_each_inventory_row do |row| key = row[CSV_KEY_INDEX] @@ -258,17 +261,28 @@ class S3Inventory def files return if @preloaded_inventory_file + @files ||= begin symlink_file = unsorted_files.sort_by { |file| -file.last_modified.to_i }.first + return [] if symlink_file.blank? + if BackupMetadata.last_restore_date.present? && + (symlink_file.last_modified - WAIT_AFTER_RESTORE_DAYS.days) < + BackupMetadata.last_restore_date + return [] + end + @inventory_date = symlink_file.last_modified - INVENTORY_LAG log "Downloading symlink file to tmp directory..." failure_message = "Failed to download symlink file to tmp directory." filename = File.join(tmp_directory, File.basename(symlink_file.key)) @s3_helper.download_file(symlink_file.key, filename, failure_message) + + return [] if !File.exist?(filename) + File .readlines(filename) .map do |key| diff --git a/spec/jobs/ensure_s3_uploads_existence_spec.rb b/spec/jobs/ensure_s3_uploads_existence_spec.rb index 846f58b5a5b..e7c8d267030 100644 --- a/spec/jobs/ensure_s3_uploads_existence_spec.rb +++ b/spec/jobs/ensure_s3_uploads_existence_spec.rb @@ -13,20 +13,6 @@ RSpec.describe Jobs::EnsureS3UploadsExistence do S3Inventory.any_instance.expects(:backfill_etags_and_list_missing).once job.execute({}) end - - it "doesn't execute when the site was restored within the last 48 hours" do - S3Inventory.any_instance.expects(:backfill_etags_and_list_missing).never - BackupMetadata.update_last_restore_date(47.hours.ago) - - job.execute({}) - end - - it "executes when the site was restored more than 48 hours ago" do - S3Inventory.any_instance.expects(:backfill_etags_and_list_missing).once - BackupMetadata.update_last_restore_date(49.hours.ago) - - job.execute({}) - end end context "with S3 inventory disabled" do diff --git a/spec/lib/s3_inventory_spec.rb b/spec/lib/s3_inventory_spec.rb index 1aaf094de18..7fca5bd2ede 100644 --- a/spec/lib/s3_inventory_spec.rb +++ b/spec/lib/s3_inventory_spec.rb @@ -4,53 +4,17 @@ require "s3_helper" require "s3_inventory" require "file_store/s3_store" -RSpec.describe "S3Inventory" do +RSpec.describe S3Inventory do let(:client) { Aws::S3::Client.new(stub_responses: true) } - let(:helper) { S3Helper.new(SiteSetting.Upload.s3_upload_bucket.downcase, "", client: client) } + let(:resource) { Aws::S3::Resource.new(client: client) } + let(:bucket) { resource.bucket(SiteSetting.Upload.s3_upload_bucket.downcase) } + let(:helper) { S3Helper.new(bucket.name, "", client: client, bucket: bucket) } let(:inventory) { S3Inventory.new(helper, :upload) } let(:csv_filename) { "#{Rails.root}/spec/fixtures/csv/s3_inventory.csv" } before do setup_s3 SiteSetting.enable_s3_inventory = true - - client.stub_responses( - :list_objects, - ->(context) do - expect(context.params[:prefix]).to eq( - "#{S3Inventory::INVENTORY_PREFIX}/#{S3Inventory::INVENTORY_VERSION}/bucket/original/hive", - ) - - { - contents: [ - { - etag: "\"70ee1738b6b21e2c8a43f3a5ab0eee71\"", - key: "example1.csv.gz", - last_modified: Time.parse("2014-11-21T19:40:05.000Z"), - owner: { - display_name: "myname", - id: "12345example25102679df27bb0ae12b3f85be6f290b936c4393484be31bebcc", - }, - size: 11, - storage_class: "STANDARD", - }, - { - etag: "\"9c8af9a76df052144598c115ef33e511\"", - key: "example2.csv.gz", - last_modified: Time.parse("2013-11-15T01:10:49.000Z"), - owner: { - display_name: "myname", - id: "12345example25102679df27bb0ae12b3f85be6f290b936c4393484be31bebcc", - }, - size: 713_193, - storage_class: "STANDARD", - }, - ], - next_marker: "eyJNYXJrZXIiOiBudWxsLCAiYm90b190cnVuY2F0ZV9hbW91bnQiOiAyfQ==", - } - end, - ) - inventory.stubs(:cleanup!) end @@ -162,6 +126,58 @@ RSpec.describe "S3Inventory" do expect(Upload.by_users.order(:url).pluck(:url, :etag)).to eq(files) end + context "when site was restored from a backup" do + before do + freeze_time + BackupMetadata.update_last_restore_date(Time.now) + end + + it "should run if inventory files are at least #{described_class::WAIT_AFTER_RESTORE_DAYS.days} days older than the last restore date" do + client.stub_responses( + :list_objects_v2, + { + contents: [ + { + key: "symlink.txt", + last_modified: + BackupMetadata.last_restore_date + described_class::WAIT_AFTER_RESTORE_DAYS.days, + size: 1, + }, + ], + }, + ) + + client.expects(:get_object).once + + capture_stdout do + inventory = described_class.new(helper, :upload) + inventory.backfill_etags_and_list_missing + end + end + + it "should not run if inventory files are not at least #{described_class::WAIT_AFTER_RESTORE_DAYS.days} days older than the last restore date" do + client.stub_responses( + :list_objects_v2, + { + contents: [ + { + key: "symlink.txt", + last_modified: BackupMetadata.last_restore_date + 1.day, + size: 1, + }, + ], + }, + ) + + client.expects(:get_object).never + + capture_stdout do + inventory = described_class.new(helper, :upload) + inventory.backfill_etags_and_list_missing + end + end + end + it "should work when passed preloaded data" do freeze_time