FIX: S3Inventory to ignore files older than last backup restore date (#27166)

This commit updates `S3Inventory#files` to ignore S3 inventory files
which have a `last_modified` timestamp which are not at least 2 days
older than `BackupMetadata.last_restore_date` timestamp.

This check was previously only in `Jobs::EnsureS3UploadsExistence` but
`S3Inventory` can also be used via Rake tasks so this protection needs
to be in `S3Inventory` and not in the scheduled job.
This commit is contained in:
Alan Guo Xiang Tan 2024-05-24 10:54:06 +08:00 committed by GitHub
parent a4c5f85b10
commit df16ab0758
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 77 additions and 72 deletions

View File

@ -4,8 +4,6 @@ module Jobs
class EnsureS3UploadsExistence < ::Jobs::Scheduled class EnsureS3UploadsExistence < ::Jobs::Scheduled
every 1.day every 1.day
WAIT_AFTER_RESTORE_HOURS = 48
def perform(*args) def perform(*args)
super super
ensure ensure
@ -28,7 +26,7 @@ module Jobs
end end
def execute(args) def execute(args)
return if !executable? return if !SiteSetting.enable_s3_inventory
require "s3_inventory" require "s3_inventory"
if !@db_inventories && Rails.configuration.multisite && GlobalSetting.use_s3? if !@db_inventories && Rails.configuration.multisite && GlobalSetting.use_s3?
@ -48,15 +46,5 @@ module Jobs
S3Inventory.new(s3_helper, :upload).backfill_etags_and_list_missing S3Inventory.new(s3_helper, :upload).backfill_etags_and_list_missing
end end
end end
def executable?
return false if !SiteSetting.enable_s3_inventory
if last_restore_date = BackupMetadata.last_restore_date
return false if last_restore_date > WAIT_AFTER_RESTORE_HOURS.hours.ago
end
true
end
end end
end end

View File

@ -27,6 +27,7 @@ class S3Helper
def initialize(s3_bucket_name, tombstone_prefix = "", options = {}) def initialize(s3_bucket_name, tombstone_prefix = "", options = {})
@s3_client = options.delete(:client) @s3_client = options.delete(:client)
@s3_bucket = options.delete(:bucket)
@s3_options = default_s3_options.merge(options) @s3_options = default_s3_options.merge(options)
@s3_bucket_name, @s3_bucket_folder_path = @s3_bucket_name, @s3_bucket_folder_path =

View File

@ -6,11 +6,12 @@ require "csv"
class S3Inventory class S3Inventory
attr_reader :type, :model, :inventory_date attr_reader :type, :model, :inventory_date
CSV_KEY_INDEX ||= 1 CSV_KEY_INDEX = 1
CSV_ETAG_INDEX ||= 2 CSV_ETAG_INDEX = 2
INVENTORY_PREFIX ||= "inventory" INVENTORY_PREFIX = "inventory"
INVENTORY_VERSION ||= "1" INVENTORY_VERSION = "1"
INVENTORY_LAG ||= 2.days INVENTORY_LAG = 2.days
WAIT_AFTER_RESTORE_DAYS = 2
def initialize(s3_helper, type, preloaded_inventory_file: nil, preloaded_inventory_date: nil) def initialize(s3_helper, type, preloaded_inventory_file: nil, preloaded_inventory_date: nil)
@s3_helper = s3_helper @s3_helper = s3_helper
@ -41,11 +42,13 @@ class S3Inventory
download_and_decompress_files if !@preloaded_inventory_file download_and_decompress_files if !@preloaded_inventory_file
multisite_prefix = Discourse.store.upload_path multisite_prefix = Discourse.store.upload_path
ActiveRecord::Base.transaction do ActiveRecord::Base.transaction do
begin begin
connection.exec( connection.exec(
"CREATE TEMP TABLE #{table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))", "CREATE TEMP TABLE #{table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))",
) )
connection.copy_data("COPY #{table_name} FROM STDIN CSV") do connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
for_each_inventory_row do |row| for_each_inventory_row do |row|
key = row[CSV_KEY_INDEX] key = row[CSV_KEY_INDEX]
@ -258,17 +261,28 @@ class S3Inventory
def files def files
return if @preloaded_inventory_file return if @preloaded_inventory_file
@files ||= @files ||=
begin begin
symlink_file = unsorted_files.sort_by { |file| -file.last_modified.to_i }.first symlink_file = unsorted_files.sort_by { |file| -file.last_modified.to_i }.first
return [] if symlink_file.blank? return [] if symlink_file.blank?
if BackupMetadata.last_restore_date.present? &&
(symlink_file.last_modified - WAIT_AFTER_RESTORE_DAYS.days) <
BackupMetadata.last_restore_date
return []
end
@inventory_date = symlink_file.last_modified - INVENTORY_LAG @inventory_date = symlink_file.last_modified - INVENTORY_LAG
log "Downloading symlink file to tmp directory..." log "Downloading symlink file to tmp directory..."
failure_message = "Failed to download symlink file to tmp directory." failure_message = "Failed to download symlink file to tmp directory."
filename = File.join(tmp_directory, File.basename(symlink_file.key)) filename = File.join(tmp_directory, File.basename(symlink_file.key))
@s3_helper.download_file(symlink_file.key, filename, failure_message) @s3_helper.download_file(symlink_file.key, filename, failure_message)
return [] if !File.exist?(filename)
File File
.readlines(filename) .readlines(filename)
.map do |key| .map do |key|

View File

@ -13,20 +13,6 @@ RSpec.describe Jobs::EnsureS3UploadsExistence do
S3Inventory.any_instance.expects(:backfill_etags_and_list_missing).once S3Inventory.any_instance.expects(:backfill_etags_and_list_missing).once
job.execute({}) job.execute({})
end end
it "doesn't execute when the site was restored within the last 48 hours" do
S3Inventory.any_instance.expects(:backfill_etags_and_list_missing).never
BackupMetadata.update_last_restore_date(47.hours.ago)
job.execute({})
end
it "executes when the site was restored more than 48 hours ago" do
S3Inventory.any_instance.expects(:backfill_etags_and_list_missing).once
BackupMetadata.update_last_restore_date(49.hours.ago)
job.execute({})
end
end end
context "with S3 inventory disabled" do context "with S3 inventory disabled" do

View File

@ -4,53 +4,17 @@ require "s3_helper"
require "s3_inventory" require "s3_inventory"
require "file_store/s3_store" require "file_store/s3_store"
RSpec.describe "S3Inventory" do RSpec.describe S3Inventory do
let(:client) { Aws::S3::Client.new(stub_responses: true) } let(:client) { Aws::S3::Client.new(stub_responses: true) }
let(:helper) { S3Helper.new(SiteSetting.Upload.s3_upload_bucket.downcase, "", client: client) } let(:resource) { Aws::S3::Resource.new(client: client) }
let(:bucket) { resource.bucket(SiteSetting.Upload.s3_upload_bucket.downcase) }
let(:helper) { S3Helper.new(bucket.name, "", client: client, bucket: bucket) }
let(:inventory) { S3Inventory.new(helper, :upload) } let(:inventory) { S3Inventory.new(helper, :upload) }
let(:csv_filename) { "#{Rails.root}/spec/fixtures/csv/s3_inventory.csv" } let(:csv_filename) { "#{Rails.root}/spec/fixtures/csv/s3_inventory.csv" }
before do before do
setup_s3 setup_s3
SiteSetting.enable_s3_inventory = true SiteSetting.enable_s3_inventory = true
client.stub_responses(
:list_objects,
->(context) do
expect(context.params[:prefix]).to eq(
"#{S3Inventory::INVENTORY_PREFIX}/#{S3Inventory::INVENTORY_VERSION}/bucket/original/hive",
)
{
contents: [
{
etag: "\"70ee1738b6b21e2c8a43f3a5ab0eee71\"",
key: "example1.csv.gz",
last_modified: Time.parse("2014-11-21T19:40:05.000Z"),
owner: {
display_name: "myname",
id: "12345example25102679df27bb0ae12b3f85be6f290b936c4393484be31bebcc",
},
size: 11,
storage_class: "STANDARD",
},
{
etag: "\"9c8af9a76df052144598c115ef33e511\"",
key: "example2.csv.gz",
last_modified: Time.parse("2013-11-15T01:10:49.000Z"),
owner: {
display_name: "myname",
id: "12345example25102679df27bb0ae12b3f85be6f290b936c4393484be31bebcc",
},
size: 713_193,
storage_class: "STANDARD",
},
],
next_marker: "eyJNYXJrZXIiOiBudWxsLCAiYm90b190cnVuY2F0ZV9hbW91bnQiOiAyfQ==",
}
end,
)
inventory.stubs(:cleanup!) inventory.stubs(:cleanup!)
end end
@ -162,6 +126,58 @@ RSpec.describe "S3Inventory" do
expect(Upload.by_users.order(:url).pluck(:url, :etag)).to eq(files) expect(Upload.by_users.order(:url).pluck(:url, :etag)).to eq(files)
end end
context "when site was restored from a backup" do
before do
freeze_time
BackupMetadata.update_last_restore_date(Time.now)
end
it "should run if inventory files are at least #{described_class::WAIT_AFTER_RESTORE_DAYS.days} days older than the last restore date" do
client.stub_responses(
:list_objects_v2,
{
contents: [
{
key: "symlink.txt",
last_modified:
BackupMetadata.last_restore_date + described_class::WAIT_AFTER_RESTORE_DAYS.days,
size: 1,
},
],
},
)
client.expects(:get_object).once
capture_stdout do
inventory = described_class.new(helper, :upload)
inventory.backfill_etags_and_list_missing
end
end
it "should not run if inventory files are not at least #{described_class::WAIT_AFTER_RESTORE_DAYS.days} days older than the last restore date" do
client.stub_responses(
:list_objects_v2,
{
contents: [
{
key: "symlink.txt",
last_modified: BackupMetadata.last_restore_date + 1.day,
size: 1,
},
],
},
)
client.expects(:get_object).never
capture_stdout do
inventory = described_class.new(helper, :upload)
inventory.backfill_etags_and_list_missing
end
end
end
it "should work when passed preloaded data" do it "should work when passed preloaded data" do
freeze_time freeze_time