discourse/spec/lib/s3_inventory_spec.rb
Alan Guo Xiang Tan 8cf4ed5f88
DEV: Introduce hidden s3_inventory_bucket site setting (#27304)
This commit introduces a hidden `s3_inventory_bucket` site setting which
replaces the `enable_s3_inventory` and `s3_configure_inventory_policy`
site setting.

The reason `enable_s3_inventory` and `s3_configure_inventory_policy`
site settings are removed is because this feature has technically been
broken since it was introduced. When the `enable_s3_inventory` feature
is turned on, the app will because configure a daily inventory policy for the
`s3_upload_bucket` bucket and store the inventories under a prefix in
the bucket. The problem here is that once the inventories are created,
there is nothing cleaning up all these inventories so whoever that has
enabled this feature would have been paying the cost of storing a whole
bunch of inventory files which are never used. Given that we have not
received any complains about inventory files inflating S3 storage costs,
we think that it is very likely that this feature is no longer being
used and we are looking to drop support for this feature in the not too
distance future.

For now, we will still support a hidden `s3_inventory_bucket` site
setting which site administrators can configure via the
`DISCOURSE_S3_INVENTORY_BUCKET` env.
2024-06-10 13:16:00 +08:00

208 lines
6.9 KiB
Ruby

# frozen_string_literal: true
RSpec.describe S3Inventory do
let(:inventory) do
S3Inventory.new(:upload, s3_inventory_bucket: "some-inventory-bucket/inventoried-bucket/prefix")
end
let(:csv_filename) { "#{Rails.root}/spec/fixtures/csv/s3_inventory.csv" }
before do
inventory.s3_helper.stub_client_responses!
inventory.stubs(:cleanup!)
end
it "should raise error if an inventory file is not found" do
inventory.s3_client.stub_responses(:list_objects, contents: [])
output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(output).to eq("Failed to list inventory from S3\n")
end
describe "verifying uploads" do
before do
freeze_time
CSV.foreach(csv_filename, headers: false) do |row|
next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default")
Fabricate(
:upload,
etag: row[S3Inventory::CSV_ETAG_INDEX],
url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]),
updated_at: 2.days.ago,
)
end
@upload_1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)
@upload_2 = Fabricate(:upload, etag: "ETag2", updated_at: Time.now)
@no_etag = Fabricate(:upload, updated_at: 2.days.ago)
@upload_3 =
Fabricate(
:upload,
etag: "ETag3",
updated_at: 2.days.ago,
verification_status: Upload.verification_statuses[:s3_file_missing_confirmed],
)
inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3)
inventory.expects(:inventory_date).times(2).returns(Time.now)
end
it "should display missing uploads correctly" do
output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(output).to eq("#{@upload_1.url}\n#{@no_etag.url}\n2 of 5 uploads are missing\n")
expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
end
it "should detect when a url match exists with a different etag" do
differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0")
differing_etag.update_columns(etag: "somethingelse")
output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(output).to eq(<<~TEXT)
#{differing_etag.url} has different etag
#{@upload_1.url}
#{@no_etag.url}
3 of 5 uploads are missing
1 of these are caused by differing etags
Null the etag column and re-run for automatic backfill
TEXT
expect(Discourse.stats.get("missing_s3_uploads")).to eq(3)
end
it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do
expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
).to eq(12)
output = capture_stdout { inventory.backfill_etags_and_list_missing }
verification_status = Upload.pluck(:verification_status)
expect(
Upload.where(verification_status: Upload.verification_statuses[:verified]).count,
).to eq(3)
expect(Upload.with_invalid_etag_verification_status.count).to eq(2)
expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
).to eq(7)
end
it "does not affect the updated_at date of uploads" do
upload_1_updated = @upload_1.updated_at
upload_2_updated = @upload_2.updated_at
no_etag_updated = @no_etag.updated_at
output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(@upload_1.reload.updated_at).to eq_time(upload_1_updated)
expect(@upload_2.reload.updated_at).to eq_time(upload_2_updated)
expect(@no_etag.reload.updated_at).to eq_time(no_etag_updated)
end
end
it "should backfill etags to uploads table correctly" do
files = [
[
"#{Discourse.store.absolute_base_url}/uploads/default/original/1X/0184537a4f419224404d013414e913a4f56018f2.jpg",
"defcaac0b4aca535c284e95f30d608d0",
],
[
"#{Discourse.store.absolute_base_url}/uploads/default/original/1X/0789fbf5490babc68326b9cec90eeb0d6590db05.png",
"25c02eaceef4cb779fc17030d33f7f06",
],
]
files.each { |file| Fabricate(:upload, url: file[0]) }
inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3)
output =
capture_stdout do
expect { inventory.backfill_etags_and_list_missing }.to change {
Upload.where(etag: nil).count
}.by(-2)
end
expect(Upload.by_users.order(:url).pluck(:url, :etag)).to eq(files)
end
context "when site was restored from a backup" do
before do
freeze_time
BackupMetadata.update_last_restore_date(Time.now)
end
it "should run if inventory files are at least #{described_class::WAIT_AFTER_RESTORE_DAYS.days} days older than the last restore date" do
inventory.s3_client.stub_responses(
:list_objects_v2,
{
contents: [
{
key: "symlink.txt",
last_modified:
BackupMetadata.last_restore_date + described_class::WAIT_AFTER_RESTORE_DAYS.days,
size: 1,
},
],
},
)
inventory.s3_client.expects(:get_object).once
capture_stdout { inventory.backfill_etags_and_list_missing }
end
it "should not run if inventory files are not at least #{described_class::WAIT_AFTER_RESTORE_DAYS.days} days older than the last restore date" do
inventory.s3_client.stub_responses(
:list_objects_v2,
{
contents: [
{
key: "symlink.txt",
last_modified: BackupMetadata.last_restore_date + 1.day,
size: 1,
},
],
},
)
inventory.s3_client.expects(:get_object).never
capture_stdout { inventory.backfill_etags_and_list_missing }
end
end
it "should work when passed preloaded data" do
freeze_time
CSV.foreach(csv_filename, headers: false) do |row|
next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default")
Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago)
end
upload = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)
Fabricate(:upload, etag: "ETag2", updated_at: Time.now)
no_etag = Fabricate(:upload, updated_at: 2.days.ago)
output =
capture_stdout do
File.open(csv_filename) do |f|
preloaded_inventory =
S3Inventory.new(
:upload,
s3_inventory_bucket: "some-inventory-bucket",
preloaded_inventory_file: f,
preloaded_inventory_date: Time.now,
)
preloaded_inventory.backfill_etags_and_list_missing
end
end
expect(output).to eq("#{upload.url}\n#{no_etag.url}\n2 of 5 uploads are missing\n")
expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
end
end