From 5a00a041f1a9a00fb31b18956769262af6f11037 Mon Sep 17 00:00:00 2001 From: Bianca Nenciu Date: Fri, 8 Nov 2024 02:05:14 +0200 Subject: [PATCH] FEATURE: Mark bad uploads with :invalid_url (#29640) A "bad upload" in this context is a upload with a mismatched URL. This can happen when changing the S3 bucket used for uploads and the upload records in the database have not been remapped correctly. --- app/models/upload.rb | 16 ++++++++++++++-- lib/s3_inventory.rb | 35 ++++++++++++++++++++++++++++++++++- spec/lib/s3_inventory_spec.rb | 23 +++++++++++++++++++---- 3 files changed, 67 insertions(+), 7 deletions(-) diff --git a/app/models/upload.rb b/app/models/upload.rb index 02df52ab18b..5c1ffd029fe 100644 --- a/app/models/upload.rb +++ b/app/models/upload.rb @@ -66,13 +66,25 @@ class Upload < ActiveRecord::Base scope :with_invalid_etag_verification_status, -> { where(verification_status: Upload.verification_statuses[:invalid_etag]) } + scope :with_invalid_url_verification_status, + -> { where(verification_status: Upload.verification_statuses[:invalid_url]) } + def self.verification_statuses @verification_statuses ||= Enum.new( unchecked: 1, verified: 2, - invalid_etag: 3, # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to the ETag value of the inventory file - s3_file_missing_confirmed: 4, # Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a file in the S3 file store + # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to + # the ETag value of the inventory file. A upload with invalid ETag is equivalent to "missing + # upload file" + invalid_etag: 3, + # Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a + # file in the S3 file store + s3_file_missing_confirmed: 4, + # Used by S3Inventory to mark S3 Upload records that have an invalid url value compared to + # the url value of the inventory file. A upload with invalid URL is equivalent to "file + # exists (same ETag), but with a different URL" + invalid_url: 5, ) end diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb index 7057a978a12..0e1662a643e 100644 --- a/lib/s3_inventory.rb +++ b/lib/s3_inventory.rb @@ -85,7 +85,9 @@ class S3Inventory missing_uploads = uploads.joins( "LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag", - ).where("#{tmp_table_name}.etag IS NULL") + ).where( + "#{tmp_table_name}.etag IS NULL OR #{tmp_table_name}.url != #{table_name}.url", + ) exists_with_different_etag = missing_uploads @@ -95,11 +97,20 @@ class S3Inventory .where("inventory2.etag IS NOT NULL") .pluck(:id) + exists_with_different_url = + missing_uploads + .joins( + "LEFT JOIN #{tmp_table_name} inventory3 ON inventory3.etag = #{table_name}.etag", + ) + .where("inventory3.url != #{table_name}.url") + .pluck(:id) + # marking as verified/not verified if @model == Upload sql_params = { inventory_date: inventory_date, invalid_etag: Upload.verification_statuses[:invalid_etag], + invalid_url: Upload.verification_statuses[:invalid_url], s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed], verified: Upload.verification_statuses[:verified], seeded_id_threshold: @model::SEEDED_ID_THRESHOLD, @@ -135,6 +146,22 @@ class S3Inventory WHERE #{tmp_table_name}.etag = #{table_name}.etag ) SQL + + DB.exec(<<~SQL, sql_params) + UPDATE #{table_name} + SET verification_status = :invalid_url + WHERE verification_status <> :invalid_url + AND verification_status <> :invalid_etag + AND verification_status <> :s3_file_missing_confirmed + AND updated_at < :inventory_date + AND id > :seeded_id_threshold + AND NOT EXISTS + ( + SELECT 1 + FROM #{tmp_table_name} + WHERE #{tmp_table_name}.url = #{table_name}.url + ) + SQL end if (missing_count = missing_uploads.count) > 0 @@ -143,6 +170,8 @@ class S3Inventory .find_each do |upload| if exists_with_different_etag.include?(upload.id) log "#{upload.url} has different etag" + elsif exists_with_different_url.include?(upload.id) + log "#{upload.url} has different url" else log upload.url end @@ -153,6 +182,10 @@ class S3Inventory log "#{exists_with_different_etag.count} of these are caused by differing etags" log "Null the etag column and re-run for automatic backfill" end + if exists_with_different_url.present? + log "#{exists_with_different_url.count} of these are caused by differing urls" + log "Empty the url column and re-run for automatic backfill" + end end set_missing_s3_discourse_stats(missing_count) diff --git a/spec/lib/s3_inventory_spec.rb b/spec/lib/s3_inventory_spec.rb index 44afba50932..8ea55c28057 100644 --- a/spec/lib/s3_inventory_spec.rb +++ b/spec/lib/s3_inventory_spec.rb @@ -74,20 +74,29 @@ RSpec.describe S3Inventory do differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0") differing_etag.update_columns(etag: "somethingelse") + differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a") + differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault")) + output = capture_stdout { inventory.backfill_etags_and_list_missing } expect(output).to eq(<<~TEXT) #{differing_etag.url} has different etag + #{differing_url.url} has different url #{@upload_1.url} #{@no_etag.url} - 3 of 5 uploads are missing + 4 of 5 uploads are missing 1 of these are caused by differing etags Null the etag column and re-run for automatic backfill + 1 of these are caused by differing urls + Empty the url column and re-run for automatic backfill TEXT - expect(Discourse.stats.get("missing_s3_uploads")).to eq(3) + expect(Discourse.stats.get("missing_s3_uploads")).to eq(4) end it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do + differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a") + differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault")) + expect( Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, ).to eq(12) @@ -96,9 +105,10 @@ RSpec.describe S3Inventory do verification_status = Upload.pluck(:verification_status) expect( Upload.where(verification_status: Upload.verification_statuses[:verified]).count, - ).to eq(3) + ).to eq(2) expect(Upload.with_invalid_etag_verification_status.count).to eq(2) + expect(Upload.with_invalid_url_verification_status.count).to eq(1) expect( Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, @@ -198,7 +208,12 @@ RSpec.describe S3Inventory do CSV.foreach(csv_filename, headers: false) do |row| next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default") - Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago) + Fabricate( + :upload, + url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]), + etag: row[S3Inventory::CSV_ETAG_INDEX], + updated_at: 2.days.ago, + ) end upload = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)