FEATURE: Mark bad uploads with :invalid_url (#29640)

A "bad upload" in this context is a upload with a mismatched URL. This can happen when changing the S3 bucket used for uploads and the upload records in the database have not been remapped correctly.
This commit is contained in:
Bianca Nenciu 2024-11-08 02:05:14 +02:00 committed by GitHub
parent 81e171070d
commit 5a00a041f1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 67 additions and 7 deletions

View File

@ -66,13 +66,25 @@ class Upload < ActiveRecord::Base
scope :with_invalid_etag_verification_status,
-> { where(verification_status: Upload.verification_statuses[:invalid_etag]) }
scope :with_invalid_url_verification_status,
-> { where(verification_status: Upload.verification_statuses[:invalid_url]) }
def self.verification_statuses
@verification_statuses ||=
Enum.new(
unchecked: 1,
verified: 2,
invalid_etag: 3, # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to the ETag value of the inventory file
s3_file_missing_confirmed: 4, # Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a file in the S3 file store
# Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to
# the ETag value of the inventory file. A upload with invalid ETag is equivalent to "missing
# upload file"
invalid_etag: 3,
# Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a
# file in the S3 file store
s3_file_missing_confirmed: 4,
# Used by S3Inventory to mark S3 Upload records that have an invalid url value compared to
# the url value of the inventory file. A upload with invalid URL is equivalent to "file
# exists (same ETag), but with a different URL"
invalid_url: 5,
)
end

View File

@ -85,7 +85,9 @@ class S3Inventory
missing_uploads =
uploads.joins(
"LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag",
).where("#{tmp_table_name}.etag IS NULL")
).where(
"#{tmp_table_name}.etag IS NULL OR #{tmp_table_name}.url != #{table_name}.url",
)
exists_with_different_etag =
missing_uploads
@ -95,11 +97,20 @@ class S3Inventory
.where("inventory2.etag IS NOT NULL")
.pluck(:id)
exists_with_different_url =
missing_uploads
.joins(
"LEFT JOIN #{tmp_table_name} inventory3 ON inventory3.etag = #{table_name}.etag",
)
.where("inventory3.url != #{table_name}.url")
.pluck(:id)
# marking as verified/not verified
if @model == Upload
sql_params = {
inventory_date: inventory_date,
invalid_etag: Upload.verification_statuses[:invalid_etag],
invalid_url: Upload.verification_statuses[:invalid_url],
s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed],
verified: Upload.verification_statuses[:verified],
seeded_id_threshold: @model::SEEDED_ID_THRESHOLD,
@ -135,6 +146,22 @@ class S3Inventory
WHERE #{tmp_table_name}.etag = #{table_name}.etag
)
SQL
DB.exec(<<~SQL, sql_params)
UPDATE #{table_name}
SET verification_status = :invalid_url
WHERE verification_status <> :invalid_url
AND verification_status <> :invalid_etag
AND verification_status <> :s3_file_missing_confirmed
AND updated_at < :inventory_date
AND id > :seeded_id_threshold
AND NOT EXISTS
(
SELECT 1
FROM #{tmp_table_name}
WHERE #{tmp_table_name}.url = #{table_name}.url
)
SQL
end
if (missing_count = missing_uploads.count) > 0
@ -143,6 +170,8 @@ class S3Inventory
.find_each do |upload|
if exists_with_different_etag.include?(upload.id)
log "#{upload.url} has different etag"
elsif exists_with_different_url.include?(upload.id)
log "#{upload.url} has different url"
else
log upload.url
end
@ -153,6 +182,10 @@ class S3Inventory
log "#{exists_with_different_etag.count} of these are caused by differing etags"
log "Null the etag column and re-run for automatic backfill"
end
if exists_with_different_url.present?
log "#{exists_with_different_url.count} of these are caused by differing urls"
log "Empty the url column and re-run for automatic backfill"
end
end
set_missing_s3_discourse_stats(missing_count)

View File

@ -74,20 +74,29 @@ RSpec.describe S3Inventory do
differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0")
differing_etag.update_columns(etag: "somethingelse")
differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a")
differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault"))
output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(output).to eq(<<~TEXT)
#{differing_etag.url} has different etag
#{differing_url.url} has different url
#{@upload_1.url}
#{@no_etag.url}
3 of 5 uploads are missing
4 of 5 uploads are missing
1 of these are caused by differing etags
Null the etag column and re-run for automatic backfill
1 of these are caused by differing urls
Empty the url column and re-run for automatic backfill
TEXT
expect(Discourse.stats.get("missing_s3_uploads")).to eq(3)
expect(Discourse.stats.get("missing_s3_uploads")).to eq(4)
end
it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do
differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a")
differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault"))
expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
).to eq(12)
@ -96,9 +105,10 @@ RSpec.describe S3Inventory do
verification_status = Upload.pluck(:verification_status)
expect(
Upload.where(verification_status: Upload.verification_statuses[:verified]).count,
).to eq(3)
).to eq(2)
expect(Upload.with_invalid_etag_verification_status.count).to eq(2)
expect(Upload.with_invalid_url_verification_status.count).to eq(1)
expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
@ -198,7 +208,12 @@ RSpec.describe S3Inventory do
CSV.foreach(csv_filename, headers: false) do |row|
next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default")
Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago)
Fabricate(
:upload,
url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]),
etag: row[S3Inventory::CSV_ETAG_INDEX],
updated_at: 2.days.ago,
)
end
upload = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)