FEATURE: Mark bad uploads with :invalid_url (#29640)

A "bad upload" in this context is a upload with a mismatched URL. This can happen when changing the S3 bucket used for uploads and the upload records in the database have not been remapped correctly.
This commit is contained in:
Bianca Nenciu 2024-11-08 02:05:14 +02:00 committed by GitHub
parent 81e171070d
commit 5a00a041f1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 67 additions and 7 deletions

View File

@ -66,13 +66,25 @@ class Upload < ActiveRecord::Base
scope :with_invalid_etag_verification_status, scope :with_invalid_etag_verification_status,
-> { where(verification_status: Upload.verification_statuses[:invalid_etag]) } -> { where(verification_status: Upload.verification_statuses[:invalid_etag]) }
scope :with_invalid_url_verification_status,
-> { where(verification_status: Upload.verification_statuses[:invalid_url]) }
def self.verification_statuses def self.verification_statuses
@verification_statuses ||= @verification_statuses ||=
Enum.new( Enum.new(
unchecked: 1, unchecked: 1,
verified: 2, verified: 2,
invalid_etag: 3, # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to the ETag value of the inventory file # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to
s3_file_missing_confirmed: 4, # Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a file in the S3 file store # the ETag value of the inventory file. A upload with invalid ETag is equivalent to "missing
# upload file"
invalid_etag: 3,
# Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a
# file in the S3 file store
s3_file_missing_confirmed: 4,
# Used by S3Inventory to mark S3 Upload records that have an invalid url value compared to
# the url value of the inventory file. A upload with invalid URL is equivalent to "file
# exists (same ETag), but with a different URL"
invalid_url: 5,
) )
end end

View File

@ -85,7 +85,9 @@ class S3Inventory
missing_uploads = missing_uploads =
uploads.joins( uploads.joins(
"LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag", "LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag",
).where("#{tmp_table_name}.etag IS NULL") ).where(
"#{tmp_table_name}.etag IS NULL OR #{tmp_table_name}.url != #{table_name}.url",
)
exists_with_different_etag = exists_with_different_etag =
missing_uploads missing_uploads
@ -95,11 +97,20 @@ class S3Inventory
.where("inventory2.etag IS NOT NULL") .where("inventory2.etag IS NOT NULL")
.pluck(:id) .pluck(:id)
exists_with_different_url =
missing_uploads
.joins(
"LEFT JOIN #{tmp_table_name} inventory3 ON inventory3.etag = #{table_name}.etag",
)
.where("inventory3.url != #{table_name}.url")
.pluck(:id)
# marking as verified/not verified # marking as verified/not verified
if @model == Upload if @model == Upload
sql_params = { sql_params = {
inventory_date: inventory_date, inventory_date: inventory_date,
invalid_etag: Upload.verification_statuses[:invalid_etag], invalid_etag: Upload.verification_statuses[:invalid_etag],
invalid_url: Upload.verification_statuses[:invalid_url],
s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed], s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed],
verified: Upload.verification_statuses[:verified], verified: Upload.verification_statuses[:verified],
seeded_id_threshold: @model::SEEDED_ID_THRESHOLD, seeded_id_threshold: @model::SEEDED_ID_THRESHOLD,
@ -135,6 +146,22 @@ class S3Inventory
WHERE #{tmp_table_name}.etag = #{table_name}.etag WHERE #{tmp_table_name}.etag = #{table_name}.etag
) )
SQL SQL
DB.exec(<<~SQL, sql_params)
UPDATE #{table_name}
SET verification_status = :invalid_url
WHERE verification_status <> :invalid_url
AND verification_status <> :invalid_etag
AND verification_status <> :s3_file_missing_confirmed
AND updated_at < :inventory_date
AND id > :seeded_id_threshold
AND NOT EXISTS
(
SELECT 1
FROM #{tmp_table_name}
WHERE #{tmp_table_name}.url = #{table_name}.url
)
SQL
end end
if (missing_count = missing_uploads.count) > 0 if (missing_count = missing_uploads.count) > 0
@ -143,6 +170,8 @@ class S3Inventory
.find_each do |upload| .find_each do |upload|
if exists_with_different_etag.include?(upload.id) if exists_with_different_etag.include?(upload.id)
log "#{upload.url} has different etag" log "#{upload.url} has different etag"
elsif exists_with_different_url.include?(upload.id)
log "#{upload.url} has different url"
else else
log upload.url log upload.url
end end
@ -153,6 +182,10 @@ class S3Inventory
log "#{exists_with_different_etag.count} of these are caused by differing etags" log "#{exists_with_different_etag.count} of these are caused by differing etags"
log "Null the etag column and re-run for automatic backfill" log "Null the etag column and re-run for automatic backfill"
end end
if exists_with_different_url.present?
log "#{exists_with_different_url.count} of these are caused by differing urls"
log "Empty the url column and re-run for automatic backfill"
end
end end
set_missing_s3_discourse_stats(missing_count) set_missing_s3_discourse_stats(missing_count)

View File

@ -74,20 +74,29 @@ RSpec.describe S3Inventory do
differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0") differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0")
differing_etag.update_columns(etag: "somethingelse") differing_etag.update_columns(etag: "somethingelse")
differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a")
differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault"))
output = capture_stdout { inventory.backfill_etags_and_list_missing } output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(output).to eq(<<~TEXT) expect(output).to eq(<<~TEXT)
#{differing_etag.url} has different etag #{differing_etag.url} has different etag
#{differing_url.url} has different url
#{@upload_1.url} #{@upload_1.url}
#{@no_etag.url} #{@no_etag.url}
3 of 5 uploads are missing 4 of 5 uploads are missing
1 of these are caused by differing etags 1 of these are caused by differing etags
Null the etag column and re-run for automatic backfill Null the etag column and re-run for automatic backfill
1 of these are caused by differing urls
Empty the url column and re-run for automatic backfill
TEXT TEXT
expect(Discourse.stats.get("missing_s3_uploads")).to eq(3) expect(Discourse.stats.get("missing_s3_uploads")).to eq(4)
end end
it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do
differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a")
differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault"))
expect( expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
).to eq(12) ).to eq(12)
@ -96,9 +105,10 @@ RSpec.describe S3Inventory do
verification_status = Upload.pluck(:verification_status) verification_status = Upload.pluck(:verification_status)
expect( expect(
Upload.where(verification_status: Upload.verification_statuses[:verified]).count, Upload.where(verification_status: Upload.verification_statuses[:verified]).count,
).to eq(3) ).to eq(2)
expect(Upload.with_invalid_etag_verification_status.count).to eq(2) expect(Upload.with_invalid_etag_verification_status.count).to eq(2)
expect(Upload.with_invalid_url_verification_status.count).to eq(1)
expect( expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
@ -198,7 +208,12 @@ RSpec.describe S3Inventory do
CSV.foreach(csv_filename, headers: false) do |row| CSV.foreach(csv_filename, headers: false) do |row|
next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default") next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default")
Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago) Fabricate(
:upload,
url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]),
etag: row[S3Inventory::CSV_ETAG_INDEX],
updated_at: 2.days.ago,
)
end end
upload = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago) upload = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)