DEV: Allow site administrators to mark S3 uploads with a missing status (#27222)

This commit introduces the following changes which allows a site
administrator to mark `Upload` records with the `s3_file_missing`
verification status which will result in the `Upload` record being ignored when
`Discourse.store.list_missing_uploads` is ran on a site where S3 uploads
are enabled and `SiteSetting.enable_s3_inventory` is set to `true`.

1. Introduce `s3_file_missing` to `Upload.verification_statuses`
2. Introduce `Upload.mark_invalid_s3_uploads_as_missing` which updates
   `Upload#verification_status` of all `Upload` records from `invalid_etag` to `s3_file_missing`.
3. Introduce `rake uploads:mark_invalid_s3_uploads_as_missing` Rake task
   which allows a site administrator to change `Upload` records with
`invalid_etag` verification status to the `s3_file_missing`
verificaton_status.
4. Update `S3Inventory` to ignore `Upload` records with the
   `s3_file_missing` verification status.
This commit is contained in:
Alan Guo Xiang Tan 2024-05-30 08:37:38 +08:00 committed by GitHub
parent 2d1ab4c9e3
commit dc55b645b2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 125 additions and 44 deletions

View File

@ -57,8 +57,28 @@ class Upload < ActiveRecord::Base
scope :by_users, -> { where("uploads.id > ?", SEEDED_ID_THRESHOLD) } scope :by_users, -> { where("uploads.id > ?", SEEDED_ID_THRESHOLD) }
scope :without_s3_file_missing_confirmed_verification_status,
-> do
where.not(verification_status: Upload.verification_statuses[:s3_file_missing_confirmed])
end
scope :with_invalid_etag_verification_status,
-> { where(verification_status: Upload.verification_statuses[:invalid_etag]) }
def self.verification_statuses def self.verification_statuses
@verification_statuses ||= Enum.new(unchecked: 1, verified: 2, invalid_etag: 3) @verification_statuses ||=
Enum.new(
unchecked: 1,
verified: 2,
invalid_etag: 3, # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to the ETag value of the inventory file
s3_file_missing_confirmed: 4, # Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a file in the S3 file store
)
end
def self.mark_invalid_s3_uploads_as_missing
Upload.with_invalid_etag_verification_status.update_all(
verification_status: Upload.verification_statuses[:s3_file_missing_confirmed],
)
end end
def self.add_unused_callback(&block) def self.add_unused_callback(&block)

View File

@ -4,7 +4,7 @@ require "aws-sdk-s3"
require "csv" require "csv"
class S3Inventory class S3Inventory
attr_reader :type, :model, :inventory_date attr_reader :type, :inventory_date
CSV_KEY_INDEX = 1 CSV_KEY_INDEX = 1
CSV_ETAG_INDEX = 2 CSV_ETAG_INDEX = 2
@ -25,9 +25,12 @@ class S3Inventory
if type == :upload if type == :upload
@type = "original" @type = "original"
@model = Upload @model = Upload
@scope = @model.by_users.without_s3_file_missing_confirmed_verification_status
elsif type == :optimized elsif type == :optimized
@type = "optimized" @type = "optimized"
@model = OptimizedImage @scope = @model = OptimizedImage
else
raise "Invalid type: #{type}"
end end
end end
@ -46,10 +49,10 @@ class S3Inventory
ActiveRecord::Base.transaction do ActiveRecord::Base.transaction do
begin begin
connection.exec( connection.exec(
"CREATE TEMP TABLE #{table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))", "CREATE TEMP TABLE #{tmp_table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))",
) )
connection.copy_data("COPY #{table_name} FROM STDIN CSV") do connection.copy_data("COPY #{tmp_table_name} FROM STDIN CSV") do
for_each_inventory_row do |row| for_each_inventory_row do |row|
key = row[CSV_KEY_INDEX] key = row[CSV_KEY_INDEX]
@ -61,66 +64,70 @@ class S3Inventory
end end
end end
table_name = @model.table_name
# backfilling etags # backfilling etags
connection.async_exec( connection.async_exec(
"UPDATE #{model.table_name} "UPDATE #{table_name}
SET etag = #{table_name}.etag SET etag = #{tmp_table_name}.etag
FROM #{table_name} FROM #{tmp_table_name}
WHERE #{model.table_name}.etag IS NULL AND WHERE #{table_name}.etag IS NULL AND
#{model.table_name}.url = #{table_name}.url", #{table_name}.url = #{tmp_table_name}.url",
) )
uploads = model.where("updated_at < ?", inventory_date) uploads = @scope.where("updated_at < ?", inventory_date)
uploads = uploads.by_users if model == Upload
missing_uploads = missing_uploads =
uploads.joins( uploads.joins(
"LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag", "LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag",
).where("#{table_name}.etag IS NULL") ).where("#{tmp_table_name}.etag IS NULL")
exists_with_different_etag = exists_with_different_etag =
missing_uploads missing_uploads
.joins( .joins(
"LEFT JOIN #{table_name} inventory2 ON inventory2.url = #{model.table_name}.url", "LEFT JOIN #{tmp_table_name} inventory2 ON inventory2.url = #{table_name}.url",
) )
.where("inventory2.etag IS NOT NULL") .where("inventory2.etag IS NOT NULL")
.pluck(:id) .pluck(:id)
# marking as verified/not verified # marking as verified/not verified
if model == Upload if @model == Upload
sql_params = { sql_params = {
inventory_date: inventory_date, inventory_date: inventory_date,
invalid_etag: Upload.verification_statuses[:invalid_etag], invalid_etag: Upload.verification_statuses[:invalid_etag],
s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed],
verified: Upload.verification_statuses[:verified], verified: Upload.verification_statuses[:verified],
seeded_id_threshold: model::SEEDED_ID_THRESHOLD, seeded_id_threshold: @model::SEEDED_ID_THRESHOLD,
} }
DB.exec(<<~SQL, sql_params) DB.exec(<<~SQL, sql_params)
UPDATE #{model.table_name} UPDATE #{table_name}
SET verification_status = :verified SET verification_status = :verified
WHERE etag IS NOT NULL WHERE etag IS NOT NULL
AND verification_status <> :verified AND verification_status <> :verified
AND verification_status <> :s3_file_missing_confirmed
AND updated_at < :inventory_date AND updated_at < :inventory_date
AND id > :seeded_id_threshold AND id > :seeded_id_threshold
AND EXISTS AND EXISTS
( (
SELECT 1 SELECT 1
FROM #{table_name} FROM #{tmp_table_name}
WHERE #{table_name}.etag = #{model.table_name}.etag WHERE #{tmp_table_name}.etag = #{table_name}.etag
) )
SQL SQL
DB.exec(<<~SQL, sql_params) DB.exec(<<~SQL, sql_params)
UPDATE #{model.table_name} UPDATE #{table_name}
SET verification_status = :invalid_etag SET verification_status = :invalid_etag
WHERE verification_status <> :invalid_etag WHERE verification_status <> :invalid_etag
AND verification_status <> :s3_file_missing_confirmed
AND updated_at < :inventory_date AND updated_at < :inventory_date
AND id > :seeded_id_threshold AND id > :seeded_id_threshold
AND NOT EXISTS AND NOT EXISTS
( (
SELECT 1 SELECT 1
FROM #{table_name} FROM #{tmp_table_name}
WHERE #{table_name}.etag = #{model.table_name}.etag WHERE #{tmp_table_name}.etag = #{table_name}.etag
) )
SQL SQL
end end
@ -136,16 +143,16 @@ class S3Inventory
end end
end end
log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing" log "#{missing_count} of #{uploads.count} #{@scope.name.underscore.pluralize} are missing"
if exists_with_different_etag.present? if exists_with_different_etag.present?
log "#{exists_with_different_etag.count} of these are caused by differing etags" log "#{exists_with_different_etag.count} of these are caused by differing etags"
log "Null the etag column and re-run for automatic backfill" log "Null the etag column and re-run for automatic backfill"
end end
end end
Discourse.stats.set("missing_s3_#{model.table_name}", missing_count) Discourse.stats.set("missing_s3_#{table_name}", missing_count)
ensure ensure
connection.exec("DROP TABLE #{table_name}") unless connection.nil? connection.exec("DROP TABLE #{tmp_table_name}") unless connection.nil?
end end
end end
ensure ensure
@ -255,7 +262,7 @@ class S3Inventory
@connection ||= ActiveRecord::Base.connection.raw_connection @connection ||= ActiveRecord::Base.connection.raw_connection
end end
def table_name def tmp_table_name
"#{type}_inventory" "#{type}_inventory"
end end

View File

@ -949,7 +949,7 @@ def analyze_missing_s3
puts puts
end end
missing_uploads = Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]) missing_uploads = Upload.with_invalid_etag_verification_status
puts "Total missing uploads: #{missing_uploads.count}, newest is #{missing_uploads.maximum(:created_at)}" puts "Total missing uploads: #{missing_uploads.count}, newest is #{missing_uploads.maximum(:created_at)}"
puts "Total problem posts: #{lookup.keys.count} with #{lookup.values.sum { |a| a.length }} missing uploads" puts "Total problem posts: #{lookup.keys.count} with #{lookup.values.sum { |a| a.length }} missing uploads"
puts "Other missing uploads count: #{other.count}" puts "Other missing uploads count: #{other.count}"
@ -991,16 +991,15 @@ def analyze_missing_s3
end end
def delete_missing_s3 def delete_missing_s3
missing = missing = Upload.with_invalid_etag_verification_status.order(:created_at)
Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]).order(
:created_at,
)
count = missing.count count = missing.count
if count > 0 if count > 0
puts "The following uploads will be deleted from the database" puts "The following uploads will be deleted from the database"
missing.each { |upload| puts "#{upload.id} - #{upload.url} - #{upload.created_at}" } missing.each { |upload| puts "#{upload.id} - #{upload.url} - #{upload.created_at}" }
puts "Please confirm you wish to delete #{count} upload records by typing YES" puts "Please confirm you wish to delete #{count} upload records by typing YES"
confirm = STDIN.gets.strip confirm = STDIN.gets.strip
if confirm == "YES" if confirm == "YES"
missing.destroy_all missing.destroy_all
puts "#{count} records were deleted" puts "#{count} records were deleted"
@ -1011,6 +1010,29 @@ def delete_missing_s3
end end
end end
task "uploads:mark_invalid_s3_uploads_as_missing" => :environment do
puts "Marking invalid S3 uploads as missing for '#{RailsMultisite::ConnectionManagement.current_db}'..."
invalid_s3_uploads = Upload.with_invalid_etag_verification_status.order(:created_at)
count = invalid_s3_uploads.count
if count > 0
puts "The following uploads will be marked as missing on S3"
invalid_s3_uploads.each { |upload| puts "#{upload.id} - #{upload.url} - #{upload.created_at}" }
puts "Please confirm you wish to mark #{count} upload records as missing by typing YES"
confirm = STDIN.gets.strip
if confirm == "YES"
changed_count = Upload.mark_invalid_s3_uploads_as_missing
puts "#{changed_count} records were marked as missing"
else
STDERR.puts "Aborting"
exit 1
end
else
puts "No uploads found with invalid S3 etag verification status"
end
end
task "uploads:delete_missing_s3" => :environment do task "uploads:delete_missing_s3" => :environment do
if RailsMultisite::ConnectionManagement.current_db != "default" if RailsMultisite::ConnectionManagement.current_db != "default"
delete_missing_s3 delete_missing_s3
@ -1031,7 +1053,7 @@ def fix_missing_s3
Jobs.run_immediately! Jobs.run_immediately!
puts "Attempting to download missing uploads and recreate" puts "Attempting to download missing uploads and recreate"
ids = Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]).pluck(:id) ids = Upload.with_invalid_etag_verification_status.pluck(:id)
ids.each do |id| ids.each do |id|
upload = Upload.find_by(id: id) upload = Upload.find_by(id: id)
next if !upload next if !upload

View File

@ -38,10 +38,18 @@ RSpec.describe S3Inventory do
) )
end end
@upload1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago) @upload_1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)
@upload2 = Fabricate(:upload, etag: "ETag2", updated_at: Time.now) @upload_2 = Fabricate(:upload, etag: "ETag2", updated_at: Time.now)
@no_etag = Fabricate(:upload, updated_at: 2.days.ago) @no_etag = Fabricate(:upload, updated_at: 2.days.ago)
@upload_3 =
Fabricate(
:upload,
etag: "ETag3",
updated_at: 2.days.ago,
verification_status: Upload.verification_statuses[:s3_file_missing_confirmed],
)
inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3) inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3)
inventory.expects(:inventory_date).times(2).returns(Time.now) inventory.expects(:inventory_date).times(2).returns(Time.now)
end end
@ -49,7 +57,7 @@ RSpec.describe S3Inventory do
it "should display missing uploads correctly" do it "should display missing uploads correctly" do
output = capture_stdout { inventory.backfill_etags_and_list_missing } output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(output).to eq("#{@upload1.url}\n#{@no_etag.url}\n2 of 5 uploads are missing\n") expect(output).to eq("#{@upload_1.url}\n#{@no_etag.url}\n2 of 5 uploads are missing\n")
expect(Discourse.stats.get("missing_s3_uploads")).to eq(2) expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
end end
@ -61,7 +69,7 @@ RSpec.describe S3Inventory do
expect(output).to eq(<<~TEXT) expect(output).to eq(<<~TEXT)
#{differing_etag.url} has different etag #{differing_etag.url} has different etag
#{@upload1.url} #{@upload_1.url}
#{@no_etag.url} #{@no_etag.url}
3 of 5 uploads are missing 3 of 5 uploads are missing
1 of these are caused by differing etags 1 of these are caused by differing etags
@ -80,23 +88,23 @@ RSpec.describe S3Inventory do
expect( expect(
Upload.where(verification_status: Upload.verification_statuses[:verified]).count, Upload.where(verification_status: Upload.verification_statuses[:verified]).count,
).to eq(3) ).to eq(3)
expect(
Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]).count, expect(Upload.with_invalid_etag_verification_status.count).to eq(2)
).to eq(2)
expect( expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
).to eq(7) ).to eq(7)
end end
it "does not affect the updated_at date of uploads" do it "does not affect the updated_at date of uploads" do
upload_1_updated = @upload1.updated_at upload_1_updated = @upload_1.updated_at
upload_2_updated = @upload2.updated_at upload_2_updated = @upload_2.updated_at
no_etag_updated = @no_etag.updated_at no_etag_updated = @no_etag.updated_at
output = capture_stdout { inventory.backfill_etags_and_list_missing } output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(@upload1.reload.updated_at).to eq_time(upload_1_updated) expect(@upload_1.reload.updated_at).to eq_time(upload_1_updated)
expect(@upload2.reload.updated_at).to eq_time(upload_2_updated) expect(@upload_2.reload.updated_at).to eq_time(upload_2_updated)
expect(@no_etag.reload.updated_at).to eq_time(no_etag_updated) expect(@no_etag.reload.updated_at).to eq_time(no_etag_updated)
end end
end end

View File

@ -809,4 +809,28 @@ RSpec.describe Upload do
expect { u.update!(dominant_color: "abcd") }.to raise_error(ActiveRecord::RecordInvalid) expect { u.update!(dominant_color: "abcd") }.to raise_error(ActiveRecord::RecordInvalid)
end end
end end
describe ".mark_invalid_s3_uploads_as_missing" do
it "should update all upload records with a `verification_status` of `invalid_etag` to `s3_file_missing`" do
upload_1 =
Fabricate(:upload_s3, verification_status: Upload.verification_statuses[:invalid_etag])
upload_2 =
Fabricate(:upload_s3, verification_status: Upload.verification_statuses[:invalid_etag])
upload_3 = Fabricate(:upload_s3, verification_status: Upload.verification_statuses[:verified])
Upload.mark_invalid_s3_uploads_as_missing
expect(upload_1.reload.verification_status).to eq(
Upload.verification_statuses[:s3_file_missing_confirmed],
)
expect(upload_2.reload.verification_status).to eq(
Upload.verification_statuses[:s3_file_missing_confirmed],
)
expect(upload_3.reload.verification_status).to eq(Upload.verification_statuses[:verified])
end
end
end end