mirror of
https://github.com/discourse/discourse.git
synced 2024-11-25 09:42:07 +08:00
DEV: Allow site administrators to mark S3 uploads with a missing status (#27222)
This commit introduces the following changes which allows a site administrator to mark `Upload` records with the `s3_file_missing` verification status which will result in the `Upload` record being ignored when `Discourse.store.list_missing_uploads` is ran on a site where S3 uploads are enabled and `SiteSetting.enable_s3_inventory` is set to `true`. 1. Introduce `s3_file_missing` to `Upload.verification_statuses` 2. Introduce `Upload.mark_invalid_s3_uploads_as_missing` which updates `Upload#verification_status` of all `Upload` records from `invalid_etag` to `s3_file_missing`. 3. Introduce `rake uploads:mark_invalid_s3_uploads_as_missing` Rake task which allows a site administrator to change `Upload` records with `invalid_etag` verification status to the `s3_file_missing` verificaton_status. 4. Update `S3Inventory` to ignore `Upload` records with the `s3_file_missing` verification status.
This commit is contained in:
parent
2d1ab4c9e3
commit
dc55b645b2
|
@ -57,8 +57,28 @@ class Upload < ActiveRecord::Base
|
||||||
|
|
||||||
scope :by_users, -> { where("uploads.id > ?", SEEDED_ID_THRESHOLD) }
|
scope :by_users, -> { where("uploads.id > ?", SEEDED_ID_THRESHOLD) }
|
||||||
|
|
||||||
|
scope :without_s3_file_missing_confirmed_verification_status,
|
||||||
|
-> do
|
||||||
|
where.not(verification_status: Upload.verification_statuses[:s3_file_missing_confirmed])
|
||||||
|
end
|
||||||
|
|
||||||
|
scope :with_invalid_etag_verification_status,
|
||||||
|
-> { where(verification_status: Upload.verification_statuses[:invalid_etag]) }
|
||||||
|
|
||||||
def self.verification_statuses
|
def self.verification_statuses
|
||||||
@verification_statuses ||= Enum.new(unchecked: 1, verified: 2, invalid_etag: 3)
|
@verification_statuses ||=
|
||||||
|
Enum.new(
|
||||||
|
unchecked: 1,
|
||||||
|
verified: 2,
|
||||||
|
invalid_etag: 3, # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to the ETag value of the inventory file
|
||||||
|
s3_file_missing_confirmed: 4, # Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a file in the S3 file store
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.mark_invalid_s3_uploads_as_missing
|
||||||
|
Upload.with_invalid_etag_verification_status.update_all(
|
||||||
|
verification_status: Upload.verification_statuses[:s3_file_missing_confirmed],
|
||||||
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.add_unused_callback(&block)
|
def self.add_unused_callback(&block)
|
||||||
|
|
|
@ -4,7 +4,7 @@ require "aws-sdk-s3"
|
||||||
require "csv"
|
require "csv"
|
||||||
|
|
||||||
class S3Inventory
|
class S3Inventory
|
||||||
attr_reader :type, :model, :inventory_date
|
attr_reader :type, :inventory_date
|
||||||
|
|
||||||
CSV_KEY_INDEX = 1
|
CSV_KEY_INDEX = 1
|
||||||
CSV_ETAG_INDEX = 2
|
CSV_ETAG_INDEX = 2
|
||||||
|
@ -25,9 +25,12 @@ class S3Inventory
|
||||||
if type == :upload
|
if type == :upload
|
||||||
@type = "original"
|
@type = "original"
|
||||||
@model = Upload
|
@model = Upload
|
||||||
|
@scope = @model.by_users.without_s3_file_missing_confirmed_verification_status
|
||||||
elsif type == :optimized
|
elsif type == :optimized
|
||||||
@type = "optimized"
|
@type = "optimized"
|
||||||
@model = OptimizedImage
|
@scope = @model = OptimizedImage
|
||||||
|
else
|
||||||
|
raise "Invalid type: #{type}"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -46,10 +49,10 @@ class S3Inventory
|
||||||
ActiveRecord::Base.transaction do
|
ActiveRecord::Base.transaction do
|
||||||
begin
|
begin
|
||||||
connection.exec(
|
connection.exec(
|
||||||
"CREATE TEMP TABLE #{table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))",
|
"CREATE TEMP TABLE #{tmp_table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))",
|
||||||
)
|
)
|
||||||
|
|
||||||
connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
|
connection.copy_data("COPY #{tmp_table_name} FROM STDIN CSV") do
|
||||||
for_each_inventory_row do |row|
|
for_each_inventory_row do |row|
|
||||||
key = row[CSV_KEY_INDEX]
|
key = row[CSV_KEY_INDEX]
|
||||||
|
|
||||||
|
@ -61,66 +64,70 @@ class S3Inventory
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
table_name = @model.table_name
|
||||||
|
|
||||||
# backfilling etags
|
# backfilling etags
|
||||||
connection.async_exec(
|
connection.async_exec(
|
||||||
"UPDATE #{model.table_name}
|
"UPDATE #{table_name}
|
||||||
SET etag = #{table_name}.etag
|
SET etag = #{tmp_table_name}.etag
|
||||||
FROM #{table_name}
|
FROM #{tmp_table_name}
|
||||||
WHERE #{model.table_name}.etag IS NULL AND
|
WHERE #{table_name}.etag IS NULL AND
|
||||||
#{model.table_name}.url = #{table_name}.url",
|
#{table_name}.url = #{tmp_table_name}.url",
|
||||||
)
|
)
|
||||||
|
|
||||||
uploads = model.where("updated_at < ?", inventory_date)
|
uploads = @scope.where("updated_at < ?", inventory_date)
|
||||||
uploads = uploads.by_users if model == Upload
|
|
||||||
|
|
||||||
missing_uploads =
|
missing_uploads =
|
||||||
uploads.joins(
|
uploads.joins(
|
||||||
"LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag",
|
"LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag",
|
||||||
).where("#{table_name}.etag IS NULL")
|
).where("#{tmp_table_name}.etag IS NULL")
|
||||||
|
|
||||||
exists_with_different_etag =
|
exists_with_different_etag =
|
||||||
missing_uploads
|
missing_uploads
|
||||||
.joins(
|
.joins(
|
||||||
"LEFT JOIN #{table_name} inventory2 ON inventory2.url = #{model.table_name}.url",
|
"LEFT JOIN #{tmp_table_name} inventory2 ON inventory2.url = #{table_name}.url",
|
||||||
)
|
)
|
||||||
.where("inventory2.etag IS NOT NULL")
|
.where("inventory2.etag IS NOT NULL")
|
||||||
.pluck(:id)
|
.pluck(:id)
|
||||||
|
|
||||||
# marking as verified/not verified
|
# marking as verified/not verified
|
||||||
if model == Upload
|
if @model == Upload
|
||||||
sql_params = {
|
sql_params = {
|
||||||
inventory_date: inventory_date,
|
inventory_date: inventory_date,
|
||||||
invalid_etag: Upload.verification_statuses[:invalid_etag],
|
invalid_etag: Upload.verification_statuses[:invalid_etag],
|
||||||
|
s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed],
|
||||||
verified: Upload.verification_statuses[:verified],
|
verified: Upload.verification_statuses[:verified],
|
||||||
seeded_id_threshold: model::SEEDED_ID_THRESHOLD,
|
seeded_id_threshold: @model::SEEDED_ID_THRESHOLD,
|
||||||
}
|
}
|
||||||
|
|
||||||
DB.exec(<<~SQL, sql_params)
|
DB.exec(<<~SQL, sql_params)
|
||||||
UPDATE #{model.table_name}
|
UPDATE #{table_name}
|
||||||
SET verification_status = :verified
|
SET verification_status = :verified
|
||||||
WHERE etag IS NOT NULL
|
WHERE etag IS NOT NULL
|
||||||
AND verification_status <> :verified
|
AND verification_status <> :verified
|
||||||
|
AND verification_status <> :s3_file_missing_confirmed
|
||||||
AND updated_at < :inventory_date
|
AND updated_at < :inventory_date
|
||||||
AND id > :seeded_id_threshold
|
AND id > :seeded_id_threshold
|
||||||
AND EXISTS
|
AND EXISTS
|
||||||
(
|
(
|
||||||
SELECT 1
|
SELECT 1
|
||||||
FROM #{table_name}
|
FROM #{tmp_table_name}
|
||||||
WHERE #{table_name}.etag = #{model.table_name}.etag
|
WHERE #{tmp_table_name}.etag = #{table_name}.etag
|
||||||
)
|
)
|
||||||
SQL
|
SQL
|
||||||
|
|
||||||
DB.exec(<<~SQL, sql_params)
|
DB.exec(<<~SQL, sql_params)
|
||||||
UPDATE #{model.table_name}
|
UPDATE #{table_name}
|
||||||
SET verification_status = :invalid_etag
|
SET verification_status = :invalid_etag
|
||||||
WHERE verification_status <> :invalid_etag
|
WHERE verification_status <> :invalid_etag
|
||||||
|
AND verification_status <> :s3_file_missing_confirmed
|
||||||
AND updated_at < :inventory_date
|
AND updated_at < :inventory_date
|
||||||
AND id > :seeded_id_threshold
|
AND id > :seeded_id_threshold
|
||||||
AND NOT EXISTS
|
AND NOT EXISTS
|
||||||
(
|
(
|
||||||
SELECT 1
|
SELECT 1
|
||||||
FROM #{table_name}
|
FROM #{tmp_table_name}
|
||||||
WHERE #{table_name}.etag = #{model.table_name}.etag
|
WHERE #{tmp_table_name}.etag = #{table_name}.etag
|
||||||
)
|
)
|
||||||
SQL
|
SQL
|
||||||
end
|
end
|
||||||
|
@ -136,16 +143,16 @@ class S3Inventory
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing"
|
log "#{missing_count} of #{uploads.count} #{@scope.name.underscore.pluralize} are missing"
|
||||||
if exists_with_different_etag.present?
|
if exists_with_different_etag.present?
|
||||||
log "#{exists_with_different_etag.count} of these are caused by differing etags"
|
log "#{exists_with_different_etag.count} of these are caused by differing etags"
|
||||||
log "Null the etag column and re-run for automatic backfill"
|
log "Null the etag column and re-run for automatic backfill"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
Discourse.stats.set("missing_s3_#{model.table_name}", missing_count)
|
Discourse.stats.set("missing_s3_#{table_name}", missing_count)
|
||||||
ensure
|
ensure
|
||||||
connection.exec("DROP TABLE #{table_name}") unless connection.nil?
|
connection.exec("DROP TABLE #{tmp_table_name}") unless connection.nil?
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
ensure
|
ensure
|
||||||
|
@ -255,7 +262,7 @@ class S3Inventory
|
||||||
@connection ||= ActiveRecord::Base.connection.raw_connection
|
@connection ||= ActiveRecord::Base.connection.raw_connection
|
||||||
end
|
end
|
||||||
|
|
||||||
def table_name
|
def tmp_table_name
|
||||||
"#{type}_inventory"
|
"#{type}_inventory"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -949,7 +949,7 @@ def analyze_missing_s3
|
||||||
puts
|
puts
|
||||||
end
|
end
|
||||||
|
|
||||||
missing_uploads = Upload.where(verification_status: Upload.verification_statuses[:invalid_etag])
|
missing_uploads = Upload.with_invalid_etag_verification_status
|
||||||
puts "Total missing uploads: #{missing_uploads.count}, newest is #{missing_uploads.maximum(:created_at)}"
|
puts "Total missing uploads: #{missing_uploads.count}, newest is #{missing_uploads.maximum(:created_at)}"
|
||||||
puts "Total problem posts: #{lookup.keys.count} with #{lookup.values.sum { |a| a.length }} missing uploads"
|
puts "Total problem posts: #{lookup.keys.count} with #{lookup.values.sum { |a| a.length }} missing uploads"
|
||||||
puts "Other missing uploads count: #{other.count}"
|
puts "Other missing uploads count: #{other.count}"
|
||||||
|
@ -991,16 +991,15 @@ def analyze_missing_s3
|
||||||
end
|
end
|
||||||
|
|
||||||
def delete_missing_s3
|
def delete_missing_s3
|
||||||
missing =
|
missing = Upload.with_invalid_etag_verification_status.order(:created_at)
|
||||||
Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]).order(
|
|
||||||
:created_at,
|
|
||||||
)
|
|
||||||
count = missing.count
|
count = missing.count
|
||||||
|
|
||||||
if count > 0
|
if count > 0
|
||||||
puts "The following uploads will be deleted from the database"
|
puts "The following uploads will be deleted from the database"
|
||||||
missing.each { |upload| puts "#{upload.id} - #{upload.url} - #{upload.created_at}" }
|
missing.each { |upload| puts "#{upload.id} - #{upload.url} - #{upload.created_at}" }
|
||||||
puts "Please confirm you wish to delete #{count} upload records by typing YES"
|
puts "Please confirm you wish to delete #{count} upload records by typing YES"
|
||||||
confirm = STDIN.gets.strip
|
confirm = STDIN.gets.strip
|
||||||
|
|
||||||
if confirm == "YES"
|
if confirm == "YES"
|
||||||
missing.destroy_all
|
missing.destroy_all
|
||||||
puts "#{count} records were deleted"
|
puts "#{count} records were deleted"
|
||||||
|
@ -1011,6 +1010,29 @@ def delete_missing_s3
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
task "uploads:mark_invalid_s3_uploads_as_missing" => :environment do
|
||||||
|
puts "Marking invalid S3 uploads as missing for '#{RailsMultisite::ConnectionManagement.current_db}'..."
|
||||||
|
invalid_s3_uploads = Upload.with_invalid_etag_verification_status.order(:created_at)
|
||||||
|
count = invalid_s3_uploads.count
|
||||||
|
|
||||||
|
if count > 0
|
||||||
|
puts "The following uploads will be marked as missing on S3"
|
||||||
|
invalid_s3_uploads.each { |upload| puts "#{upload.id} - #{upload.url} - #{upload.created_at}" }
|
||||||
|
puts "Please confirm you wish to mark #{count} upload records as missing by typing YES"
|
||||||
|
confirm = STDIN.gets.strip
|
||||||
|
|
||||||
|
if confirm == "YES"
|
||||||
|
changed_count = Upload.mark_invalid_s3_uploads_as_missing
|
||||||
|
puts "#{changed_count} records were marked as missing"
|
||||||
|
else
|
||||||
|
STDERR.puts "Aborting"
|
||||||
|
exit 1
|
||||||
|
end
|
||||||
|
else
|
||||||
|
puts "No uploads found with invalid S3 etag verification status"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
task "uploads:delete_missing_s3" => :environment do
|
task "uploads:delete_missing_s3" => :environment do
|
||||||
if RailsMultisite::ConnectionManagement.current_db != "default"
|
if RailsMultisite::ConnectionManagement.current_db != "default"
|
||||||
delete_missing_s3
|
delete_missing_s3
|
||||||
|
@ -1031,7 +1053,7 @@ def fix_missing_s3
|
||||||
Jobs.run_immediately!
|
Jobs.run_immediately!
|
||||||
|
|
||||||
puts "Attempting to download missing uploads and recreate"
|
puts "Attempting to download missing uploads and recreate"
|
||||||
ids = Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]).pluck(:id)
|
ids = Upload.with_invalid_etag_verification_status.pluck(:id)
|
||||||
ids.each do |id|
|
ids.each do |id|
|
||||||
upload = Upload.find_by(id: id)
|
upload = Upload.find_by(id: id)
|
||||||
next if !upload
|
next if !upload
|
||||||
|
|
|
@ -38,10 +38,18 @@ RSpec.describe S3Inventory do
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
@upload1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)
|
@upload_1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)
|
||||||
@upload2 = Fabricate(:upload, etag: "ETag2", updated_at: Time.now)
|
@upload_2 = Fabricate(:upload, etag: "ETag2", updated_at: Time.now)
|
||||||
@no_etag = Fabricate(:upload, updated_at: 2.days.ago)
|
@no_etag = Fabricate(:upload, updated_at: 2.days.ago)
|
||||||
|
|
||||||
|
@upload_3 =
|
||||||
|
Fabricate(
|
||||||
|
:upload,
|
||||||
|
etag: "ETag3",
|
||||||
|
updated_at: 2.days.ago,
|
||||||
|
verification_status: Upload.verification_statuses[:s3_file_missing_confirmed],
|
||||||
|
)
|
||||||
|
|
||||||
inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3)
|
inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3)
|
||||||
inventory.expects(:inventory_date).times(2).returns(Time.now)
|
inventory.expects(:inventory_date).times(2).returns(Time.now)
|
||||||
end
|
end
|
||||||
|
@ -49,7 +57,7 @@ RSpec.describe S3Inventory do
|
||||||
it "should display missing uploads correctly" do
|
it "should display missing uploads correctly" do
|
||||||
output = capture_stdout { inventory.backfill_etags_and_list_missing }
|
output = capture_stdout { inventory.backfill_etags_and_list_missing }
|
||||||
|
|
||||||
expect(output).to eq("#{@upload1.url}\n#{@no_etag.url}\n2 of 5 uploads are missing\n")
|
expect(output).to eq("#{@upload_1.url}\n#{@no_etag.url}\n2 of 5 uploads are missing\n")
|
||||||
expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
|
expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -61,7 +69,7 @@ RSpec.describe S3Inventory do
|
||||||
|
|
||||||
expect(output).to eq(<<~TEXT)
|
expect(output).to eq(<<~TEXT)
|
||||||
#{differing_etag.url} has different etag
|
#{differing_etag.url} has different etag
|
||||||
#{@upload1.url}
|
#{@upload_1.url}
|
||||||
#{@no_etag.url}
|
#{@no_etag.url}
|
||||||
3 of 5 uploads are missing
|
3 of 5 uploads are missing
|
||||||
1 of these are caused by differing etags
|
1 of these are caused by differing etags
|
||||||
|
@ -80,23 +88,23 @@ RSpec.describe S3Inventory do
|
||||||
expect(
|
expect(
|
||||||
Upload.where(verification_status: Upload.verification_statuses[:verified]).count,
|
Upload.where(verification_status: Upload.verification_statuses[:verified]).count,
|
||||||
).to eq(3)
|
).to eq(3)
|
||||||
expect(
|
|
||||||
Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]).count,
|
expect(Upload.with_invalid_etag_verification_status.count).to eq(2)
|
||||||
).to eq(2)
|
|
||||||
expect(
|
expect(
|
||||||
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
|
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
|
||||||
).to eq(7)
|
).to eq(7)
|
||||||
end
|
end
|
||||||
|
|
||||||
it "does not affect the updated_at date of uploads" do
|
it "does not affect the updated_at date of uploads" do
|
||||||
upload_1_updated = @upload1.updated_at
|
upload_1_updated = @upload_1.updated_at
|
||||||
upload_2_updated = @upload2.updated_at
|
upload_2_updated = @upload_2.updated_at
|
||||||
no_etag_updated = @no_etag.updated_at
|
no_etag_updated = @no_etag.updated_at
|
||||||
|
|
||||||
output = capture_stdout { inventory.backfill_etags_and_list_missing }
|
output = capture_stdout { inventory.backfill_etags_and_list_missing }
|
||||||
|
|
||||||
expect(@upload1.reload.updated_at).to eq_time(upload_1_updated)
|
expect(@upload_1.reload.updated_at).to eq_time(upload_1_updated)
|
||||||
expect(@upload2.reload.updated_at).to eq_time(upload_2_updated)
|
expect(@upload_2.reload.updated_at).to eq_time(upload_2_updated)
|
||||||
expect(@no_etag.reload.updated_at).to eq_time(no_etag_updated)
|
expect(@no_etag.reload.updated_at).to eq_time(no_etag_updated)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -809,4 +809,28 @@ RSpec.describe Upload do
|
||||||
expect { u.update!(dominant_color: "abcd") }.to raise_error(ActiveRecord::RecordInvalid)
|
expect { u.update!(dominant_color: "abcd") }.to raise_error(ActiveRecord::RecordInvalid)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
describe ".mark_invalid_s3_uploads_as_missing" do
|
||||||
|
it "should update all upload records with a `verification_status` of `invalid_etag` to `s3_file_missing`" do
|
||||||
|
upload_1 =
|
||||||
|
Fabricate(:upload_s3, verification_status: Upload.verification_statuses[:invalid_etag])
|
||||||
|
|
||||||
|
upload_2 =
|
||||||
|
Fabricate(:upload_s3, verification_status: Upload.verification_statuses[:invalid_etag])
|
||||||
|
|
||||||
|
upload_3 = Fabricate(:upload_s3, verification_status: Upload.verification_statuses[:verified])
|
||||||
|
|
||||||
|
Upload.mark_invalid_s3_uploads_as_missing
|
||||||
|
|
||||||
|
expect(upload_1.reload.verification_status).to eq(
|
||||||
|
Upload.verification_statuses[:s3_file_missing_confirmed],
|
||||||
|
)
|
||||||
|
|
||||||
|
expect(upload_2.reload.verification_status).to eq(
|
||||||
|
Upload.verification_statuses[:s3_file_missing_confirmed],
|
||||||
|
)
|
||||||
|
|
||||||
|
expect(upload_3.reload.verification_status).to eq(Upload.verification_statuses[:verified])
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue
Block a user