DEV: Detect when s3 inventory failure is caused by etag difference (#10427)

This commit is contained in:
David Taylor 2020-08-13 00:30:28 +01:00 committed by GitHub
parent 451b9b245f
commit bd0a7553c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 36 additions and 2 deletions

View File

@ -67,6 +67,10 @@ class S3Inventory
.joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag") .joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag")
.where("#{table_name}.etag IS NULL") .where("#{table_name}.etag IS NULL")
exists_with_different_etag = missing_uploads
.joins("LEFT JOIN #{table_name} inventory2 ON inventory2.url = #{model.table_name}.url")
.where("inventory2.etag IS NOT NULL").pluck(:id)
# marking as verified/not verified # marking as verified/not verified
id_threshold_clause = model == Upload ? " AND model_table.id > #{model::SEEDED_ID_THRESHOLD}" : "" id_threshold_clause = model == Upload ? " AND model_table.id > #{model::SEEDED_ID_THRESHOLD}" : ""
DB.exec(<<~SQL, inventory_date DB.exec(<<~SQL, inventory_date
@ -86,10 +90,18 @@ class S3Inventory
if (missing_count = missing_uploads.count) > 0 if (missing_count = missing_uploads.count) > 0
missing_uploads.select(:id, :url).find_each do |upload| missing_uploads.select(:id, :url).find_each do |upload|
log upload.url if exists_with_different_etag.include?(upload.id)
log "#{upload.url} has different etag"
else
log upload.url
end
end end
log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing" log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing"
if exists_with_different_etag.present?
log "#{exists_with_different_etag.count} of these are caused by differing etags"
log "Null the etag column and re-run for automatic backfill"
end
end end
Discourse.stats.set("missing_s3_#{model.table_name}", missing_count) Discourse.stats.set("missing_s3_#{model.table_name}", missing_count)

View File

@ -64,7 +64,10 @@ describe "S3Inventory" do
CSV.foreach(csv_filename, headers: false) do |row| CSV.foreach(csv_filename, headers: false) do |row|
next unless row[S3Inventory::CSV_KEY_INDEX].include?("default") next unless row[S3Inventory::CSV_KEY_INDEX].include?("default")
Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago) Fabricate(:upload,
etag: row[S3Inventory::CSV_ETAG_INDEX],
url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]),
updated_at: 2.days.ago)
end end
@upload1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago) @upload1 = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)
@ -84,6 +87,25 @@ describe "S3Inventory" do
expect(Discourse.stats.get("missing_s3_uploads")).to eq(2) expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
end end
it "should detect when a url match exists with a different etag" do
differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0")
differing_etag.update_columns(etag: "somethingelse")
output = capture_stdout do
inventory.backfill_etags_and_list_missing
end
expect(output).to eq(<<~TEXT)
#{differing_etag.url} has different etag
#{@upload1.url}
#{@no_etag.url}
3 of 5 uploads are missing
1 of these are caused by differing etags
Null the etag column and re-run for automatic backfill
TEXT
expect(Discourse.stats.get("missing_s3_uploads")).to eq(3)
end
it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do
expect(Upload.where(verified: nil).count).to eq(12) expect(Upload.where(verified: nil).count).to eq(12)
output = capture_stdout do output = capture_stdout do