2019-02-01 12:40:48 +08:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
require "aws-sdk-s3"
|
|
|
|
require "csv"
|
|
|
|
|
|
|
|
class S3Inventory
|
2024-06-10 13:16:00 +08:00
|
|
|
attr_reader :type, :inventory_date, :s3_helper
|
2019-02-01 12:40:48 +08:00
|
|
|
|
2024-05-24 10:54:06 +08:00
|
|
|
CSV_KEY_INDEX = 1
|
|
|
|
CSV_ETAG_INDEX = 2
|
|
|
|
INVENTORY_PREFIX = "inventory"
|
|
|
|
INVENTORY_LAG = 2.days
|
|
|
|
WAIT_AFTER_RESTORE_DAYS = 2
|
2019-02-01 12:40:48 +08:00
|
|
|
|
2024-06-10 13:16:00 +08:00
|
|
|
def initialize(
|
|
|
|
type,
|
|
|
|
s3_inventory_bucket:,
|
|
|
|
preloaded_inventory_file: nil,
|
2024-07-09 12:03:43 +08:00
|
|
|
preloaded_inventory_date: nil,
|
|
|
|
s3_options: {}
|
2024-06-10 13:16:00 +08:00
|
|
|
)
|
2024-07-09 12:03:43 +08:00
|
|
|
@s3_helper = S3Helper.new(s3_inventory_bucket, "", s3_options)
|
2019-02-01 12:40:48 +08:00
|
|
|
|
2020-07-29 08:49:45 +08:00
|
|
|
if preloaded_inventory_file && preloaded_inventory_date
|
|
|
|
# Data preloaded, so we don't need to fetch it again
|
|
|
|
@preloaded_inventory_file = preloaded_inventory_file
|
|
|
|
@inventory_date = preloaded_inventory_date
|
|
|
|
end
|
|
|
|
|
2019-02-01 12:40:48 +08:00
|
|
|
if type == :upload
|
2019-05-02 07:05:35 +08:00
|
|
|
@type = "original"
|
2019-02-01 12:40:48 +08:00
|
|
|
@model = Upload
|
2024-05-30 08:37:38 +08:00
|
|
|
@scope = @model.by_users.without_s3_file_missing_confirmed_verification_status
|
2019-02-01 12:40:48 +08:00
|
|
|
elsif type == :optimized
|
2019-05-02 07:05:35 +08:00
|
|
|
@type = "optimized"
|
2024-05-30 08:37:38 +08:00
|
|
|
@scope = @model = OptimizedImage
|
|
|
|
else
|
|
|
|
raise "Invalid type: #{type}"
|
2019-02-01 12:40:48 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-02-20 00:24:35 +08:00
|
|
|
def backfill_etags_and_list_missing
|
2020-07-29 08:49:45 +08:00
|
|
|
if !@preloaded_inventory_file && files.blank?
|
2019-02-01 12:40:48 +08:00
|
|
|
error("Failed to list inventory from S3")
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
2019-08-15 09:48:08 +08:00
|
|
|
DistributedMutex.synchronize("s3_inventory_list_missing_#{type}", validity: 30.minutes) do
|
2019-08-13 13:59:31 +08:00
|
|
|
begin
|
2020-07-29 08:49:45 +08:00
|
|
|
download_and_decompress_files if !@preloaded_inventory_file
|
2019-08-13 13:59:31 +08:00
|
|
|
|
2019-12-18 13:51:57 +08:00
|
|
|
multisite_prefix = Discourse.store.upload_path
|
2024-05-24 10:54:06 +08:00
|
|
|
|
2019-08-13 13:59:31 +08:00
|
|
|
ActiveRecord::Base.transaction do
|
|
|
|
begin
|
|
|
|
connection.exec(
|
2024-05-30 08:37:38 +08:00
|
|
|
"CREATE TEMP TABLE #{tmp_table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))",
|
2019-08-13 13:59:31 +08:00
|
|
|
)
|
2024-05-24 10:54:06 +08:00
|
|
|
|
2024-05-30 08:37:38 +08:00
|
|
|
connection.copy_data("COPY #{tmp_table_name} FROM STDIN CSV") do
|
2020-07-29 08:49:45 +08:00
|
|
|
for_each_inventory_row do |row|
|
|
|
|
key = row[CSV_KEY_INDEX]
|
2022-11-21 04:52:30 +08:00
|
|
|
|
2020-07-29 08:49:45 +08:00
|
|
|
next if Rails.configuration.multisite && key.exclude?(multisite_prefix)
|
2022-11-23 04:41:22 +08:00
|
|
|
next if key.exclude?("#{type}/")
|
2022-11-21 04:52:30 +08:00
|
|
|
|
2020-07-29 08:49:45 +08:00
|
|
|
url = File.join(Discourse.store.absolute_base_url, key)
|
|
|
|
connection.put_copy_data("#{url},#{row[CSV_ETAG_INDEX]}\n")
|
2019-02-20 10:52:40 +08:00
|
|
|
end
|
2019-02-14 06:11:52 +08:00
|
|
|
end
|
2019-02-01 12:40:48 +08:00
|
|
|
|
2024-05-30 08:37:38 +08:00
|
|
|
table_name = @model.table_name
|
|
|
|
|
2019-08-13 13:59:31 +08:00
|
|
|
# backfilling etags
|
|
|
|
connection.async_exec(
|
2024-05-30 08:37:38 +08:00
|
|
|
"UPDATE #{table_name}
|
|
|
|
SET etag = #{tmp_table_name}.etag
|
|
|
|
FROM #{tmp_table_name}
|
|
|
|
WHERE #{table_name}.etag IS NULL AND
|
|
|
|
#{table_name}.url = #{tmp_table_name}.url",
|
2020-07-06 08:33:54 +08:00
|
|
|
)
|
2019-02-14 07:48:06 +08:00
|
|
|
|
2024-05-30 08:37:38 +08:00
|
|
|
uploads = @scope.where("updated_at < ?", inventory_date)
|
2020-07-22 00:19:56 +08:00
|
|
|
|
2019-08-13 13:59:31 +08:00
|
|
|
missing_uploads =
|
|
|
|
uploads.joins(
|
2024-05-30 08:37:38 +08:00
|
|
|
"LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag",
|
2024-11-08 08:05:14 +08:00
|
|
|
).where(
|
|
|
|
"#{tmp_table_name}.etag IS NULL OR #{tmp_table_name}.url != #{table_name}.url",
|
|
|
|
)
|
2019-02-01 12:40:48 +08:00
|
|
|
|
2020-08-13 07:30:28 +08:00
|
|
|
exists_with_different_etag =
|
|
|
|
missing_uploads
|
|
|
|
.joins(
|
2024-05-30 08:37:38 +08:00
|
|
|
"LEFT JOIN #{tmp_table_name} inventory2 ON inventory2.url = #{table_name}.url",
|
2020-08-13 07:30:28 +08:00
|
|
|
)
|
|
|
|
.where("inventory2.etag IS NOT NULL")
|
|
|
|
.pluck(:id)
|
|
|
|
|
2024-11-08 08:05:14 +08:00
|
|
|
exists_with_different_url =
|
|
|
|
missing_uploads
|
|
|
|
.joins(
|
|
|
|
"LEFT JOIN #{tmp_table_name} inventory3 ON inventory3.etag = #{table_name}.etag",
|
|
|
|
)
|
|
|
|
.where("inventory3.url != #{table_name}.url")
|
|
|
|
.pluck(:id)
|
|
|
|
|
2020-09-17 11:35:29 +08:00
|
|
|
# marking as verified/not verified
|
2024-05-30 08:37:38 +08:00
|
|
|
if @model == Upload
|
2020-09-17 11:35:29 +08:00
|
|
|
sql_params = {
|
|
|
|
inventory_date: inventory_date,
|
|
|
|
invalid_etag: Upload.verification_statuses[:invalid_etag],
|
2024-11-08 08:05:14 +08:00
|
|
|
invalid_url: Upload.verification_statuses[:invalid_url],
|
2024-05-30 08:37:38 +08:00
|
|
|
s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed],
|
2022-11-21 04:52:30 +08:00
|
|
|
verified: Upload.verification_statuses[:verified],
|
2024-05-30 08:37:38 +08:00
|
|
|
seeded_id_threshold: @model::SEEDED_ID_THRESHOLD,
|
2020-09-17 11:35:29 +08:00
|
|
|
}
|
2022-11-21 04:52:30 +08:00
|
|
|
|
|
|
|
DB.exec(<<~SQL, sql_params)
|
2024-05-30 08:37:38 +08:00
|
|
|
UPDATE #{table_name}
|
2022-11-21 04:52:30 +08:00
|
|
|
SET verification_status = :verified
|
|
|
|
WHERE etag IS NOT NULL
|
|
|
|
AND verification_status <> :verified
|
2024-05-30 08:37:38 +08:00
|
|
|
AND verification_status <> :s3_file_missing_confirmed
|
2022-11-21 04:52:30 +08:00
|
|
|
AND updated_at < :inventory_date
|
|
|
|
AND id > :seeded_id_threshold
|
|
|
|
AND EXISTS
|
|
|
|
(
|
|
|
|
SELECT 1
|
2024-05-30 08:37:38 +08:00
|
|
|
FROM #{tmp_table_name}
|
|
|
|
WHERE #{tmp_table_name}.etag = #{table_name}.etag
|
2022-11-21 04:52:30 +08:00
|
|
|
)
|
|
|
|
SQL
|
|
|
|
|
2020-09-17 11:35:29 +08:00
|
|
|
DB.exec(<<~SQL, sql_params)
|
2024-05-30 08:37:38 +08:00
|
|
|
UPDATE #{table_name}
|
2022-11-21 04:52:30 +08:00
|
|
|
SET verification_status = :invalid_etag
|
|
|
|
WHERE verification_status <> :invalid_etag
|
2024-05-30 08:37:38 +08:00
|
|
|
AND verification_status <> :s3_file_missing_confirmed
|
2022-11-21 04:52:30 +08:00
|
|
|
AND updated_at < :inventory_date
|
|
|
|
AND id > :seeded_id_threshold
|
|
|
|
AND NOT EXISTS
|
|
|
|
(
|
|
|
|
SELECT 1
|
2024-05-30 08:37:38 +08:00
|
|
|
FROM #{tmp_table_name}
|
|
|
|
WHERE #{tmp_table_name}.etag = #{table_name}.etag
|
2022-11-21 04:52:30 +08:00
|
|
|
)
|
2020-09-17 11:35:29 +08:00
|
|
|
SQL
|
2024-11-08 08:05:14 +08:00
|
|
|
|
|
|
|
DB.exec(<<~SQL, sql_params)
|
|
|
|
UPDATE #{table_name}
|
|
|
|
SET verification_status = :invalid_url
|
|
|
|
WHERE verification_status <> :invalid_url
|
|
|
|
AND verification_status <> :invalid_etag
|
|
|
|
AND verification_status <> :s3_file_missing_confirmed
|
|
|
|
AND updated_at < :inventory_date
|
|
|
|
AND id > :seeded_id_threshold
|
|
|
|
AND NOT EXISTS
|
|
|
|
(
|
|
|
|
SELECT 1
|
|
|
|
FROM #{tmp_table_name}
|
|
|
|
WHERE #{tmp_table_name}.url = #{table_name}.url
|
|
|
|
)
|
|
|
|
SQL
|
2020-09-14 22:21:34 +08:00
|
|
|
end
|
2020-08-11 12:43:51 +08:00
|
|
|
|
2019-08-13 13:59:31 +08:00
|
|
|
if (missing_count = missing_uploads.count) > 0
|
|
|
|
missing_uploads
|
|
|
|
.select(:id, :url)
|
|
|
|
.find_each do |upload|
|
2020-08-13 07:30:28 +08:00
|
|
|
if exists_with_different_etag.include?(upload.id)
|
|
|
|
log "#{upload.url} has different etag"
|
2024-11-08 08:05:14 +08:00
|
|
|
elsif exists_with_different_url.include?(upload.id)
|
|
|
|
log "#{upload.url} has different url"
|
2020-08-13 07:30:28 +08:00
|
|
|
else
|
|
|
|
log upload.url
|
2023-01-09 20:10:19 +08:00
|
|
|
end
|
2020-08-13 07:30:28 +08:00
|
|
|
end
|
2019-08-13 13:59:31 +08:00
|
|
|
|
2024-05-30 08:37:38 +08:00
|
|
|
log "#{missing_count} of #{uploads.count} #{@scope.name.underscore.pluralize} are missing"
|
2020-08-13 07:30:28 +08:00
|
|
|
if exists_with_different_etag.present?
|
|
|
|
log "#{exists_with_different_etag.count} of these are caused by differing etags"
|
|
|
|
log "Null the etag column and re-run for automatic backfill"
|
|
|
|
end
|
2024-11-08 08:05:14 +08:00
|
|
|
if exists_with_different_url.present?
|
|
|
|
log "#{exists_with_different_url.count} of these are caused by differing urls"
|
|
|
|
log "Empty the url column and re-run for automatic backfill"
|
|
|
|
end
|
2019-02-20 10:52:40 +08:00
|
|
|
end
|
2019-02-01 12:40:48 +08:00
|
|
|
|
2024-07-19 14:22:58 +08:00
|
|
|
set_missing_s3_discourse_stats(missing_count)
|
2019-08-13 13:59:31 +08:00
|
|
|
ensure
|
2024-05-30 08:37:38 +08:00
|
|
|
connection.exec("DROP TABLE #{tmp_table_name}") unless connection.nil?
|
2019-02-20 10:52:40 +08:00
|
|
|
end
|
2019-02-01 12:40:48 +08:00
|
|
|
end
|
2019-08-13 13:59:31 +08:00
|
|
|
ensure
|
|
|
|
cleanup!
|
2019-02-01 12:40:48 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2020-07-29 08:49:45 +08:00
|
|
|
def for_each_inventory_row
|
|
|
|
if @preloaded_inventory_file
|
|
|
|
CSV.foreach(@preloaded_inventory_file) { |row| yield(row) }
|
|
|
|
else
|
|
|
|
files.each { |file| CSV.foreach(file[:filename][0...-3]) { |row| yield(row) } }
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-08-13 13:59:31 +08:00
|
|
|
def download_inventory_file_to_tmp_directory(file)
|
2022-01-06 01:45:08 +08:00
|
|
|
return if File.exist?(file[:filename])
|
2019-07-25 16:46:47 +08:00
|
|
|
|
2019-08-13 13:59:31 +08:00
|
|
|
log "Downloading inventory file '#{file[:key]}' to tmp directory..."
|
|
|
|
failure_message = "Failed to inventory file '#{file[:key]}' to tmp directory."
|
2019-02-01 12:40:48 +08:00
|
|
|
|
2019-08-13 13:59:31 +08:00
|
|
|
@s3_helper.download_file(file[:key], file[:filename], failure_message)
|
2019-02-01 12:40:48 +08:00
|
|
|
end
|
|
|
|
|
2019-08-13 13:59:31 +08:00
|
|
|
def decompress_inventory_file(file)
|
|
|
|
log "Decompressing inventory file '#{file[:filename]}', this may take a while..."
|
|
|
|
Discourse::Utils.execute_command(
|
|
|
|
"gzip",
|
|
|
|
"--decompress",
|
|
|
|
file[:filename],
|
|
|
|
failure_message: "Failed to decompress inventory file '#{file[:filename]}'.",
|
|
|
|
chdir: tmp_directory,
|
|
|
|
)
|
2019-02-01 12:40:48 +08:00
|
|
|
end
|
|
|
|
|
2020-07-29 08:49:45 +08:00
|
|
|
def prepare_for_all_sites
|
|
|
|
db_names = RailsMultisite::ConnectionManagement.all_dbs
|
|
|
|
db_files = {}
|
|
|
|
|
|
|
|
db_names.each { |db| db_files[db] = Tempfile.new("#{db}-inventory.csv") }
|
|
|
|
|
|
|
|
download_and_decompress_files
|
|
|
|
for_each_inventory_row do |row|
|
|
|
|
key = row[CSV_KEY_INDEX]
|
|
|
|
row_db = key.match(%r{uploads/([^/]+)/})&.[](1)
|
|
|
|
if row_db && file = db_files[row_db]
|
|
|
|
file.write(row.to_csv)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
db_names.each { |db| db_files[db].rewind }
|
|
|
|
|
|
|
|
db_files
|
|
|
|
ensure
|
|
|
|
cleanup!
|
|
|
|
end
|
|
|
|
|
2024-06-10 13:16:00 +08:00
|
|
|
def s3_client
|
|
|
|
@s3_helper.s3_client
|
|
|
|
end
|
|
|
|
|
2019-02-01 12:40:48 +08:00
|
|
|
private
|
|
|
|
|
2019-08-13 13:59:31 +08:00
|
|
|
def cleanup!
|
2020-07-29 08:49:45 +08:00
|
|
|
return if @preloaded_inventory_file
|
2019-08-13 13:59:31 +08:00
|
|
|
files.each do |file|
|
2022-01-06 01:45:08 +08:00
|
|
|
File.delete(file[:filename]) if File.exist?(file[:filename])
|
|
|
|
File.delete(file[:filename][0...-3]) if File.exist?(file[:filename][0...-3])
|
2019-08-13 13:59:31 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-05-04 03:46:20 +08:00
|
|
|
def connection
|
|
|
|
@connection ||= ActiveRecord::Base.connection.raw_connection
|
|
|
|
end
|
|
|
|
|
2024-05-30 08:37:38 +08:00
|
|
|
def tmp_table_name
|
2019-05-04 03:46:20 +08:00
|
|
|
"#{type}_inventory"
|
|
|
|
end
|
|
|
|
|
2019-02-14 06:11:52 +08:00
|
|
|
def files
|
2020-07-29 08:49:45 +08:00
|
|
|
return if @preloaded_inventory_file
|
2024-05-24 10:54:06 +08:00
|
|
|
|
2019-02-14 06:11:52 +08:00
|
|
|
@files ||=
|
|
|
|
begin
|
|
|
|
symlink_file = unsorted_files.sort_by { |file| -file.last_modified.to_i }.first
|
2024-05-24 10:54:06 +08:00
|
|
|
|
2019-02-14 06:11:52 +08:00
|
|
|
return [] if symlink_file.blank?
|
2023-01-09 20:10:19 +08:00
|
|
|
|
2024-05-24 10:54:06 +08:00
|
|
|
if BackupMetadata.last_restore_date.present? &&
|
|
|
|
(symlink_file.last_modified - WAIT_AFTER_RESTORE_DAYS.days) <
|
|
|
|
BackupMetadata.last_restore_date
|
2024-07-19 14:22:58 +08:00
|
|
|
set_missing_s3_discourse_stats(0)
|
2024-05-24 10:54:06 +08:00
|
|
|
return []
|
|
|
|
end
|
|
|
|
|
2020-12-30 13:05:42 +08:00
|
|
|
@inventory_date = symlink_file.last_modified - INVENTORY_LAG
|
2019-02-14 06:11:52 +08:00
|
|
|
log "Downloading symlink file to tmp directory..."
|
|
|
|
failure_message = "Failed to download symlink file to tmp directory."
|
|
|
|
filename = File.join(tmp_directory, File.basename(symlink_file.key))
|
2023-01-09 20:10:19 +08:00
|
|
|
|
2019-02-14 06:11:52 +08:00
|
|
|
@s3_helper.download_file(symlink_file.key, filename, failure_message)
|
2024-05-24 10:54:06 +08:00
|
|
|
|
|
|
|
return [] if !File.exist?(filename)
|
|
|
|
|
2019-02-14 06:11:52 +08:00
|
|
|
File
|
|
|
|
.readlines(filename)
|
|
|
|
.map do |key|
|
2019-02-14 08:26:30 +08:00
|
|
|
key = key.sub("s3://#{bucket_name}/", "").sub("\n", "")
|
2019-02-14 06:11:52 +08:00
|
|
|
{ key: key, filename: File.join(tmp_directory, File.basename(key)) }
|
2023-01-09 20:10:19 +08:00
|
|
|
end
|
2019-02-14 06:11:52 +08:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2020-07-29 08:49:45 +08:00
|
|
|
def download_and_decompress_files
|
|
|
|
files.each do |file|
|
2022-01-06 01:45:08 +08:00
|
|
|
next if File.exist?(file[:filename][0...-3])
|
2020-07-29 08:49:45 +08:00
|
|
|
|
|
|
|
download_inventory_file_to_tmp_directory(file)
|
|
|
|
decompress_inventory_file(file)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-02-14 06:11:52 +08:00
|
|
|
def tmp_directory
|
|
|
|
@tmp_directory ||=
|
|
|
|
begin
|
|
|
|
current_db = RailsMultisite::ConnectionManagement.current_db
|
2019-08-13 13:59:31 +08:00
|
|
|
directory = File.join(Rails.root, "tmp", INVENTORY_PREFIX, current_db)
|
2019-02-14 06:11:52 +08:00
|
|
|
FileUtils.mkdir_p(directory)
|
|
|
|
directory
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-02-01 12:40:48 +08:00
|
|
|
def bucket_name
|
|
|
|
@s3_helper.s3_bucket_name
|
|
|
|
end
|
|
|
|
|
|
|
|
def bucket_folder_path
|
|
|
|
@s3_helper.s3_bucket_folder_path
|
|
|
|
end
|
|
|
|
|
|
|
|
def unsorted_files
|
|
|
|
objects = []
|
2024-06-10 13:16:00 +08:00
|
|
|
hive_path = File.join(bucket_folder_path, "hive")
|
2023-01-21 02:52:49 +08:00
|
|
|
@s3_helper.list(hive_path).each { |obj| objects << obj if obj.key.match?(/symlink\.txt\z/i) }
|
2019-02-01 12:40:48 +08:00
|
|
|
|
|
|
|
objects
|
|
|
|
rescue Aws::Errors::ServiceError => e
|
|
|
|
log("Failed to list inventory from S3", e)
|
2019-08-23 15:59:31 +08:00
|
|
|
[]
|
2019-02-01 12:40:48 +08:00
|
|
|
end
|
|
|
|
|
|
|
|
def log(message, ex = nil)
|
|
|
|
puts(message)
|
|
|
|
Rails.logger.error("#{ex}\n" + (ex.backtrace || []).join("\n")) if ex
|
|
|
|
end
|
|
|
|
|
|
|
|
def error(message)
|
|
|
|
log(message, StandardError.new(message))
|
|
|
|
end
|
2024-07-19 14:22:58 +08:00
|
|
|
|
|
|
|
def set_missing_s3_discourse_stats(count)
|
|
|
|
Discourse.stats.set("missing_s3_#{@model.table_name}", count)
|
|
|
|
end
|
2019-02-01 12:40:48 +08:00
|
|
|
end
|