PERF: Speed up S3 inventory updates (#19110)

The UPDATE statement could lock the `uploads` table for a very long time
when the `verification_status` of lots of uploads changed. Splitting up
and simplifying the UPDATE solves that problem.

Also, this change ensures that only the needed data from the inventory
gets inserted into the `TEMP TABLE`. For example, there's no need to
have records for optimized images in that table when the `uploads` table
gets updated.
This commit is contained in:
Gerhard Schlager 2022-11-20 21:52:30 +01:00 committed by GitHub
parent 723f8239df
commit a597ef7131
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -48,7 +48,10 @@ class S3Inventory
connection.copy_data("COPY #{table_name} FROM STDIN CSV") do connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
for_each_inventory_row do |row| for_each_inventory_row do |row|
key = row[CSV_KEY_INDEX] key = row[CSV_KEY_INDEX]
next if Rails.configuration.multisite && key.exclude?(multisite_prefix) next if Rails.configuration.multisite && key.exclude?(multisite_prefix)
next if key.exclude?("/#{type}/")
url = File.join(Discourse.store.absolute_base_url, key) url = File.join(Discourse.store.absolute_base_url, key)
connection.put_copy_data("#{url},#{row[CSV_ETAG_INDEX]}\n") connection.put_copy_data("#{url},#{row[CSV_ETAG_INDEX]}\n")
end end
@ -76,29 +79,38 @@ class S3Inventory
if model == Upload if model == Upload
sql_params = { sql_params = {
inventory_date: inventory_date, inventory_date: inventory_date,
unchecked: Upload.verification_statuses[:unchecked],
invalid_etag: Upload.verification_statuses[:invalid_etag], invalid_etag: Upload.verification_statuses[:invalid_etag],
verified: Upload.verification_statuses[:verified] verified: Upload.verification_statuses[:verified],
seeded_id_threshold: model::SEEDED_ID_THRESHOLD
} }
DB.exec(<<~SQL, sql_params) DB.exec(<<~SQL, sql_params)
UPDATE #{model.table_name} UPDATE #{model.table_name}
SET verification_status = CASE WHEN table_name_alias.etag IS NULL SET verification_status = :verified
THEN :invalid_etag WHERE etag IS NOT NULL
ELSE :verified AND verification_status <> :verified
END AND updated_at < :inventory_date
FROM #{model.table_name} AS model_table AND id > :seeded_id_threshold
LEFT JOIN #{table_name} AS table_name_alias ON AND EXISTS
model_table.etag = table_name_alias.etag (
WHERE model_table.id = #{model.table_name}.id SELECT 1
AND model_table.updated_at < :inventory_date FROM #{table_name}
AND ( WHERE #{table_name}.etag = #{model.table_name}.etag
model_table.verification_status = :unchecked OR )
model_table.verification_status <> CASE WHEN table_name_alias.etag IS NULL SQL
THEN :invalid_etag
ELSE :verified DB.exec(<<~SQL, sql_params)
END UPDATE #{model.table_name}
) SET verification_status = :invalid_etag
AND model_table.id > #{model::SEEDED_ID_THRESHOLD} WHERE verification_status <> :invalid_etag
AND updated_at < :inventory_date
AND id > :seeded_id_threshold
AND NOT EXISTS
(
SELECT 1
FROM #{table_name}
WHERE #{table_name}.etag = #{model.table_name}.etag
)
SQL SQL
end end