2019-04-30 08:27:42 +08:00
# frozen_string_literal: true
2024-05-24 10:54:06 +08:00
RSpec . describe S3Inventory do
2024-06-10 13:16:00 +08:00
let ( :inventory ) do
S3Inventory . new ( :upload , s3_inventory_bucket : " some-inventory-bucket/inventoried-bucket/prefix " )
end
2019-02-01 12:40:48 +08:00
let ( :csv_filename ) { " #{ Rails . root } /spec/fixtures/csv/s3_inventory.csv " }
before do
2024-06-10 13:16:00 +08:00
inventory . s3_helper . stub_client_responses!
2019-08-13 13:59:31 +08:00
inventory . stubs ( :cleanup! )
2019-02-01 12:40:48 +08:00
end
it " should raise error if an inventory file is not found " do
2024-06-10 13:16:00 +08:00
inventory . s3_client . stub_responses ( :list_objects , contents : [ ] )
2019-02-20 00:24:35 +08:00
output = capture_stdout { inventory . backfill_etags_and_list_missing }
2019-02-01 12:40:48 +08:00
expect ( output ) . to eq ( " Failed to list inventory from S3 \n " )
end
2024-07-09 12:03:43 +08:00
it " should forward custom s3 options to the S3Helper when initializing " do
inventory =
S3Inventory . new (
:upload ,
s3_inventory_bucket : " some-inventory-bucket " ,
s3_options : {
region : " us-west-1 " ,
} ,
)
inventory . s3_helper . stub_client_responses!
expect ( inventory . s3_helper . s3_client . config . region ) . to eq ( " us-west-1 " )
end
2020-08-11 12:43:51 +08:00
describe " verifying uploads " do
before do
freeze_time
2019-02-01 12:40:48 +08:00
2020-08-11 12:43:51 +08:00
CSV . foreach ( csv_filename , headers : false ) do | row |
2024-05-27 18:27:13 +08:00
next if row [ S3Inventory :: CSV_KEY_INDEX ] . exclude? ( " default " )
2020-08-13 07:30:28 +08:00
Fabricate (
:upload ,
etag : row [ S3Inventory :: CSV_ETAG_INDEX ] ,
url : File . join ( Discourse . store . absolute_base_url , row [ S3Inventory :: CSV_KEY_INDEX ] ) ,
updated_at : 2 . days . ago ,
)
2020-08-11 12:43:51 +08:00
end
2024-05-30 08:37:38 +08:00
@upload_1 = Fabricate ( :upload , etag : " ETag " , updated_at : 1 . days . ago )
@upload_2 = Fabricate ( :upload , etag : " ETag2 " , updated_at : Time . now )
2020-08-11 12:43:51 +08:00
@no_etag = Fabricate ( :upload , updated_at : 2 . days . ago )
2024-05-30 08:37:38 +08:00
@upload_3 =
Fabricate (
:upload ,
etag : " ETag3 " ,
updated_at : 2 . days . ago ,
verification_status : Upload . verification_statuses [ :s3_file_missing_confirmed ] ,
)
2020-08-11 12:43:51 +08:00
inventory . expects ( :files ) . returns ( [ { key : " Key " , filename : " #{ csv_filename } .gz " } ] ) . times ( 3 )
inventory . expects ( :inventory_date ) . times ( 2 ) . returns ( Time . now )
2019-02-01 12:40:48 +08:00
end
2019-03-13 17:39:07 +08:00
2020-08-11 12:43:51 +08:00
it " should display missing uploads correctly " do
output = capture_stdout { inventory . backfill_etags_and_list_missing }
2024-05-30 08:37:38 +08:00
expect ( output ) . to eq ( " #{ @upload_1 . url } \n #{ @no_etag . url } \n 2 of 5 uploads are missing \n " )
2020-08-11 12:43:51 +08:00
expect ( Discourse . stats . get ( " missing_s3_uploads " ) ) . to eq ( 2 )
end
2019-02-01 12:40:48 +08:00
2020-08-13 07:30:28 +08:00
it " should detect when a url match exists with a different etag " do
differing_etag = Upload . find_by ( etag : " defcaac0b4aca535c284e95f30d608d0 " )
differing_etag . update_columns ( etag : " somethingelse " )
2024-11-08 08:05:14 +08:00
differing_url = Upload . find_by ( etag : " 0cdc623af39cde0adb382670a6dc702a " )
differing_url . update_columns ( url : differing_url . url . gsub ( " default " , " notdefault " ) )
2020-08-13 07:30:28 +08:00
output = capture_stdout { inventory . backfill_etags_and_list_missing }
expect ( output ) . to eq ( << ~ TEXT )
#{differing_etag.url} has different etag
2024-11-08 08:05:14 +08:00
#{differing_url.url} has different url
2024-05-30 08:37:38 +08:00
#{@upload_1.url}
2020-08-13 07:30:28 +08:00
#{@no_etag.url}
2024-11-08 08:05:14 +08:00
4 of 5 uploads are missing
2020-08-13 07:30:28 +08:00
1 of these are caused by differing etags
Null the etag column and re - run for automatic backfill
2024-11-08 08:05:14 +08:00
1 of these are caused by differing urls
Empty the url column and re - run for automatic backfill
2020-08-13 07:30:28 +08:00
TEXT
2024-11-08 08:05:14 +08:00
expect ( Discourse . stats . get ( " missing_s3_uploads " ) ) . to eq ( 4 )
2020-08-13 07:30:28 +08:00
end
2020-08-11 12:43:51 +08:00
it " marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil " do
2024-11-08 08:05:14 +08:00
differing_url = Upload . find_by ( etag : " 0cdc623af39cde0adb382670a6dc702a " )
differing_url . update_columns ( url : differing_url . url . gsub ( " default " , " notdefault " ) )
2020-09-17 11:35:29 +08:00
expect (
Upload . where ( verification_status : Upload . verification_statuses [ :unchecked ] ) . count ,
) . to eq ( 12 )
2020-08-11 12:43:51 +08:00
output = capture_stdout { inventory . backfill_etags_and_list_missing }
2019-02-01 12:40:48 +08:00
2020-09-17 11:35:29 +08:00
verification_status = Upload . pluck ( :verification_status )
expect (
Upload . where ( verification_status : Upload . verification_statuses [ :verified ] ) . count ,
2024-11-08 08:05:14 +08:00
) . to eq ( 2 )
2024-05-30 08:37:38 +08:00
expect ( Upload . with_invalid_etag_verification_status . count ) . to eq ( 2 )
2024-11-08 08:05:14 +08:00
expect ( Upload . with_invalid_url_verification_status . count ) . to eq ( 1 )
2024-05-30 08:37:38 +08:00
2020-09-17 11:35:29 +08:00
expect (
Upload . where ( verification_status : Upload . verification_statuses [ :unchecked ] ) . count ,
) . to eq ( 7 )
2019-02-01 12:40:48 +08:00
end
2020-08-11 12:43:51 +08:00
it " does not affect the updated_at date of uploads " do
2024-05-30 08:37:38 +08:00
upload_1_updated = @upload_1 . updated_at
upload_2_updated = @upload_2 . updated_at
2020-08-11 12:43:51 +08:00
no_etag_updated = @no_etag . updated_at
output = capture_stdout { inventory . backfill_etags_and_list_missing }
2024-05-30 08:37:38 +08:00
expect ( @upload_1 . reload . updated_at ) . to eq_time ( upload_1_updated )
expect ( @upload_2 . reload . updated_at ) . to eq_time ( upload_2_updated )
2020-08-11 12:43:51 +08:00
expect ( @no_etag . reload . updated_at ) . to eq_time ( no_etag_updated )
end
2019-02-01 12:40:48 +08:00
end
2019-02-14 07:48:06 +08:00
it " should backfill etags to uploads table correctly " do
2019-02-15 03:04:35 +08:00
files = [
2020-07-29 08:49:45 +08:00
[
" #{ Discourse . store . absolute_base_url } /uploads/default/original/1X/0184537a4f419224404d013414e913a4f56018f2.jpg " ,
" defcaac0b4aca535c284e95f30d608d0 " ,
] ,
[
" #{ Discourse . store . absolute_base_url } /uploads/default/original/1X/0789fbf5490babc68326b9cec90eeb0d6590db05.png " ,
" 25c02eaceef4cb779fc17030d33f7f06 " ,
2023-01-09 19:18:21 +08:00
] ,
2019-02-15 03:04:35 +08:00
]
files . each { | file | Fabricate ( :upload , url : file [ 0 ] ) }
2019-02-14 07:48:06 +08:00
2019-08-13 13:59:31 +08:00
inventory . expects ( :files ) . returns ( [ { key : " Key " , filename : " #{ csv_filename } .gz " } ] ) . times ( 3 )
2019-02-14 07:48:06 +08:00
output =
capture_stdout do
2019-02-20 00:24:35 +08:00
expect { inventory . backfill_etags_and_list_missing } . to change {
Upload . where ( etag : nil ) . count
} . by ( - 2 )
2019-02-14 07:48:06 +08:00
end
2019-02-15 03:04:35 +08:00
2019-03-13 17:39:07 +08:00
expect ( Upload . by_users . order ( :url ) . pluck ( :url , :etag ) ) . to eq ( files )
2019-02-14 07:48:06 +08:00
end
2020-07-29 08:49:45 +08:00
2024-05-24 10:54:06 +08:00
context " when site was restored from a backup " do
before do
freeze_time
BackupMetadata . update_last_restore_date ( Time . now )
end
it " should run if inventory files are at least #{ described_class :: WAIT_AFTER_RESTORE_DAYS . days } days older than the last restore date " do
2024-06-10 13:16:00 +08:00
inventory . s3_client . stub_responses (
2024-05-24 10:54:06 +08:00
:list_objects_v2 ,
{
contents : [
{
key : " symlink.txt " ,
last_modified :
BackupMetadata . last_restore_date + described_class :: WAIT_AFTER_RESTORE_DAYS . days ,
size : 1 ,
} ,
] ,
} ,
)
2024-06-10 13:16:00 +08:00
inventory . s3_client . expects ( :get_object ) . once
2024-05-24 10:54:06 +08:00
2024-06-10 13:16:00 +08:00
capture_stdout { inventory . backfill_etags_and_list_missing }
2024-05-24 10:54:06 +08:00
end
2024-07-19 14:22:58 +08:00
it " should not run if inventory files are not at least #{ described_class :: WAIT_AFTER_RESTORE_DAYS . days } days older than the last restore date and reset stats count " do
Discourse . stats . set ( " missing_s3_uploads " , 2 )
2024-06-10 13:16:00 +08:00
inventory . s3_client . stub_responses (
2024-05-24 10:54:06 +08:00
:list_objects_v2 ,
{
contents : [
{
key : " symlink.txt " ,
last_modified : BackupMetadata . last_restore_date + 1 . day ,
size : 1 ,
} ,
] ,
} ,
)
2024-06-10 13:16:00 +08:00
inventory . s3_client . expects ( :get_object ) . never
2024-05-24 10:54:06 +08:00
2024-06-10 13:16:00 +08:00
capture_stdout { inventory . backfill_etags_and_list_missing }
2024-07-19 14:22:58 +08:00
expect ( Discourse . stats . get ( " missing_s3_uploads " ) ) . to eq ( 0 )
2024-05-24 10:54:06 +08:00
end
end
2020-07-29 08:49:45 +08:00
it " should work when passed preloaded data " do
freeze_time
CSV . foreach ( csv_filename , headers : false ) do | row |
2024-05-27 18:27:13 +08:00
next if row [ S3Inventory :: CSV_KEY_INDEX ] . exclude? ( " default " )
2024-11-08 08:05:14 +08:00
Fabricate (
:upload ,
url : File . join ( Discourse . store . absolute_base_url , row [ S3Inventory :: CSV_KEY_INDEX ] ) ,
etag : row [ S3Inventory :: CSV_ETAG_INDEX ] ,
updated_at : 2 . days . ago ,
)
2020-07-29 08:49:45 +08:00
end
upload = Fabricate ( :upload , etag : " ETag " , updated_at : 1 . days . ago )
Fabricate ( :upload , etag : " ETag2 " , updated_at : Time . now )
no_etag = Fabricate ( :upload , updated_at : 2 . days . ago )
output =
capture_stdout do
File . open ( csv_filename ) do | f |
preloaded_inventory =
S3Inventory . new (
:upload ,
2024-06-10 13:16:00 +08:00
s3_inventory_bucket : " some-inventory-bucket " ,
2020-07-29 08:49:45 +08:00
preloaded_inventory_file : f ,
preloaded_inventory_date : Time . now ,
)
2024-06-10 13:16:00 +08:00
2020-07-29 08:49:45 +08:00
preloaded_inventory . backfill_etags_and_list_missing
2023-01-09 19:18:21 +08:00
end
2020-07-29 08:49:45 +08:00
end
expect ( output ) . to eq ( " #{ upload . url } \n #{ no_etag . url } \n 2 of 5 uploads are missing \n " )
expect ( Discourse . stats . get ( " missing_s3_uploads " ) ) . to eq ( 2 )
end
2019-02-01 12:40:48 +08:00
end