discourse/lib/tasks/s3.rake
David Taylor 1291fc1afe PERF: Correct should_skip? logic in s3:upload (#18862)
This task is supposed to skip uploading if the asset is already present in S3. However, when a bucket 'folder path' was configured, this logic was broken and so the assets would be re-uploaded every time.

This commit fixes that logic to include the bucket 'folder path' in the check
2022-11-17 14:17:45 +00:00

223 lines
5.6 KiB
Ruby

# frozen_string_literal: true
def brotli_s3_path(path)
ext = File.extname(path)
"#{path[0..-ext.length]}br#{ext}"
end
def gzip_s3_path(path)
ext = File.extname(path)
"#{path[0..-ext.length]}gz#{ext}"
end
def should_skip?(path)
return false if ENV['FORCE_S3_UPLOADS']
@existing_assets ||= Set.new(helper.list("assets/").map(&:key))
path = File.join(helper.s3_bucket_folder_path, path) if helper.s3_bucket_folder_path
@existing_assets.include?(path)
end
def upload(path, remote_path, content_type, content_encoding = nil)
options = {
cache_control: 'max-age=31556952, public, immutable',
content_type: content_type,
acl: 'public-read'
}
if content_encoding
options[:content_encoding] = content_encoding
end
if should_skip?(remote_path)
puts "Skipping: #{remote_path}"
else
puts "Uploading: #{remote_path}"
File.open(path) do |file|
helper.upload(file, remote_path, options)
end
end
File.delete(path) if (File.exist?(path) && ENV["DELETE_ASSETS_AFTER_S3_UPLOAD"])
end
def use_db_s3_config
ENV["USE_DB_S3_CONFIG"]
end
def helper
@helper ||= S3Helper.build_from_config(use_db_s3_config: use_db_s3_config)
end
def assets
cached = Rails.application.assets&.cached
manifest = Sprockets::Manifest.new(cached, Rails.root + 'public/assets', Rails.application.config.assets.manifest)
results = []
manifest.assets.each do |_, path|
fullpath = (Rails.root + "public/assets/#{path}").to_s
# Ignore files we can't find the mime type of, like yarn.lock
if mime = MiniMime.lookup_by_filename(fullpath)
content_type = mime.content_type
asset_path = "assets/#{path}"
results << [fullpath, asset_path, content_type]
if File.exist?(fullpath + '.br')
results << [fullpath + '.br', brotli_s3_path(asset_path), content_type, 'br']
end
if File.exist?(fullpath + '.gz')
results << [fullpath + '.gz', gzip_s3_path(asset_path), content_type, 'gzip']
end
if File.exist?(fullpath + '.map')
results << [fullpath + '.map', asset_path + '.map', 'application/json']
end
end
end
results
end
def asset_paths
Set.new(assets.map { |_, asset_path| asset_path })
end
def ensure_s3_configured!
unless GlobalSetting.use_s3? || use_db_s3_config
STDERR.puts "ERROR: Ensure S3 is configured in config/discourse.conf or environment vars"
exit 1
end
end
task 's3:correct_acl' => :environment do
ensure_s3_configured!
puts "ensuring public-read is set on every upload and optimized image"
i = 0
base_url = Discourse.store.absolute_base_url
objects = Upload.pluck(:id, :url).map { |array| array << :upload }
objects.concat(OptimizedImage.pluck(:id, :url).map { |array| array << :optimized_image })
puts "#{objects.length} objects found"
objects.each do |id, url, type|
i += 1
if !url.start_with?(base_url)
puts "Skipping #{type} #{id} since it is not stored on s3, url is #{url}"
else
begin
key = url[(base_url.length + 1)..-1]
object = Discourse.store.s3_helper.object(key)
object.acl.put(acl: "public-read")
rescue => e
puts "Skipping #{type} #{id} url is #{url} #{e}"
end
end
if i % 100 == 0
puts "#{i} done"
end
end
end
task 's3:correct_cachecontrol' => :environment do
ensure_s3_configured!
puts "ensuring cache-control is set on every upload and optimized image"
i = 0
base_url = Discourse.store.absolute_base_url
cache_control = 'max-age=31556952, public, immutable'
objects = Upload.pluck(:id, :url).map { |array| array << :upload }
objects.concat(OptimizedImage.pluck(:id, :url).map { |array| array << :optimized_image })
puts "#{objects.length} objects found"
objects.each do |id, url, type|
i += 1
if !url.start_with?(base_url)
puts "Skipping #{type} #{id} since it is not stored on s3, url is #{url}"
else
begin
key = url[(base_url.length + 1)..-1]
object = Discourse.store.s3_helper.object(key)
object.copy_from(
copy_source: "#{object.bucket_name}/#{object.key}",
acl: "public-read",
cache_control: cache_control,
content_type: object.content_type,
content_disposition: object.content_disposition,
metadata_directive: 'REPLACE'
)
rescue => e
puts "Skipping #{type} #{id} url is #{url} #{e}"
end
end
if i % 100 == 0
puts "#{i} done"
end
end
end
task 's3:ensure_cors_rules' => :environment do
ensure_s3_configured!
puts "Installing CORS rules..."
result = S3CorsRulesets.sync(use_db_s3_config: use_db_s3_config)
if !result
puts "skipping"
next
end
puts "Assets rules status: #{result[:assets_rules_status]}."
puts "Backup rules status: #{result[:backup_rules_status]}."
puts "Direct upload rules status: #{result[:direct_upload_rules_status]}."
end
task 's3:upload_assets' => [:environment, 's3:ensure_cors_rules'] do
assets.each do |asset|
upload(*asset)
end
end
task 's3:expire_missing_assets' => :environment do
ensure_s3_configured!
count = 0
keep = 0
in_manifest = asset_paths
puts "Ensuring AWS assets are tagged correctly for removal"
helper.list('assets/').each do |f|
if !in_manifest.include?(f.key)
helper.tag_file(f.key, old: true)
count += 1
else
# ensure we do not delete this by mistake
helper.tag_file(f.key, {})
keep += 1
end
end
puts "#{count} assets were flagged for removal in 10 days (#{keep} assets will be retained)"
puts "Ensuring AWS rule exists for purging old assets"
helper.update_lifecycle("delete_old_assets", 10, tag: { key: 'old', value: 'true' })
end