discourse/lib/tasks/uploads.rake
2023-01-20 12:52:49 -06:00

1290 lines
39 KiB
Ruby

# frozen_string_literal: true
require "db_helper"
require "digest/sha1"
require "base62"
################################################################################
# gather #
################################################################################
require "rake_helpers"
task "uploads:gather" => :environment do
ENV["RAILS_DB"] ? gather_uploads : gather_uploads_for_all_sites
end
def gather_uploads_for_all_sites
RailsMultisite::ConnectionManagement.each_connection { gather_uploads }
end
def file_exists?(path)
File.exist?(path) && File.size(path) > 0
rescue StandardError
false
end
def gather_uploads
public_directory = "#{Rails.root}/public"
current_db = RailsMultisite::ConnectionManagement.current_db
puts "", "Gathering uploads for '#{current_db}'...", ""
Upload
.where("url ~ '^\/uploads\/'")
.where("url !~ ?", "^\/uploads\/#{current_db}")
.find_each do |upload|
begin
old_db = upload.url[%r{\A/uploads/([^/]+)/}, 1]
from = upload.url.dup
to = upload.url.sub("/uploads/#{old_db}/", "/uploads/#{current_db}/")
source = "#{public_directory}#{from}"
destination = "#{public_directory}#{to}"
# create destination directory & copy file unless it already exists
unless file_exists?(destination)
`mkdir -p '#{File.dirname(destination)}'`
`cp --link '#{source}' '#{destination}'`
end
# ensure file has been successfully copied over
raise unless file_exists?(destination)
# remap links in db
DbHelper.remap(from, to)
rescue StandardError
putc "!"
else
putc "."
end
end
puts "", "Done!"
end
################################################################################
# backfill_shas #
################################################################################
task "uploads:backfill_shas" => :environment do
RailsMultisite::ConnectionManagement.each_connection do |db|
puts "Backfilling #{db}..."
Upload
.where(sha1: nil)
.find_each do |u|
begin
path = Discourse.store.path_for(u)
sha1 = Upload.generate_digest(path)
u.sha1 = u.secure? ? SecureRandom.hex(20) : sha1
u.original_sha1 = u.secure? ? sha1 : nil
u.save!
putc "."
rescue => e
puts "Skipping #{u.original_filename} (#{u.url}) #{e.message}"
end
end
end
puts "", "Done"
end
################################################################################
# migrate_to_s3 #
################################################################################
task "uploads:migrate_to_s3" => :environment do
STDOUT.puts(
"Please note that migrating to S3 is currently not reversible! \n[CTRL+c] to cancel, [ENTER] to continue",
)
STDIN.gets
ENV["RAILS_DB"] ? migrate_to_s3 : migrate_to_s3_all_sites
end
def migrate_to_s3_all_sites
RailsMultisite::ConnectionManagement.each_connection do
begin
migrate_to_s3
rescue RuntimeError => e
if ENV["SKIP_FAILED"]
puts e
else
raise e unless ENV["SKIP_FAILED"]
end
end
end
end
def create_migration
FileStore::ToS3Migration.new(
s3_options: FileStore::ToS3Migration.s3_options_from_env,
dry_run: !!ENV["DRY_RUN"],
migrate_to_multisite: !!ENV["MIGRATE_TO_MULTISITE"],
skip_etag_verify: !!ENV["SKIP_ETAG_VERIFY"],
)
end
def migrate_to_s3
create_migration.migrate
end
task "uploads:s3_migration_status" => :environment do
success = true
RailsMultisite::ConnectionManagement.each_connection do
success &&= create_migration.migration_successful?
end
queued_jobs = Sidekiq::Stats.new.queues.sum { |_, x| x }
if queued_jobs > 50
puts "WARNING: There are #{queued_jobs} jobs queued! Wait till Sidekiq clears backlog prior to migrating site to a new host"
exit 1
end
if !success
puts "Site is not ready for migration"
exit 1
end
puts "All sites appear to have uploads in order!"
end
################################################################################
# clean_up #
################################################################################
task "uploads:clean_up" => :environment do
ENV["RAILS_DB"] ? clean_up_uploads : clean_up_uploads_all_sites
end
def clean_up_uploads_all_sites
RailsMultisite::ConnectionManagement.each_connection { clean_up_uploads }
end
def clean_up_uploads
db = RailsMultisite::ConnectionManagement.current_db
puts "Cleaning up uploads and thumbnails for '#{db}'..."
if Discourse.store.external?
puts "This task only works for internal storages."
exit 1
end
puts <<~TEXT
This task will remove upload records and files permanently.
Would you like to take a full backup before the clean up? (Y/N)
TEXT
if STDIN.gets.chomp.downcase == "y"
puts "Starting backup..."
backuper = BackupRestore::Backuper.new(Discourse.system_user.id)
backuper.run
exit 1 unless backuper.success
end
public_directory = Rails.root.join("public").to_s
##
## DATABASE vs FILE SYSTEM
##
# uploads & avatars
Upload.find_each do |upload|
path = File.join(public_directory, upload.url)
if !File.exist?(path)
upload.destroy!
putc "#"
else
putc "."
end
end
# optimized images
OptimizedImage.find_each do |optimized_image|
path = File.join(public_directory, optimized_image.url)
if !File.exist?(path)
optimized_image.destroy!
putc "#"
else
putc "."
end
end
##
## FILE SYSTEM vs DATABASE
##
uploads_directory = File.join(public_directory, "uploads", db).to_s
# avatars (no avatar should be stored in that old directory)
FileUtils.rm_rf("#{uploads_directory}/avatars")
# uploads and optimized images
Dir
.glob("#{uploads_directory}/**/*.*")
.each do |file_path|
sha1 = Upload.generate_digest(file_path)
url = file_path.split(public_directory, 2)[1]
if (Upload.where(sha1: sha1).empty? && Upload.where(url: url).empty?) &&
(OptimizedImage.where(sha1: sha1).empty? && OptimizedImage.where(url: url).empty?)
FileUtils.rm(file_path)
putc "#"
else
putc "."
end
end
puts "Removing empty directories..."
puts `find #{uploads_directory} -type d -empty -exec rmdir {} \\;`
puts "Done!"
end
################################################################################
# missing files #
################################################################################
# list all missing uploads and optimized images
task "uploads:missing_files" => :environment do
if ENV["RAILS_DB"]
list_missing_uploads(skip_optimized: ENV["SKIP_OPTIMIZED"])
else
RailsMultisite::ConnectionManagement.each_connection do |db|
if ENV["SKIP_EXTERNAL"] == "1" && Discourse.store.external?
puts "#{RailsMultisite::ConnectionManagement.current_db} has uploads stored externally skipping!"
else
if Discourse.store.external?
puts "-" * 80
puts "WARNING! WARNING! WARNING!"
puts "-" * 80
puts
puts <<~TEXT
#{RailsMultisite::ConnectionManagement.current_db} has uploads on S3!
validating without inventory is likely to take an enormous amount of time.
We recommend you run SKIP_EXTERNAL=1 rake uploads:missing to skip validating if on a multisite.
TEXT
end
list_missing_uploads(skip_optimized: ENV["SKIP_OPTIMIZED"])
end
end
end
end
def list_missing_uploads(skip_optimized: false)
Discourse.store.list_missing_uploads(skip_optimized: skip_optimized)
end
task "uploads:missing" => :environment do
Rake::Task["uploads:missing_files"].invoke
end
################################################################################
# regenerate_missing_optimized #
################################################################################
# regenerate missing optimized images
task "uploads:regenerate_missing_optimized" => :environment do
if ENV["RAILS_DB"]
regenerate_missing_optimized
else
RailsMultisite::ConnectionManagement.each_connection { regenerate_missing_optimized }
end
end
def regenerate_missing_optimized
db = RailsMultisite::ConnectionManagement.current_db
puts "Regenerating missing optimized images for '#{db}'..."
if Discourse.store.external?
puts "This task only works for internal storages."
return
end
public_directory = "#{Rails.root}/public"
missing_uploads = Set.new
avatar_upload_ids = UserAvatar.all.pluck(:custom_upload_id, :gravatar_upload_id).flatten.compact
default_scope = OptimizedImage.includes(:upload)
[
default_scope.where("optimized_images.upload_id IN (?)", avatar_upload_ids),
default_scope
.where("optimized_images.upload_id NOT IN (?)", avatar_upload_ids)
.where("LENGTH(COALESCE(url, '')) > 0")
.where("width > 0 AND height > 0"),
].each do |scope|
scope.find_each do |optimized_image|
upload = optimized_image.upload
next unless optimized_image.url =~ %r{\A/[^/]}
next unless upload.url =~ %r{\A/[^/]}
thumbnail = "#{public_directory}#{optimized_image.url}"
original = "#{public_directory}#{upload.url}"
if !File.exist?(thumbnail) || File.size(thumbnail) <= 0
# make sure the original image exists locally
if (!File.exist?(original) || File.size(original) <= 0) && upload.origin.present?
# try to fix it by redownloading it
begin
downloaded =
begin
FileHelper.download(
upload.origin,
max_file_size: SiteSetting.max_image_size_kb.kilobytes,
tmp_file_name: "discourse-missing",
follow_redirect: true,
)
rescue StandardError
nil
end
if downloaded && downloaded.size > 0
FileUtils.mkdir_p(File.dirname(original))
File.open(original, "wb") { |f| f.write(downloaded.read) }
end
ensure
downloaded.try(:close!) if downloaded.respond_to?(:close!)
end
end
if File.exist?(original) && File.size(original) > 0
FileUtils.mkdir_p(File.dirname(thumbnail))
OptimizedImage.resize(original, thumbnail, optimized_image.width, optimized_image.height)
putc "#"
else
missing_uploads << original
putc "X"
end
else
putc "."
end
end
end
puts "", "Done"
if missing_uploads.size > 0
puts "Missing uploads:"
missing_uploads.sort.each { |u| puts u }
end
end
################################################################################
# migrate_to_new_scheme #
################################################################################
task "uploads:start_migration" => :environment do
SiteSetting.migrate_to_new_scheme = true
puts "Migration started!"
end
task "uploads:stop_migration" => :environment do
SiteSetting.migrate_to_new_scheme = false
puts "Migration stoped!"
end
task "uploads:analyze", %i[cache_path limit] => :environment do |_, args|
now = Time.zone.now
current_db = RailsMultisite::ConnectionManagement.current_db
puts "Analyzing uploads for '#{current_db}'... This may take awhile...\n"
cache_path = args[:cache_path]
current_db = RailsMultisite::ConnectionManagement.current_db
uploads_path = Rails.root.join("public", "uploads", current_db)
path =
if cache_path
cache_path
else
path = "/tmp/#{current_db}-#{now.to_i}-paths.txt"
FileUtils.touch("/tmp/#{now.to_i}-paths.txt")
`find #{uploads_path} -type f -printf '%s %h/%f\n' > #{path}`
path
end
extensions = {}
paths_count = 0
File
.readlines(path)
.each do |line|
size, file_path = line.split(" ", 2)
paths_count += 1
extension = File.extname(file_path).chomp.downcase
extensions[extension] ||= {}
extensions[extension]["count"] ||= 0
extensions[extension]["count"] += 1
extensions[extension]["size"] ||= 0
extensions[extension]["size"] += size.to_i
end
uploads_count = Upload.count
optimized_images_count = OptimizedImage.count
puts <<~TEXT
Report for '#{current_db}'
-----------#{"-" * current_db.length}
Number of `Upload` records in DB: #{uploads_count}
Number of `OptimizedImage` records in DB: #{optimized_images_count}
**Total DB records: #{uploads_count + optimized_images_count}**
Number of images in uploads folder: #{paths_count}
------------------------------------#{"-" * paths_count.to_s.length}
TEXT
helper = Class.new { include ActionView::Helpers::NumberHelper }
helper = helper.new
printf "%-15s | %-15s | %-15s\n", "extname", "total size", "count"
puts "-" * 45
extensions
.sort_by { |_, value| value["size"] }
.reverse
.each do |extname, value|
printf "%-15s | %-15s | %-15s\n",
extname,
helper.number_to_human_size(value["size"]),
value["count"]
end
puts "\n"
limit = args[:limit] || 10
sql = <<~SQL
SELECT
users.username,
COUNT(uploads.user_id) AS num_of_uploads,
SUM(uploads.filesize) AS total_size_of_uploads,
COUNT(optimized_images.id) AS num_of_optimized_images
FROM users
INNER JOIN uploads ON users.id = uploads.user_id
INNER JOIN optimized_images ON uploads.id = optimized_images.upload_id
GROUP BY users.id
ORDER BY total_size_of_uploads DESC
LIMIT #{limit}
SQL
puts "Users using the most disk space"
puts "-------------------------------\n"
printf "%-25s | %-25s | %-25s | %-25s\n",
"username",
"total size of uploads",
"number of uploads",
"number of optimized images"
puts "-" * 110
DB
.query_single(sql)
.each do |username, num_of_uploads, total_size_of_uploads, num_of_optimized_images|
printf "%-25s | %-25s | %-25s | %-25s\n",
username,
helper.number_to_human_size(total_size_of_uploads),
num_of_uploads,
num_of_optimized_images
end
puts "\n"
puts "List of file paths @ #{path}"
puts "Duration: #{Time.zone.now - now} seconds"
end
task "uploads:fix_incorrect_extensions" => :environment do
UploadFixer.fix_all_extensions
end
task "uploads:recover_from_tombstone" => :environment do
Rake::Task["uploads:recover"].invoke
end
task "uploads:recover" => :environment do
dry_run = ENV["DRY_RUN"].present?
stop_on_error = ENV["STOP_ON_ERROR"].present?
if ENV["RAILS_DB"]
UploadRecovery.new(dry_run: dry_run, stop_on_error: stop_on_error).recover
else
RailsMultisite::ConnectionManagement.each_connection do |db|
UploadRecovery.new(dry_run: dry_run, stop_on_error: stop_on_error).recover
end
end
end
task "uploads:sync_s3_acls" => :environment do
RailsMultisite::ConnectionManagement.each_connection do |db|
unless Discourse.store.external?
puts "This task only works for external storage."
exit 1
end
puts "CAUTION: This task may take a long time to complete! There are #{Upload.count} uploads to sync ACLs for."
puts ""
puts "-" * 30
puts "Uploads marked as secure will get a private ACL, and uploads marked as not secure will get a public ACL."
puts "Upload ACLs will be updated in Sidekiq jobs in batches of 100 at a time, check Sidekiq queues for SyncAclsForUploads for progress."
Upload.select(:id).find_in_batches(batch_size: 100) { |uploads| adjust_acls(uploads.map(&:id)) }
puts "", "Upload ACL sync complete!"
end
end
#
# TODO (martin) Update this rake task to use the _first_ UploadReference
# record for each upload to determine security, and do not mark things
# as secure if the first record is something public e.g. a site setting.
task "uploads:disable_secure_uploads" => :environment do
RailsMultisite::ConnectionManagement.each_connection do |db|
unless Discourse.store.external?
puts "This task only works for external storage."
exit 1
end
puts "Disabling secure upload and resetting uploads to not secure in #{db}...", ""
SiteSetting.secure_uploads = false
secure_uploads =
Upload
.joins(:upload_references)
.where(upload_references: { target_type: "Post" })
.where(secure: true)
secure_upload_count = secure_uploads.count
secure_upload_ids = secure_uploads.pluck(:id)
puts "", "Marking #{secure_upload_count} uploads as not secure.", ""
secure_uploads.update_all(
secure: false,
security_last_changed_at: Time.zone.now,
security_last_changed_reason: "marked as not secure by disable_secure_uploads task",
)
post_ids_to_rebake =
DB.query_single(
"SELECT DISTINCT target_id FROM upload_references WHERE upload_id IN (?) AND target_type = 'Post'",
secure_upload_ids,
)
adjust_acls(secure_upload_ids)
post_rebake_errors = rebake_upload_posts(post_ids_to_rebake)
log_rebake_errors(post_rebake_errors)
puts "", "Rebaking and uploading complete!", ""
end
puts "", "Secure uploads are now disabled!", ""
end
##
# Run this task whenever the secure_uploads or login_required
# settings are changed for a Discourse instance to update
# the upload secure flag and S3 upload ACLs. Any uploads that
# have their secure status changed will have all associated posts
# rebaked.
#
# TODO (martin) Update this rake task to use the _first_ UploadReference
# record for each upload to determine security, and do not mark things
# as secure if the first record is something public e.g. a site setting.
task "uploads:secure_upload_analyse_and_update" => :environment do
RailsMultisite::ConnectionManagement.each_connection do |db|
unless Discourse.store.external?
puts "This task only works for external storage."
exit 1
end
puts "Analyzing security for uploads in #{db}...", ""
all_upload_ids_changed, post_ids_to_rebake = nil
Upload.transaction do
# If secure upload is enabled we need to first set the access control post of
# all post uploads (even uploads that are linked to multiple posts). If the
# upload is not set to secure upload then this has no other effect on the upload,
# but we _must_ know what the access control post is because the with_secure_uploads?
# method is on the post, and this knows about the category security & PM status
update_uploads_access_control_post if SiteSetting.secure_uploads?
puts "", "Analysing which uploads need to be marked secure and be rebaked.", ""
if SiteSetting.login_required?
# Simply mark all uploads linked to posts secure if login_required because no anons will be able to access them.
post_ids_to_rebake, all_upload_ids_changed = mark_all_as_secure_login_required
else
# Otherwise only mark uploads linked to posts in secure categories or PMs as secure.
post_ids_to_rebake, all_upload_ids_changed =
update_specific_upload_security_no_login_required
end
end
# Enqueue rebakes AFTER upload transaction complete, so there is no race condition
# between updating the DB and the rebakes occurring.
post_rebake_errors = rebake_upload_posts(post_ids_to_rebake)
log_rebake_errors(post_rebake_errors)
# Also do this AFTER upload transaction complete so we don't end up with any
# errors leaving ACLs in a bad state (the ACL sync task can be run to fix any
# outliers at any time).
adjust_acls(all_upload_ids_changed)
end
puts "", "", "Done!"
end
def adjust_acls(upload_ids_to_adjust_acl_for)
jobs_to_create = (upload_ids_to_adjust_acl_for.count.to_f / 100.00).ceil
if jobs_to_create > 1
puts "Adjusting ACLs for #{upload_ids_to_adjust_acl_for} uploads. These will be batched across #{jobs_to_create} sync job(s)."
end
upload_ids_to_adjust_acl_for.each_slice(100) do |upload_ids|
Jobs.enqueue(:sync_acls_for_uploads, upload_ids: upload_ids)
end
puts "ACL batching complete. Keep an eye on the Sidekiq queue for progress." if jobs_to_create > 1
end
def mark_all_as_secure_login_required
post_upload_ids_marked_secure = DB.query_single(<<~SQL)
WITH upl AS (
SELECT DISTINCT ON (upload_id) upload_id
FROM upload_references
INNER JOIN posts ON posts.id = upload_references.target_id AND upload_references.target_type = 'Post'
INNER JOIN topics ON topics.id = posts.topic_id
)
UPDATE uploads
SET secure = true,
security_last_changed_reason = 'upload security rake task mark as secure',
security_last_changed_at = NOW()
FROM upl
WHERE uploads.id = upl.upload_id AND NOT uploads.secure
RETURNING uploads.id
SQL
puts "Marked #{post_upload_ids_marked_secure.count} upload(s) as secure because login_required is true.",
""
upload_ids_marked_not_secure = DB.query_single(<<~SQL, post_upload_ids_marked_secure)
UPDATE uploads
SET secure = false,
security_last_changed_reason = 'upload security rake task mark as not secure',
security_last_changed_at = NOW()
WHERE id NOT IN (?) AND uploads.secure
RETURNING uploads.id
SQL
puts "Marked #{upload_ids_marked_not_secure.count} upload(s) as not secure because they are not linked to posts.",
""
puts "Finished marking upload(s) as secure."
post_ids_to_rebake =
DB.query_single(
"SELECT DISTINCT target_id FROM upload_references WHERE upload_id IN (?) AND target_type = 'Post'",
post_upload_ids_marked_secure,
)
[post_ids_to_rebake, (post_upload_ids_marked_secure + upload_ids_marked_not_secure).uniq]
end
def log_rebake_errors(rebake_errors)
return if rebake_errors.empty?
puts "The following post rebakes failed with error:", ""
rebake_errors.each { |message| puts message }
end
def update_specific_upload_security_no_login_required
# A simplification of the rules found in UploadSecurity which is a lot faster than
# having to loop through records and use that class to check security.
post_upload_ids_marked_secure = DB.query_single(<<~SQL)
WITH upl AS (
SELECT DISTINCT ON (upload_id) upload_id
FROM upload_references
INNER JOIN posts ON posts.id = upload_references.target_id AND upload_references.target_type = 'Post'
INNER JOIN topics ON topics.id = posts.topic_id
LEFT JOIN categories ON categories.id = topics.category_id
WHERE (topics.category_id IS NOT NULL AND categories.read_restricted) OR
(topics.archetype = 'private_message')
)
UPDATE uploads
SET secure = true,
security_last_changed_reason = 'upload security rake task mark as secure',
security_last_changed_at = NOW()
FROM upl
WHERE uploads.id = upl.upload_id AND NOT uploads.secure
RETURNING uploads.id
SQL
puts "Marked #{post_upload_ids_marked_secure.length} uploads as secure."
# Anything in a public category or a regular topic should not be secure.
post_upload_ids_marked_not_secure = DB.query_single(<<~SQL)
WITH upl AS (
SELECT DISTINCT ON (upload_id) upload_id
FROM upload_references
INNER JOIN posts ON posts.id = upload_references.target_id AND upload_references.target_type = 'Post'
INNER JOIN topics ON topics.id = posts.topic_id
LEFT JOIN categories ON categories.id = topics.category_id
WHERE (topics.archetype = 'regular' AND topics.category_id IS NOT NULL AND NOT categories.read_restricted) OR
(topics.archetype = 'regular' AND topics.category_id IS NULL)
)
UPDATE uploads
SET secure = false,
security_last_changed_reason = 'upload security rake task mark as not secure',
security_last_changed_at = NOW()
FROM upl
WHERE uploads.id = upl.upload_id AND uploads.secure
RETURNING uploads.id
SQL
puts "Marked #{post_upload_ids_marked_not_secure.length} uploads as not secure."
# Everything else should not be secure!
upload_ids_changed = (post_upload_ids_marked_secure + post_upload_ids_marked_not_secure).uniq
upload_ids_marked_not_secure = DB.query_single(<<~SQL, upload_ids_changed)
UPDATE uploads
SET secure = false,
security_last_changed_reason = 'upload security rake task mark as not secure',
security_last_changed_at = NOW()
WHERE id NOT IN (?) AND uploads.secure
RETURNING uploads.id
SQL
puts "Finished updating upload security. Marked #{upload_ids_marked_not_secure.length} uploads not linked to posts as not secure."
all_upload_ids_changed = (upload_ids_changed + upload_ids_marked_not_secure).uniq
post_ids_to_rebake =
DB.query_single(
"SELECT DISTINCT target_id FROM upload_references WHERE upload_id IN (?) AND target_type = 'Post'",
upload_ids_changed,
)
[post_ids_to_rebake, all_upload_ids_changed]
end
def update_uploads_access_control_post
DB.exec(<<~SQL)
WITH upl AS (
SELECT DISTINCT ON (upload_id) upload_id, target_id AS post_id
FROM upload_references
WHERE target_type = 'Post'
ORDER BY upload_id, target_id
)
UPDATE uploads
SET access_control_post_id = upl.post_id
FROM upl
WHERE uploads.id = upl.upload_id
SQL
end
def rebake_upload_posts(post_ids_to_rebake)
posts_to_rebake = Post.where(id: post_ids_to_rebake)
post_rebake_errors = []
puts "", "Rebaking #{posts_to_rebake.length} posts with affected uploads.", ""
begin
i = 0
posts_to_rebake.each do |post|
RakeHelpers.print_status_with_label("Rebaking posts.....", i, posts_to_rebake.length)
post.rebake!
i += 1
end
RakeHelpers.print_status_with_label("Rebaking complete! ", i, posts_to_rebake.length)
puts ""
rescue => e
post_rebake_errors << e.message
end
post_rebake_errors
end
def inline_uploads(post)
replaced = false
original_raw = post.raw
post.raw =
post
.raw
.gsub(%r{(\((/uploads\S+).*\))}) do
upload = Upload.find_by(url: $2)
if !upload
data = Upload.extract_url($2)
if data && sha1 = data[2]
upload = Upload.find_by(sha1: sha1)
if !upload
sha_map = JSON.parse(post.custom_fields["UPLOAD_SHA1_MAP"] || "{}")
if mapped_sha = sha_map[sha1]
upload = Upload.find_by(sha1: mapped_sha)
end
end
end
end
result = $1
if upload&.id
result.sub!($2, upload.short_url)
replaced = true
else
puts "Upload not found #{$2} in Post #{post.id} - #{post.url}"
end
result
end
if replaced
puts "Corrected image urls in #{post.full_url} raw backup stored in custom field"
post.custom_fields["BACKUP_POST_RAW"] = original_raw
post.save_custom_fields
post.save!(validate: false)
post.rebake!
end
end
def inline_img_tags(post)
replaced = false
original_raw = post.raw
post.raw =
post
.raw
.gsub(%r{(<img\s+src=["'](/uploads/[^'"]*)["'].*>)}i) do
next if $2.include?("..")
upload = Upload.find_by(url: $2)
if !upload
data = Upload.extract_url($2)
if data && sha1 = data[2]
upload = Upload.find_by(sha1: sha1)
end
end
if !upload
local_file = File.join(Rails.root, "public", $2)
if File.exist?(local_file)
File.open(local_file) do |f|
upload = UploadCreator.new(f, "image").create_for(post.user_id)
end
end
end
if upload
replaced = true
"![image](#{upload.short_url})"
else
puts "skipping missing upload in #{post.full_url} #{$1}"
$1
end
end
if replaced
puts "Corrected image urls in #{post.full_url} raw backup stored in custom field"
post.custom_fields["BACKUP_POST_RAW"] = original_raw
post.save_custom_fields
post.save!(validate: false)
post.rebake!
end
end
def fix_relative_links
Post.where("raw like ?", "%](/uploads%").find_each { |post| inline_uploads(post) }
Post.where("raw ilike ?", "%<img%src=%/uploads/%>%").find_each { |post| inline_img_tags(post) }
end
task "uploads:fix_relative_upload_links" => :environment do
if RailsMultisite::ConnectionManagement.current_db != "default"
fix_relative_links
else
RailsMultisite::ConnectionManagement.each_connection { fix_relative_links }
end
end
def analyze_missing_s3
puts "List of posts with missing images:"
sql = <<~SQL
SELECT ur.target_id, u.url, u.sha1, u.extension, u.id
FROM upload_references ur
RIGHT JOIN uploads u ON u.id = ur.upload_id
WHERE ur.target_type = 'Post' AND u.verification_status = :invalid_etag
ORDER BY ur.created_at
SQL
lookup = {}
other = []
all = []
DB
.query(sql, invalid_etag: Upload.verification_statuses[:invalid_etag])
.each do |r|
all << r
if r.target_id
lookup[r.target_id] ||= []
lookup[r.target_id] << [r.url, r.sha1, r.extension]
else
other << r
end
end
posts = Post.where(id: lookup.keys)
posts
.order(:created_at)
.each do |post|
puts "#{Discourse.base_url}/p/#{post.id} #{lookup[post.id].length} missing, #{post.created_at}"
lookup[post.id].each do |url, sha1, extension|
puts url
puts "#{Upload.base62_sha1(sha1)}.#{extension}"
end
puts
end
missing_uploads = Upload.where(verification_status: Upload.verification_statuses[:invalid_etag])
puts "Total missing uploads: #{missing_uploads.count}, newest is #{missing_uploads.maximum(:created_at)}"
puts "Total problem posts: #{lookup.keys.count} with #{lookup.values.sum { |a| a.length }} missing uploads"
puts "Other missing uploads count: #{other.count}"
if all.count > 0
ids = all.map { |r| r.id }
lookups = [
%i[upload_references upload_id],
%i[users uploaded_avatar_id],
%i[user_avatars gravatar_upload_id],
%i[user_avatars custom_upload_id],
[
:site_settings,
[
"NULLIF(value, '')::integer",
"data_type = #{SiteSettings::TypeSupervisor.types[:upload].to_i}",
],
],
%i[user_profiles profile_background_upload_id],
%i[user_profiles card_background_upload_id],
%i[categories uploaded_logo_id],
%i[categories uploaded_logo_dark_id],
%i[categories uploaded_background_id],
%i[custom_emojis upload_id],
%i[theme_fields upload_id],
%i[user_exports upload_id],
%i[groups flair_upload_id],
]
lookups.each do |table, (column, where)|
count = DB.query_single(<<~SQL, ids: ids).first
SELECT COUNT(*) FROM #{table} WHERE #{column} IN (:ids) #{"AND #{where}" if where}
SQL
puts "Found #{count} missing row#{"s" if count > 1} in #{table}(#{column})" if count > 0
end
end
end
def delete_missing_s3
missing =
Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]).order(
:created_at,
)
count = missing.count
if count > 0
puts "The following uploads will be deleted from the database"
missing.each { |upload| puts "#{upload.id} - #{upload.url} - #{upload.created_at}" }
puts "Please confirm you wish to delete #{count} upload records by typing YES"
confirm = STDIN.gets.strip
if confirm == "YES"
missing.destroy_all
puts "#{count} records were deleted"
else
STDERR.puts "Aborting"
exit 1
end
end
end
task "uploads:delete_missing_s3" => :environment do
if RailsMultisite::ConnectionManagement.current_db != "default"
delete_missing_s3
else
RailsMultisite::ConnectionManagement.each_connection { delete_missing_s3 }
end
end
task "uploads:analyze_missing_s3" => :environment do
if RailsMultisite::ConnectionManagement.current_db != "default"
analyze_missing_s3
else
RailsMultisite::ConnectionManagement.each_connection { analyze_missing_s3 }
end
end
def fix_missing_s3
Jobs.run_immediately!
puts "Attempting to download missing uploads and recreate"
ids = Upload.where(verification_status: Upload.verification_statuses[:invalid_etag]).pluck(:id)
ids.each do |id|
upload = Upload.find_by(id: id)
next if !upload
tempfile = nil
downloaded_from = nil
begin
tempfile =
FileHelper.download(
upload.url,
max_file_size: 30.megabyte,
tmp_file_name: "#{SecureRandom.hex}.#{upload.extension}",
)
downloaded_from = upload.url
rescue => e
if upload.origin.present?
begin
tempfile =
FileHelper.download(
upload.origin,
max_file_size: 30.megabyte,
tmp_file_name: "#{SecureRandom.hex}.#{upload.extension}",
)
downloaded_from = upload.origin
rescue => e
puts "Failed to download #{upload.origin} #{e}"
end
else
puts "Failed to download #{upload.url} #{e}"
end
end
if tempfile
puts "Successfully downloaded upload id: #{upload.id} - #{downloaded_from} fixing upload"
fixed_upload = nil
fix_error = nil
Upload.transaction do
begin
upload.update_column(:sha1, SecureRandom.hex)
fixed_upload =
UploadCreator.new(
tempfile,
"temp.#{upload.extension}",
skip_validations: true,
).create_for(Discourse.system_user.id)
rescue => fix_error
# invalid extension is the most common issue
end
raise ActiveRecord::Rollback
end
if fix_error
puts "Failed to fix upload #{fix_error}"
else
# we do not fix sha, it may be wrong for arbitrary reasons, if we correct it
# we may end up breaking posts
save_error = nil
begin
upload.assign_attributes(
etag: fixed_upload.etag,
url: fixed_upload.url,
verification_status: Upload.verification_statuses[:unchecked],
)
upload.save!(validate: false)
rescue => save_error
# url might be null
end
if save_error
puts "Failed to save upload #{save_error}"
else
OptimizedImage.where(upload_id: upload.id).destroy_all
rebake_ids =
UploadReference.where(upload_id: upload.id).where(target_type: "Post").pluck(:target_id)
if rebake_ids.present?
Post
.where(id: rebake_ids)
.each do |post|
puts "rebake post #{post.id}"
post.rebake!
end
end
end
end
end
end
puts "Attempting to automatically fix problem uploads"
puts
puts "Rebaking posts with missing uploads, this can take a while as all rebaking runs inline"
sql = <<~SQL
SELECT ur.target_id
FROM upload_references ur
JOIN uploads u ON u.id = ur.upload_id
WHERE ur.target_type = 'Post' AND u.verification_status = :invalid_etag
ORDER BY ur.target_id DESC
SQL
DB
.query_single(sql, invalid_etag: Upload.verification_statuses[:invalid_etag])
.each do |post_id|
post = Post.find_by(id: post_id)
if post
post.rebake!
print "."
else
puts "Skipping #{post_id} since it is deleted"
end
end
puts
end
task "uploads:fix_missing_s3" => :environment do
if RailsMultisite::ConnectionManagement.current_db != "default"
fix_missing_s3
else
RailsMultisite::ConnectionManagement.each_connection { fix_missing_s3 }
end
end
# Supported ENV arguments:
#
# VERBOSE=1
# Shows debug information.
#
# INTERACTIVE=1
# Shows debug information and pauses for input on issues.
#
# WORKER_ID/WORKER_COUNT
# When running the script on a single forum in multiple terminals.
# For example, if you want 4 concurrent scripts use WORKER_COUNT=4
# and WORKER_ID from 0 to 3.
#
# START_ID
# Skip uploads with id lower than START_ID.
task "uploads:downsize" => :environment do
min_image_pixels = 500_000 # 0.5 megapixels
default_image_pixels = 1_000_000 # 1 megapixel
max_image_pixels = [ARGV[0]&.to_i || default_image_pixels, min_image_pixels].max
ENV["VERBOSE"] = "1" if ENV["INTERACTIVE"]
def log(*args)
puts(*args) if ENV["VERBOSE"]
end
puts "", "Downsizing images to no more than #{max_image_pixels} pixels"
dimensions_count = 0
downsized_count = 0
scope =
Upload.by_users.with_no_non_post_relations.where(
"LOWER(extension) IN ('jpg', 'jpeg', 'gif', 'png')",
)
scope = scope.where(<<-SQL, max_image_pixels)
COALESCE(width, 0) = 0 OR
COALESCE(height, 0) = 0 OR
COALESCE(thumbnail_width, 0) = 0 OR
COALESCE(thumbnail_height, 0) = 0 OR
width * height > ?
SQL
if ENV["WORKER_ID"] && ENV["WORKER_COUNT"]
scope = scope.where("uploads.id % ? = ?", ENV["WORKER_COUNT"], ENV["WORKER_ID"])
end
scope = scope.where("uploads.id >= ?", ENV["START_ID"]) if ENV["START_ID"]
skipped = 0
total_count = scope.count
puts "Uploads to process: #{total_count}"
scope.find_each.with_index do |upload, index|
progress = (index * 100.0 / total_count).round(1)
log "\n"
print "\r#{progress}% Fixed dimensions: #{dimensions_count} Downsized: #{downsized_count} Skipped: #{skipped} (upload id: #{upload.id})"
log "\n"
path =
if upload.local?
Discourse.store.path_for(upload)
else
(
begin
Discourse.store.download(upload, max_file_size_kb: 100.megabytes)
rescue StandardError
nil
end
)&.path
end
unless path
log "No image path"
skipped += 1
next
end
begin
w, h = FastImage.size(path, raise_on_failure: true)
rescue FastImage::UnknownImageType
log "Unknown image type"
skipped += 1
next
rescue FastImage::SizeNotFound
log "Size not found"
skipped += 1
next
end
if !w || !h
log "Invalid image dimensions"
skipped += 1
next
end
ww, hh = ImageSizer.resize(w, h)
if w == 0 || h == 0 || ww == 0 || hh == 0
log "Invalid image dimensions"
skipped += 1
next
end
upload.attributes = {
width: w,
height: h,
thumbnail_width: ww,
thumbnail_height: hh,
filesize: File.size(path),
}
if upload.changed?
log "Correcting the upload dimensions"
log "Before: #{upload.width_was}x#{upload.height_was} #{upload.thumbnail_width_was}x#{upload.thumbnail_height_was} (#{upload.filesize_was})"
log "After: #{w}x#{h} #{ww}x#{hh} (#{upload.filesize})"
dimensions_count += 1
# Don't validate the size - max image size setting might have
# changed since the file was uploaded, so this could fail
upload.validate_file_size = false
upload.save!
end
if w * h < max_image_pixels
log "Image size within allowed range"
skipped += 1
next
end
result =
ShrinkUploadedImage.new(
upload: upload,
path: path,
max_pixels: max_image_pixels,
verbose: ENV["VERBOSE"],
interactive: ENV["INTERACTIVE"],
).perform
if result
downsized_count += 1
else
skipped += 1
end
end
STDIN.beep
puts "", "Done", Time.zone.now
end