FIX: Improve image downsizing script (#9549)

Correctly handles more upload formats in posts, updates post custom fields, fixes more edge cases, adds debugging capabilities. (VERBOSE=1 and INTERACTIVE=1 flags)

Includes these commits and some more:

* DEV: Show the fixed image dimensions
* FIX: Support more upload url formats
* DEV: Remove the old upload after updating posts
* FIX: Use the `process_post_#{id}` mutex
* FIX: Avoid rebaking twice
* DEV: Print out the link to the post
* DEV: Process posts chronologically
* DEV: Do a dry-run before saving, pause on any issue
* FIX: Also process deleted posts
* DEV: Make matchers case-insensitive
* DEV: Pause on "detached" uploads, add more debug info
* DEV: Print out time when finished
* DEV: Add support for WORKER_ID/WORKER_COUNT
* DEV: Fix the onebox in cooked text heuristic
* DEV: Don't report already processed posts
* DEV: Beep when done!
* DEV: Ignore issues with deleted posts
* DEV: Ignore issues with deleted topics
* DEV: Multiline SQL
* DEV: Use the bulk attribute assignment
* DEV: Add ENV["INTERACTIVE"] mode
* DEV: Handle post custom fields
* DEV: Bail on non-S3 sites
* DEV: Allow sizes smaller than 1 mpix
This commit is contained in:
Jarek Radosz 2020-05-26 15:38:23 +02:00 committed by GitHub
parent b61a291cf3
commit eb462bfb3d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -2,80 +2,181 @@
require File.expand_path("../../config/environment", __FILE__)
# no less than 1 megapixel
max_image_pixels = [ARGV[0].to_i, 1_000_000].max
# Supported ENV arguments:
#
# VERBOSE=1
# Shows debug information.
#
# INTERACTIVE=1
# Shows debug information and pauses for input on issues.
#
# WORKER_ID/WORKER_COUNT
# When running the script on a single forum in multiple terminals.
# For example, if you want 4 concurrent scripts use WORKER_COUNT=4
# and WORKER_ID from 0 to 3
puts "", "Downsizing images to no more than #{max_image_pixels} pixels"
MIN_IMAGE_PIXELS = 500_000 # 0.5 megapixels
DEFAULT_IMAGE_PIXELS = 1_000_000 # 1 megapixel
dimensions_count = 0
downsized_count = 0
MAX_IMAGE_PIXELS = [
ARGV[0]&.to_i || DEFAULT_IMAGE_PIXELS,
MIN_IMAGE_PIXELS
].max
def downsize_upload(upload, path, max_image_pixels)
ENV["VERBOSE"] = "1" if ENV["INTERACTIVE"]
def log(*args)
puts(*args) if ENV["VERBOSE"]
end
def transform_post(post, upload_before, upload_after)
post.raw.gsub!(/upload:\/\/#{upload_before.base62_sha1}(\.#{upload_before.extension})?/i, upload_after.short_url)
post.raw.gsub!(Discourse.store.cdn_url(upload_before.url), Discourse.store.cdn_url(upload_after.url))
post.raw.gsub!(Discourse.store.url_for(upload_before), Discourse.store.url_for(upload_after))
post.raw.gsub!("#{Discourse.base_url}#{upload_before.short_path}", "#{Discourse.base_url}#{upload_after.short_path}")
path = SiteSetting.Upload.s3_upload_bucket.split("/", 2)[1]
post.raw.gsub!(/<img src=\"https:\/\/.+?\/#{path}\/uploads\/default\/optimized\/.+?\/#{upload_before.sha1}_\d_(?<width>\d+)x(?<height>\d+).*?\" alt=\"(?<alt>.*?)\"\/?>/i) do
"![#{$~[:alt]}|#{$~[:width]}x#{$~[:height]}](#{upload_after.short_url})"
end
post.raw.gsub!(/!\[(.*?)\]\(\/uploads\/.+?\/#{upload_before.sha1}(\.#{upload_before.extension})?\)/i, "![\\1](#{upload_after.short_url})")
end
def downsize_upload(upload, path)
# Make sure the filesize is up to date
upload.filesize = File.size(path)
OptimizedImage.downsize(path, path, "#{max_image_pixels}@", filename: upload.original_filename)
OptimizedImage.downsize(path, path, "#{MAX_IMAGE_PIXELS}@", filename: upload.original_filename)
sha1 = Upload.generate_digest(path)
if sha1 == upload.sha1
puts "no sha1 change" if ENV["VERBOSE"]
log "No sha1 change"
return
end
w, h = FastImage.size(path, timeout: 10, raise_on_failure: true)
w, h = FastImage.size(path, timeout: 15, raise_on_failure: true)
if !w || !h
puts "invalid image dimensions after resizing" if ENV["VERBOSE"]
log "Invalid image dimensions after resizing"
return
end
# Neither #dup or #clone provide a complete copy
original_upload = Upload.find(upload.id)
ww, hh = ImageSizer.resize(w, h)
new_file = true
if existing_upload = Upload.find_by(sha1: sha1)
upload = existing_upload
new_file = false
end
# A different upload record that matches the sha1 of the downsized image
existing_upload = Upload.find_by(sha1: sha1)
upload = existing_upload if existing_upload
before = upload.filesize
upload.filesize = File.size(path)
upload.attributes = {
sha1: sha1,
width: w,
height: h,
thumbnail_width: ww,
thumbnail_height: hh,
filesize: File.size(path)
}
if upload.filesize > before
puts "no filesize reduction" if ENV["VERBOSE"]
if upload.filesize > upload.filesize_was
log "No filesize reduction"
return
end
upload.sha1 = sha1
upload.width = w
upload.height = h
upload.thumbnail_width = ww
upload.thumbnail_height = hh
if new_file
unless existing_upload
url = Discourse.store.store_upload(File.new(path), upload)
unless url
puts "couldn't store the upload" if ENV["VERBOSE"]
log "Couldn't store the upload"
return
end
upload.url = url
end
if ENV["VERBOSE"]
puts "base62: #{original_upload.base62_sha1} -> #{Upload.base62_sha1(sha1)}"
puts "sha1: #{original_upload.sha1} -> #{sha1}"
puts "is a new file: #{new_file}"
log "base62: #{original_upload.base62_sha1} -> #{Upload.base62_sha1(sha1)}"
log "sha: #{original_upload.sha1} -> #{sha1}"
log "(an exisiting upload)" if existing_upload
success = true
posts = Post.unscoped.joins(:post_uploads).where(post_uploads: { upload_id: original_upload.id }).uniq.sort_by(&:created_at)
posts.each do |post|
transform_post(post, original_upload, upload)
if post.custom_fields[Post::DOWNLOADED_IMAGES].present?
downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES])
end
if post.raw_changed?
log "Updating post"
elsif downloaded_images&.has_value?(original_upload.id)
log "A hotlinked, unreferenced image"
elsif post.raw.include?(upload.short_url)
log "Already processed"
elsif post.trashed?
log "A deleted post"
elsif !post.topic || post.topic.trashed?
log "A deleted topic"
elsif post.cooked.include?(original_upload.sha1)
if post.raw.include?("#{Discourse.base_url.sub(/^https?:\/\//i, "")}/t/")
log "Updating a topic onebox"
else
log "Updating an external onebox"
end
else
log "Could not find the upload URL"
success = false
end
log "#{Discourse.base_url}/p/#{post.id}"
end
if posts.empty?
log "Upload not used in any posts"
if User.where(uploaded_avatar_id: original_upload.id).count
log "Used as a User avatar"
elsif UserAvatar.where(gravatar_upload_id: original_upload.id).count
log "Used as a UserAvatar gravatar"
elsif UserAvatar.where(custom_upload_id: original_upload.id).count
log "Used as a UserAvatar custom upload"
elsif UserProfile.where(profile_background_upload_id: original_upload.id).count
log "Used as a UserProfile profile background"
elsif UserProfile.where(card_background_upload_id: original_upload.id).count
log "Used as a UserProfile card background"
elsif Category.where(uploaded_logo_id: original_upload.id).count
log "Used as a Category logo"
elsif Category.where(uploaded_background_id: original_upload.id).count
log "Used as a Category background"
elsif CustomEmoji.where(upload_id: original_upload.id).count
log "Used as a CustomEmoji"
elsif ThemeField.where(upload_id: original_upload.id).count
log "Used as a ThemeField"
else
success = false
end
end
unless success
if ENV["INTERACTIVE"]
print "Press any key to continue with the upload"
STDIN.beep
STDIN.getch
puts " k"
elsif !existing_upload && !Upload.where(url: upload.url).exist?
# We're bailing, so clean up the just uploaded file
Discourse.store.remove_upload(upload)
log "⏩ Skipping"
return
end
end
upload.save!
if new_file
upload.optimized_images.each(&:destroy!)
Discourse.store.remove_upload(original_upload)
else
if existing_upload
begin
PostUpload.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
rescue ActiveRecord::RecordNotUnique, PG::UniqueViolation
@ -90,85 +191,162 @@ def downsize_upload(upload, path, max_image_pixels)
Category.where(uploaded_background_id: original_upload.id).update_all(uploaded_background_id: upload.id)
CustomEmoji.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
ThemeField.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
else
upload.optimized_images.each(&:destroy!)
end
original_upload.posts.each do |post|
post.raw.gsub!(/upload:\/\/#{original_upload.base62_sha1}(\.#{original_upload.extension})?/, upload.short_url)
post.raw.gsub!(Discourse.store.cdn_url(original_upload.url), Discourse.store.cdn_url(upload.url))
posts.each do |post|
DistributedMutex.synchronize("process_post_#{post.id}") do
current_post = Post.unscoped.find(post.id)
if post.raw_changed?
puts "updating post #{post.id}" if ENV["VERBOSE"]
post.save!
else
puts "Could find the upload path in post.raw (post_id: #{post.id})" if ENV["VERBOSE"]
# If the post became outdated, reapply changes
if current_post.updated_at != post.updated_at
transform_post(current_post, original_upload, upload)
post = current_post
end
if post.raw_changed?
post.update_columns(
raw: post.raw,
updated_at: Time.zone.now
)
end
if existing_upload && post.custom_fields[Post::DOWNLOADED_IMAGES].present?
downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES])
downloaded_images.transform_values! do |upload_id|
upload_id == original_upload.id ? upload.id : upload_id
end
post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_images.to_json if downloaded_images.present?
post.save_custom_fields
end
post.rebake!
end
post.rebake!
end
original_upload.reload.destroy! unless new_file
puts "" if ENV["VERBOSE"]
if existing_upload
original_upload.reload.destroy!
else
Discourse.store.remove_upload(original_upload)
end
true
end
scope = Upload
.where("LOWER(extension) IN ('jpg', 'jpeg', 'gif', 'png')")
.where("COALESCE(width, 0) = 0 OR COALESCE(height, 0) = 0 OR COALESCE(thumbnail_width, 0) = 0 OR COALESCE(thumbnail_height, 0) = 0 OR width * height > ?", max_image_pixels)
puts "Uploads to process: #{scope.count}"
scope.find_each do |upload|
print "\rFixed dimensions: %8d Downsized: %8d (upload id: #{upload.id})".freeze % [dimensions_count, downsized_count]
puts "\n" if ENV["VERBOSE"]
source = upload.local? ? Discourse.store.path_for(upload) : "https:#{upload.url}"
unless source
puts "no path or URL" if ENV["VERBOSE"]
next
def process_uploads
unless SiteSetting.Upload.enable_s3_uploads
puts "This script supports only S3 uploads"
return
end
w, h = FastImage.size(source, timeout: 10)
puts "", "Downsizing images to no more than #{MAX_IMAGE_PIXELS} pixels"
if !w || !h
puts "invalid image dimensions" if ENV["VERBOSE"]
next
dimensions_count = 0
downsized_count = 0
scope = Upload.where("LOWER(extension) IN ('jpg', 'jpeg', 'gif', 'png')")
scope = scope.where(<<-SQL, MAX_IMAGE_PIXELS)
COALESCE(width, 0) = 0 OR
COALESCE(height, 0) = 0 OR
COALESCE(thumbnail_width, 0) = 0 OR
COALESCE(thumbnail_height, 0) = 0 OR
width * height > ?
SQL
if ENV["WORKER_ID"] && ENV["WORKER_COUNT"]
scope = scope.where("id % ? = ?", ENV["WORKER_COUNT"], ENV["WORKER_ID"])
end
ww, hh = ImageSizer.resize(w, h)
skipped = 0
total_count = scope.count
puts "Uploads to process: #{total_count}"
if w == 0 || h == 0 || ww == 0 || hh == 0
puts "invalid image dimensions" if ENV["VERBOSE"]
next
end
scope.find_each.with_index do |upload, index|
progress = (index * 100.0 / total_count).round(1)
if upload.read_attribute(:width) != w || upload.read_attribute(:height) != h || upload.read_attribute(:thumbnail_width) != ww || upload.read_attribute(:thumbnail_height) != hh
puts "Correcting the upload dimensions" if ENV["VERBOSE"]
dimensions_count += 1
log "\n"
print "\r#{progress}% Fixed dimensions: #{dimensions_count} Downsized: #{downsized_count} Skipped: #{skipped} (upload id: #{upload.id})"
log "\n"
upload.update!(
source = upload.local? ? Discourse.store.path_for(upload) : "https:#{upload.url}"
unless source
log "No path or URL"
skipped += 1
next
end
begin
w, h = FastImage.size(source, timeout: 15, raise_on_failure: true)
rescue FastImage::ImageFetchFailure
log "Retrying image resizing"
w, h = FastImage.size(source, timeout: 15)
rescue FastImage::UnknownImageType
log "Unknown image type"
skipped += 1
next
rescue FastImage::SizeNotFound
log "Size not found"
skipped += 1
next
end
if !w || !h
log "Invalid image dimensions"
skipped += 1
next
end
ww, hh = ImageSizer.resize(w, h)
if w == 0 || h == 0 || ww == 0 || hh == 0
log "Invalid image dimensions"
skipped += 1
next
end
upload.attributes = {
width: w,
height: h,
thumbnail_width: ww,
thumbnail_height: hh,
)
thumbnail_height: hh
}
if upload.changed?
log "Correcting the upload dimensions"
log "Before: #{upload.width_was}x#{upload.height_was} #{upload.thumbnail_width_was}x#{upload.thumbnail_height_was}"
log "After: #{w}x#{h} #{ww}x#{hh}"
dimensions_count += 1
upload.save!
end
if w * h < MAX_IMAGE_PIXELS
log "Image size within allowed range"
skipped += 1
next
end
path = upload.local? ? source : (Discourse.store.download(upload) rescue nil)&.path
unless path
log "No image path"
skipped += 1
next
end
if downsize_upload(upload, path)
downsized_count += 1
else
skipped += 1
end
end
if w * h < max_image_pixels
puts "image size within allowed range" if ENV["VERBOSE"]
next
end
path = upload.local? ? source : (Discourse.store.download(upload) rescue nil)&.path
unless path
puts "no image path" if ENV["VERBOSE"]
next
end
downsized_count += 1 if downsize_upload(upload, path, max_image_pixels)
STDIN.beep
puts "", "Done", Time.zone.now
end
puts "", "Done"
process_uploads