From eb462bfb3daaa0afbc8e2abb3210f430f3682b06 Mon Sep 17 00:00:00 2001 From: Jarek Radosz Date: Tue, 26 May 2020 15:38:23 +0200 Subject: [PATCH] FIX: Improve image downsizing script (#9549) Correctly handles more upload formats in posts, updates post custom fields, fixes more edge cases, adds debugging capabilities. (VERBOSE=1 and INTERACTIVE=1 flags) Includes these commits and some more: * DEV: Show the fixed image dimensions * FIX: Support more upload url formats * DEV: Remove the old upload after updating posts * FIX: Use the `process_post_#{id}` mutex * FIX: Avoid rebaking twice * DEV: Print out the link to the post * DEV: Process posts chronologically * DEV: Do a dry-run before saving, pause on any issue * FIX: Also process deleted posts * DEV: Make matchers case-insensitive * DEV: Pause on "detached" uploads, add more debug info * DEV: Print out time when finished * DEV: Add support for WORKER_ID/WORKER_COUNT * DEV: Fix the onebox in cooked text heuristic * DEV: Don't report already processed posts * DEV: Beep when done! * DEV: Ignore issues with deleted posts * DEV: Ignore issues with deleted topics * DEV: Multiline SQL * DEV: Use the bulk attribute assignment * DEV: Add ENV["INTERACTIVE"] mode * DEV: Handle post custom fields * DEV: Bail on non-S3 sites * DEV: Allow sizes smaller than 1 mpix --- script/downsize_uploads.rb | 362 +++++++++++++++++++++++++++---------- 1 file changed, 270 insertions(+), 92 deletions(-) diff --git a/script/downsize_uploads.rb b/script/downsize_uploads.rb index 95d65c24946..dc2e7b2a628 100644 --- a/script/downsize_uploads.rb +++ b/script/downsize_uploads.rb @@ -2,80 +2,181 @@ require File.expand_path("../../config/environment", __FILE__) -# no less than 1 megapixel -max_image_pixels = [ARGV[0].to_i, 1_000_000].max +# Supported ENV arguments: +# +# VERBOSE=1 +# Shows debug information. +# +# INTERACTIVE=1 +# Shows debug information and pauses for input on issues. +# +# WORKER_ID/WORKER_COUNT +# When running the script on a single forum in multiple terminals. +# For example, if you want 4 concurrent scripts use WORKER_COUNT=4 +# and WORKER_ID from 0 to 3 -puts "", "Downsizing images to no more than #{max_image_pixels} pixels" +MIN_IMAGE_PIXELS = 500_000 # 0.5 megapixels +DEFAULT_IMAGE_PIXELS = 1_000_000 # 1 megapixel -dimensions_count = 0 -downsized_count = 0 +MAX_IMAGE_PIXELS = [ + ARGV[0]&.to_i || DEFAULT_IMAGE_PIXELS, + MIN_IMAGE_PIXELS +].max -def downsize_upload(upload, path, max_image_pixels) +ENV["VERBOSE"] = "1" if ENV["INTERACTIVE"] + +def log(*args) + puts(*args) if ENV["VERBOSE"] +end + +def transform_post(post, upload_before, upload_after) + post.raw.gsub!(/upload:\/\/#{upload_before.base62_sha1}(\.#{upload_before.extension})?/i, upload_after.short_url) + post.raw.gsub!(Discourse.store.cdn_url(upload_before.url), Discourse.store.cdn_url(upload_after.url)) + post.raw.gsub!(Discourse.store.url_for(upload_before), Discourse.store.url_for(upload_after)) + post.raw.gsub!("#{Discourse.base_url}#{upload_before.short_path}", "#{Discourse.base_url}#{upload_after.short_path}") + + path = SiteSetting.Upload.s3_upload_bucket.split("/", 2)[1] + post.raw.gsub!(/\d+)x(?\d+).*?\" alt=\"(?.*?)\"\/?>/i) do + "![#{$~[:alt]}|#{$~[:width]}x#{$~[:height]}](#{upload_after.short_url})" + end + + post.raw.gsub!(/!\[(.*?)\]\(\/uploads\/.+?\/#{upload_before.sha1}(\.#{upload_before.extension})?\)/i, "![\\1](#{upload_after.short_url})") +end + +def downsize_upload(upload, path) # Make sure the filesize is up to date upload.filesize = File.size(path) - OptimizedImage.downsize(path, path, "#{max_image_pixels}@", filename: upload.original_filename) + OptimizedImage.downsize(path, path, "#{MAX_IMAGE_PIXELS}@", filename: upload.original_filename) sha1 = Upload.generate_digest(path) if sha1 == upload.sha1 - puts "no sha1 change" if ENV["VERBOSE"] + log "No sha1 change" return end - w, h = FastImage.size(path, timeout: 10, raise_on_failure: true) + w, h = FastImage.size(path, timeout: 15, raise_on_failure: true) if !w || !h - puts "invalid image dimensions after resizing" if ENV["VERBOSE"] + log "Invalid image dimensions after resizing" return end # Neither #dup or #clone provide a complete copy original_upload = Upload.find(upload.id) ww, hh = ImageSizer.resize(w, h) - new_file = true - if existing_upload = Upload.find_by(sha1: sha1) - upload = existing_upload - new_file = false - end + # A different upload record that matches the sha1 of the downsized image + existing_upload = Upload.find_by(sha1: sha1) + upload = existing_upload if existing_upload - before = upload.filesize - upload.filesize = File.size(path) + upload.attributes = { + sha1: sha1, + width: w, + height: h, + thumbnail_width: ww, + thumbnail_height: hh, + filesize: File.size(path) + } - if upload.filesize > before - puts "no filesize reduction" if ENV["VERBOSE"] + if upload.filesize > upload.filesize_was + log "No filesize reduction" return end - upload.sha1 = sha1 - upload.width = w - upload.height = h - upload.thumbnail_width = ww - upload.thumbnail_height = hh - - if new_file + unless existing_upload url = Discourse.store.store_upload(File.new(path), upload) unless url - puts "couldn't store the upload" if ENV["VERBOSE"] + log "Couldn't store the upload" return end upload.url = url end - if ENV["VERBOSE"] - puts "base62: #{original_upload.base62_sha1} -> #{Upload.base62_sha1(sha1)}" - puts "sha1: #{original_upload.sha1} -> #{sha1}" - puts "is a new file: #{new_file}" + log "base62: #{original_upload.base62_sha1} -> #{Upload.base62_sha1(sha1)}" + log "sha: #{original_upload.sha1} -> #{sha1}" + log "(an exisiting upload)" if existing_upload + + success = true + posts = Post.unscoped.joins(:post_uploads).where(post_uploads: { upload_id: original_upload.id }).uniq.sort_by(&:created_at) + + posts.each do |post| + transform_post(post, original_upload, upload) + + if post.custom_fields[Post::DOWNLOADED_IMAGES].present? + downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES]) + end + + if post.raw_changed? + log "Updating post" + elsif downloaded_images&.has_value?(original_upload.id) + log "A hotlinked, unreferenced image" + elsif post.raw.include?(upload.short_url) + log "Already processed" + elsif post.trashed? + log "A deleted post" + elsif !post.topic || post.topic.trashed? + log "A deleted topic" + elsif post.cooked.include?(original_upload.sha1) + if post.raw.include?("#{Discourse.base_url.sub(/^https?:\/\//i, "")}/t/") + log "Updating a topic onebox" + else + log "Updating an external onebox" + end + else + log "Could not find the upload URL" + success = false + end + + log "#{Discourse.base_url}/p/#{post.id}" + end + + if posts.empty? + log "Upload not used in any posts" + + if User.where(uploaded_avatar_id: original_upload.id).count + log "Used as a User avatar" + elsif UserAvatar.where(gravatar_upload_id: original_upload.id).count + log "Used as a UserAvatar gravatar" + elsif UserAvatar.where(custom_upload_id: original_upload.id).count + log "Used as a UserAvatar custom upload" + elsif UserProfile.where(profile_background_upload_id: original_upload.id).count + log "Used as a UserProfile profile background" + elsif UserProfile.where(card_background_upload_id: original_upload.id).count + log "Used as a UserProfile card background" + elsif Category.where(uploaded_logo_id: original_upload.id).count + log "Used as a Category logo" + elsif Category.where(uploaded_background_id: original_upload.id).count + log "Used as a Category background" + elsif CustomEmoji.where(upload_id: original_upload.id).count + log "Used as a CustomEmoji" + elsif ThemeField.where(upload_id: original_upload.id).count + log "Used as a ThemeField" + else + success = false + end + end + + unless success + if ENV["INTERACTIVE"] + print "Press any key to continue with the upload" + STDIN.beep + STDIN.getch + puts " k" + elsif !existing_upload && !Upload.where(url: upload.url).exist? + # We're bailing, so clean up the just uploaded file + Discourse.store.remove_upload(upload) + + log "⏩ Skipping" + return + end end upload.save! - if new_file - upload.optimized_images.each(&:destroy!) - Discourse.store.remove_upload(original_upload) - else + if existing_upload begin PostUpload.where(upload_id: original_upload.id).update_all(upload_id: upload.id) rescue ActiveRecord::RecordNotUnique, PG::UniqueViolation @@ -90,85 +191,162 @@ def downsize_upload(upload, path, max_image_pixels) Category.where(uploaded_background_id: original_upload.id).update_all(uploaded_background_id: upload.id) CustomEmoji.where(upload_id: original_upload.id).update_all(upload_id: upload.id) ThemeField.where(upload_id: original_upload.id).update_all(upload_id: upload.id) + else + upload.optimized_images.each(&:destroy!) end - original_upload.posts.each do |post| - post.raw.gsub!(/upload:\/\/#{original_upload.base62_sha1}(\.#{original_upload.extension})?/, upload.short_url) - post.raw.gsub!(Discourse.store.cdn_url(original_upload.url), Discourse.store.cdn_url(upload.url)) + posts.each do |post| + DistributedMutex.synchronize("process_post_#{post.id}") do + current_post = Post.unscoped.find(post.id) - if post.raw_changed? - puts "updating post #{post.id}" if ENV["VERBOSE"] - post.save! - else - puts "Could find the upload path in post.raw (post_id: #{post.id})" if ENV["VERBOSE"] + # If the post became outdated, reapply changes + if current_post.updated_at != post.updated_at + transform_post(current_post, original_upload, upload) + post = current_post + end + + if post.raw_changed? + post.update_columns( + raw: post.raw, + updated_at: Time.zone.now + ) + end + + if existing_upload && post.custom_fields[Post::DOWNLOADED_IMAGES].present? + downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES]) + + downloaded_images.transform_values! do |upload_id| + upload_id == original_upload.id ? upload.id : upload_id + end + + post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_images.to_json if downloaded_images.present? + post.save_custom_fields + end + + post.rebake! end - - post.rebake! end - original_upload.reload.destroy! unless new_file - - puts "" if ENV["VERBOSE"] + if existing_upload + original_upload.reload.destroy! + else + Discourse.store.remove_upload(original_upload) + end true end -scope = Upload - .where("LOWER(extension) IN ('jpg', 'jpeg', 'gif', 'png')") - .where("COALESCE(width, 0) = 0 OR COALESCE(height, 0) = 0 OR COALESCE(thumbnail_width, 0) = 0 OR COALESCE(thumbnail_height, 0) = 0 OR width * height > ?", max_image_pixels) - -puts "Uploads to process: #{scope.count}" - -scope.find_each do |upload| - print "\rFixed dimensions: %8d Downsized: %8d (upload id: #{upload.id})".freeze % [dimensions_count, downsized_count] - puts "\n" if ENV["VERBOSE"] - - source = upload.local? ? Discourse.store.path_for(upload) : "https:#{upload.url}" - - unless source - puts "no path or URL" if ENV["VERBOSE"] - next +def process_uploads + unless SiteSetting.Upload.enable_s3_uploads + puts "This script supports only S3 uploads" + return end - w, h = FastImage.size(source, timeout: 10) + puts "", "Downsizing images to no more than #{MAX_IMAGE_PIXELS} pixels" - if !w || !h - puts "invalid image dimensions" if ENV["VERBOSE"] - next + dimensions_count = 0 + downsized_count = 0 + + scope = Upload.where("LOWER(extension) IN ('jpg', 'jpeg', 'gif', 'png')") + scope = scope.where(<<-SQL, MAX_IMAGE_PIXELS) + COALESCE(width, 0) = 0 OR + COALESCE(height, 0) = 0 OR + COALESCE(thumbnail_width, 0) = 0 OR + COALESCE(thumbnail_height, 0) = 0 OR + width * height > ? + SQL + + if ENV["WORKER_ID"] && ENV["WORKER_COUNT"] + scope = scope.where("id % ? = ?", ENV["WORKER_COUNT"], ENV["WORKER_ID"]) end - ww, hh = ImageSizer.resize(w, h) + skipped = 0 + total_count = scope.count + puts "Uploads to process: #{total_count}" - if w == 0 || h == 0 || ww == 0 || hh == 0 - puts "invalid image dimensions" if ENV["VERBOSE"] - next - end + scope.find_each.with_index do |upload, index| + progress = (index * 100.0 / total_count).round(1) - if upload.read_attribute(:width) != w || upload.read_attribute(:height) != h || upload.read_attribute(:thumbnail_width) != ww || upload.read_attribute(:thumbnail_height) != hh - puts "Correcting the upload dimensions" if ENV["VERBOSE"] - dimensions_count += 1 + log "\n" + print "\r#{progress}% Fixed dimensions: #{dimensions_count} Downsized: #{downsized_count} Skipped: #{skipped} (upload id: #{upload.id})" + log "\n" - upload.update!( + source = upload.local? ? Discourse.store.path_for(upload) : "https:#{upload.url}" + + unless source + log "No path or URL" + skipped += 1 + next + end + + begin + w, h = FastImage.size(source, timeout: 15, raise_on_failure: true) + rescue FastImage::ImageFetchFailure + log "Retrying image resizing" + w, h = FastImage.size(source, timeout: 15) + rescue FastImage::UnknownImageType + log "Unknown image type" + skipped += 1 + next + rescue FastImage::SizeNotFound + log "Size not found" + skipped += 1 + next + end + + if !w || !h + log "Invalid image dimensions" + skipped += 1 + next + end + + ww, hh = ImageSizer.resize(w, h) + + if w == 0 || h == 0 || ww == 0 || hh == 0 + log "Invalid image dimensions" + skipped += 1 + next + end + + upload.attributes = { width: w, height: h, thumbnail_width: ww, - thumbnail_height: hh, - ) + thumbnail_height: hh + } + + if upload.changed? + log "Correcting the upload dimensions" + log "Before: #{upload.width_was}x#{upload.height_was} #{upload.thumbnail_width_was}x#{upload.thumbnail_height_was}" + log "After: #{w}x#{h} #{ww}x#{hh}" + + dimensions_count += 1 + upload.save! + end + + if w * h < MAX_IMAGE_PIXELS + log "Image size within allowed range" + skipped += 1 + next + end + + path = upload.local? ? source : (Discourse.store.download(upload) rescue nil)&.path + + unless path + log "No image path" + skipped += 1 + next + end + + if downsize_upload(upload, path) + downsized_count += 1 + else + skipped += 1 + end end - if w * h < max_image_pixels - puts "image size within allowed range" if ENV["VERBOSE"] - next - end - - path = upload.local? ? source : (Discourse.store.download(upload) rescue nil)&.path - - unless path - puts "no image path" if ENV["VERBOSE"] - next - end - - downsized_count += 1 if downsize_upload(upload, path, max_image_pixels) + STDIN.beep + puts "", "Done", Time.zone.now end -puts "", "Done" +process_uploads