FIX: Improvements and fixes to the image downsizing script (#9950)

Fixed bugs, added specs, extracted the upload downsizing code to a class, added support for non-S3 setups, changed it so that images aren't downloaded twice.

This code has been tested on production and successfully resized ~180k uploads.

Includes:

* DEV: Extract upload downsizing logic
* DEV: Add support for non-S3 uploads
* DEV: Process only images uploaded by users
* FIX: Incorrect usage of `count` and `exist?` typo
* DEV: Spec S3 image downsizing
* DEV: Avoid downloading images twice
* DEV: Update filesizes earlier in the process
* DEV: Return false on invalid upload
* FIX: Download images that currently above the limit (If the image size limit is decreased, then there was no way to resize those images that now fall outside the allowed size range)
* Update script/downsize_uploads.rb (Co-authored-by: Régis Hanol <regis@hanol.fr>)
This commit is contained in:
Jarek Radosz 2020-06-11 14:47:59 +02:00 committed by GitHub
parent b7e70850e4
commit 3d55f2e3b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 376 additions and 233 deletions

View File

@ -76,13 +76,13 @@ module FileStore
not_implemented not_implemented
end end
def download(upload) def download(upload, max_file_size_kb: nil)
DistributedMutex.synchronize("download_#{upload.sha1}", validity: 3.minutes) do DistributedMutex.synchronize("download_#{upload.sha1}", validity: 3.minutes) do
filename = "#{upload.sha1}#{File.extname(upload.original_filename)}" filename = "#{upload.sha1}#{File.extname(upload.original_filename)}"
file = get_from_cache(filename) file = get_from_cache(filename)
if !file if !file
max_file_size_kb = [SiteSetting.max_image_size_kb, SiteSetting.max_attachment_size_kb].max.kilobytes max_file_size_kb ||= [SiteSetting.max_image_size_kb, SiteSetting.max_attachment_size_kb].max.kilobytes
url = upload.secure? ? url = upload.secure? ?
Discourse.store.signed_url_for_path(upload.url) : Discourse.store.signed_url_for_path(upload.url) :

View File

@ -0,0 +1,236 @@
# frozen_string_literal: true
class ShrinkUploadedImage
attr_reader :upload, :path
def initialize(upload:, path:, max_pixels:, verbose: false, interactive: false)
@upload = upload
@path = path
@max_pixels = max_pixels
@verbose = verbose
@interactive = interactive
end
def perform
OptimizedImage.downsize(path, path, "#{@max_pixels}@", filename: upload.original_filename)
sha1 = Upload.generate_digest(path)
if sha1 == upload.sha1
log "No sha1 change"
return false
end
w, h = FastImage.size(path, timeout: 15, raise_on_failure: true)
if !w || !h
log "Invalid image dimensions after resizing"
return false
end
# Neither #dup or #clone provide a complete copy
original_upload = Upload.find(upload.id)
ww, hh = ImageSizer.resize(w, h)
# A different upload record that matches the sha1 of the downsized image
existing_upload = Upload.find_by(sha1: sha1)
@upload = existing_upload if existing_upload
upload.attributes = {
sha1: sha1,
width: w,
height: h,
thumbnail_width: ww,
thumbnail_height: hh,
filesize: File.size(path)
}
if upload.filesize > upload.filesize_was
log "No filesize reduction"
return false
end
unless existing_upload
url = Discourse.store.store_upload(File.new(path), upload)
unless url
log "Couldn't store the upload"
return false
end
upload.url = url
end
log "base62: #{original_upload.base62_sha1} -> #{Upload.base62_sha1(sha1)}"
log "sha: #{original_upload.sha1} -> #{sha1}"
log "(an exisiting upload)" if existing_upload
success = true
posts = Post.unscoped.joins(:post_uploads).where(post_uploads: { upload_id: original_upload.id }).uniq.sort_by(&:created_at)
posts.each do |post|
transform_post(post, original_upload, upload)
if post.custom_fields[Post::DOWNLOADED_IMAGES].present?
downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES])
end
if post.raw_changed?
log "Updating post"
elsif downloaded_images&.has_value?(original_upload.id)
log "A hotlinked, unreferenced image"
elsif post.raw.include?(upload.short_url)
log "Already processed"
elsif post.trashed?
log "A deleted post"
elsif !post.topic || post.topic.trashed?
log "A deleted topic"
elsif post.cooked.include?(original_upload.sha1)
if post.raw.include?("#{Discourse.base_url.sub(/^https?:\/\//i, "")}/t/")
log "Updating a topic onebox"
else
log "Updating an external onebox"
end
else
log "Could not find the upload URL"
success = false
end
log "#{Discourse.base_url}/p/#{post.id}"
end
if posts.empty?
log "Upload not used in any posts"
if User.where(uploaded_avatar_id: original_upload.id).exists?
log "Used as a User avatar"
elsif UserAvatar.where(gravatar_upload_id: original_upload.id).exists?
log "Used as a UserAvatar gravatar"
elsif UserAvatar.where(custom_upload_id: original_upload.id).exists?
log "Used as a UserAvatar custom upload"
elsif UserProfile.where(profile_background_upload_id: original_upload.id).exists?
log "Used as a UserProfile profile background"
elsif UserProfile.where(card_background_upload_id: original_upload.id).exists?
log "Used as a UserProfile card background"
elsif Category.where(uploaded_logo_id: original_upload.id).exists?
log "Used as a Category logo"
elsif Category.where(uploaded_background_id: original_upload.id).exists?
log "Used as a Category background"
elsif CustomEmoji.where(upload_id: original_upload.id).exists?
log "Used as a CustomEmoji"
elsif ThemeField.where(upload_id: original_upload.id).exists?
log "Used as a ThemeField"
else
success = false
end
end
unless success
if @interactive
print "Press any key to continue with the upload"
STDIN.beep
STDIN.getch
puts " k"
else
if !existing_upload && !Upload.where(url: upload.url).exists?
# We're bailing, so clean up the just uploaded file
Discourse.store.remove_upload(upload)
end
log "⏩ Skipping"
return false
end
end
unless upload.save
if !existing_upload && !Upload.where(url: upload.url).exists?
# We're bailing, so clean up the just uploaded file
Discourse.store.remove_upload(upload)
end
log "⏩ Skipping an invalid upload"
return false
end
if existing_upload
begin
PostUpload.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
rescue ActiveRecord::RecordNotUnique, PG::UniqueViolation
end
User.where(uploaded_avatar_id: original_upload.id).update_all(uploaded_avatar_id: upload.id)
UserAvatar.where(gravatar_upload_id: original_upload.id).update_all(gravatar_upload_id: upload.id)
UserAvatar.where(custom_upload_id: original_upload.id).update_all(custom_upload_id: upload.id)
UserProfile.where(profile_background_upload_id: original_upload.id).update_all(profile_background_upload_id: upload.id)
UserProfile.where(card_background_upload_id: original_upload.id).update_all(card_background_upload_id: upload.id)
Category.where(uploaded_logo_id: original_upload.id).update_all(uploaded_logo_id: upload.id)
Category.where(uploaded_background_id: original_upload.id).update_all(uploaded_background_id: upload.id)
CustomEmoji.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
ThemeField.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
else
upload.optimized_images.each(&:destroy!)
end
posts.each do |post|
DistributedMutex.synchronize("process_post_#{post.id}") do
current_post = Post.unscoped.find(post.id)
# If the post became outdated, reapply changes
if current_post.updated_at != post.updated_at
transform_post(current_post, original_upload, upload)
post = current_post
end
if post.raw_changed?
post.update_columns(
raw: post.raw,
updated_at: Time.zone.now
)
end
if existing_upload && post.custom_fields[Post::DOWNLOADED_IMAGES].present?
downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES])
downloaded_images.transform_values! do |upload_id|
upload_id == original_upload.id ? upload.id : upload_id
end
post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_images.to_json if downloaded_images.present?
post.save_custom_fields
end
post.rebake!
end
end
if existing_upload
original_upload.reload.destroy!
else
Discourse.store.remove_upload(original_upload)
end
true
end
private
def transform_post(post, upload_before, upload_after)
post.raw.gsub!(/upload:\/\/#{upload_before.base62_sha1}(\.#{upload_before.extension})?/i, upload_after.short_url)
post.raw.gsub!(Discourse.store.cdn_url(upload_before.url), Discourse.store.cdn_url(upload_after.url))
post.raw.gsub!("#{Discourse.base_url}#{upload_before.short_path}", "#{Discourse.base_url}#{upload_after.short_path}")
if SiteSetting.enable_s3_uploads
post.raw.gsub!(Discourse.store.url_for(upload_before), Discourse.store.url_for(upload_after))
path = SiteSetting.Upload.s3_upload_bucket.split("/", 2)[1]
post.raw.gsub!(/<img src=\"https:\/\/.+?\/#{path}\/uploads\/default\/optimized\/.+?\/#{upload_before.sha1}_\d_(?<width>\d+)x(?<height>\d+).*?\" alt=\"(?<alt>.*?)\"\/?>/i) do
"![#{$~[:alt]}|#{$~[:width]}x#{$~[:height]}](#{upload_after.short_url})"
end
end
post.raw.gsub!(/!\[(.*?)\]\(\/uploads\/.+?\/#{upload_before.sha1}(\.#{upload_before.extension})?\)/i, "![\\1](#{upload_after.short_url})")
end
def log(*args)
puts(*args) if @verbose
end
end

View File

@ -29,225 +29,13 @@ def log(*args)
puts(*args) if ENV["VERBOSE"] puts(*args) if ENV["VERBOSE"]
end end
def transform_post(post, upload_before, upload_after)
post.raw.gsub!(/upload:\/\/#{upload_before.base62_sha1}(\.#{upload_before.extension})?/i, upload_after.short_url)
post.raw.gsub!(Discourse.store.cdn_url(upload_before.url), Discourse.store.cdn_url(upload_after.url))
post.raw.gsub!(Discourse.store.url_for(upload_before), Discourse.store.url_for(upload_after))
post.raw.gsub!("#{Discourse.base_url}#{upload_before.short_path}", "#{Discourse.base_url}#{upload_after.short_path}")
path = SiteSetting.Upload.s3_upload_bucket.split("/", 2)[1]
post.raw.gsub!(/<img src=\"https:\/\/.+?\/#{path}\/uploads\/default\/optimized\/.+?\/#{upload_before.sha1}_\d_(?<width>\d+)x(?<height>\d+).*?\" alt=\"(?<alt>.*?)\"\/?>/i) do
"![#{$~[:alt]}|#{$~[:width]}x#{$~[:height]}](#{upload_after.short_url})"
end
post.raw.gsub!(/!\[(.*?)\]\(\/uploads\/.+?\/#{upload_before.sha1}(\.#{upload_before.extension})?\)/i, "![\\1](#{upload_after.short_url})")
end
def downsize_upload(upload, path)
# Make sure the filesize is up to date
upload.filesize = File.size(path)
OptimizedImage.downsize(path, path, "#{MAX_IMAGE_PIXELS}@", filename: upload.original_filename)
sha1 = Upload.generate_digest(path)
if sha1 == upload.sha1
log "No sha1 change"
return
end
w, h = FastImage.size(path, timeout: 15, raise_on_failure: true)
if !w || !h
log "Invalid image dimensions after resizing"
return
end
# Neither #dup or #clone provide a complete copy
original_upload = Upload.find(upload.id)
ww, hh = ImageSizer.resize(w, h)
# A different upload record that matches the sha1 of the downsized image
existing_upload = Upload.find_by(sha1: sha1)
upload = existing_upload if existing_upload
upload.attributes = {
sha1: sha1,
width: w,
height: h,
thumbnail_width: ww,
thumbnail_height: hh,
filesize: File.size(path)
}
if upload.filesize > upload.filesize_was
log "No filesize reduction"
return
end
unless existing_upload
url = Discourse.store.store_upload(File.new(path), upload)
unless url
log "Couldn't store the upload"
return
end
upload.url = url
end
log "base62: #{original_upload.base62_sha1} -> #{Upload.base62_sha1(sha1)}"
log "sha: #{original_upload.sha1} -> #{sha1}"
log "(an exisiting upload)" if existing_upload
success = true
posts = Post.unscoped.joins(:post_uploads).where(post_uploads: { upload_id: original_upload.id }).uniq.sort_by(&:created_at)
posts.each do |post|
transform_post(post, original_upload, upload)
if post.custom_fields[Post::DOWNLOADED_IMAGES].present?
downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES])
end
if post.raw_changed?
log "Updating post"
elsif downloaded_images&.has_value?(original_upload.id)
log "A hotlinked, unreferenced image"
elsif post.raw.include?(upload.short_url)
log "Already processed"
elsif post.trashed?
log "A deleted post"
elsif !post.topic || post.topic.trashed?
log "A deleted topic"
elsif post.cooked.include?(original_upload.sha1)
if post.raw.include?("#{Discourse.base_url.sub(/^https?:\/\//i, "")}/t/")
log "Updating a topic onebox"
else
log "Updating an external onebox"
end
else
log "Could not find the upload URL"
success = false
end
log "#{Discourse.base_url}/p/#{post.id}"
end
if posts.empty?
log "Upload not used in any posts"
if User.where(uploaded_avatar_id: original_upload.id).count
log "Used as a User avatar"
elsif UserAvatar.where(gravatar_upload_id: original_upload.id).count
log "Used as a UserAvatar gravatar"
elsif UserAvatar.where(custom_upload_id: original_upload.id).count
log "Used as a UserAvatar custom upload"
elsif UserProfile.where(profile_background_upload_id: original_upload.id).count
log "Used as a UserProfile profile background"
elsif UserProfile.where(card_background_upload_id: original_upload.id).count
log "Used as a UserProfile card background"
elsif Category.where(uploaded_logo_id: original_upload.id).count
log "Used as a Category logo"
elsif Category.where(uploaded_background_id: original_upload.id).count
log "Used as a Category background"
elsif CustomEmoji.where(upload_id: original_upload.id).count
log "Used as a CustomEmoji"
elsif ThemeField.where(upload_id: original_upload.id).count
log "Used as a ThemeField"
else
success = false
end
end
unless success
if ENV["INTERACTIVE"]
print "Press any key to continue with the upload"
STDIN.beep
STDIN.getch
puts " k"
elsif !existing_upload && !Upload.where(url: upload.url).exist?
# We're bailing, so clean up the just uploaded file
Discourse.store.remove_upload(upload)
log "⏩ Skipping"
return
end
end
upload.save!
if existing_upload
begin
PostUpload.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
rescue ActiveRecord::RecordNotUnique, PG::UniqueViolation
end
User.where(uploaded_avatar_id: original_upload.id).update_all(uploaded_avatar_id: upload.id)
UserAvatar.where(gravatar_upload_id: original_upload.id).update_all(gravatar_upload_id: upload.id)
UserAvatar.where(custom_upload_id: original_upload.id).update_all(custom_upload_id: upload.id)
UserProfile.where(profile_background_upload_id: original_upload.id).update_all(profile_background_upload_id: upload.id)
UserProfile.where(card_background_upload_id: original_upload.id).update_all(card_background_upload_id: upload.id)
Category.where(uploaded_logo_id: original_upload.id).update_all(uploaded_logo_id: upload.id)
Category.where(uploaded_background_id: original_upload.id).update_all(uploaded_background_id: upload.id)
CustomEmoji.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
ThemeField.where(upload_id: original_upload.id).update_all(upload_id: upload.id)
else
upload.optimized_images.each(&:destroy!)
end
posts.each do |post|
DistributedMutex.synchronize("process_post_#{post.id}") do
current_post = Post.unscoped.find(post.id)
# If the post became outdated, reapply changes
if current_post.updated_at != post.updated_at
transform_post(current_post, original_upload, upload)
post = current_post
end
if post.raw_changed?
post.update_columns(
raw: post.raw,
updated_at: Time.zone.now
)
end
if existing_upload && post.custom_fields[Post::DOWNLOADED_IMAGES].present?
downloaded_images = JSON.parse(post.custom_fields[Post::DOWNLOADED_IMAGES])
downloaded_images.transform_values! do |upload_id|
upload_id == original_upload.id ? upload.id : upload_id
end
post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_images.to_json if downloaded_images.present?
post.save_custom_fields
end
post.rebake!
end
end
if existing_upload
original_upload.reload.destroy!
else
Discourse.store.remove_upload(original_upload)
end
true
end
def process_uploads def process_uploads
unless SiteSetting.Upload.enable_s3_uploads
puts "This script supports only S3 uploads"
return
end
puts "", "Downsizing images to no more than #{MAX_IMAGE_PIXELS} pixels" puts "", "Downsizing images to no more than #{MAX_IMAGE_PIXELS} pixels"
dimensions_count = 0 dimensions_count = 0
downsized_count = 0 downsized_count = 0
scope = Upload.where("LOWER(extension) IN ('jpg', 'jpeg', 'gif', 'png')") scope = Upload.by_users.where("LOWER(extension) IN ('jpg', 'jpeg', 'gif', 'png')")
scope = scope.where(<<-SQL, MAX_IMAGE_PIXELS) scope = scope.where(<<-SQL, MAX_IMAGE_PIXELS)
COALESCE(width, 0) = 0 OR COALESCE(width, 0) = 0 OR
COALESCE(height, 0) = 0 OR COALESCE(height, 0) = 0 OR
@ -271,19 +59,20 @@ def process_uploads
print "\r#{progress}% Fixed dimensions: #{dimensions_count} Downsized: #{downsized_count} Skipped: #{skipped} (upload id: #{upload.id})" print "\r#{progress}% Fixed dimensions: #{dimensions_count} Downsized: #{downsized_count} Skipped: #{skipped} (upload id: #{upload.id})"
log "\n" log "\n"
source = upload.local? ? Discourse.store.path_for(upload) : "https:#{upload.url}" path = if upload.local?
Discourse.store.path_for(upload)
else
(Discourse.store.download(upload, max_file_size_kb: 100.megabytes) rescue nil)&.path
end
unless source unless path
log "No path or URL" log "No image path"
skipped += 1 skipped += 1
next next
end end
begin begin
w, h = FastImage.size(source, timeout: 15, raise_on_failure: true) w, h = FastImage.size(path, raise_on_failure: true)
rescue FastImage::ImageFetchFailure
log "Retrying image resizing"
w, h = FastImage.size(source, timeout: 15)
rescue FastImage::UnknownImageType rescue FastImage::UnknownImageType
log "Unknown image type" log "Unknown image type"
skipped += 1 skipped += 1
@ -312,13 +101,14 @@ def process_uploads
width: w, width: w,
height: h, height: h,
thumbnail_width: ww, thumbnail_width: ww,
thumbnail_height: hh thumbnail_height: hh,
filesize: File.size(path)
} }
if upload.changed? if upload.changed?
log "Correcting the upload dimensions" log "Correcting the upload dimensions"
log "Before: #{upload.width_was}x#{upload.height_was} #{upload.thumbnail_width_was}x#{upload.thumbnail_height_was}" log "Before: #{upload.width_was}x#{upload.height_was} #{upload.thumbnail_width_was}x#{upload.thumbnail_height_was} (#{upload.filesize_was})"
log "After: #{w}x#{h} #{ww}x#{hh}" log "After: #{w}x#{h} #{ww}x#{hh} (#{upload.filesize})"
dimensions_count += 1 dimensions_count += 1
upload.save! upload.save!
@ -330,15 +120,15 @@ def process_uploads
next next
end end
path = upload.local? ? source : (Discourse.store.download(upload) rescue nil)&.path result = ShrinkUploadedImage.new(
upload: upload,
path: path,
max_pixels: MAX_IMAGE_PIXELS,
verbose: ENV["VERBOSE"],
interactive: ENV["INTERACTIVE"]
).perform
unless path if result
log "No image path"
skipped += 1
next
end
if downsize_upload(upload, path)
downsized_count += 1 downsized_count += 1
else else
skipped += 1 skipped += 1

View File

@ -66,6 +66,20 @@ Fabricator(:upload_s3, from: :upload) do
end end
end end
Fabricator(:s3_image_upload, from: :upload_s3) do
after_create do |upload|
file = Tempfile.new(['fabricated', '.png'])
`convert -size #{upload.width}x#{upload.height} xc:white "#{file.path}"`
Discourse.store.store_upload(file, upload)
upload.sha1 = Upload.generate_digest(file.path)
WebMock
.stub_request(:get, upload.url)
.to_return(status: 200, body: File.new(file.path))
end
end
Fabricator(:secure_upload_s3, from: :upload_s3) do Fabricator(:secure_upload_s3, from: :upload_s3) do
secure true secure true
sha1 { SecureRandom.hex(20) } sha1 { SecureRandom.hex(20) }

View File

@ -0,0 +1,103 @@
# frozen_string_literal: true
require 'rails_helper'
describe ShrinkUploadedImage do
context "when local uploads are enabled" do
let(:upload) { Fabricate(:image_upload, width: 200, height: 200) }
it "resizes the image" do
filesize_before = upload.filesize
post = Fabricate(:post, raw: "<img src='#{upload.url}'>")
post.link_post_uploads
result = ShrinkUploadedImage.new(
upload: upload,
path: Discourse.store.path_for(upload),
max_pixels: 10_000
).perform
expect(result).to be(true)
expect(upload.width).to eq(100)
expect(upload.height).to eq(100)
expect(upload.filesize).to be < filesize_before
end
it "returns false if the image is not used by any models" do
result = ShrinkUploadedImage.new(
upload: upload,
path: Discourse.store.path_for(upload),
max_pixels: 10_000
).perform
expect(result).to be(false)
end
it "returns false if the image cannot be shrunk more" do
post = Fabricate(:post, raw: "<img src='#{upload.url}'>")
post.link_post_uploads
ShrinkUploadedImage.new(
upload: upload,
path: Discourse.store.path_for(upload),
max_pixels: 10_000
).perform
upload.reload
result = ShrinkUploadedImage.new(
upload: upload,
path: Discourse.store.path_for(upload),
max_pixels: 10_000
).perform
expect(result).to be(false)
end
it "returns false when the upload is above the size limit" do
post = Fabricate(:post, raw: "<img src='#{upload.url}'>")
post.link_post_uploads
SiteSetting.max_image_size_kb = 0.001 # 1 byte
result = ShrinkUploadedImage.new(
upload: upload,
path: Discourse.store.path_for(upload),
max_pixels: 10_000
).perform
expect(result).to be(false)
end
end
context "when S3 uploads are enabled" do
let(:upload) { Fabricate(:s3_image_upload, width: 200, height: 200) }
before do
SiteSetting.enable_s3_uploads = true
SiteSetting.s3_access_key_id = "fakeid7974664"
SiteSetting.s3_secret_access_key = "fakesecretid7974664"
store = FileStore::S3Store.new
s3_helper = store.instance_variable_get(:@s3_helper)
client = Aws::S3::Client.new(stub_responses: true)
s3_helper.stubs(:s3_client).returns(client)
Discourse.stubs(:store).returns(store)
end
it "resizes the image" do
filesize_before = upload.filesize
post = Fabricate(:post, raw: "<img src='#{upload.url}'>")
post.link_post_uploads
result = ShrinkUploadedImage.new(
upload: upload,
path: Discourse.store.download(upload).path,
max_pixels: 10_000
).perform
expect(result).to be(true)
expect(upload.width).to eq(100)
expect(upload.height).to eq(100)
expect(upload.filesize).to be < filesize_before
end
end
end