From 3a73028a70309fd2597a397d05e2ba5c542b298c Mon Sep 17 00:00:00 2001 From: Sam Date: Thu, 2 Dec 2021 15:08:22 +1100 Subject: [PATCH] PERF: migrate normalized_emails in a migrations (#15166) Old OnceOff job could perform pretty slowly on sites with millions of emails New implementation operates in batches in a migration, minimizing locking. --- app/jobs/onceoff/migrate_normalized_emails.rb | 11 ---- ...21028_migrate_email_to_normalized_email.rb | 50 +++++++++++++++++++ 2 files changed, 50 insertions(+), 11 deletions(-) delete mode 100644 app/jobs/onceoff/migrate_normalized_emails.rb create mode 100644 db/post_migrate/20211201221028_migrate_email_to_normalized_email.rb diff --git a/app/jobs/onceoff/migrate_normalized_emails.rb b/app/jobs/onceoff/migrate_normalized_emails.rb deleted file mode 100644 index 4b1ff23c0fe..00000000000 --- a/app/jobs/onceoff/migrate_normalized_emails.rb +++ /dev/null @@ -1,11 +0,0 @@ -# frozen_string_literal: true - -module Jobs - class MigrateNormalizedEmails < ::Jobs::Onceoff - def execute_onceoff(args) - ::UserEmail.find_each do |user_email| - user_email.update(normalized_email: user_email.normalize_email) - end - end - end -end diff --git a/db/post_migrate/20211201221028_migrate_email_to_normalized_email.rb b/db/post_migrate/20211201221028_migrate_email_to_normalized_email.rb new file mode 100644 index 00000000000..2839c8f2fd0 --- /dev/null +++ b/db/post_migrate/20211201221028_migrate_email_to_normalized_email.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +class MigrateEmailToNormalizedEmail < ActiveRecord::Migration[6.1] + + # minimize locking on user_email table + disable_ddl_transaction! + + def up + + min, max = DB.query_single "SELECT MIN(id), MAX(id) FROM user_emails" + # scaling is needed to compensate for "holes" where records were deleted + # and pathological cases where for some reason id 100_000_000 and 0 exist + + # avoid doing any work on empty dbs + return if min.nil? + + bounds = DB.query_single <<~SQL + SELECT t.id + FROM ( + SELECT *, row_number() OVER(ORDER BY id ASC) AS row + FROM user_emails + ) t + WHERE t.row % 100000 = 0 + SQL + + # subtle but loop does < not <= + # includes low, excludes high + bounds << (max + 1) + + low_id = min + bounds.each do |high_id| + + # using execute cause MiniSQL is not logging at the moment + # to_i is not needed, but specified so it is explicit there is no SQL injection + execute <<~SQL + UPDATE user_emails + SET normalized_email = REPLACE(REGEXP_REPLACE(email,'([+@].*)',''),'.','') || REGEXP_REPLACE(email, '[^@]*', '') + WHERE (normalized_email IS NULL OR normalized_email <> (REPLACE(REGEXP_REPLACE(email,'([+@].*)',''),'.','') || REGEXP_REPLACE(email, '[^@]*', ''))) + AND (id >= #{low_id.to_i} AND id < #{high_id.to_i}) + SQL + + low_id = high_id + end + + end + + def down + execute "UPDATE user_emails SET normalized_email = null" + end +end