DEV: Bulk imports should find existing users by email (#14468)

Without this change, bulk imports unconditionally create new user records even when a user with the same email address exists.
This commit is contained in:
Gerhard Schlager 2021-09-29 00:20:06 +02:00 committed by GitHub
parent 574cb28b0e
commit a4d0d866aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 30 additions and 9 deletions

View File

@ -153,6 +153,7 @@ class BulkImport::Base
puts "Loading imported user ids..."
@users, imported_user_ids = imported_ids("user")
@last_imported_user_id = imported_user_ids.max || -1
@pre_existing_user_ids = Set.new
puts "Loading imported category ids..."
@categories, imported_category_ids = imported_ids("category")
@ -197,7 +198,7 @@ class BulkImport::Base
puts "Loading users indexes..."
@last_user_id = last_id(User)
@last_user_email_id = last_id(UserEmail)
@emails = User.unscoped.joins(:user_emails).pluck(:"user_emails.email").to_set
@emails = User.unscoped.joins(:user_emails).pluck(:"user_emails.email", :"user_emails.user_id").to_h
@usernames_lower = User.unscoped.pluck(:username_lower).to_set
@mapped_usernames = UserCustomField.joins(:user).where(name: "import_username").pluck("user_custom_fields.value", "users.username").to_h
@ -393,6 +394,17 @@ class BulkImport::Base
end
def process_user(user)
if user[:email].present?
user[:email].downcase!
if existing_user_id = @emails[user[:email]]
@pre_existing_user_ids << existing_user_id
@users[user[:imported_id].to_i] = existing_user_id
user[:skip] = true
return user
end
end
@users[user[:imported_id].to_i] = user[:id] = @last_user_id += 1
imported_username = user[:username].dup
@ -412,11 +424,6 @@ class BulkImport::Base
end
user[:username_lower] = user[:username].downcase
user[:email] ||= random_email
user[:email].downcase!
# unique email
user[:email] = random_email until user[:email] =~ EmailValidator.email_regex && @emails.add?(user[:email])
user[:trust_level] ||= TrustLevel[1]
user[:active] = true unless user.has_key?(:active)
user[:admin] ||= false
@ -428,18 +435,28 @@ class BulkImport::Base
end
def process_user_email(user_email)
user_id = @users[user_email[:imported_user_id].to_i]
return { skip: true } if @pre_existing_user_ids.include?(user_id)
user_email[:id] = @last_user_email_id += 1
user_email[:user_id] = @users[user_email[:imported_user_id].to_i]
user_email[:user_id] = user_id
user_email[:primary] = true
user_email[:created_at] ||= NOW
user_email[:updated_at] ||= user_email[:created_at]
user_email[:email] ||= random_email
user_email[:email].downcase!
# unique email
user_email[:email] = random_email until user_email[:email] =~ EmailValidator.email_regex && !@emails.has_key?(user_email[:email])
user_email
end
def process_user_stat(user_stat)
user_stat[:user_id] = @users[user_stat[:imported_user_id].to_i]
user_id = @users[user_stat[:imported_user_id].to_i]
return { skip: true } if @pre_existing_user_ids.include?(user_id)
user_stat[:user_id] = user_id
user_stat[:topics_entered] ||= 0
user_stat[:time_read] ||= 0
user_stat[:days_visited] ||= 0
@ -455,6 +472,8 @@ class BulkImport::Base
end
def process_user_profile(user_profile)
return { skip: true } if @pre_existing_user_ids.include?(user_profile[:user_id])
user_profile[:bio_raw] = (user_profile[:bio_raw].presence || "").scrub.strip.presence
user_profile[:bio_cooked] = pre_cook(user_profile[:bio_raw]) if user_profile[:bio_raw].present?
user_profile[:views] ||= 0
@ -697,7 +716,7 @@ class BulkImport::Base
processed = send(process_method_name, mapped)
imported_ids << mapped[:imported_id] unless mapped[:imported_id].nil?
imported_ids |= mapped[:imported_ids] unless mapped[:imported_ids].nil?
@raw_connection.put_copy_data columns.map { |c| processed[c] }
@raw_connection.put_copy_data columns.map { |c| processed[c] } unless processed[:skip]
print "\r%7d - %6d/sec" % [imported_ids.size, imported_ids.size.to_f / (Time.now - start)] if imported_ids.size % 5000 == 0
rescue => e
puts "\n"

View File

@ -83,6 +83,7 @@ class BulkImport::PhpBB < BulkImport::Base
u = {
imported_id: row["user_id"],
username: normalize_text(row["username"]),
email: row["user_email"],
created_at: Time.zone.at(row["user_regdate"].to_i),
last_seen_at: row["user_lastvisit"] == 0 ? Time.zone.at(row["user_regdate"].to_i) : Time.zone.at(row["user_lastvisit"].to_i),
trust_level: row["user_posts"] == 0 ? TrustLevel[0] : TrustLevel[1],

View File

@ -118,6 +118,7 @@ class BulkImport::VBulletin < BulkImport::Base
imported_id: row[0],
username: normalize_text(row[1]),
name: normalize_text(row[1]),
email: row[2],
created_at: Time.zone.at(row[3]),
date_of_birth: parse_birthday(row[4]),
primary_group_id: group_id_from_imported_id(row[6]),