discourse/script/import_scripts/bespoke_1.rb
Leonardo Mosquera c83914e2e5
FIX: fix normalize_raw method for nil inputs in migration scripts (#22304)
Various migration scripts define a normalize_raw method to do custom processing of post contents before storing it in the Post.raw and other fields.

They normally do not handle nil inputs, but it's a relatively common occurrence in data dumps.

Since this method is used from various points in the migration script, as it stands, the experience of using a migration script is that it will fail multiple times at different points, forcing you to fix the data or apply logic hacks every time then restarting.

This PR generalizes handling of nil input by returning a <missing> string.

Pros:

    no more messy repeated crashes + restarts
    consistency

Cons:

    it might hide data issues
        OTOH we can't print a warning on that method because it will flood the console since it's called from inside loops.

* FIX: zendesk import script: support nil inputs in normalize_raw
* FIX: return '<missing>' instead of empty string; do it for all methods
2023-06-29 13:22:47 -03:00

266 lines
6.2 KiB
Ruby

# frozen_string_literal: true
# bespoke importer for a customer, feel free to borrow ideas
require "csv"
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
# Call it like this:
# RAILS_ENV=production bundle exec ruby script/import_scripts/bespoke_1.rb
class ImportScripts::Bespoke < ImportScripts::Base
BATCH_SIZE = 1000
def initialize(path)
@path = path
super()
@bbcode_to_md = true
puts "loading post mappings..."
@post_number_map = {}
Post
.pluck(:id, :post_number)
.each { |post_id, post_number| @post_number_map[post_id] = post_number }
end
def created_post(post)
@post_number_map[post.id] = post.post_number
super
end
def execute
import_users
import_categories
import_posts
end
class RowResolver
def load(row)
@row = row
end
def self.create(cols)
Class.new(RowResolver).new(cols)
end
def initialize(cols)
cols.each_with_index { |col, idx| self.class.public_send(:define_method, col) { @row[idx] } }
end
end
def load_user_batch!(users, offset, total)
if users.length > 0
create_users(users, offset: offset, total: total) { |user| user }
users.clear
end
end
def csv_parse(name)
filename = "#{@path}/#{name}.csv"
first = true
row = nil
current_row = +""
double_quote_count = 0
File
.open(filename)
.each_line do |line|
# escaping is mental here
line.gsub!(/\\(.{1})/) { |m| m[-1] == '"' ? '""' : m[-1] }
line.strip!
current_row << "\n" unless current_row.empty?
current_row << line
double_quote_count += line.scan('"').count
next if double_quote_count % 2 == 1
raw =
begin
CSV.parse(current_row)
rescue CSV::MalformedCSVError => e
puts e.message
puts "*" * 100
puts "Bad row skipped, line is: #{line}"
puts
puts current_row
puts
puts "double quote count is : #{double_quote_count}"
puts "*" * 100
current_row = ""
double_quote_count = 0
next
end[
0
]
if first
row = RowResolver.create(raw)
current_row = ""
double_quote_count = 0
first = false
next
end
row.load(raw)
yield row
current_row = ""
double_quote_count = 0
end
end
def total_rows(table)
File.foreach("#{@path}/#{table}.csv").inject(0) { |c, line| c + 1 } - 1
end
def import_users
puts "", "creating users"
count = 0
users = []
total = total_rows("users")
csv_parse("users") do |row|
id = row.id
email = row.email
# fake it
email = fake_email if row.email.blank? || row.email !~ /@/
name = row.display_name
username = row.key_custom
created_at = DateTime.parse(row.dcreate)
username = name if username == "NULL"
username = email.split("@")[0] if username.blank?
name = email.split("@")[0] if name.blank?
users << { id: id, email: email, name: name, username: username, created_at: created_at }
count += 1
load_user_batch! users, count - users.length, total if count % BATCH_SIZE == 0
end
load_user_batch! users, count, total
end
def import_categories
rows = []
csv_parse("categories") do |row|
rows << { id: row.id, name: row.name, description: row.description }
end
create_categories(rows) { |row| row }
end
def normalize_raw!(raw)
return "<missing>" if raw.blank?
# purple and #1223f3
raw.gsub!(/\[color=[#a-z0-9]+\]/i, "")
raw.gsub!(%r{\[/color\]}i, "")
raw.gsub!(%r{\[signature\].+\[/signature\]}im, "")
raw
end
def import_post_batch!(posts, topics, offset, total)
create_posts(posts, total: total, offset: offset) do |post|
mapped = {}
mapped[:id] = post[:id]
mapped[:user_id] = user_id_from_imported_user_id(post[:user_id]) || -1
mapped[:raw] = post[:body]
mapped[:created_at] = post[:created_at]
topic = topics[post[:topic_id]]
if topic[:post_id]
parent = topic_lookup_from_imported_post_id(topic[:post_id])
next unless parent
mapped[:topic_id] = parent[:topic_id]
reply_to_post_id = post_id_from_imported_post_id(post[:reply_id])
if reply_to_post_id
reply_to_post_number = @post_number_map[reply_to_post_id]
if reply_to_post_number && reply_to_post_number > 1
mapped[:reply_to_post_number] = reply_to_post_number
end
end
else
mapped[:category] = category_id_from_imported_category_id(topic[:category_id])
mapped[:title] = post[:title]
topic[:post_id] = post[:id]
end
next if topic[:deleted] || post[:deleted]
mapped
end
posts.clear
end
def import_posts
puts "", "creating topics and posts"
topic_map = {}
csv_parse("topics") do |topic|
topic_map[topic.id] = {
id: topic.id,
category_id: topic.forum_category_id,
deleted: topic.is_deleted.to_i == 1,
locked: topic.is_locked.to_i == 1,
pinned: topic.is_pinned.to_i == 1,
}
end
total = total_rows("posts")
posts = []
count = 0
csv_parse("posts") do |row|
unless row.dcreate
puts "NO CREATION DATE FOR POST"
p row
next
end
row = {
id: row.id,
topic_id: row.forum_topic_id,
reply_id: row.reply_id,
user_id: row.user_id,
title: row.title,
body: normalize_raw!(row.body),
deleted: row.is_deleted.to_i == 1,
created_at: DateTime.parse(row.dcreate),
}
posts << row
count += 1
if posts.length > 0 && posts.length % BATCH_SIZE == 0
import_post_batch!(posts, topic_map, count - posts.length, total)
end
end
import_post_batch!(posts, topic_map, count - posts.length, total) if posts.length > 0
exit
end
end
unless ARGV[0] && Dir.exist?(ARGV[0])
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/bespoke_1.rb DIRNAME", ""
exit 1
end
ImportScripts::Bespoke.new(ARGV[0]).perform