mirror of
https://github.com/discourse/discourse.git
synced 2024-11-23 22:26:26 +08:00
c83914e2e5
Various migration scripts define a normalize_raw method to do custom processing of post contents before storing it in the Post.raw and other fields. They normally do not handle nil inputs, but it's a relatively common occurrence in data dumps. Since this method is used from various points in the migration script, as it stands, the experience of using a migration script is that it will fail multiple times at different points, forcing you to fix the data or apply logic hacks every time then restarting. This PR generalizes handling of nil input by returning a <missing> string. Pros: no more messy repeated crashes + restarts consistency Cons: it might hide data issues OTOH we can't print a warning on that method because it will flood the console since it's called from inside loops. * FIX: zendesk import script: support nil inputs in normalize_raw * FIX: return '<missing>' instead of empty string; do it for all methods
266 lines
6.2 KiB
Ruby
266 lines
6.2 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
# bespoke importer for a customer, feel free to borrow ideas
|
|
|
|
require "csv"
|
|
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
|
|
|
# Call it like this:
|
|
# RAILS_ENV=production bundle exec ruby script/import_scripts/bespoke_1.rb
|
|
class ImportScripts::Bespoke < ImportScripts::Base
|
|
BATCH_SIZE = 1000
|
|
|
|
def initialize(path)
|
|
@path = path
|
|
super()
|
|
@bbcode_to_md = true
|
|
|
|
puts "loading post mappings..."
|
|
@post_number_map = {}
|
|
Post
|
|
.pluck(:id, :post_number)
|
|
.each { |post_id, post_number| @post_number_map[post_id] = post_number }
|
|
end
|
|
|
|
def created_post(post)
|
|
@post_number_map[post.id] = post.post_number
|
|
super
|
|
end
|
|
|
|
def execute
|
|
import_users
|
|
import_categories
|
|
import_posts
|
|
end
|
|
|
|
class RowResolver
|
|
def load(row)
|
|
@row = row
|
|
end
|
|
|
|
def self.create(cols)
|
|
Class.new(RowResolver).new(cols)
|
|
end
|
|
|
|
def initialize(cols)
|
|
cols.each_with_index { |col, idx| self.class.public_send(:define_method, col) { @row[idx] } }
|
|
end
|
|
end
|
|
|
|
def load_user_batch!(users, offset, total)
|
|
if users.length > 0
|
|
create_users(users, offset: offset, total: total) { |user| user }
|
|
users.clear
|
|
end
|
|
end
|
|
|
|
def csv_parse(name)
|
|
filename = "#{@path}/#{name}.csv"
|
|
first = true
|
|
row = nil
|
|
|
|
current_row = +""
|
|
double_quote_count = 0
|
|
|
|
File
|
|
.open(filename)
|
|
.each_line do |line|
|
|
# escaping is mental here
|
|
line.gsub!(/\\(.{1})/) { |m| m[-1] == '"' ? '""' : m[-1] }
|
|
line.strip!
|
|
|
|
current_row << "\n" unless current_row.empty?
|
|
current_row << line
|
|
|
|
double_quote_count += line.scan('"').count
|
|
|
|
next if double_quote_count % 2 == 1
|
|
|
|
raw =
|
|
begin
|
|
CSV.parse(current_row)
|
|
rescue CSV::MalformedCSVError => e
|
|
puts e.message
|
|
puts "*" * 100
|
|
puts "Bad row skipped, line is: #{line}"
|
|
puts
|
|
puts current_row
|
|
puts
|
|
puts "double quote count is : #{double_quote_count}"
|
|
puts "*" * 100
|
|
|
|
current_row = ""
|
|
double_quote_count = 0
|
|
next
|
|
end[
|
|
0
|
|
]
|
|
|
|
if first
|
|
row = RowResolver.create(raw)
|
|
|
|
current_row = ""
|
|
double_quote_count = 0
|
|
first = false
|
|
next
|
|
end
|
|
|
|
row.load(raw)
|
|
|
|
yield row
|
|
|
|
current_row = ""
|
|
double_quote_count = 0
|
|
end
|
|
end
|
|
|
|
def total_rows(table)
|
|
File.foreach("#{@path}/#{table}.csv").inject(0) { |c, line| c + 1 } - 1
|
|
end
|
|
|
|
def import_users
|
|
puts "", "creating users"
|
|
|
|
count = 0
|
|
users = []
|
|
|
|
total = total_rows("users")
|
|
|
|
csv_parse("users") do |row|
|
|
id = row.id
|
|
email = row.email
|
|
|
|
# fake it
|
|
email = fake_email if row.email.blank? || row.email !~ /@/
|
|
|
|
name = row.display_name
|
|
username = row.key_custom
|
|
created_at = DateTime.parse(row.dcreate)
|
|
|
|
username = name if username == "NULL"
|
|
username = email.split("@")[0] if username.blank?
|
|
name = email.split("@")[0] if name.blank?
|
|
|
|
users << { id: id, email: email, name: name, username: username, created_at: created_at }
|
|
|
|
count += 1
|
|
load_user_batch! users, count - users.length, total if count % BATCH_SIZE == 0
|
|
end
|
|
|
|
load_user_batch! users, count, total
|
|
end
|
|
|
|
def import_categories
|
|
rows = []
|
|
csv_parse("categories") do |row|
|
|
rows << { id: row.id, name: row.name, description: row.description }
|
|
end
|
|
|
|
create_categories(rows) { |row| row }
|
|
end
|
|
|
|
def normalize_raw!(raw)
|
|
return "<missing>" if raw.blank?
|
|
|
|
# purple and #1223f3
|
|
raw.gsub!(/\[color=[#a-z0-9]+\]/i, "")
|
|
raw.gsub!(%r{\[/color\]}i, "")
|
|
raw.gsub!(%r{\[signature\].+\[/signature\]}im, "")
|
|
raw
|
|
end
|
|
|
|
def import_post_batch!(posts, topics, offset, total)
|
|
create_posts(posts, total: total, offset: offset) do |post|
|
|
mapped = {}
|
|
|
|
mapped[:id] = post[:id]
|
|
mapped[:user_id] = user_id_from_imported_user_id(post[:user_id]) || -1
|
|
mapped[:raw] = post[:body]
|
|
mapped[:created_at] = post[:created_at]
|
|
|
|
topic = topics[post[:topic_id]]
|
|
|
|
if topic[:post_id]
|
|
parent = topic_lookup_from_imported_post_id(topic[:post_id])
|
|
next unless parent
|
|
|
|
mapped[:topic_id] = parent[:topic_id]
|
|
|
|
reply_to_post_id = post_id_from_imported_post_id(post[:reply_id])
|
|
if reply_to_post_id
|
|
reply_to_post_number = @post_number_map[reply_to_post_id]
|
|
if reply_to_post_number && reply_to_post_number > 1
|
|
mapped[:reply_to_post_number] = reply_to_post_number
|
|
end
|
|
end
|
|
else
|
|
mapped[:category] = category_id_from_imported_category_id(topic[:category_id])
|
|
mapped[:title] = post[:title]
|
|
topic[:post_id] = post[:id]
|
|
end
|
|
|
|
next if topic[:deleted] || post[:deleted]
|
|
|
|
mapped
|
|
end
|
|
|
|
posts.clear
|
|
end
|
|
|
|
def import_posts
|
|
puts "", "creating topics and posts"
|
|
|
|
topic_map = {}
|
|
|
|
csv_parse("topics") do |topic|
|
|
topic_map[topic.id] = {
|
|
id: topic.id,
|
|
category_id: topic.forum_category_id,
|
|
deleted: topic.is_deleted.to_i == 1,
|
|
locked: topic.is_locked.to_i == 1,
|
|
pinned: topic.is_pinned.to_i == 1,
|
|
}
|
|
end
|
|
|
|
total = total_rows("posts")
|
|
|
|
posts = []
|
|
count = 0
|
|
csv_parse("posts") do |row|
|
|
unless row.dcreate
|
|
puts "NO CREATION DATE FOR POST"
|
|
p row
|
|
next
|
|
end
|
|
|
|
row = {
|
|
id: row.id,
|
|
topic_id: row.forum_topic_id,
|
|
reply_id: row.reply_id,
|
|
user_id: row.user_id,
|
|
title: row.title,
|
|
body: normalize_raw!(row.body),
|
|
deleted: row.is_deleted.to_i == 1,
|
|
created_at: DateTime.parse(row.dcreate),
|
|
}
|
|
posts << row
|
|
count += 1
|
|
|
|
if posts.length > 0 && posts.length % BATCH_SIZE == 0
|
|
import_post_batch!(posts, topic_map, count - posts.length, total)
|
|
end
|
|
end
|
|
|
|
import_post_batch!(posts, topic_map, count - posts.length, total) if posts.length > 0
|
|
|
|
exit
|
|
end
|
|
end
|
|
|
|
unless ARGV[0] && Dir.exist?(ARGV[0])
|
|
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/bespoke_1.rb DIRNAME", ""
|
|
exit 1
|
|
end
|
|
|
|
ImportScripts::Bespoke.new(ARGV[0]).perform
|