discourse/script/import_scripts/jive.rb
Leonardo Mosquera c83914e2e5
FIX: fix normalize_raw method for nil inputs in migration scripts (#22304)
Various migration scripts define a normalize_raw method to do custom processing of post contents before storing it in the Post.raw and other fields.

They normally do not handle nil inputs, but it's a relatively common occurrence in data dumps.

Since this method is used from various points in the migration script, as it stands, the experience of using a migration script is that it will fail multiple times at different points, forcing you to fix the data or apply logic hacks every time then restarting.

This PR generalizes handling of nil input by returning a <missing> string.

Pros:

    no more messy repeated crashes + restarts
    consistency

Cons:

    it might hide data issues
        OTOH we can't print a warning on that method because it will flood the console since it's called from inside loops.

* FIX: zendesk import script: support nil inputs in normalize_raw
* FIX: return '<missing>' instead of empty string; do it for all methods
2023-06-29 13:22:47 -03:00

399 lines
11 KiB
Ruby

# frozen_string_literal: true
# Jive importer
require "nokogiri"
require "csv"
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
class ImportScripts::Jive < ImportScripts::Base
BATCH_SIZE = 1000
CATEGORY_IDS = [2023, 2003, 2004, 2042, 2036, 2029] # categories that should be imported
def initialize(path)
@path = path
super()
@bbcode_to_md = true
puts "loading post mappings..."
@post_number_map = {}
Post
.pluck(:id, :post_number)
.each { |post_id, post_number| @post_number_map[post_id] = post_number }
end
def created_post(post)
@post_number_map[post.id] = post.post_number
super
end
def execute
import_users
import_groups
import_group_members
import_categories
import_posts
# Topic.update_all(closed: true)
end
class RowResolver
def load(row)
@row = row
end
def self.create(cols)
Class.new(RowResolver).new(cols)
end
def initialize(cols)
cols.each_with_index { |col, idx| self.class.public_send(:define_method, col) { @row[idx] } }
end
end
def load_user_batch!(users, offset, total)
if users.length > 0
create_users(users, offset: offset, total: total) { |user| user }
users.clear
end
end
def csv_parse(name)
filename = "#{@path}/#{name}.csv"
first = true
row = nil
current_row = +""
double_quote_count = 0
File
.open(filename)
.each_line do |line|
line.gsub!(/\\(.{1})/) { |m| m[-1] == '"' ? '""' : m[-1] }
line.strip!
current_row << "\n" unless current_row.empty?
current_row << line
double_quote_count += line.scan('"').count
next if double_quote_count % 2 == 1
raw =
begin
CSV.parse(current_row)
rescue CSV::MalformedCSVError => e
puts e.message
puts "*" * 100
puts "Bad row skipped, line is: #{line}"
puts
puts current_row
puts
puts "double quote count is : #{double_quote_count}"
puts "*" * 100
current_row = ""
double_quote_count = 0
next
end[
0
]
if first
row = RowResolver.create(raw)
current_row = ""
double_quote_count = 0
first = false
next
end
row.load(raw)
yield row
current_row = ""
double_quote_count = 0
end
end
def total_rows(table)
File.foreach("#{@path}/#{table}.csv").inject(0) { |c, line| c + 1 } - 1
end
def import_groups
puts "", "importing groups..."
rows = []
csv_parse("groups") { |row| rows << { id: row.groupid, name: row.name } }
create_groups(rows) { |row| row }
end
def import_users
puts "", "creating users"
count = 0
users = []
total = total_rows("users")
csv_parse("users") do |row|
id = row.userid
email = "#{row.email}"
# fake it
email = fake_email if row.email.blank? || row.email !~ /@/
name = "#{row.firstname} #{row.lastname}"
username = row.username
created_at = DateTime.parse(row.creationdate)
last_seen_at = DateTime.parse(row.lastloggedin)
is_activated = row.userenabled
username = name if username == "NULL"
username = email.split("@")[0] if username.blank?
name = email.split("@")[0] if name.blank?
users << {
id: id,
email: email,
name: name,
username: username,
created_at: created_at,
last_seen_at: last_seen_at,
active: is_activated.to_i == 1,
approved: true,
}
count += 1
load_user_batch! users, count - users.length, total if count % BATCH_SIZE == 0
end
load_user_batch! users, count, total
end
def import_group_members
puts "", "importing group members..."
csv_parse("group_members") do |row|
user_id = user_id_from_imported_user_id(row.userid)
group_id = group_id_from_imported_group_id(row.groupid)
GroupUser.find_or_create_by(user_id: user_id, group_id: group_id) if user_id && group_id
end
end
def import_categories
rows = []
csv_parse("communities") do |row|
next unless CATEGORY_IDS.include?(row.communityid.to_i)
rows << { id: row.communityid, name: "#{row.name} (#{row.communityid})" }
end
create_categories(rows) { |row| row }
end
def normalize_raw!(raw)
return "<missing>" if raw.blank?
raw = raw.dup
raw = raw[5..-6]
doc = Nokogiri::HTML5.fragment(raw)
doc.css("img").each { |img| img.remove if img["class"] == "jive-image" }
raw = doc.to_html
raw = raw[4..-1]
raw
end
def import_post_batch!(posts, topics, offset, total)
create_posts(posts, total: total, offset: offset) do |post|
mapped = {}
mapped[:id] = post[:id]
mapped[:user_id] = user_id_from_imported_user_id(post[:user_id]) || -1
mapped[:raw] = post[:body]
mapped[:created_at] = post[:created_at]
topic = topics[post[:topic_id]]
unless topic
p "MISSING TOPIC #{post[:topic_id]}"
p post
next
end
if topic[:post_id]
parent = topic_lookup_from_imported_post_id(topic[:post_id])
next unless parent
mapped[:topic_id] = parent[:topic_id]
reply_to_post_id = post_id_from_imported_post_id(post[:reply_id])
if reply_to_post_id
reply_to_post_number = @post_number_map[reply_to_post_id]
if reply_to_post_number && reply_to_post_number > 1
mapped[:reply_to_post_number] = reply_to_post_number
end
end
else
mapped[:category] = category_id_from_imported_category_id(topic[:category_id])
mapped[:title] = post[:title]
topic[:post_id] = post[:id]
end
next if topic[:deleted] || post[:deleted]
mapped
end
posts.clear
end
def import_posts
puts "", "creating topics and posts"
topic_map = {}
thread_map = {}
csv_parse("messages") do |thread|
next unless CATEGORY_IDS.include?(thread.containerid.to_i)
if !thread.parentmessageid
# topic
thread_map[thread.threadid] = thread.messageid
#IMAGE UPLOADER
if thread.imagecount
Dir.foreach(
"/var/www/discourse/script/import_scripts/jive/img/#{thread.messageid}",
) do |item|
next if item == (".") || item == ("..") || item == (".DS_Store")
photo_path =
"/var/www/discourse/script/import_scripts/jive/img/#{thread.messageid}/#{item}"
upload = create_upload(thread.userid, photo_path, File.basename(photo_path))
if upload.persisted?
puts "Image upload is successful for #{photo_path}, new path is #{upload.url}!"
thread.body.gsub!(item, upload.url)
else
puts "Error: Image upload is not successful for #{photo_path}!"
end
end
end
#ATTACHMENT UPLOADER
if thread.attachmentcount
Dir.foreach(
"/var/www/discourse/script/import_scripts/jive/attach/#{thread.messageid}",
) do |item|
next if item == (".") || item == ("..") || item == (".DS_Store")
attach_path =
"/var/www/discourse/script/import_scripts/jive/attach/#{thread.messageid}/#{item}"
upload = create_upload(thread.userid, attach_path, File.basename(attach_path))
if upload.persisted?
puts "Attachment upload is successful for #{attach_path}, new path is #{upload.url}!"
thread.body.gsub!(item, upload.url)
thread.body << "<br/><br/> #{attachment_html(upload, item)}"
else
puts "Error: Attachment upload is not successful for #{attach_path}!"
end
end
end
topic_map[thread.messageid] = {
id: thread.messageid,
topic_id: thread.messageid,
category_id: thread.containerid,
user_id: thread.userid,
title: thread.subject,
body: normalize_raw!(thread.body || thread.subject || "<missing>"),
created_at: DateTime.parse(thread.creationdate),
}
end
end
total = total_rows("messages")
posts = []
count = 0
topic_map.each do |_, topic|
posts << topic if topic[:body]
count += 1
end
csv_parse("messages") do |thread|
# post
next unless CATEGORY_IDS.include?(thread.containerid.to_i)
if thread.parentmessageid
#IMAGE UPLOADER
if thread.imagecount
Dir.foreach(
"/var/www/discourse/script/import_scripts/jive/img/#{thread.messageid}",
) do |item|
next if item == (".") || item == ("..") || item == (".DS_Store")
photo_path =
"/var/www/discourse/script/import_scripts/jive/img/#{thread.messageid}/#{item}"
upload = create_upload(thread.userid, photo_path, File.basename(photo_path))
if upload.persisted?
puts "Image upload is successful for #{photo_path}, new path is #{upload.url}!"
thread.body.gsub!(item, upload.url)
else
puts "Error: Image upload is not successful for #{photo_path}!"
end
end
end
#ATTACHMENT UPLOADER
if thread.attachmentcount
Dir.foreach(
"/var/www/discourse/script/import_scripts/jive/attach/#{thread.messageid}",
) do |item|
next if item == (".") || item == ("..") || item == (".DS_Store")
attach_path =
"/var/www/discourse/script/import_scripts/jive/attach/#{thread.messageid}/#{item}"
upload = create_upload(thread.userid, attach_path, File.basename(attach_path))
if upload.persisted?
puts "Attachment upload is successful for #{attach_path}, new path is #{upload.url}!"
thread.body.gsub!(item, upload.url)
thread.body << "<br/><br/> #{attachment_html(upload, item)}"
else
puts "Error: Attachment upload is not successful for #{attach_path}!"
end
end
end
row = {
id: thread.messageid,
topic_id: thread_map["#{thread.threadid}"],
user_id: thread.userid,
title: thread.subject,
body: normalize_raw!(thread.body),
created_at: DateTime.parse(thread.creationdate),
}
posts << row
count += 1
if posts.length > 0 && posts.length % BATCH_SIZE == 0
import_post_batch!(posts, topic_map, count - posts.length, total)
end
end
end
import_post_batch!(posts, topic_map, count - posts.length, total) if posts.length > 0
end
end
unless ARGV[0] && Dir.exist?(ARGV[0])
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/jive.rb DIRNAME", ""
exit 1
end
ImportScripts::Jive.new(ARGV[0]).perform