discourse/script/import_scripts/zendesk.rb
Leonardo Mosquera c83914e2e5
FIX: fix normalize_raw method for nil inputs in migration scripts (#22304)
Various migration scripts define a normalize_raw method to do custom processing of post contents before storing it in the Post.raw and other fields.

They normally do not handle nil inputs, but it's a relatively common occurrence in data dumps.

Since this method is used from various points in the migration script, as it stands, the experience of using a migration script is that it will fail multiple times at different points, forcing you to fix the data or apply logic hacks every time then restarting.

This PR generalizes handling of nil input by returning a <missing> string.

Pros:

    no more messy repeated crashes + restarts
    consistency

Cons:

    it might hide data issues
        OTOH we can't print a warning on that method because it will flood the console since it's called from inside loops.

* FIX: zendesk import script: support nil inputs in normalize_raw
* FIX: return '<missing>' instead of empty string; do it for all methods
2023-06-29 13:22:47 -03:00

246 lines
6.1 KiB
Ruby

# frozen_string_literal: true
# Zendesk importer
#
# You will need a bunch of CSV files:
#
# - users.csv
# - topics.csv (topics in Zendesk are categories in Discourse)
# - posts.csv (posts in Zendesk are topics in Discourse)
# - comments.csv (comments in Zendesk are posts in Discourse)
require "csv"
require "reverse_markdown"
require_relative "base"
require_relative "base/generic_database"
# Call it like this:
# RAILS_ENV=production bundle exec ruby script/import_scripts/zendesk.rb DIRNAME
class ImportScripts::Zendesk < ImportScripts::Base
OLD_DOMAIN = "https://support.example.com"
BATCH_SIZE = 1000
def initialize(path)
super()
@path = path
@db = ImportScripts::GenericDatabase.new(@path, batch_size: BATCH_SIZE, recreate: true)
end
def execute
read_csv_files
import_categories
import_users
import_topics
import_posts
end
def read_csv_files
puts "", "reading CSV files"
csv_parse("topics") do |row|
@db.insert_category(
id: row[:id],
name: row[:name],
description: row[:description],
position: row[:position],
url: row[:htmlurl],
)
end
csv_parse("users") do |row|
@db.insert_user(
id: row[:id],
email: row[:email],
name: row[:name],
created_at: parse_datetime(row[:createdat]),
last_seen_at: parse_datetime(row[:lastloginat]),
active: true,
)
end
csv_parse("posts") do |row|
@db.insert_topic(
id: row[:id],
title: row[:title],
raw: row[:details],
category_id: row[:topicid],
closed: row[:closed] == "TRUE",
user_id: row[:authorid],
created_at: parse_datetime(row[:createdat]),
url: row[:htmlurl],
)
end
csv_parse("comments") do |row|
@db.insert_post(
id: row[:id],
raw: row[:body],
topic_id: row[:postid],
user_id: row[:authorid],
created_at: parse_datetime(row[:createdat]),
url: row[:htmlurl],
)
end
@db.delete_unused_users
@db.sort_posts_by_created_at
end
def parse_datetime(text)
return nil if text.blank? || text == "null"
DateTime.parse(text)
end
def import_categories
puts "", "creating categories"
rows = @db.fetch_categories
create_categories(rows) do |row|
{
id: row["id"],
name: row["name"],
description: row["description"],
position: row["position"],
post_create_action:
proc do |category|
url = remove_domain(row["url"])
Permalink.create(url: url, category_id: category.id) unless permalink_exists?(url)
end,
}
end
end
def batches
super(BATCH_SIZE)
end
def import_users
puts "", "creating users"
total_count = @db.count_users
last_id = ""
batches do |offset|
rows, last_id = @db.fetch_users(last_id)
break if rows.empty?
next if all_records_exist?(:users, rows.map { |row| row["id"] })
create_users(rows, total: total_count, offset: offset) do |row|
{
id: row["id"],
email: row["email"],
name: row["name"],
created_at: row["created_at"],
last_seen_at: row["last_seen_at"],
active: row["active"] == 1,
}
end
end
end
def import_topics
puts "", "creating topics"
total_count = @db.count_topics
last_id = ""
batches do |offset|
rows, last_id = @db.fetch_topics(last_id)
break if rows.empty?
next if all_records_exist?(:posts, rows.map { |row| import_topic_id(row["id"]) })
create_posts(rows, total: total_count, offset: offset) do |row|
{
id: import_topic_id(row["id"]),
title: row["title"].present? ? row["title"].strip[0...255] : "Topic title missing",
raw: normalize_raw(row["raw"]),
category: category_id_from_imported_category_id(row["category_id"]),
user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
created_at: row["created_at"],
closed: row["closed"] == 1,
post_create_action:
proc do |post|
url = remove_domain(row["url"])
Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
end,
}
end
end
end
def import_topic_id(topic_id)
"T#{topic_id}"
end
def import_posts
puts "", "creating posts"
total_count = @db.count_posts
last_row_id = 0
batches do |offset|
rows, last_row_id = @db.fetch_sorted_posts(last_row_id)
break if rows.empty?
next if all_records_exist?(:posts, rows.map { |row| row["id"] })
create_posts(rows, total: total_count, offset: offset) do |row|
topic = topic_lookup_from_imported_post_id(import_topic_id(row["topic_id"]))
if topic.nil?
p "MISSING TOPIC #{row["topic_id"]}"
p row
next
end
{
id: row["id"],
raw: normalize_raw(row["raw"]),
user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
topic_id: topic[:topic_id],
created_at: row["created_at"],
post_create_action:
proc do |post|
url = remove_domain(row["url"])
Permalink.create(url: url, post_id: post.id) unless permalink_exists?(url)
end,
}
end
end
end
def normalize_raw(raw)
return "<missing>" if raw.blank?
raw = raw.gsub('\n', "")
raw = ReverseMarkdown.convert(raw)
raw
end
def remove_domain(url)
url.sub(OLD_DOMAIN, "")
end
def permalink_exists?(url)
Permalink.find_by(url: url)
end
def csv_parse(table_name)
CSV.foreach(
File.join(@path, "#{table_name}.csv"),
headers: true,
header_converters: :symbol,
skip_blanks: true,
encoding: "bom|utf-8",
) { |row| yield row }
end
end
unless ARGV[0] && Dir.exist?(ARGV[0])
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/zendesk.rb DIRNAME", ""
exit 1
end
ImportScripts::Zendesk.new(ARGV[0]).perform