discourse/script/import_scripts/getsatisfaction.rb
Régis Hanol 7a6d772ad2 DEV: couple bug fixes in getsatisfaction importer
- Ensure we don't modify a frozen string
- Ensure we have a slug before trying to create a permalink
2020-07-06 17:41:28 +02:00

384 lines
10 KiB
Ruby

# frozen_string_literal: true
# getsatisfaction importer
#
# pre-req: You will either get an Excel or a bunch of CSV files. Be sure to rename them all so that
#
# - users.csv is the users table export
# - replies.csv is the reply table export
# - topics.csv is the topics table export
# - categories.csv is the categories table export
# - topics_categories.csv is the mapping between the topics and categories table
#
# Make sure that the CSV files use UTF-8 encoding, have consistent line endings and use comma as column separator.
# That's usually the case when you export Excel sheets as CSV.
# When you get MalformedCSVError during the import, try converting the line endings of the CSV into the Unix format.
# Mixed line endings in CSV files can create weird errors!
#
# You need to call fix_quotes_in_csv() for CSV files that use \" to escape quotes within quoted fields.
# The import script expects quotes to be escaped with "".
#
# It's likely that some posts in replies.csv aren't in the correct order. Currently the import script doesn't handle
# that correctly and will import the replies in the wrong order.
# You should run `rake posts:reorder_posts` after the import.
require 'csv'
require 'set'
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require 'reverse_markdown' # gem 'reverse_markdown'
# Call it like this:
# RAILS_ENV=production bundle exec ruby script/import_scripts/getsatisfaction.rb DIRNAME
class ImportScripts::GetSatisfaction < ImportScripts::Base
IMPORT_ARCHIVED_TOPICS = false
# The script classifies each topic as private when at least one associated category
# in "topics_categories.csv" is unknown (not included i "categories.csv").
IMPORT_PRIVATE_TOPICS = false
# Should the creation of permalinks be skipped? Make sure you configure OLD_DOMAIN if you
CREATE_PERMALINKS = true
# Replace "http://community.example.com/" with the URL of your community for permalinks
OLD_DOMAIN = "http://community.example.com/"
BATCH_SIZE = 1000
def initialize(path)
@path = path
super()
@bbcode_to_md = true
@topic_slug = {}
@topic_categories = {}
@skipped_topics = Set.new
end
def execute
# TODO Remove the call to fix_quotes_in_csv() if your replies.csv uses the double quotes ("").
# That's usually the case when you exported the file from Excel.
fix_quotes_in_csv("replies")
import_users
import_categories
import_topics
import_posts
create_permalinks if CREATE_PERMALINKS
end
def csv_filename(table_name, use_fixed: true)
if use_fixed
filename = File.join(@path, "#{table_name}_fixed.csv")
return filename if File.exists?(filename)
end
File.join(@path, "#{table_name}.csv")
end
def fix_quotes_in_csv(*table_names)
puts "", "fixing CSV files"
table_names.each do |table_name|
source_filename = csv_filename(table_name, use_fixed: false)
target_filename = csv_filename("#{table_name}_fixed", use_fixed: false)
previous_line = nil
File.open(target_filename, "w") do |file|
File.open(source_filename).each_line do |line|
line.gsub!(/(?<![^\\]\\)\\"/, '""')
line.gsub!(/\\\\/, '\\')
if previous_line
previous_line << "\n" unless line.starts_with?(",")
line = "#{previous_line}#{line}"
previous_line = nil
end
if line.gsub!(/,\+1\\\R$/m, ',"+1"').present?
previous_line = line
else
file.puts(line)
end
end
file.puts(previous_line) if previous_line
end
end
end
def csv_parse(table_name)
CSV.foreach(csv_filename(table_name),
headers: true,
header_converters: :symbol,
skip_blanks: true,
encoding: 'bom|utf-8') { |row| yield row }
end
def total_rows(table_name)
CSV.foreach(csv_filename(table_name),
headers: true,
skip_blanks: true,
encoding: 'bom|utf-8')
.inject(0) { |c, _| c + 1 }
end
def import_users
puts "", "creating users"
count = 0
users = []
total = total_rows("users")
csv_parse("users") do |row|
users << {
id: row[:user_id],
email: row[:email],
name: row[:realname],
username: row[:nickname],
created_at: DateTime.parse(row[:joined_date]),
active: true
}
count += 1
if count % BATCH_SIZE == 0
import_users_batch!(users, count - users.length, total)
end
end
import_users_batch!(users, count - users.length, total)
end
def import_users_batch!(users, offset, total)
return if users.empty?
create_users(users, offset: offset, total: total) do |user|
user
end
users.clear
end
def import_categories
puts "", "creating categories"
rows = []
csv_parse("categories") do |row|
rows << {
id: row[:category_id],
name: row[:name],
description: row[:description].present? ? normalize_raw!(row[:description]) : nil
}
end
create_categories(rows) do |row|
row
end
end
def import_topic_id(topic_id)
"T#{topic_id}"
end
def import_topics
read_topic_categories
puts "", "creating topics"
count = 0
topics = []
total = total_rows("topics")
csv_parse("topics") do |row|
topic = nil
topic_id = import_topic_id(row[:topic_id])
if skip_topic?(row)
@skipped_topics.add(topic_id)
else
topic = map_post(row)
topic[:id] = topic_id
topic[:title] = row[:subject].present? ? row[:subject].strip[0...255] : "Topic title missing"
topic[:category] = category_id(row)
topic[:archived] = row[:archived_at].present?
@topic_slug[topic[:id]] = row[:url] if CREATE_PERMALINKS
end
topics << topic
count += 1
if count % BATCH_SIZE == 0
import_topics_batch!(topics, count - topics.length, total)
end
end
import_topics_batch!(topics, count - topics.length, total)
end
def skip_topic?(row)
return true if row[:removed] == "1"
return true unless IMPORT_ARCHIVED_TOPICS || row[:archived_at].blank?
unless IMPORT_PRIVATE_TOPICS
categories = @topic_categories[row[:topic_id]]
return true if categories && categories[:has_unknown_category]
end
false
end
def category_id(row)
categories = @topic_categories[row[:topic_id]]
return categories[:category_ids].last if categories
SiteSetting.uncategorized_category_id
end
def read_topic_categories
puts "", "reading topic_categories"
count = 0
total = total_rows("topics_categories")
csv_parse("topics_categories") do |row|
topic_id = row[:topic_id]
category_id = category_id_from_imported_category_id(row[:category_id])
@topic_categories[topic_id] ||= { category_ids: [], has_unknown_category: false }
if category_id.nil?
@topic_categories[topic_id][:has_unknown_category] = true
else
@topic_categories[topic_id][:category_ids] << category_id
end
count += 1
print_status(count, total)
end
end
def import_topics_batch!(topics, offset, total)
return if topics.empty?
create_posts(topics, total: total, offset: offset) { |topic| topic }
topics.clear
end
def import_posts
puts "", "creating posts"
count = 0
posts = []
total = total_rows("replies")
csv_parse("replies") do |row|
post = nil
if row[:removed] != "1"
parent = topic_lookup_from_imported_post_id(row[:parent_id]) if row[:parent_id] != "NULL"
post = map_post(row)
post[:id] = row[:reply_id]
post[:topic_id] = import_topic_id(row[:topic_id])
post[:reply_to_post_number] = parent[:post_number] if parent
end
posts << post
count += 1
if count % BATCH_SIZE == 0
import_posts_batch!(posts, count - posts.length, total)
end
end
import_posts_batch!(posts, count - posts.length, total)
end
def import_posts_batch!(posts, offset, total)
return if posts.empty?
create_posts(posts, total: total, offset: offset) do |post|
next if post.nil? || @skipped_topics.include?(post[:topic_id])
topic = topic_lookup_from_imported_post_id(post[:topic_id])
if topic
post[:topic_id] = topic[:topic_id]
else
p "MISSING TOPIC #{post[:topic_id]}"
p post
next
end
post
end
posts.clear
end
def map_post(row)
{
user_id: user_id_from_imported_user_id(row[:user_id]) || Discourse.system_user.id,
created_at: DateTime.parse(row[:created_at]),
raw: normalize_raw!(row[:formatted_content])
}
end
def normalize_raw!(raw)
return "<missing>" if raw.blank?
raw = raw.dup
# hoist code
hoisted = {}
raw.gsub!(/(<pre>\s*)?<code>(.*?)<\/code>(\s*<\/pre>)?/mi) do
code = $2
hoist = SecureRandom.hex
# tidy code, wow, this is impressively crazy
code.gsub!(/ (\s*)/, "\n\\1")
code.gsub!(/^\s*\n$/, "\n")
code.gsub!(/\n+/m, "\n")
code.strip!
hoisted[hoist] = code
hoist
end
# impressive seems to be using tripple space as a <p> unless hoisted
# in this case double space works best ... so odd
raw.gsub!(" ", "\n\n")
hoisted.each do |hoist, code|
raw.gsub!(hoist, "\n```\n#{code}\n```\n")
end
raw = CGI.unescapeHTML(raw)
raw = ReverseMarkdown.convert(raw)
raw
end
def create_permalinks
puts '', 'Creating Permalinks...', ''
Topic.listable_topics.find_each do |topic|
tcf = topic.first_post.custom_fields
if tcf && tcf["import_id"]
if slug = @topic_slug[tcf["import_id"]]
slug.gsub!(OLD_DOMAIN, "")
Permalink.find_or_create_by(url: slug, topic_id: topic.id) if slug.present?
end
end
end
end
end
unless ARGV[0] && Dir.exist?(ARGV[0])
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/getsatisfaction.rb DIRNAME", ""
exit 1
end
ImportScripts::GetSatisfaction.new(ARGV[0]).perform