discourse/script/import_scripts/getsatisfaction.rb
Sam Saffron 30990006a9 DEV: enable frozen string literal on all files
This reduces chances of errors where consumers of strings mutate inputs
and reduces memory usage of the app.

Test suite passes now, but there may be some stuff left, so we will run
a few sites on a branch prior to merging
2019-05-13 09:31:32 +08:00

383 lines
10 KiB
Ruby

# frozen_string_literal: true
# getsatisfaction importer
#
# pre-req: You will either get an Excel or a bunch of CSV files. Be sure to rename them all so that
#
# - users.csv is the users table export
# - replies.csv is the reply table export
# - topics.csv is the topics table export
# - categories.csv is the categories table export
# - topics_categories.csv is the mapping between the topics and categories table
#
# Make sure that the CSV files use UTF-8 encoding, have consistent line endings and use comma as column separator.
# That's usually the case when you export Excel sheets as CSV.
# When you get MalformedCSVError during the import, try converting the line endings of the CSV into the Unix format.
# Mixed line endings in CSV files can create weird errors!
#
# You need to call fix_quotes_in_csv() for CSV files that use \" to escape quotes within quoted fields.
# The import script expects quotes to be escaped with "".
#
# It's likely that some posts in replies.csv aren't in the correct order. Currently the import script doesn't handle
# that correctly and will import the replies in the wrong order.
# You should run `rake posts:reorder_posts` after the import.
require 'csv'
require 'set'
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require 'reverse_markdown' # gem 'reverse_markdown'
# Call it like this:
# RAILS_ENV=production bundle exec ruby script/import_scripts/getsatisfaction.rb DIRNAME
class ImportScripts::GetSatisfaction < ImportScripts::Base
IMPORT_ARCHIVED_TOPICS = false
# The script classifies each topic as private when at least one associated category
# in "topics_categories.csv" is unknown (not included i "categories.csv").
IMPORT_PRIVATE_TOPICS = false
# Should the creation of permalinks be skipped? Make sure you configure OLD_DOMAIN if you
CREATE_PERMALINKS = true
# Replace "http://community.example.com/" with the URL of your community for permalinks
OLD_DOMAIN = "http://community.example.com/"
BATCH_SIZE = 1000
def initialize(path)
@path = path
super()
@bbcode_to_md = true
@topic_slug = {}
@topic_categories = {}
@skipped_topics = Set.new
end
def execute
# TODO Remove the call to fix_quotes_in_csv() if your replies.csv uses the double quotes ("").
# That's usually the case when you exported the file from Excel.
fix_quotes_in_csv("replies")
import_users
import_categories
import_topics
import_posts
create_permalinks if CREATE_PERMALINKS
end
def csv_filename(table_name, use_fixed: true)
if use_fixed
filename = File.join(@path, "#{table_name}_fixed.csv")
return filename if File.exists?(filename)
end
File.join(@path, "#{table_name}.csv")
end
def fix_quotes_in_csv(*table_names)
puts "", "fixing CSV files"
table_names.each do |table_name|
source_filename = csv_filename(table_name, use_fixed: false)
target_filename = csv_filename("#{table_name}_fixed", use_fixed: false)
previous_line = nil
File.open(target_filename, "w") do |file|
File.open(source_filename).each_line do |line|
line.gsub!(/(?<![^\\]\\)\\"/, '""')
line.gsub!(/\\\\/, '\\')
if previous_line
previous_line << "\n" unless line.starts_with?(",")
line = "#{previous_line}#{line}"
previous_line = nil
end
if line.gsub!(/,\+1\\\R$/m, ',"+1"').present?
previous_line = line
else
file.puts(line)
end
end
file.puts(previous_line) if previous_line
end
end
end
def csv_parse(table_name)
CSV.foreach(csv_filename(table_name),
headers: true,
header_converters: :symbol,
skip_blanks: true,
encoding: 'bom|utf-8') { |row| yield row }
end
def total_rows(table_name)
CSV.foreach(csv_filename(table_name),
headers: true,
skip_blanks: true,
encoding: 'bom|utf-8')
.inject(0) { |c, _| c + 1 }
end
def import_users
puts "", "creating users"
count = 0
users = []
total = total_rows("users")
csv_parse("users") do |row|
users << {
id: row[:user_id],
email: row[:email],
name: row[:realname],
username: row[:nickname],
created_at: DateTime.parse(row[:joined_date]),
active: true
}
count += 1
if count % BATCH_SIZE == 0
import_users_batch!(users, count - users.length, total)
end
end
import_users_batch!(users, count - users.length, total)
end
def import_users_batch!(users, offset, total)
return if users.empty?
create_users(users, offset: offset, total: total) do |user|
user
end
users.clear
end
def import_categories
puts "", "creating categories"
rows = []
csv_parse("categories") do |row|
rows << {
id: row[:category_id],
name: row[:name],
description: row[:description].present? ? normalize_raw!(row[:description]) : nil
}
end
create_categories(rows) do |row|
row
end
end
def import_topic_id(topic_id)
"T#{topic_id}"
end
def import_topics
read_topic_categories
puts "", "creating topics"
count = 0
topics = []
total = total_rows("topics")
csv_parse("topics") do |row|
topic = nil
topic_id = import_topic_id(row[:topic_id])
if skip_topic?(row)
@skipped_topics.add(topic_id)
else
topic = map_post(row)
topic[:id] = topic_id
topic[:title] = row[:subject].present? ? row[:subject].strip[0...255] : "Topic title missing"
topic[:category] = category_id(row)
topic[:archived] = row[:archived_at].present?
@topic_slug[topic[:id]] = row[:url] if CREATE_PERMALINKS
end
topics << topic
count += 1
if count % BATCH_SIZE == 0
import_topics_batch!(topics, count - topics.length, total)
end
end
import_topics_batch!(topics, count - topics.length, total)
end
def skip_topic?(row)
return true if row[:removed] == "1"
return true unless IMPORT_ARCHIVED_TOPICS || row[:archived_at].blank?
unless IMPORT_PRIVATE_TOPICS
categories = @topic_categories[row[:topic_id]]
return true if categories && categories[:has_unknown_category]
end
false
end
def category_id(row)
categories = @topic_categories[row[:topic_id]]
return categories[:category_ids].last if categories
SiteSetting.uncategorized_category_id
end
def read_topic_categories
puts "", "reading topic_categories"
count = 0
total = total_rows("topics_categories")
csv_parse("topics_categories") do |row|
topic_id = row[:topic_id]
category_id = category_id_from_imported_category_id(row[:category_id])
@topic_categories[topic_id] ||= { category_ids: [], has_unknown_category: false }
if category_id.nil?
@topic_categories[topic_id][:has_unknown_category] = true
else
@topic_categories[topic_id][:category_ids] << category_id
end
count += 1
print_status(count, total)
end
end
def import_topics_batch!(topics, offset, total)
return if topics.empty?
create_posts(topics, total: total, offset: offset) { |topic| topic }
topics.clear
end
def import_posts
puts "", "creating posts"
count = 0
posts = []
total = total_rows("replies")
csv_parse("replies") do |row|
post = nil
if row[:removed] != "1"
parent = topic_lookup_from_imported_post_id(row[:parent_id]) if row[:parent_id] != "NULL"
post = map_post(row)
post[:id] = row[:reply_id]
post[:topic_id] = import_topic_id(row[:topic_id])
post[:reply_to_post_number] = parent[:post_number] if parent
end
posts << post
count += 1
if count % BATCH_SIZE == 0
import_posts_batch!(posts, count - posts.length, total)
end
end
import_posts_batch!(posts, count - posts.length, total)
end
def import_posts_batch!(posts, offset, total)
return if posts.empty?
create_posts(posts, total: total, offset: offset) do |post|
next if post.nil? || @skipped_topics.include?(post[:topic_id])
topic = topic_lookup_from_imported_post_id(post[:topic_id])
if topic
post[:topic_id] = topic[:topic_id]
else
p "MISSING TOPIC #{post[:topic_id]}"
p post
next
end
post
end
posts.clear
end
def map_post(row)
{
user_id: user_id_from_imported_user_id(row[:user_id]) || Discourse.system_user.id,
created_at: DateTime.parse(row[:created_at]),
raw: normalize_raw!(row[:formatted_content])
}
end
def normalize_raw!(raw)
return "<missing>" if raw.blank?
raw = raw.dup
# hoist code
hoisted = {}
raw.gsub!(/(<pre>\s*)?<code>(.*?)<\/code>(\s*<\/pre>)?/mi) do
code = $2
hoist = SecureRandom.hex
# tidy code, wow, this is impressively crazy
code.gsub!(/ (\s*)/, "\n\\1")
code.gsub!(/^\s*\n$/, "\n")
code.gsub!(/\n+/m, "\n")
code.strip!
hoisted[hoist] = code
hoist
end
# impressive seems to be using tripple space as a <p> unless hoisted
# in this case double space works best ... so odd
raw.gsub!(" ", "\n\n")
hoisted.each do |hoist, code|
raw.gsub!(hoist, "\n```\n" << code << "\n```\n")
end
raw = CGI.unescapeHTML(raw)
raw = ReverseMarkdown.convert(raw)
raw
end
def create_permalinks
puts '', 'Creating Permalinks...', ''
Topic.listable_topics.find_each do |topic|
tcf = topic.first_post.custom_fields
if tcf && tcf["import_id"]
slug = @topic_slug[tcf["import_id"]]
slug = slug.gsub(OLD_DOMAIN, "")
Permalink.create(url: slug, topic_id: topic.id)
end
end
end
end
unless ARGV[0] && Dir.exist?(ARGV[0])
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/getsatisfaction.rb DIRNAME", ""
exit 1
end
ImportScripts::GetSatisfaction.new(ARGV[0]).perform