mirror of
https://github.com/discourse/discourse.git
synced 2024-12-13 00:23:43 +08:00
c5fd8c42db
* File.exists? is deprecated and removed in Ruby 3.2 in favor of File.exist? * Dir.exists? is deprecated and removed in Ruby 3.2 in favor of Dir.exist?
384 lines
10 KiB
Ruby
384 lines
10 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
# getsatisfaction importer
|
|
#
|
|
# pre-req: You will either get an Excel or a bunch of CSV files. Be sure to rename them all so that
|
|
#
|
|
# - users.csv is the users table export
|
|
# - replies.csv is the reply table export
|
|
# - topics.csv is the topics table export
|
|
# - categories.csv is the categories table export
|
|
# - topics_categories.csv is the mapping between the topics and categories table
|
|
#
|
|
# Make sure that the CSV files use UTF-8 encoding, have consistent line endings and use comma as column separator.
|
|
# That's usually the case when you export Excel sheets as CSV.
|
|
# When you get MalformedCSVError during the import, try converting the line endings of the CSV into the Unix format.
|
|
# Mixed line endings in CSV files can create weird errors!
|
|
#
|
|
# You need to call fix_quotes_in_csv() for CSV files that use \" to escape quotes within quoted fields.
|
|
# The import script expects quotes to be escaped with "".
|
|
#
|
|
# It's likely that some posts in replies.csv aren't in the correct order. Currently the import script doesn't handle
|
|
# that correctly and will import the replies in the wrong order.
|
|
# You should run `rake posts:reorder_posts` after the import.
|
|
|
|
require 'csv'
|
|
require 'set'
|
|
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
|
require 'reverse_markdown' # gem 'reverse_markdown'
|
|
|
|
# Call it like this:
|
|
# RAILS_ENV=production bundle exec ruby script/import_scripts/getsatisfaction.rb DIRNAME
|
|
class ImportScripts::GetSatisfaction < ImportScripts::Base
|
|
|
|
IMPORT_ARCHIVED_TOPICS = false
|
|
|
|
# The script classifies each topic as private when at least one associated category
|
|
# in "topics_categories.csv" is unknown (not included i "categories.csv").
|
|
IMPORT_PRIVATE_TOPICS = false
|
|
|
|
# Should the creation of permalinks be skipped? Make sure you configure OLD_DOMAIN if you
|
|
CREATE_PERMALINKS = true
|
|
|
|
# Replace "http://community.example.com/" with the URL of your community for permalinks
|
|
OLD_DOMAIN = "http://community.example.com/"
|
|
BATCH_SIZE = 1000
|
|
|
|
def initialize(path)
|
|
@path = path
|
|
super()
|
|
@bbcode_to_md = true
|
|
@topic_slug = {}
|
|
@topic_categories = {}
|
|
@skipped_topics = Set.new
|
|
end
|
|
|
|
def execute
|
|
# TODO Remove the call to fix_quotes_in_csv() if your replies.csv uses the double quotes ("").
|
|
# That's usually the case when you exported the file from Excel.
|
|
fix_quotes_in_csv("replies")
|
|
|
|
import_users
|
|
import_categories
|
|
import_topics
|
|
import_posts
|
|
|
|
create_permalinks if CREATE_PERMALINKS
|
|
end
|
|
|
|
def csv_filename(table_name, use_fixed: true)
|
|
if use_fixed
|
|
filename = File.join(@path, "#{table_name}_fixed.csv")
|
|
return filename if File.exist?(filename)
|
|
end
|
|
|
|
File.join(@path, "#{table_name}.csv")
|
|
end
|
|
|
|
def fix_quotes_in_csv(*table_names)
|
|
puts "", "fixing CSV files"
|
|
|
|
table_names.each do |table_name|
|
|
source_filename = csv_filename(table_name, use_fixed: false)
|
|
target_filename = csv_filename("#{table_name}_fixed", use_fixed: false)
|
|
|
|
previous_line = nil
|
|
|
|
File.open(target_filename, "w") do |file|
|
|
File.open(source_filename).each_line do |line|
|
|
line.gsub!(/(?<![^\\]\\)\\"/, '""')
|
|
line.gsub!(/\\\\/, '\\')
|
|
|
|
if previous_line
|
|
previous_line << "\n" unless line.starts_with?(",")
|
|
line = "#{previous_line}#{line}"
|
|
previous_line = nil
|
|
end
|
|
|
|
if line.gsub!(/,\+1\\\R$/m, ',"+1"').present?
|
|
previous_line = line
|
|
else
|
|
file.puts(line)
|
|
end
|
|
end
|
|
|
|
file.puts(previous_line) if previous_line
|
|
end
|
|
end
|
|
end
|
|
|
|
def csv_parse(table_name)
|
|
CSV.foreach(csv_filename(table_name),
|
|
headers: true,
|
|
header_converters: :symbol,
|
|
skip_blanks: true,
|
|
encoding: 'bom|utf-8') { |row| yield row }
|
|
end
|
|
|
|
def total_rows(table_name)
|
|
CSV.foreach(csv_filename(table_name),
|
|
headers: true,
|
|
skip_blanks: true,
|
|
encoding: 'bom|utf-8')
|
|
.inject(0) { |c, _| c + 1 }
|
|
end
|
|
|
|
def import_users
|
|
puts "", "creating users"
|
|
|
|
count = 0
|
|
users = []
|
|
|
|
total = total_rows("users")
|
|
|
|
csv_parse("users") do |row|
|
|
users << {
|
|
id: row[:user_id],
|
|
email: row[:email],
|
|
name: row[:realname],
|
|
username: row[:nickname],
|
|
created_at: DateTime.parse(row[:joined_date]),
|
|
active: true
|
|
}
|
|
|
|
count += 1
|
|
if count % BATCH_SIZE == 0
|
|
import_users_batch!(users, count - users.length, total)
|
|
end
|
|
end
|
|
|
|
import_users_batch!(users, count - users.length, total)
|
|
end
|
|
|
|
def import_users_batch!(users, offset, total)
|
|
return if users.empty?
|
|
|
|
create_users(users, offset: offset, total: total) do |user|
|
|
user
|
|
end
|
|
users.clear
|
|
end
|
|
|
|
def import_categories
|
|
puts "", "creating categories"
|
|
|
|
rows = []
|
|
|
|
csv_parse("categories") do |row|
|
|
rows << {
|
|
id: row[:category_id],
|
|
name: row[:name],
|
|
description: row[:description].present? ? normalize_raw!(row[:description]) : nil
|
|
}
|
|
end
|
|
|
|
create_categories(rows) do |row|
|
|
row
|
|
end
|
|
end
|
|
|
|
def import_topic_id(topic_id)
|
|
"T#{topic_id}"
|
|
end
|
|
|
|
def import_topics
|
|
read_topic_categories
|
|
|
|
puts "", "creating topics"
|
|
|
|
count = 0
|
|
topics = []
|
|
|
|
total = total_rows("topics")
|
|
|
|
csv_parse("topics") do |row|
|
|
topic = nil
|
|
topic_id = import_topic_id(row[:topic_id])
|
|
|
|
if skip_topic?(row)
|
|
@skipped_topics.add(topic_id)
|
|
else
|
|
topic = map_post(row)
|
|
topic[:id] = topic_id
|
|
topic[:title] = row[:subject].present? ? row[:subject].strip[0...255] : "Topic title missing"
|
|
topic[:category] = category_id(row)
|
|
topic[:archived] = row[:archived_at].present?
|
|
|
|
@topic_slug[topic[:id]] = row[:url] if CREATE_PERMALINKS
|
|
end
|
|
|
|
topics << topic
|
|
count += 1
|
|
|
|
if count % BATCH_SIZE == 0
|
|
import_topics_batch!(topics, count - topics.length, total)
|
|
end
|
|
end
|
|
|
|
import_topics_batch!(topics, count - topics.length, total)
|
|
end
|
|
|
|
def skip_topic?(row)
|
|
return true if row[:removed] == "1"
|
|
return true unless IMPORT_ARCHIVED_TOPICS || row[:archived_at].blank?
|
|
|
|
unless IMPORT_PRIVATE_TOPICS
|
|
categories = @topic_categories[row[:topic_id]]
|
|
return true if categories && categories[:has_unknown_category]
|
|
end
|
|
|
|
false
|
|
end
|
|
|
|
def category_id(row)
|
|
categories = @topic_categories[row[:topic_id]]
|
|
return categories[:category_ids].last if categories
|
|
|
|
SiteSetting.uncategorized_category_id
|
|
end
|
|
|
|
def read_topic_categories
|
|
puts "", "reading topic_categories"
|
|
|
|
count = 0
|
|
total = total_rows("topics_categories")
|
|
|
|
csv_parse("topics_categories") do |row|
|
|
topic_id = row[:topic_id]
|
|
category_id = category_id_from_imported_category_id(row[:category_id])
|
|
|
|
@topic_categories[topic_id] ||= { category_ids: [], has_unknown_category: false }
|
|
|
|
if category_id.nil?
|
|
@topic_categories[topic_id][:has_unknown_category] = true
|
|
else
|
|
@topic_categories[topic_id][:category_ids] << category_id
|
|
end
|
|
|
|
count += 1
|
|
print_status(count, total)
|
|
end
|
|
end
|
|
|
|
def import_topics_batch!(topics, offset, total)
|
|
return if topics.empty?
|
|
|
|
create_posts(topics, total: total, offset: offset) { |topic| topic }
|
|
topics.clear
|
|
end
|
|
|
|
def import_posts
|
|
puts "", "creating posts"
|
|
|
|
count = 0
|
|
posts = []
|
|
|
|
total = total_rows("replies")
|
|
|
|
csv_parse("replies") do |row|
|
|
post = nil
|
|
|
|
if row[:removed] != "1"
|
|
parent = topic_lookup_from_imported_post_id(row[:parent_id]) if row[:parent_id] != "NULL"
|
|
|
|
post = map_post(row)
|
|
post[:id] = row[:reply_id]
|
|
post[:topic_id] = import_topic_id(row[:topic_id])
|
|
post[:reply_to_post_number] = parent[:post_number] if parent
|
|
end
|
|
|
|
posts << post
|
|
count += 1
|
|
|
|
if count % BATCH_SIZE == 0
|
|
import_posts_batch!(posts, count - posts.length, total)
|
|
end
|
|
end
|
|
|
|
import_posts_batch!(posts, count - posts.length, total)
|
|
end
|
|
|
|
def import_posts_batch!(posts, offset, total)
|
|
return if posts.empty?
|
|
|
|
create_posts(posts, total: total, offset: offset) do |post|
|
|
next if post.nil? || @skipped_topics.include?(post[:topic_id])
|
|
|
|
topic = topic_lookup_from_imported_post_id(post[:topic_id])
|
|
|
|
if topic
|
|
post[:topic_id] = topic[:topic_id]
|
|
else
|
|
p "MISSING TOPIC #{post[:topic_id]}"
|
|
p post
|
|
next
|
|
end
|
|
|
|
post
|
|
end
|
|
|
|
posts.clear
|
|
end
|
|
|
|
def map_post(row)
|
|
{
|
|
user_id: user_id_from_imported_user_id(row[:user_id]) || Discourse.system_user.id,
|
|
created_at: DateTime.parse(row[:created_at]),
|
|
raw: normalize_raw!(row[:formatted_content])
|
|
}
|
|
end
|
|
|
|
def normalize_raw!(raw)
|
|
return "<missing>" if raw.blank?
|
|
raw = raw.dup
|
|
|
|
# hoist code
|
|
hoisted = {}
|
|
raw.gsub!(/(<pre>\s*)?<code>(.*?)<\/code>(\s*<\/pre>)?/mi) do
|
|
code = $2
|
|
hoist = SecureRandom.hex
|
|
# tidy code, wow, this is impressively crazy
|
|
code.gsub!(/ (\s*)/, "\n\\1")
|
|
code.gsub!(/^\s*\n$/, "\n")
|
|
code.gsub!(/\n+/m, "\n")
|
|
code.strip!
|
|
hoisted[hoist] = code
|
|
hoist
|
|
end
|
|
|
|
# impressive seems to be using tripple space as a <p> unless hoisted
|
|
# in this case double space works best ... so odd
|
|
raw.gsub!(" ", "\n\n")
|
|
|
|
hoisted.each do |hoist, code|
|
|
raw.gsub!(hoist, "\n```\n#{code}\n```\n")
|
|
end
|
|
|
|
raw = CGI.unescapeHTML(raw)
|
|
raw = ReverseMarkdown.convert(raw)
|
|
raw
|
|
end
|
|
|
|
def create_permalinks
|
|
puts '', 'Creating Permalinks...', ''
|
|
|
|
Topic.listable_topics.find_each do |topic|
|
|
tcf = topic.first_post.custom_fields
|
|
if tcf && tcf["import_id"]
|
|
if slug = @topic_slug[tcf["import_id"]]
|
|
slug.gsub!(OLD_DOMAIN, "")
|
|
Permalink.find_or_create_by(url: slug, topic_id: topic.id) if slug.present?
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
end
|
|
|
|
unless ARGV[0] && Dir.exist?(ARGV[0])
|
|
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/getsatisfaction.rb DIRNAME", ""
|
|
exit 1
|
|
end
|
|
|
|
ImportScripts::GetSatisfaction.new(ARGV[0]).perform
|