discourse/script/import_scripts/answerbase.rb
Sam Saffron 30990006a9 DEV: enable frozen string literal on all files
This reduces chances of errors where consumers of strings mutate inputs
and reduces memory usage of the app.

Test suite passes now, but there may be some stuff left, so we will run
a few sites on a branch prior to merging
2019-05-13 09:31:32 +08:00

344 lines
9.5 KiB
Ruby

# frozen_string_literal: true
require 'csv'
require 'reverse_markdown'
require_relative 'base'
require_relative 'base/generic_database'
# Call it like this:
# RAILS_ENV=production bundle exec ruby script/import_scripts/answerbase.rb DIRNAME
class ImportScripts::Answerbase < ImportScripts::Base
OLD_DOMAIN = "http://answerbase.example.com" # without trailing slash
NEW_DOMAIN = "https://discourse.example.com"
AVATAR_DIRECTORY = "User Images"
ANSWER_ATTACHMENT_DIRECTORY = "Answer Attachments"
ANSWER_IMAGE_DIRECTORY = "Answer Images"
QUESTION_ATTACHMENT_DIRECTORY = "Question Attachments"
QUESTION_IMAGE_DIRECTORY = "Question Images"
EMBEDDED_IMAGE_REGEX = /<a[^>]*href="[^"]*relativeUrl=(?<path>[^"\&]*)[^"]*"[^>]*>\s*<img[^>]*>\s*<\/a>/i
QUESTION_LINK_REGEX = /<a[^>]*?href="#{Regexp.escape(OLD_DOMAIN)}\/[^"]*?(?:q|questionid=)(?<id>\d+)[^"]*?"[^>]*>(?<text>.*?)<\/a>/i
TOPIC_LINK_NORMALIZATION = '/.*?-(q\d+).*/\1'
BATCH_SIZE = 1000
def initialize(path)
super()
@path = path
@db = ImportScripts::GenericDatabase.new(
@path,
batch_size: BATCH_SIZE,
recreate: true,
numeric_keys: true
)
end
def execute
read_csv_files
add_permalink_normalizations
import_categories
import_users
import_topics
import_posts
end
def read_csv_files
puts "", "reading CSV files..."
category_position = 0
csv_parse("categories") do |row|
@db.insert_category(
id: row[:id],
name: row[:name],
position: category_position += 1
)
end
csv_parse("users") do |row|
@db.insert_user(
id: row[:id],
email: row[:email],
username: row[:username],
bio: row[:description],
avatar_path: row[:profile_image],
created_at: parse_date(row[:createtime]),
active: true
)
end
last_topic_id = nil
csv_parse("questions-answers-comments") do |row|
next if row[:published] == "No"
user_id = @db.get_user_id(row[:username])
created_at = parse_datetime(row[:createtime])
begin
if row[:type] == "Question"
attachments = parse_filenames(row[:attachments], QUESTION_ATTACHMENT_DIRECTORY) +
parse_filenames(row[:images], QUESTION_IMAGE_DIRECTORY)
@db.insert_topic(
id: row[:id],
title: row[:title],
raw: row[:text],
category_id: row[:categorylist],
user_id: user_id,
created_at: created_at,
attachments: attachments
)
last_topic_id = row[:id]
else
attachments = parse_filenames(row[:attachments], ANSWER_ATTACHMENT_DIRECTORY) +
parse_filenames(row[:images], ANSWER_IMAGE_DIRECTORY)
@db.insert_post(
id: row[:id],
raw: row[:text],
topic_id: last_topic_id,
user_id: user_id,
created_at: created_at,
attachments: attachments
)
end
rescue
p row
raise
end
end
end
def parse_filenames(text, directory)
return [] if text.blank?
text
.split(';')
.map { |filename| File.join(@path, directory, filename.strip) }
end
def parse_date(text)
return nil if text.blank?
DateTime.strptime(text, "%m/%d/%y")
end
def parse_datetime(text)
return nil if text.blank?
# DateTime.strptime(text, "%m/%d/%Y %H:%M")
DateTime.parse(text).utc.to_datetime
end
def import_categories
puts "", "creating categories"
rows = @db.fetch_categories
create_categories(rows) do |row|
{
id: row['id'],
name: row['name'],
description: row['description'],
position: row['position']
}
end
end
def batches
super(BATCH_SIZE)
end
def import_users
puts "", "creating users"
total_count = @db.count_users
last_id = 0
batches do |offset|
rows, last_id = @db.fetch_users(last_id)
break if rows.empty?
next if all_records_exist?(:users, rows.map { |row| row['id'] })
create_users(rows, total: total_count, offset: offset) do |row|
{
id: row['id'],
email: row['email'],
username: row['username'],
bio_raw: row['bio'],
created_at: row['created_at'],
active: row['active'] == 1,
post_create_action: proc do |user|
create_avatar(user, row['avatar_path'])
end
}
end
end
end
def create_avatar(user, avatar_path)
return if avatar_path.blank?
avatar_path = File.join(@path, AVATAR_DIRECTORY, avatar_path)
if File.exist?(avatar_path)
@uploader.create_avatar(user, avatar_path)
else
STDERR.puts "Could not find avatar: #{avatar_path}"
end
end
def import_topics
puts "", "creating topics"
total_count = @db.count_topics
last_id = 0
batches do |offset|
rows, last_id = @db.fetch_topics(last_id)
break if rows.empty?
next if all_records_exist?(:posts, rows.map { |row| row['id'] })
create_posts(rows, total: total_count, offset: offset) do |row|
attachments = @db.fetch_topic_attachments(row['id']) if row['upload_count'] > 0
user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id
{
id: row['id'],
title: row['title'],
raw: raw_with_attachments(row['raw'].presence || row['title'], attachments, user_id),
category: category_id_from_imported_category_id(row['category_id']),
user_id: user_id,
created_at: row['created_at'],
closed: row['closed'] == 1,
post_create_action: proc do |post|
url = "q#{row['id']}"
Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
end
}
end
end
end
def import_posts
puts "", "creating posts"
total_count = @db.count_posts
last_row_id = 0
batches do |offset|
rows, last_row_id = @db.fetch_posts(last_row_id)
break if rows.empty?
next if all_records_exist?(:posts, rows.map { |row| row['id'] })
create_posts(rows, total: total_count, offset: offset) do |row|
topic = topic_lookup_from_imported_post_id(row['topic_id'])
attachments = @db.fetch_post_attachments(row['id']) if row['upload_count'] > 0
user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id
{
id: row['id'],
raw: raw_with_attachments(row['raw'], attachments, user_id),
user_id: user_id,
topic_id: topic[:topic_id],
created_at: row['created_at']
}
end
end
end
def raw_with_attachments(raw, attachments, user_id)
raw, embedded_paths, upload_ids = replace_embedded_attachments(raw, user_id)
raw = replace_question_links(raw)
raw = ReverseMarkdown.convert(raw) || ""
attachments&.each do |attachment|
path = attachment['path']
next if embedded_paths.include?(path)
if File.exist?(path)
filename = File.basename(path)
upload = @uploader.create_upload(user_id, path, filename)
if upload.present? && upload.persisted? && !upload_ids.include?(upload.id)
raw << "\n" << @uploader.html_for_upload(upload, filename)
end
else
STDERR.puts "Could not find file: #{path}"
end
end
raw
end
def replace_embedded_attachments(raw, user_id)
paths = []
upload_ids = []
raw = raw.gsub(EMBEDDED_IMAGE_REGEX) do
path = File.join(@path, Regexp.last_match['path'])
filename = File.basename(path)
path = find_image_path(filename)
if path
upload = @uploader.create_upload(user_id, path, filename)
if upload.present? && upload.persisted?
paths << path
upload_ids << upload.id
@uploader.html_for_upload(upload, filename)
end
else
STDERR.puts "Could not find file: #{path}"
end
end
[raw, paths, upload_ids]
end
def find_image_path(filename)
[QUESTION_IMAGE_DIRECTORY, ANSWER_IMAGE_DIRECTORY].each do |directory|
path = File.join(@path, directory, filename)
return path if File.exist?(path)
end
end
def replace_question_links(raw)
raw.gsub(QUESTION_LINK_REGEX) do
topic_id = Regexp.last_match("id")
topic = topic_lookup_from_imported_post_id(topic_id)
return Regexp.last_match.to_s unless topic
url = File.join(NEW_DOMAIN, topic[:url])
text = Regexp.last_match("text")
text.include?(OLD_DOMAIN) ? url : "<a href='#{url}'>#{text}</a>"
end
end
def add_permalink_normalizations
normalizations = SiteSetting.permalink_normalizations
normalizations = normalizations.blank? ? [] : normalizations.split('|')
add_normalization(normalizations, TOPIC_LINK_NORMALIZATION)
SiteSetting.permalink_normalizations = normalizations.join('|')
end
def add_normalization(normalizations, normalization)
normalizations << normalization unless normalizations.include?(normalization)
end
def permalink_exists?(url)
Permalink.find_by(url: url)
end
def csv_parse(table_name)
CSV.foreach(File.join(@path, "#{table_name}.csv"),
headers: true,
header_converters: :symbol,
skip_blanks: true,
encoding: 'bom|utf-8') { |row| yield row }
end
end
unless ARGV[0] && Dir.exist?(ARGV[0])
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/answerbase.rb DIRNAME", ""
exit 1
end
ImportScripts::Answerbase.new(ARGV[0]).perform