FEATURE: Import script for AnswerBase

Improves the generic database used by some import scripts:
* Adds additional columns for users
* Adds support for attachments
* Allows setting the data type for keys (numeric or string) to ensure correct sorting
This commit is contained in:
Gerhard Schlager 2019-02-28 21:59:36 +01:00
parent ff26b4ed9b
commit c36c9c2ee5
5 changed files with 471 additions and 40 deletions

View File

@ -202,10 +202,11 @@ gem 'rchardet', require: false
if ENV["IMPORT"] == "1" if ENV["IMPORT"] == "1"
gem 'mysql2' gem 'mysql2'
gem 'redcarpet' gem 'redcarpet'
gem 'sqlite3', '~> 1.3.13' gem 'sqlite3', '~> 1.3', '>= 1.3.13'
gem 'ruby-bbcode-to-md', git: 'https://github.com/nlalonde/ruby-bbcode-to-md' gem 'ruby-bbcode-to-md', git: 'https://github.com/nlalonde/ruby-bbcode-to-md'
gem 'reverse_markdown' gem 'reverse_markdown'
gem 'tiny_tds' gem 'tiny_tds'
gem 'csv', '~> 3.0'
end end
gem 'webpush', require: false gem 'webpush', require: false

View File

@ -0,0 +1,341 @@
require 'csv'
require 'reverse_markdown'
require_relative 'base'
require_relative 'base/generic_database'
# Call it like this:
# RAILS_ENV=production bundle exec ruby script/import_scripts/answerbase.rb DIRNAME
class ImportScripts::Answerbase < ImportScripts::Base
OLD_DOMAIN = "http://answerbase.example.com" # without trailing slash
NEW_DOMAIN = "https://discourse.example.com"
AVATAR_DIRECTORY = "User Images"
ANSWER_ATTACHMENT_DIRECTORY = "Answer Attachments"
ANSWER_IMAGE_DIRECTORY = "Answer Images"
QUESTION_ATTACHMENT_DIRECTORY = "Question Attachments"
QUESTION_IMAGE_DIRECTORY = "Question Images"
EMBEDDED_IMAGE_REGEX = /<a[^>]*href="[^"]*relativeUrl=(?<path>[^"\&]*)[^"]*"[^>]*>\s*<img[^>]*>\s*<\/a>/i
QUESTION_LINK_REGEX = /<a[^>]*?href="#{Regexp.escape(OLD_DOMAIN)}\/[^"]*?(?:q|questionid=)(?<id>\d+)[^"]*?"[^>]*>(?<text>.*?)<\/a>/i
TOPIC_LINK_NORMALIZATION = '/.*?-(q\d+).*/\1'
BATCH_SIZE = 1000
def initialize(path)
super()
@path = path
@db = ImportScripts::GenericDatabase.new(
@path,
batch_size: BATCH_SIZE,
recreate: true,
numeric_keys: true
)
end
def execute
read_csv_files
add_permalink_normalizations
import_categories
import_users
import_topics
import_posts
end
def read_csv_files
puts "", "reading CSV files..."
category_position = 0
csv_parse("categories") do |row|
@db.insert_category(
id: row[:id],
name: row[:name],
position: category_position += 1
)
end
csv_parse("users") do |row|
@db.insert_user(
id: row[:id],
email: row[:email],
username: row[:username],
bio: row[:description],
avatar_path: row[:profile_image],
created_at: parse_date(row[:createtime]),
active: true
)
end
last_topic_id = nil
csv_parse("questions-answers-comments") do |row|
next if row[:published] == "No"
user_id = @db.get_user_id(row[:username])
created_at = parse_datetime(row[:createtime])
begin
if row[:type] == "Question"
attachments = parse_filenames(row[:attachments], QUESTION_ATTACHMENT_DIRECTORY) +
parse_filenames(row[:images], QUESTION_IMAGE_DIRECTORY)
@db.insert_topic(
id: row[:id],
title: row[:title],
raw: row[:text],
category_id: row[:categorylist],
user_id: user_id,
created_at: created_at,
attachments: attachments
)
last_topic_id = row[:id]
else
attachments = parse_filenames(row[:attachments], ANSWER_ATTACHMENT_DIRECTORY) +
parse_filenames(row[:images], ANSWER_IMAGE_DIRECTORY)
@db.insert_post(
id: row[:id],
raw: row[:text],
topic_id: last_topic_id,
user_id: user_id,
created_at: created_at,
attachments: attachments
)
end
rescue
p row
raise
end
end
end
def parse_filenames(text, directory)
return [] if text.blank?
text
.split(';')
.map { |filename| File.join(@path, directory, filename.strip) }
end
def parse_date(text)
return nil if text.blank?
DateTime.strptime(text, "%m/%d/%y")
end
def parse_datetime(text)
return nil if text.blank?
# DateTime.strptime(text, "%m/%d/%Y %H:%M")
DateTime.parse(text).utc.to_datetime
end
def import_categories
puts "", "creating categories"
rows = @db.fetch_categories
create_categories(rows) do |row|
{
id: row['id'],
name: row['name'],
description: row['description'],
position: row['position']
}
end
end
def batches
super(BATCH_SIZE)
end
def import_users
puts "", "creating users"
total_count = @db.count_users
last_id = 0
batches do |offset|
rows, last_id = @db.fetch_users(last_id)
break if rows.empty?
next if all_records_exist?(:users, rows.map { |row| row['id'] })
create_users(rows, total: total_count, offset: offset) do |row|
{
id: row['id'],
email: row['email'],
username: row['username'],
bio_raw: row['bio'],
created_at: row['created_at'],
active: row['active'] == 1,
post_create_action: proc do |user|
create_avatar(user, row['avatar_path'])
end
}
end
end
end
def create_avatar(user, avatar_path)
return if avatar_path.blank?
avatar_path = File.join(@path, AVATAR_DIRECTORY, avatar_path)
if File.exist?(avatar_path)
@uploader.create_avatar(user, avatar_path)
else
STDERR.puts "Could not find avatar: #{avatar_path}"
end
end
def import_topics
puts "", "creating topics"
total_count = @db.count_topics
last_id = 0
batches do |offset|
rows, last_id = @db.fetch_topics(last_id)
break if rows.empty?
next if all_records_exist?(:posts, rows.map { |row| row['id'] })
create_posts(rows, total: total_count, offset: offset) do |row|
attachments = @db.fetch_topic_attachments(row['id']) if row['upload_count'] > 0
user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id
{
id: row['id'],
title: row['title'],
raw: raw_with_attachments(row['raw'].presence || row['title'], attachments, user_id),
category: category_id_from_imported_category_id(row['category_id']),
user_id: user_id,
created_at: row['created_at'],
closed: row['closed'] == 1,
post_create_action: proc do |post|
url = "q#{row['id']}"
Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
end
}
end
end
end
def import_posts
puts "", "creating posts"
total_count = @db.count_posts
last_row_id = 0
batches do |offset|
rows, last_row_id = @db.fetch_posts(last_row_id)
break if rows.empty?
next if all_records_exist?(:posts, rows.map { |row| row['id'] })
create_posts(rows, total: total_count, offset: offset) do |row|
topic = topic_lookup_from_imported_post_id(row['topic_id'])
attachments = @db.fetch_post_attachments(row['id']) if row['upload_count'] > 0
user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id
{
id: row['id'],
raw: raw_with_attachments(row['raw'], attachments, user_id),
user_id: user_id,
topic_id: topic[:topic_id],
created_at: row['created_at']
}
end
end
end
def raw_with_attachments(raw, attachments, user_id)
raw, embedded_paths, upload_ids = replace_embedded_attachments(raw, user_id)
raw = replace_question_links(raw)
raw = ReverseMarkdown.convert(raw) || ""
attachments&.each do |attachment|
path = attachment['path']
next if embedded_paths.include?(path)
if File.exist?(path)
filename = File.basename(path)
upload = @uploader.create_upload(user_id, path, filename)
if upload.present? && upload.persisted? && !upload_ids.include?(upload.id)
raw << "\n" << @uploader.html_for_upload(upload, filename)
end
else
STDERR.puts "Could not find file: #{path}"
end
end
raw
end
def replace_embedded_attachments(raw, user_id)
paths = []
upload_ids = []
raw = raw.gsub(EMBEDDED_IMAGE_REGEX) do
path = File.join(@path, Regexp.last_match['path'])
filename = File.basename(path)
path = find_image_path(filename)
if path
upload = @uploader.create_upload(user_id, path, filename)
if upload.present? && upload.persisted?
paths << path
upload_ids << upload.id
@uploader.html_for_upload(upload, filename)
end
else
STDERR.puts "Could not find file: #{path}"
end
end
[raw, paths, upload_ids]
end
def find_image_path(filename)
[QUESTION_IMAGE_DIRECTORY, ANSWER_IMAGE_DIRECTORY].each do |directory|
path = File.join(@path, directory, filename)
return path if File.exist?(path)
end
end
def replace_question_links(raw)
raw.gsub(QUESTION_LINK_REGEX) do
topic_id = Regexp.last_match("id")
topic = topic_lookup_from_imported_post_id(topic_id)
return Regexp.last_match.to_s unless topic
url = File.join(NEW_DOMAIN, topic[:url])
text = Regexp.last_match("text")
text.include?(OLD_DOMAIN) ? url : "<a href='#{url}'>#{text}</a>"
end
end
def add_permalink_normalizations
normalizations = SiteSetting.permalink_normalizations
normalizations = normalizations.blank? ? [] : normalizations.split('|')
add_normalization(normalizations, TOPIC_LINK_NORMALIZATION)
SiteSetting.permalink_normalizations = normalizations.join('|')
end
def add_normalization(normalizations, normalization)
normalizations << normalization unless normalizations.include?(normalization)
end
def permalink_exists?(url)
Permalink.find_by(url: url)
end
def csv_parse(table_name)
CSV.foreach(File.join(@path, "#{table_name}.csv"),
headers: true,
header_converters: :symbol,
skip_blanks: true,
encoding: 'bom|utf-8') { |row| yield row }
end
end
unless ARGV[0] && Dir.exist?(ARGV[0])
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/answerbase.rb DIRNAME", ""
exit 1
end
ImportScripts::Answerbase.new(ARGV[0]).perform

View File

@ -2,12 +2,13 @@ require 'sqlite3'
module ImportScripts module ImportScripts
class GenericDatabase class GenericDatabase
def initialize(directory, batch_size:, recreate: false) def initialize(directory, batch_size:, recreate: false, numeric_keys: false)
filename = "#{directory}/index.db" filename = "#{directory}/index.db"
File.delete(filename) if recreate && File.exists?(filename) File.delete(filename) if recreate && File.exists?(filename)
@db = SQLite3::Database.new(filename, results_as_hash: true) @db = SQLite3::Database.new(filename, results_as_hash: true)
@batch_size = batch_size @batch_size = batch_size
@numeric_keys = numeric_keys
configure_database configure_database
create_category_table create_category_table
@ -25,36 +26,72 @@ module ImportScripts
def insert_user(user) def insert_user(user)
@db.execute(<<-SQL, prepare(user)) @db.execute(<<-SQL, prepare(user))
INSERT OR REPLACE INTO user (id, email, username, name, created_at, last_seen_at, active) INSERT OR REPLACE
VALUES (:id, :email, :username, :name, :created_at, :last_seen_at, :active) INTO user (id, email, username, name, bio, avatar_path, created_at, last_seen_at, active)
VALUES (:id, :email, :username, :name, :bio, :avatar_path, :created_at, :last_seen_at, :active)
SQL SQL
end end
def insert_topic(topic) def insert_topic(topic)
attachments = topic.delete(:attachments)
topic[:upload_count] = attachments&.size || 0
@db.execute(<<-SQL, prepare(topic)) @db.execute(<<-SQL, prepare(topic))
INSERT OR REPLACE INTO topic (id, title, raw, category_id, closed, user_id, created_at, url) INSERT OR REPLACE INTO topic (id, title, raw, category_id, closed, user_id, created_at, url, upload_count)
VALUES (:id, :title, :raw, :category_id, :closed, :user_id, :created_at, :url) VALUES (:id, :title, :raw, :category_id, :closed, :user_id, :created_at, :url, :upload_count)
SQL SQL
attachments&.each do |attachment|
@db.execute(<<-SQL, topic_id: topic[:id], path: attachment)
INSERT OR REPLACE INTO topic_upload (topic_id, path)
VALUES (:topic_id, :path)
SQL
end
end end
def insert_post(post) def insert_post(post)
attachments = post.delete(:attachments)
post[:upload_count] = attachments&.size || 0
@db.execute(<<-SQL, prepare(post)) @db.execute(<<-SQL, prepare(post))
INSERT OR REPLACE INTO post (id, raw, topic_id, user_id, created_at, reply_to_post_id, url) INSERT OR REPLACE INTO post (id, raw, topic_id, user_id, created_at, reply_to_post_id, url, upload_count)
VALUES (:id, :raw, :topic_id, :user_id, :created_at, :reply_to_post_id, :url) VALUES (:id, :raw, :topic_id, :user_id, :created_at, :reply_to_post_id, :url, :upload_count)
SQL SQL
attachments&.each do |attachment|
@db.execute(<<-SQL, post_id: post[:id], path: attachment)
INSERT OR REPLACE INTO post_upload (post_id, path)
VALUES (:post_id, :path)
SQL
end
end end
def sort_posts_by_created_at def sort_posts_by_created_at
@db.execute 'DELETE FROM post_order' @db.execute 'DELETE FROM post_order'
@db.execute <<-SQL @db.execute <<-SQL
INSERT INTO post_order (id) INSERT INTO post_order (post_id)
SELECT id SELECT id
FROM post FROM post
ORDER BY created_at, topic_id, id ORDER BY created_at, topic_id, id
SQL SQL
end end
def delete_unused_users
@db.execute <<~SQL
DELETE FROM user
WHERE NOT EXISTS(
SELECT 1
FROM topic
WHERE topic.user_id = user.id
) AND NOT EXISTS(
SELECT 1
FROM post
WHERE post.user_id = user.id
)
SQL
end
def fetch_categories def fetch_categories
@db.execute(<<-SQL) @db.execute(<<-SQL)
SELECT * SELECT *
@ -82,6 +119,14 @@ module ImportScripts
add_last_column_value(rows, 'id') add_last_column_value(rows, 'id')
end end
def get_user_id(username)
@db.get_first_value(<<-SQL, username)
SELECT id
FROM user
WHERE username = :username
SQL
end
def count_topics def count_topics
@db.get_first_value(<<-SQL) @db.get_first_value(<<-SQL)
SELECT COUNT(*) SELECT COUNT(*)
@ -101,6 +146,14 @@ module ImportScripts
add_last_column_value(rows, 'id') add_last_column_value(rows, 'id')
end end
def fetch_topic_attachments(topic_id)
@db.execute(<<-SQL, topic_id)
SELECT path
FROM topic_upload
WHERE topic_id = :topic_id
SQL
end
def count_posts def count_posts
@db.get_first_value(<<-SQL) @db.get_first_value(<<-SQL)
SELECT COUNT(*) SELECT COUNT(*)
@ -110,9 +163,21 @@ module ImportScripts
def fetch_posts(last_row_id) def fetch_posts(last_row_id)
rows = @db.execute(<<-SQL, last_row_id) rows = @db.execute(<<-SQL, last_row_id)
SELECT o.ROWID, p.* SELECT ROWID AS rowid, *
FROM post
WHERE ROWID > :last_row_id
ORDER BY ROWID
LIMIT #{@batch_size}
SQL
add_last_column_value(rows, 'rowid')
end
def fetch_sorted_posts(last_row_id)
rows = @db.execute(<<-SQL, last_row_id)
SELECT o.ROWID AS rowid, p.*
FROM post p FROM post p
JOIN post_order o USING (id) JOIN post_order o ON (p.id = o.post_id)
WHERE o.ROWID > :last_row_id WHERE o.ROWID > :last_row_id
ORDER BY o.ROWID ORDER BY o.ROWID
LIMIT #{@batch_size} LIMIT #{@batch_size}
@ -121,6 +186,14 @@ module ImportScripts
add_last_column_value(rows, 'rowid') add_last_column_value(rows, 'rowid')
end end
def fetch_post_attachments(post_id)
@db.execute(<<-SQL, post_id)
SELECT path
FROM post_upload
WHERE post_id = :post_id
SQL
end
def execute_sql(sql) def execute_sql(sql)
@db.execute(sql) @db.execute(sql)
end end
@ -136,10 +209,14 @@ module ImportScripts
@db.execute 'PRAGMA locking_mode = EXCLUSIVE' @db.execute 'PRAGMA locking_mode = EXCLUSIVE'
end end
def key_data_type
@numeric_keys ? 'INTEGER' : 'TEXT'
end
def create_category_table def create_category_table
@db.execute <<-SQL @db.execute <<-SQL
CREATE TABLE IF NOT EXISTS category ( CREATE TABLE IF NOT EXISTS category (
id TEXT NOT NULL PRIMARY KEY, id #{key_data_type} NOT NULL PRIMARY KEY,
name TEXT NOT NULL, name TEXT NOT NULL,
description TEXT, description TEXT,
position INTEGER, position INTEGER,
@ -151,44 +228,59 @@ module ImportScripts
def create_user_table def create_user_table
@db.execute <<-SQL @db.execute <<-SQL
CREATE TABLE IF NOT EXISTS user ( CREATE TABLE IF NOT EXISTS user (
id TEXT NOT NULL PRIMARY KEY, id #{key_data_type} NOT NULL PRIMARY KEY,
email TEXT, email TEXT,
username TEXT, username TEXT,
name TEXT, name TEXT,
bio TEXT,
avatar_path TEXT,
created_at DATETIME, created_at DATETIME,
last_seen_at DATETIME, last_seen_at DATETIME,
active BOOLEAN NOT NULL DEFAULT true active BOOLEAN NOT NULL DEFAULT true
) )
SQL SQL
@db.execute 'CREATE INDEX IF NOT EXISTS user_by_username ON user (username)'
end end
def create_topic_table def create_topic_table
@db.execute <<-SQL @db.execute <<-SQL
CREATE TABLE IF NOT EXISTS topic ( CREATE TABLE IF NOT EXISTS topic (
id TEXT NOT NULL PRIMARY KEY, id #{key_data_type} NOT NULL PRIMARY KEY,
title TEXT, title TEXT,
raw TEXT, raw TEXT,
category_id TEXT NOT NULL, category_id #{key_data_type},
closed BOOLEAN NOT NULL DEFAULT false, closed BOOLEAN NOT NULL DEFAULT false,
user_id TEXT NOT NULL, user_id #{key_data_type} NOT NULL,
created_at DATETIME, created_at DATETIME,
url TEXT url TEXT,
upload_count INTEGER DEFAULT 0
) )
SQL SQL
@db.execute 'CREATE INDEX IF NOT EXISTS topic_by_user_id ON topic (user_id)' @db.execute 'CREATE INDEX IF NOT EXISTS topic_by_user_id ON topic (user_id)'
@db.execute <<-SQL
CREATE TABLE IF NOT EXISTS topic_upload (
topic_id #{key_data_type} NOT NULL,
path TEXT NOT NULL
)
SQL
@db.execute 'CREATE UNIQUE INDEX IF NOT EXISTS topic_upload_unique ON topic_upload(topic_id, path)'
end end
def create_post_table def create_post_table
@db.execute <<-SQL @db.execute <<-SQL
CREATE TABLE IF NOT EXISTS post ( CREATE TABLE IF NOT EXISTS post (
id TEXT NOT NULL PRIMARY KEY, id #{key_data_type} NOT NULL PRIMARY KEY,
raw TEXT, raw TEXT,
topic_id TEXT NOT NULL, topic_id #{key_data_type} NOT NULL,
user_id TEXT NOT NULL, user_id #{key_data_type} NOT NULL,
created_at DATETIME, created_at DATETIME,
reply_to_post_id TEXT, reply_to_post_id #{key_data_type},
url TEXT url TEXT,
upload_count INTEGER DEFAULT 0
) )
SQL SQL
@ -196,9 +288,18 @@ module ImportScripts
@db.execute <<-SQL @db.execute <<-SQL
CREATE TABLE IF NOT EXISTS post_order ( CREATE TABLE IF NOT EXISTS post_order (
id TEXT NOT NULL PRIMARY KEY post_id #{key_data_type} NOT NULL PRIMARY KEY
) )
SQL SQL
@db.execute <<-SQL
CREATE TABLE IF NOT EXISTS post_upload (
post_id #{key_data_type} NOT NULL,
path TEXT NOT NULL
)
SQL
@db.execute 'CREATE UNIQUE INDEX IF NOT EXISTS post_upload_unique ON post_upload(post_id, path)'
end end
def prepare(hash) def prepare(hash)

View File

@ -82,19 +82,7 @@ class ImportScripts::Zendesk < ImportScripts::Base
) )
end end
@db.execute_sql(<<~SQL) @db.delete_unused_users
DELETE FROM user
WHERE NOT EXISTS(
SELECT 1
FROM topic
WHERE topic.user_id = user.id
) AND NOT EXISTS(
SELECT 1
FROM post
WHERE post.user_id = user.id
)
SQL
@db.sort_posts_by_created_at @db.sort_posts_by_created_at
end end
@ -188,7 +176,7 @@ class ImportScripts::Zendesk < ImportScripts::Base
last_row_id = 0 last_row_id = 0
batches do |offset| batches do |offset|
rows, last_row_id = @db.fetch_posts(last_row_id) rows, last_row_id = @db.fetch_sorted_posts(last_row_id)
break if rows.empty? break if rows.empty?
next if all_records_exist?(:posts, rows.map { |row| row['id'] }) next if all_records_exist?(:posts, rows.map { |row| row['id'] })

View File

@ -8,7 +8,7 @@ require_relative 'base/generic_database'
# Call it like this: # Call it like this:
# RAILS_ENV=production bundle exec ruby script/import_scripts/zendesk_api.rb SOURCE_URL DIRNAME AUTH_EMAIL AUTH_TOKEN # RAILS_ENV=production bundle exec ruby script/import_scripts/zendesk_api.rb SOURCE_URL DIRNAME AUTH_EMAIL AUTH_TOKEN
class ImportScripts::Zendesk < ImportScripts::Base class ImportScripts::ZendeskApi < ImportScripts::Base
BATCH_SIZE = 1000 BATCH_SIZE = 1000
def initialize(source_url, path, auth_email, auth_token) def initialize(source_url, path, auth_email, auth_token)
@ -193,7 +193,7 @@ class ImportScripts::Zendesk < ImportScripts::Base
last_row_id = 0 last_row_id = 0
batches do |offset| batches do |offset|
rows, last_row_id = @db.fetch_posts(last_row_id) rows, last_row_id = @db.fetch_sorted_posts(last_row_id)
break if rows.empty? break if rows.empty?
create_posts(rows, total: total_count, offset: offset) do |row| create_posts(rows, total: total_count, offset: offset) do |row|
@ -303,4 +303,4 @@ unless ARGV.length == 4 && Dir.exist?(ARGV[1])
exit 1 exit 1
end end
ImportScripts::Zendesk.new(ARGV[0], ARGV[1], ARGV[2], ARGV[3]).perform ImportScripts::ZendeskApi.new(ARGV[0], ARGV[1], ARGV[2], ARGV[3]).perform