mirror of
https://github.com/discourse/discourse.git
synced 2024-11-24 20:51:50 +08:00
30990006a9
This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging
344 lines
9.5 KiB
Ruby
344 lines
9.5 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require 'csv'
|
|
require 'reverse_markdown'
|
|
require_relative 'base'
|
|
require_relative 'base/generic_database'
|
|
|
|
# Call it like this:
|
|
# RAILS_ENV=production bundle exec ruby script/import_scripts/answerbase.rb DIRNAME
|
|
class ImportScripts::Answerbase < ImportScripts::Base
|
|
OLD_DOMAIN = "http://answerbase.example.com" # without trailing slash
|
|
NEW_DOMAIN = "https://discourse.example.com"
|
|
AVATAR_DIRECTORY = "User Images"
|
|
ANSWER_ATTACHMENT_DIRECTORY = "Answer Attachments"
|
|
ANSWER_IMAGE_DIRECTORY = "Answer Images"
|
|
QUESTION_ATTACHMENT_DIRECTORY = "Question Attachments"
|
|
QUESTION_IMAGE_DIRECTORY = "Question Images"
|
|
EMBEDDED_IMAGE_REGEX = /<a[^>]*href="[^"]*relativeUrl=(?<path>[^"\&]*)[^"]*"[^>]*>\s*<img[^>]*>\s*<\/a>/i
|
|
QUESTION_LINK_REGEX = /<a[^>]*?href="#{Regexp.escape(OLD_DOMAIN)}\/[^"]*?(?:q|questionid=)(?<id>\d+)[^"]*?"[^>]*>(?<text>.*?)<\/a>/i
|
|
TOPIC_LINK_NORMALIZATION = '/.*?-(q\d+).*/\1'
|
|
BATCH_SIZE = 1000
|
|
|
|
def initialize(path)
|
|
super()
|
|
|
|
@path = path
|
|
@db = ImportScripts::GenericDatabase.new(
|
|
@path,
|
|
batch_size: BATCH_SIZE,
|
|
recreate: true,
|
|
numeric_keys: true
|
|
)
|
|
end
|
|
|
|
def execute
|
|
read_csv_files
|
|
|
|
add_permalink_normalizations
|
|
import_categories
|
|
import_users
|
|
import_topics
|
|
import_posts
|
|
end
|
|
|
|
def read_csv_files
|
|
puts "", "reading CSV files..."
|
|
|
|
category_position = 0
|
|
csv_parse("categories") do |row|
|
|
@db.insert_category(
|
|
id: row[:id],
|
|
name: row[:name],
|
|
position: category_position += 1
|
|
)
|
|
end
|
|
|
|
csv_parse("users") do |row|
|
|
@db.insert_user(
|
|
id: row[:id],
|
|
email: row[:email],
|
|
username: row[:username],
|
|
bio: row[:description],
|
|
avatar_path: row[:profile_image],
|
|
created_at: parse_date(row[:createtime]),
|
|
active: true
|
|
)
|
|
end
|
|
|
|
last_topic_id = nil
|
|
csv_parse("questions-answers-comments") do |row|
|
|
next if row[:published] == "No"
|
|
user_id = @db.get_user_id(row[:username])
|
|
created_at = parse_datetime(row[:createtime])
|
|
|
|
begin
|
|
if row[:type] == "Question"
|
|
attachments = parse_filenames(row[:attachments], QUESTION_ATTACHMENT_DIRECTORY) +
|
|
parse_filenames(row[:images], QUESTION_IMAGE_DIRECTORY)
|
|
|
|
@db.insert_topic(
|
|
id: row[:id],
|
|
title: row[:title],
|
|
raw: row[:text],
|
|
category_id: row[:categorylist],
|
|
user_id: user_id,
|
|
created_at: created_at,
|
|
attachments: attachments
|
|
)
|
|
last_topic_id = row[:id]
|
|
else
|
|
attachments = parse_filenames(row[:attachments], ANSWER_ATTACHMENT_DIRECTORY) +
|
|
parse_filenames(row[:images], ANSWER_IMAGE_DIRECTORY)
|
|
|
|
@db.insert_post(
|
|
id: row[:id],
|
|
raw: row[:text],
|
|
topic_id: last_topic_id,
|
|
user_id: user_id,
|
|
created_at: created_at,
|
|
attachments: attachments
|
|
)
|
|
end
|
|
rescue
|
|
p row
|
|
raise
|
|
end
|
|
end
|
|
end
|
|
|
|
def parse_filenames(text, directory)
|
|
return [] if text.blank?
|
|
|
|
text
|
|
.split(';')
|
|
.map { |filename| File.join(@path, directory, filename.strip) }
|
|
end
|
|
|
|
def parse_date(text)
|
|
return nil if text.blank?
|
|
DateTime.strptime(text, "%m/%d/%y")
|
|
end
|
|
|
|
def parse_datetime(text)
|
|
return nil if text.blank?
|
|
# DateTime.strptime(text, "%m/%d/%Y %H:%M")
|
|
DateTime.parse(text).utc.to_datetime
|
|
end
|
|
|
|
def import_categories
|
|
puts "", "creating categories"
|
|
rows = @db.fetch_categories
|
|
|
|
create_categories(rows) do |row|
|
|
{
|
|
id: row['id'],
|
|
name: row['name'],
|
|
description: row['description'],
|
|
position: row['position']
|
|
}
|
|
end
|
|
end
|
|
|
|
def batches
|
|
super(BATCH_SIZE)
|
|
end
|
|
|
|
def import_users
|
|
puts "", "creating users"
|
|
total_count = @db.count_users
|
|
last_id = 0
|
|
|
|
batches do |offset|
|
|
rows, last_id = @db.fetch_users(last_id)
|
|
break if rows.empty?
|
|
|
|
next if all_records_exist?(:users, rows.map { |row| row['id'] })
|
|
|
|
create_users(rows, total: total_count, offset: offset) do |row|
|
|
{
|
|
id: row['id'],
|
|
email: row['email'],
|
|
username: row['username'],
|
|
bio_raw: row['bio'],
|
|
created_at: row['created_at'],
|
|
active: row['active'] == 1,
|
|
post_create_action: proc do |user|
|
|
create_avatar(user, row['avatar_path'])
|
|
end
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def create_avatar(user, avatar_path)
|
|
return if avatar_path.blank?
|
|
avatar_path = File.join(@path, AVATAR_DIRECTORY, avatar_path)
|
|
|
|
if File.exist?(avatar_path)
|
|
@uploader.create_avatar(user, avatar_path)
|
|
else
|
|
STDERR.puts "Could not find avatar: #{avatar_path}"
|
|
end
|
|
end
|
|
|
|
def import_topics
|
|
puts "", "creating topics"
|
|
total_count = @db.count_topics
|
|
last_id = 0
|
|
|
|
batches do |offset|
|
|
rows, last_id = @db.fetch_topics(last_id)
|
|
break if rows.empty?
|
|
|
|
next if all_records_exist?(:posts, rows.map { |row| row['id'] })
|
|
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
attachments = @db.fetch_topic_attachments(row['id']) if row['upload_count'] > 0
|
|
user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id
|
|
|
|
{
|
|
id: row['id'],
|
|
title: row['title'],
|
|
raw: raw_with_attachments(row['raw'].presence || row['title'], attachments, user_id),
|
|
category: category_id_from_imported_category_id(row['category_id']),
|
|
user_id: user_id,
|
|
created_at: row['created_at'],
|
|
closed: row['closed'] == 1,
|
|
post_create_action: proc do |post|
|
|
url = "q#{row['id']}"
|
|
Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
|
|
end
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_posts
|
|
puts "", "creating posts"
|
|
total_count = @db.count_posts
|
|
last_row_id = 0
|
|
|
|
batches do |offset|
|
|
rows, last_row_id = @db.fetch_posts(last_row_id)
|
|
break if rows.empty?
|
|
|
|
next if all_records_exist?(:posts, rows.map { |row| row['id'] })
|
|
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
topic = topic_lookup_from_imported_post_id(row['topic_id'])
|
|
attachments = @db.fetch_post_attachments(row['id']) if row['upload_count'] > 0
|
|
user_id = user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id
|
|
|
|
{
|
|
id: row['id'],
|
|
raw: raw_with_attachments(row['raw'], attachments, user_id),
|
|
user_id: user_id,
|
|
topic_id: topic[:topic_id],
|
|
created_at: row['created_at']
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def raw_with_attachments(raw, attachments, user_id)
|
|
raw, embedded_paths, upload_ids = replace_embedded_attachments(raw, user_id)
|
|
raw = replace_question_links(raw)
|
|
raw = ReverseMarkdown.convert(raw) || ""
|
|
|
|
attachments&.each do |attachment|
|
|
path = attachment['path']
|
|
next if embedded_paths.include?(path)
|
|
|
|
if File.exist?(path)
|
|
filename = File.basename(path)
|
|
upload = @uploader.create_upload(user_id, path, filename)
|
|
|
|
if upload.present? && upload.persisted? && !upload_ids.include?(upload.id)
|
|
raw << "\n" << @uploader.html_for_upload(upload, filename)
|
|
end
|
|
else
|
|
STDERR.puts "Could not find file: #{path}"
|
|
end
|
|
end
|
|
|
|
raw
|
|
end
|
|
|
|
def replace_embedded_attachments(raw, user_id)
|
|
paths = []
|
|
upload_ids = []
|
|
|
|
raw = raw.gsub(EMBEDDED_IMAGE_REGEX) do
|
|
path = File.join(@path, Regexp.last_match['path'])
|
|
filename = File.basename(path)
|
|
path = find_image_path(filename)
|
|
|
|
if path
|
|
upload = @uploader.create_upload(user_id, path, filename)
|
|
|
|
if upload.present? && upload.persisted?
|
|
paths << path
|
|
upload_ids << upload.id
|
|
@uploader.html_for_upload(upload, filename)
|
|
end
|
|
else
|
|
STDERR.puts "Could not find file: #{path}"
|
|
end
|
|
end
|
|
|
|
[raw, paths, upload_ids]
|
|
end
|
|
|
|
def find_image_path(filename)
|
|
[QUESTION_IMAGE_DIRECTORY, ANSWER_IMAGE_DIRECTORY].each do |directory|
|
|
path = File.join(@path, directory, filename)
|
|
return path if File.exist?(path)
|
|
end
|
|
end
|
|
|
|
def replace_question_links(raw)
|
|
raw.gsub(QUESTION_LINK_REGEX) do
|
|
topic_id = Regexp.last_match("id")
|
|
topic = topic_lookup_from_imported_post_id(topic_id)
|
|
return Regexp.last_match.to_s unless topic
|
|
|
|
url = File.join(NEW_DOMAIN, topic[:url])
|
|
text = Regexp.last_match("text")
|
|
text.include?(OLD_DOMAIN) ? url : "<a href='#{url}'>#{text}</a>"
|
|
end
|
|
end
|
|
|
|
def add_permalink_normalizations
|
|
normalizations = SiteSetting.permalink_normalizations
|
|
normalizations = normalizations.blank? ? [] : normalizations.split('|')
|
|
|
|
add_normalization(normalizations, TOPIC_LINK_NORMALIZATION)
|
|
|
|
SiteSetting.permalink_normalizations = normalizations.join('|')
|
|
end
|
|
|
|
def add_normalization(normalizations, normalization)
|
|
normalizations << normalization unless normalizations.include?(normalization)
|
|
end
|
|
|
|
def permalink_exists?(url)
|
|
Permalink.find_by(url: url)
|
|
end
|
|
|
|
def csv_parse(table_name)
|
|
CSV.foreach(File.join(@path, "#{table_name}.csv"),
|
|
headers: true,
|
|
header_converters: :symbol,
|
|
skip_blanks: true,
|
|
encoding: 'bom|utf-8') { |row| yield row }
|
|
end
|
|
end
|
|
|
|
unless ARGV[0] && Dir.exist?(ARGV[0])
|
|
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/answerbase.rb DIRNAME", ""
|
|
exit 1
|
|
end
|
|
|
|
ImportScripts::Answerbase.new(ARGV[0]).perform
|