discourse/script/import_scripts/askbot.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

278 lines
7.4 KiB
Ruby
Raw Normal View History

# frozen_string_literal: true
2015-03-07 22:48:42 +08:00
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require "pg"
class ImportScripts::MyAskBot < ImportScripts::Base
# CHANGE THESE BEFORE RUNNING THE IMPORTER
BATCH_SIZE = 1000
OLD_SITE = "ask.cvxr.com"
DB_NAME = "cvxforum"
DB_USER = "cvxforum"
DB_PORT = 5432
DB_HOST = "ask.cvxr.com"
DB_PASS = "yeah, right"
# A list of categories to create. Any post with one of these tags will be
# assigned to that category. Ties are broken by list order.
2015-03-07 22:48:42 +08:00
CATEGORIES = %w[Nonconvex TFOCS MIDCP FAQ]
def initialize
super
@thread_parents = {}
@tagmap = []
@td = PG::TextDecoder::TimestampWithTimeZone.new
@client =
PG.connect(dbname: DB_NAME, host: DB_HOST, port: DB_PORT, user: DB_USER, password: DB_PASS)
end
def execute
create_cats
import_users
read_tags
import_posts
import_replies
post_process_posts
end
def create_cats
puts "", "creating categories"
CATEGORIES.each do |cat|
unless Category.where("LOWER(name) = ?", cat.downcase).first
Category.new(name: cat, user_id: -1).save!
end
end
end
def read_tags
puts "", "reading thread tags..."
tag_count = @client.exec(<<-SQL)[0]["count"]
SELECT COUNT(A.id)
FROM askbot_thread_tags A
JOIN tag B
ON A.tag_id = B.id
WHERE A.tag_id > 0
SQL
tags_done = 0
batches(BATCH_SIZE) do |offset|
tags = @client.exec(<<-SQL)
SELECT A.thread_id, B.name
FROM askbot_thread_tags A
JOIN tag B
ON A.tag_id = B.id
WHERE A.tag_id > 0
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if tags.ntuples() < 1
tags.each do |tag|
tid = tag["thread_id"].to_i
tnm = tag["name"].downcase
if @tagmap[tid]
@tagmap[tid].push(tnm)
else
@tagmap[tid] = [tnm]
end
tags_done += 1
print_status tags_done, tag_count
end
end
end
def import_users
puts "", "importing users"
total_count = @client.exec(<<-SQL)[0]["count"]
SELECT COUNT(id)
FROM auth_user
SQL
batches(BATCH_SIZE) do |offset|
users = @client.query(<<-SQL)
SELECT id, username, email, is_staff, date_joined, last_seen, real_name, website, location, about
FROM auth_user
ORDER BY date_joined
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if users.ntuples() < 1
next if all_records_exist? :users, users.map { |u| u["id"].to_i }
2015-03-07 22:48:42 +08:00
create_users(users, total: total_count, offset: offset) do |user|
{
2018-12-04 17:48:16 +08:00
id: user["id"],
username: user["username"],
email: user["email"] || fake_email,
2018-12-04 17:48:16 +08:00
admin: user["is_staff"],
created_at: Time.zone.at(@td.decode(user["date_joined"])),
2015-03-07 22:48:42 +08:00
last_seen_at: Time.zone.at(@td.decode(user["last_seen"])),
2018-12-04 17:48:16 +08:00
name: user["real_name"],
website: user["website"],
location: user["location"],
2015-03-07 22:48:42 +08:00
}
end
end
end
def import_posts
puts "", "importing questions..."
post_count = @client.exec(<<-SQL)[0]["count"]
SELECT COUNT(A.id)
FROM askbot_post A
JOIN askbot_thread B
ON A.thread_id = B.id
WHERE NOT B.closed AND A.post_type='question'
SQL
batches(BATCH_SIZE) do |offset|
posts = @client.exec(<<-SQL)
SELECT A.id, A.author_id, A.added_at, A.text, A.thread_id, B.title
FROM askbot_post A
JOIN askbot_thread B
ON A.thread_id = B.id
WHERE NOT B.closed AND A.post_type = 'question'
ORDER BY A.added_at
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if posts.ntuples() < 1
next if all_records_exist? :posts, posts.map { |p| p["id"].to_i }
2015-03-07 22:48:42 +08:00
create_posts(posts, total: post_count, offset: offset) do |post|
pid = post["id"]
tid = post["thread_id"].to_i
tags = @tagmap[tid]
cat = nil
if tags
CATEGORIES.each do |cname|
next if tags.exclude?(cname.downcase)
2015-03-07 22:48:42 +08:00
cat = cname
break
end
end
@thread_parents[tid] = pid
{
id: pid,
title: post["title"],
category: cat,
custom_fields: {
import_id: pid,
import_thread_id: tid,
import_tags: tags,
},
user_id: user_id_from_imported_user_id(post["author_id"]) || Discourse::SYSTEM_USER_ID,
created_at: Time.zone.at(@td.decode(post["added_at"])),
raw: post["text"],
}
end
end
end
def import_replies
puts "", "importing answers and comments..."
post_count = @client.exec(<<-SQL)[0]["count"]
SELECT COUNT(A.id)
FROM askbot_post A
JOIN askbot_thread B
ON A.thread_id = B.id
WHERE NOT B.closed AND A.post_type<>'question'
SQL
batches(BATCH_SIZE) do |offset|
posts = @client.exec(<<-SQL)
SELECT A.id, A.author_id, A.added_at, A.text, A.thread_id, B.title
FROM askbot_post A
JOIN askbot_thread B
ON A.thread_id = B.id
WHERE NOT B.closed AND A.post_type <> 'question'
ORDER BY A.added_at
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if posts.ntuples() < 1
next if all_records_exist? :posts, posts.map { |p| p["id"].to_i }
2015-03-07 22:48:42 +08:00
create_posts(posts, total: post_count, offset: offset) do |post|
tid = post["thread_id"].to_i
next unless thread = @thread_parents[tid]
next unless parent = topic_lookup_from_imported_post_id(thread)
pid = post["id"]
{
id: pid,
topic_id: parent[:topic_id],
custom_fields: {
import_id: pid,
},
user_id: user_id_from_imported_user_id(post["author_id"]) || Discourse::SYSTEM_USER_ID,
created_at: Time.zone.at(@td.decode(post["added_at"])),
raw: post["text"],
}
end
end
end
def post_process_posts
puts "", "Postprocessing posts..."
current = 0
max = Post.count
# Rewrite internal links; e.g.
# ask.cvxr.com/question/(\d+)/[^'"}]*
# I am sure this is incomplete, but we didn't make heavy use of internal
# links on our site.
tmp = Regexp.quote("http://#{OLD_SITE}")
2015-03-07 22:48:42 +08:00
r1 = %r{"(#{tmp})?/question/(\d+)/[a-zA-Z-]*/?"}
r2 = %r{\((#{tmp})?/question/(\d+)/[a-zA-Z-]*/?\)}
r3 = %r{<?#tmp/question/(\d+)/[a-zA-Z-]*/?>?}
Post.find_each do |post|
raw =
post
.raw
.gsub(r1) do
if topic = topic_lookup_from_imported_post_id($2)
"\"#{topic[:url]}\""
else
$&
2017-07-28 09:20:09 +08:00
end
end
2015-03-07 22:48:42 +08:00
raw =
raw.gsub(r2) do
if topic = topic_lookup_from_imported_post_id($2)
"(#{topic[:url]})"
else
$&
end
2015-03-07 22:48:42 +08:00
end
raw =
raw.gsub(r3) do
if topic = topic_lookup_from_imported_post_id($1)
trec = Topic.find_by(id: topic[:topic_id])
"[#{trec.title}](#{topic[:url]})"
else
2017-07-28 09:20:09 +08:00
$&
end
2015-03-07 22:48:42 +08:00
end
2017-07-28 09:20:09 +08:00
2015-03-07 22:48:42 +08:00
if raw != post.raw
post.raw = raw
2017-07-28 09:20:09 +08:00
post.save
end
2015-03-07 22:48:42 +08:00
print_status(current += 1, max)
end
end
2017-07-28 09:20:09 +08:00
end
2015-03-07 22:48:42 +08:00
ImportScripts::MyAskBot.new.perform