From e03a1f3532ea29d45773105cd8e1f5affec241d3 Mon Sep 17 00:00:00 2001 From: "Michael C. Grant" Date: Sat, 7 Mar 2015 08:48:42 -0600 Subject: [PATCH] Initial commit of askbot.rb --- script/import_scripts/askbot.rb | 270 ++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 script/import_scripts/askbot.rb diff --git a/script/import_scripts/askbot.rb b/script/import_scripts/askbot.rb new file mode 100644 index 00000000000..c6af92b4eae --- /dev/null +++ b/script/import_scripts/askbot.rb @@ -0,0 +1,270 @@ +require File.expand_path(File.dirname(__FILE__) + "/base.rb") +require 'pg' + +class ImportScripts::MyAskBot < ImportScripts::Base + # CHANGE THESE BEFORE RUNNING THE IMPORTER + + BATCH_SIZE = 1000 + + OLD_SITE = "ask.cvxr.com" + DB_NAME = "cvxforum" + DB_USER = "cvxforum" + DB_PORT = 5432 + DB_HOST = "ask.cvxr.com" + DB_PASS = 'yeah, right' + + # A list of categories to create. Any post with one of these tags will be + # assigned to that category. Ties are broken by list orer. + CATEGORIES = [ 'Nonconvex', 'TFOCS', 'MIDCP', 'FAQ' ] + + def initialize + super + + @thread_parents = {} + @tagmap = [] + @td = PG::TextDecoder::TimestampWithTimeZone.new + @client = PG.connect( + :dbname => DB_NAME, + :host => DB_HOST, + :port => DB_PORT, + :user => DB_USER, + :password => DB_PASS + ) + end + + def execute + create_cats + import_users + read_tags + import_posts + import_replies + post_process_posts + end + + def create_cats + puts "", "creating categories" + CATEGORIES.each do |cat| + unless Category.where("LOWER(name) = ?", cat.downcase).first + Category.new(name: cat, user_id: -1).save! + end + end + end + + def read_tags + puts "", "reading thread tags..." + + tag_count = @client.exec(<<-SQL + SELECT COUNT(A.id) + FROM askbot_thread_tags A + JOIN tag B + ON A.tag_id = B.id + WHERE A.tag_id > 0 + SQL + )[0]["count"] + + tags_done = 0 + batches(BATCH_SIZE) do |offset| + tags = @client.exec(<<-SQL + SELECT A.thread_id, B.name + FROM askbot_thread_tags A + JOIN tag B + ON A.tag_id = B.id + WHERE A.tag_id > 0 + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + ) + break if tags.ntuples() < 1 + tags.each do |tag| + tid = tag["thread_id"].to_i + tnm = tag["name"].downcase + if @tagmap[tid] + @tagmap[tid].push( tnm ) + else + @tagmap[tid] = [ tnm ] + end + tags_done += 1 + print_status tags_done, tag_count + end + end + end + + def import_users + puts "", "importing users" + + total_count = @client.exec(<<-SQL + SELECT COUNT(id) + FROM auth_user + SQL + )[0]["count"] + + batches(BATCH_SIZE) do |offset| + users = @client.query(<<-SQL + SELECT id, username, email, is_staff, date_joined, last_seen, real_name, website, location, about + FROM auth_user + ORDER BY date_joined + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + ) + + break if users.ntuples() < 1 + + create_users(users, total: total_count, offset: offset) do |user| + { + id: user["id"], + username: user["username"], + email: user["email"] || (SecureRandom.hex << "@domain.com"), + admin: user["is_staff"], + created_at: Time.zone.at(@td.decode(user["date_joined"])), + last_seen_at: Time.zone.at(@td.decode(user["last_seen"])), + name: user["real_name"], + website: user["website"], + location: user["location"], + } + end + end + end + + def import_posts + puts "", "importing questions..." + + post_count = @client.exec(<<-SQL + SELECT COUNT(A.id) + FROM askbot_post A + JOIN askbot_thread B + ON A.thread_id = B.id + WHERE NOT B.closed AND A.post_type='question' + SQL + )[0]["count"] + + batches(BATCH_SIZE) do |offset| + posts = @client.exec(<<-SQL + SELECT A.id, A.author_id, A.added_at, A.text, A.thread_id, B.title + FROM askbot_post A + JOIN askbot_thread B + ON A.thread_id = B.id + WHERE NOT B.closed AND A.post_type = 'question' + ORDER BY A.added_at + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + ) + + break if posts.ntuples() < 1 + + create_posts(posts, total: post_count, offset: offset) do |post| + pid = post["id"] + tid = post["thread_id"].to_i + tags = @tagmap[tid] + cat = nil + if tags + CATEGORIES.each do |cname| + next unless tags.include?(cname.downcase) + cat = cname + break + end + end + @thread_parents[tid] = pid + { + id: pid, + title: post["title"], + category: cat, + custom_fields: {import_id: pid, import_thread_id: tid, import_tags: tags}, + user_id: user_id_from_imported_user_id(post["author_id"]) || Discourse::SYSTEM_USER_ID, + created_at: Time.zone.at(@td.decode(post["added_at"])), + raw: post["text"], + } + end + end + end + + def import_replies + puts "", "importing answers and comments..." + + post_count = @client.exec(<<-SQL + SELECT COUNT(A.id) + FROM askbot_post A + JOIN askbot_thread B + ON A.thread_id = B.id + WHERE NOT B.closed AND A.post_type<>'question' + SQL + )[0]["count"] + + batches(BATCH_SIZE) do |offset| + posts = @client.exec(<<-SQL + SELECT A.id, A.author_id, A.added_at, A.text, A.thread_id, B.title + FROM askbot_post A + JOIN askbot_thread B + ON A.thread_id = B.id + WHERE NOT B.closed AND A.post_type <> 'question' + ORDER BY A.added_at + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + ) + + break if posts.ntuples() < 1 + + create_posts(posts, total: post_count, offset: offset) do |post| + tid = post["thread_id"].to_i + next unless thread = @thread_parents[tid] + next unless parent = topic_lookup_from_imported_post_id(thread) + pid = post["id"] + { + id: pid, + topic_id: parent[:topic_id], + custom_fields: {import_id: pid}, + user_id: user_id_from_imported_user_id(post["author_id"]) || Discourse::SYSTEM_USER_ID, + created_at: Time.zone.at(@td.decode(post["added_at"])), + raw: post["text"] + } + end + end + end + + def post_process_posts + puts "", "Postprocessing posts..." + current = 0 + max = Post.count + # Rewrite internal links; e.g. + # ask.cvxr.com/question/(\d+)/[^'"}]* + # I am sure this is incomplete, but we didn't make heavy use of internal + # links on our site. + tmp = Regexp.quote("http://" << OLD_SITE) + r1 = /"(#{tmp})?\/question\/(\d+)\/[a-zA-Z-]*\/?"/ + r2 = /\((#{tmp})?\/question\/(\d+)\/[a-zA-Z-]*\/?\)/ + r3 = /?/ + Post.find_each do |post| + raw = post.raw.gsub(r1) do + if topic = topic_lookup_from_imported_post_id($2) + "\"#{topic[:url]}\"" + else + $& + end + end + raw = raw.gsub(r2) do + if topic = topic_lookup_from_imported_post_id($2) + "(#{topic[:url]})" + else + $& + end + end + raw = raw.gsub(r3) do + if topic = topic_lookup_from_imported_post_id($1) + trec = Topic.find_by(id: topic[:topic_id]) + "[#{trec.title}](#{topic[:url]})" + else + $& + end + end + if raw != post.raw + post.raw = raw + post.save + end + print_status(current += 1, max) + end + end + end + +ImportScripts::MyAskBot.new.perform