discourse/script/import_scripts/askbot.rb

# frozen_string_literal: true

require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require "pg"

class ImportScripts::MyAskBot < ImportScripts::Base
  # CHANGE THESE BEFORE RUNNING THE IMPORTER

  BATCH_SIZE = 1000

  OLD_SITE = "ask.cvxr.com"
  DB_NAME = "cvxforum"
  DB_USER = "cvxforum"
  DB_PORT = 5432
  DB_HOST = "ask.cvxr.com"
  DB_PASS = "yeah, right"

  # A list of categories to create. Any post with one of these tags will be
  # assigned to that category. Ties are broken by list order.
  CATEGORIES = %w[Nonconvex TFOCS MIDCP FAQ]

  def initialize
    super

    @thread_parents = {}
    @tagmap = []
    @td = PG::TextDecoder::TimestampWithTimeZone.new
    @client =
      PG.connect(dbname: DB_NAME, host: DB_HOST, port: DB_PORT, user: DB_USER, password: DB_PASS)
  end

  def execute
    create_cats
    import_users
    read_tags
    import_posts
    import_replies
    post_process_posts
  end

  def create_cats
    puts "", "creating categories"
    CATEGORIES.each do |cat|
      unless Category.where("LOWER(name) = ?", cat.downcase).first
        Category.new(name: cat, user_id: -1).save!
      end
    end
  end

  def read_tags
    puts "", "reading thread tags..."

    tag_count = @client.exec(<<-SQL)[0]["count"]
          SELECT COUNT(A.id)
          FROM askbot_thread_tags A
          JOIN tag B
          ON A.tag_id = B.id
          WHERE A.tag_id > 0
      SQL

    tags_done = 0
    batches(BATCH_SIZE) do |offset|
      tags = @client.exec(<<-SQL)
        SELECT A.thread_id, B.name
        FROM askbot_thread_tags A
        JOIN tag B
        ON A.tag_id = B.id
        WHERE A.tag_id > 0
        LIMIT #{BATCH_SIZE}
        OFFSET #{offset}
      SQL
      break if tags.ntuples() < 1
      tags.each do |tag|
        tid = tag["thread_id"].to_i
        tnm = tag["name"].downcase
        if @tagmap[tid]
          @tagmap[tid].push(tnm)
        else
          @tagmap[tid] = [tnm]
        end
        tags_done += 1
        print_status tags_done, tag_count
      end
    end
  end

  def import_users
    puts "", "importing users"

    total_count = @client.exec(<<-SQL)[0]["count"]
          SELECT COUNT(id)
            FROM auth_user
      SQL

    batches(BATCH_SIZE) do |offset|
      users = @client.query(<<-SQL)
          SELECT id, username, email, is_staff, date_joined, last_seen, real_name, website, location, about
            FROM auth_user
        ORDER BY date_joined
           LIMIT #{BATCH_SIZE}
          OFFSET #{offset}
      SQL

      break if users.ntuples() < 1

      next if all_records_exist? :users, users.map { |u| u["id"].to_i }

      create_users(users, total: total_count, offset: offset) do |user|
        {
          id: user["id"],
          username: user["username"],
          email: user["email"] || fake_email,
          admin: user["is_staff"],
          created_at: Time.zone.at(@td.decode(user["date_joined"])),
          last_seen_at: Time.zone.at(@td.decode(user["last_seen"])),
          name: user["real_name"],
          website: user["website"],
          location: user["location"],
        }
      end
    end
  end

  def import_posts
    puts "", "importing questions..."

    post_count = @client.exec(<<-SQL)[0]["count"]
          SELECT COUNT(A.id)
            FROM askbot_post A
            JOIN askbot_thread B
              ON A.thread_id = B.id
           WHERE NOT B.closed AND A.post_type='question'
      SQL

    batches(BATCH_SIZE) do |offset|
      posts = @client.exec(<<-SQL)
          SELECT A.id, A.author_id, A.added_at, A.text, A.thread_id, B.title
            FROM askbot_post A
            JOIN askbot_thread B
              ON A.thread_id = B.id
           WHERE NOT B.closed AND A.post_type = 'question'
        ORDER BY A.added_at
          LIMIT #{BATCH_SIZE}
          OFFSET #{offset}
      SQL

      break if posts.ntuples() < 1

      next if all_records_exist? :posts, posts.map { |p| p["id"].to_i }

      create_posts(posts, total: post_count, offset: offset) do |post|
        pid = post["id"]
        tid = post["thread_id"].to_i
        tags = @tagmap[tid]
        cat = nil
        if tags
          CATEGORIES.each do |cname|
            next if tags.exclude?(cname.downcase)
            cat = cname
            break
          end
        end
        @thread_parents[tid] = pid
        {
          id: pid,
          title: post["title"],
          category: cat,
          custom_fields: {
            import_id: pid,
            import_thread_id: tid,
            import_tags: tags,
          },
          user_id: user_id_from_imported_user_id(post["author_id"]) || Discourse::SYSTEM_USER_ID,
          created_at: Time.zone.at(@td.decode(post["added_at"])),
          raw: post["text"],
        }
      end
    end
  end

  def import_replies
    puts "", "importing answers and comments..."

    post_count = @client.exec(<<-SQL)[0]["count"]
          SELECT COUNT(A.id)
            FROM askbot_post A
            JOIN askbot_thread B
              ON A.thread_id = B.id
           WHERE NOT B.closed AND A.post_type<>'question'
      SQL

    batches(BATCH_SIZE) do |offset|
      posts = @client.exec(<<-SQL)
          SELECT A.id, A.author_id, A.added_at, A.text, A.thread_id, B.title
            FROM askbot_post A
            JOIN askbot_thread B
              ON A.thread_id = B.id
           WHERE NOT B.closed AND A.post_type <> 'question'
        ORDER BY A.added_at
          LIMIT #{BATCH_SIZE}
          OFFSET #{offset}
      SQL

      break if posts.ntuples() < 1

      next if all_records_exist? :posts, posts.map { |p| p["id"].to_i }

      create_posts(posts, total: post_count, offset: offset) do |post|
        tid = post["thread_id"].to_i
        next unless thread = @thread_parents[tid]
        next unless parent = topic_lookup_from_imported_post_id(thread)
        pid = post["id"]
        {
          id: pid,
          topic_id: parent[:topic_id],
          custom_fields: {
            import_id: pid,
          },
          user_id: user_id_from_imported_user_id(post["author_id"]) || Discourse::SYSTEM_USER_ID,
          created_at: Time.zone.at(@td.decode(post["added_at"])),
          raw: post["text"],
        }
      end
    end
  end

  def post_process_posts
    puts "", "Postprocessing posts..."
    current = 0
    max = Post.count
    # Rewrite internal links; e.g.
    # ask.cvxr.com/question/(\d+)/[^'"}]*
    # I am sure this is incomplete, but we didn't make heavy use of internal
    # links on our site.
    tmp = Regexp.quote("http://#{OLD_SITE}")
    r1 = %r{"(#{tmp})?/question/(\d+)/[a-zA-Z-]*/?"}
    r2 = %r{\((#{tmp})?/question/(\d+)/[a-zA-Z-]*/?\)}
    r3 = %r{<?#tmp/question/(\d+)/[a-zA-Z-]*/?>?}
    Post.find_each do |post|
      raw =
        post
          .raw
          .gsub(r1) do
            if topic = topic_lookup_from_imported_post_id($2)
              "\"#{topic[:url]}\""
            else
              $&
            end
          end
      raw =
        raw.gsub(r2) do
          if topic = topic_lookup_from_imported_post_id($2)
            "(#{topic[:url]})"
          else
            $&
          end
        end
      raw =
        raw.gsub(r3) do
          if topic = topic_lookup_from_imported_post_id($1)
            trec = Topic.find_by(id: topic[:topic_id])
            "[#{trec.title}](#{topic[:url]})"
          else
            $&
          end
        end

      if raw != post.raw
        post.raw = raw
        post.save
      end
      print_status(current += 1, max)
    end
  end
end

ImportScripts::MyAskBot.new.perform