discourse/script/import_scripts/answerhub.rb

# frozen_string_literal: true

# AnswerHub Importer
#
# Based on having access to a mysql dump.
# Pass in the ENV variables listed below before running the script.

require_relative "base"
require "mysql2"
require "open-uri"

class ImportScripts::AnswerHub < ImportScripts::Base
  DB_NAME = ENV["DB_NAME"] || "answerhub"
  DB_PASS = ENV["DB_PASS"] || "answerhub"
  DB_USER = ENV["DB_USER"] || "answerhub"
  TABLE_PREFIX = ENV["TABLE_PREFIX"] || "network1"
  BATCH_SIZE = ENV["BATCH_SIZE"].to_i || 1000
  ATTACHMENT_DIR = ENV["ATTACHMENT_DIR"] || ""
  PROCESS_UPLOADS = ENV["PROCESS_UPLOADS"].to_i || 0
  ANSWERHUB_DOMAIN = ENV["ANSWERHUB_DOMAIN"]
  AVATAR_DIR = ENV["AVATAR_DIR"] || ""
  SITE_ID = ENV["SITE_ID"].to_i || 0
  CATEGORY_MAP_FROM = ENV["CATEGORY_MAP_FROM"].to_i || 0
  CATEGORY_MAP_TO = ENV["CATEGORY_MAP_TO"].to_i || 0
  SCRAPE_AVATARS = ENV["SCRAPE_AVATARS"].to_i || 0

  def initialize
    super
    @client =
      Mysql2::Client.new(host: "localhost", username: DB_USER, password: DB_PASS, database: DB_NAME)
    @skip_updates = true
    SiteSetting.tagging_enabled = true
    SiteSetting.max_tags_per_topic = 10
  end

  def execute
    puts "Now starting the AnswerHub Import"
    puts "DB Name: #{DB_NAME}"
    puts "Table Prefix: #{TABLE_PREFIX}"
    puts
    import_users
    import_categories
    import_topics
    import_posts
    import_groups
    add_users_to_groups
    add_moderators
    add_admins
    import_avatars
    create_permalinks
  end

  def import_users
    puts "", "creating users"

    query =
      "SELECT count(*) count
       FROM #{TABLE_PREFIX}_authoritables
       WHERE c_type = 'user'
       AND c_active = 1
       AND c_system <> 1;"
    total_count = @client.query(query).first["count"]
    puts "Total count: #{total_count}"
    @last_user_id = -1

    batches(BATCH_SIZE) do |offset|
      query =
        "SELECT c_id, c_creation_date, c_name, c_primaryEmail, c_last_seen, c_description
      FROM #{TABLE_PREFIX}_authoritables
      WHERE c_type = 'user'
      AND c_active = 1
      AND c_system <> 1
      AND c_id > #{@last_user_id}
      LIMIT #{BATCH_SIZE};"

      results = @client.query(query)
      break if results.size < 1
      @last_user_id = results.to_a.last["c_id"]

      create_users(results, total: total_count, offset: offset) do |user|
        # puts user['c_id'].to_s + ' ' + user['c_name']
        next if @lookup.user_id_from_imported_user_id(user["c_id"])
        {
          id: user["c_id"],
          email: "#{SecureRandom.hex}@invalid.invalid",
          username: user["c_name"],
          created_at: user["c_creation_date"],
          bio_raw: user["c_description"],
          last_seen_at: user["c_last_seen"],
        }
      end
    end
  end

  def import_categories
    puts "", "importing categories..."

    # Import parent categories first
    query =
      "SELECT c_id, c_name, c_plug, c_parent
    FROM containers
    WHERE c_type = 'space'
    AND c_active = 1
    AND c_parent = 7 OR c_parent IS NULL"
    results = @client.query(query)

    create_categories(results) do |c|
      { id: c["c_id"], name: c["c_name"], parent_category_id: check_parent_id(c["c_parent"]) }
    end

    # Import sub-categories
    query =
      "SELECT c_id, c_name, c_plug, c_parent
    FROM containers
    WHERE c_type = 'space'
    AND c_active = 1
    AND c_parent != 7 AND c_parent IS NOT NULL"
    results = @client.query(query)

    create_categories(results) do |c|
      # puts c.inspect
      {
        id: c["c_id"],
        name: c["c_name"],
        parent_category_id: category_id_from_imported_category_id(check_parent_id(c["c_parent"])),
      }
    end
  end

  def import_topics
    puts "", "importing topics..."

    count_query =
      "SELECT count(*) count
       FROM #{TABLE_PREFIX}_nodes
       WHERE c_visibility <> 'deleted'
       AND (c_type = 'question'
         OR c_type = 'kbentry');"
    total_count = @client.query(count_query).first["count"]

    @last_topic_id = -1

    batches(BATCH_SIZE) do |offset|
      # Let's start with just question types
      query =
        "SELECT *
         FROM #{TABLE_PREFIX}_nodes
         WHERE c_id > #{@last_topic_id}
         AND c_visibility <> 'deleted'
         AND (c_type = 'question'
           OR c_type = 'kbentry')
         ORDER BY c_id ASC
         LIMIT #{BATCH_SIZE};"
      topics = @client.query(query)

      break if topics.size < 1
      @last_topic_id = topics.to_a.last["c_id"]

      create_posts(topics, total: total_count, offset: offset) do |t|
        user_id = user_id_from_imported_user_id(t["c_author"]) || Discourse::SYSTEM_USER_ID
        body = process_mentions(t["c_body"])
        body = process_uploads(body, user_id) if PROCESS_UPLOADS == 1
        markdown_body = HtmlToMarkdown.new(body).to_markdown
        {
          id: t["c_id"],
          user_id: user_id,
          title: t["c_title"],
          category: category_id_from_imported_category_id(t["c_primaryContainer"]),
          raw: markdown_body,
          created_at: t["c_creation_date"],
          post_create_action:
            proc do |post|
              tag_names = t["c_topic_names"].split(",")
              DiscourseTagging.tag_topic_by_names(post.topic, staff_guardian, tag_names)
            end,
        }
      end
    end
  end

  def import_posts
    puts "", "importing posts..."

    count_query =
      "SELECT count(*) count
       FROM #{TABLE_PREFIX}_nodes
       WHERE c_visibility <> 'deleted'
       AND (c_type = 'answer'
         OR c_type = 'comment'
         OR c_type = 'kbentry');"
    total_count = @client.query(count_query).first["count"]

    @last_post_id = -1

    batches(BATCH_SIZE) do |offset|
      query =
        "SELECT *
         FROM #{TABLE_PREFIX}_nodes
         WHERE c_id > #{@last_post_id}
         AND c_visibility <> 'deleted'
         AND (c_type = 'answer'
           OR c_type = 'comment'
           OR c_type = 'kbentry')
         ORDER BY c_id ASC
         LIMIT #{BATCH_SIZE};"
      posts = @client.query(query)
      next if all_records_exist? :posts, posts.map { |p| p["c_id"] }

      break if posts.size < 1
      @last_post_id = posts.to_a.last["c_id"]

      create_posts(posts, total: total_count, offset: offset) do |p|
        t = topic_lookup_from_imported_post_id(p["c_originalParent"])
        next unless t

        reply_to_post_id = post_id_from_imported_post_id(p["c_parent"])
        reply_to_post = reply_to_post_id.present? ? Post.find(reply_to_post_id) : nil
        reply_to_post_number = reply_to_post.present? ? reply_to_post.post_number : nil

        user_id = user_id_from_imported_user_id(p["c_author"]) || Discourse::SYSTEM_USER_ID

        body = process_mentions(p["c_body"])
        body = process_uploads(body, user_id) if PROCESS_UPLOADS == 1

        markdown_body = HtmlToMarkdown.new(body).to_markdown
        {
          id: p["c_id"],
          user_id: user_id,
          topic_id: t[:topic_id],
          reply_to_post_number: reply_to_post_number,
          raw: markdown_body,
          created_at: p["c_creation_date"],
          post_create_action:
            proc do |post_info|
              begin
                if p["c_type"] == "answer" && p["c_marked"] == 1
                  post = Post.find(post_info[:id])
                  if post
                    user_id =
                      user_id_from_imported_user_id(p["c_author"]) || Discourse::SYSTEM_USER_ID
                    current_user = User.find(user_id)
                    solved = DiscourseSolved.accept_answer!(post, current_user)
                    # puts "SOLVED: #{solved}"
                  end
                end
              rescue ActiveRecord::RecordInvalid
                puts "SOLVED: Skipped post_id: #{post.id} because invalid"
              end
            end,
        }
      end
    end
  end

  def import_groups
    puts "", "importing groups..."

    query =
      "SELECT c_id, c_name
       FROM network6_authoritables
       WHERE c_type='group'
       AND c_id > 6;" # Ignore Anonymous, Users, Moderators, etc.
    groups = @client.query(query)

    create_groups(groups) do |group|
      { id: group["c_id"], name: group["c_name"], visibility_level: 1 }
    end
  end

  def add_users_to_groups
    puts "", "adding users to groups..."

    query =
      "SELECT c_id, c_name
       FROM network6_authoritables
       WHERE c_type='group'
       AND c_id > 6;" # Ignore Anonymous, Users, Moderators, etc.
    groups = @client.query(query)

    members_query =
      "SELECT *
       FROM network6_authoritable_groups;"
    group_members = @client.query(members_query)

    total_count = groups.count
    progress_count = 0
    start_time = Time.now

    group_members.map
    groups.each do |group|
      dgroup = find_group_by_import_id(group["c_id"])

      next if dgroup.custom_fields["import_users_added"]

      group_member_ids =
        group_members
          .map do |m|
            user_id_from_imported_user_id(m["c_members"]) if m["c_groups"] == group["c_id"]
          end
          .compact

      # add members
      dgroup.bulk_add(group_member_ids)

      # reload group
      dgroup.reload

      dgroup.custom_fields["import_users_added"] = true
      dgroup.save

      progress_count += 1
      print_status(progress_count, total_count, start_time)
    end
  end

  def add_moderators
    puts "", "adding moderators..."

    query =
      "SELECT *
       FROM network6_authoritable_groups
       WHERE c_groups = 4;"
    moderators = @client.query(query)

    moderator_ids = moderators.map { |m| user_id_from_imported_user_id(m["c_members"]) }.compact

    moderator_ids.each do |id|
      user = User.find(id)
      user.grant_moderation!
    end
  end

  def add_admins
    puts "", "adding admins..."

    query =
      "SELECT *
       FROM network6_authoritable_groups
       WHERE c_groups = 5 OR c_groups = 6;" # Super Users, Network Administrators
    admins = @client.query(query)

    admin_ids = admins.map { |a| user_id_from_imported_user_id(a["c_members"]) }.compact

    admin_ids.each do |id|
      user = User.find(id)
      user.grant_admin!
    end
  end

  def import_avatars
    puts "", "importing user avatars"
    query =
      "SELECT *
       FROM network6_user_preferences
       WHERE c_key = 'avatarImage'"
    avatars = @client.query(query)

    avatars.each do |a|
      begin
        user_id = user_id_from_imported_user_id(a["c_user"])
        user = User.find(user_id)
        if user
          filename = "avatar-#{user_id}.png"
          path = File.join(AVATAR_DIR, filename)
          next if !File.exist?(path)

          # Scrape Avatars - Avatars are saved in the db, but it might be easier to just scrape them
          if SCRAPE_AVATARS == 1
            File.open(path, "wb") do |f|
              f << open(
                "https://#{ANSWERHUB_DOMAIN}/forums/users/#{a["c_user"]}/photo/view.html?s=240",
              ).read
            end
          end

          upload = @uploader.create_upload(user.id, path, filename)

          if upload.persisted?
            user.import_mode = false
            user.create_user_avatar
            user.import_mode = true
            user.user_avatar.update(custom_upload_id: upload.id)
            user.update(uploaded_avatar_id: upload.id)
          else
            Rails.logger.error("Could not persist avatar for user #{user.username}")
          end
        end
      rescue ActiveRecord::RecordNotFound
        puts "Could not find User for user_id: #{a["c_user"]}"
      end
    end
  end

  def process_uploads(body, user_id)
    if body.match(/<img src="\/forums\/storage\/attachments\/[\w-]*.[a-z]{3,4}">/)
      # There could be multiple images in a post
      images = body.scan(/<img src="\/forums\/storage\/attachments\/[\w-]*.[a-z]{3,4}">/)

      images.each do |image|
        filepath = File.basename(image).split('"')[0]
        filepath = File.join(ATTACHMENT_DIR, filepath)

        if File.exist?(filepath)
          filename = File.basename(filepath)
          upload = create_upload(user_id, filepath, filename)
          image_html = html_for_upload(upload, filename)
          original_image_html = '<img src="/forums/storage/attachments/' + filename + '">'
          body.sub!(original_image_html, image_html)
        end
      end
    end
    # Non-images
    if body.match(/<a href="\/forums\/storage\/attachments\/[\w-]*.[a-z]{3,4}">/)
      # There could be multiple files in a post
      files = body.scan(/<a href="\/forums\/storage\/attachments\/[\w-]*.[a-z]{3,4}">/)

      files.each do |file|
        filepath = File.basename(file).split('"')[0]
        filepath = File.join(ATTACHMENT_DIR, filepath)

        if File.exist?(filepath)
          filename = File.basename(filepath)
          upload = create_upload(user_id, filepath, filename)
          file_html = html_for_upload(upload, filename)
          original_file_html = '<a href="/forums/storage/attachments/' + filename + '">'
          body.sub!(original_file_html, file_html)
        end
      end
    end

    body
  end

  def process_mentions(body)
    raw = body.dup

    # https://example.forum.com/forums/users/1469/XYZ_Rob.html
    raw.gsub!(%r{(https://example.forum.com/forums/users/\d+/[\w_%-.]*.html)}) do
      legacy_url = $1
      import_user_id =
        legacy_url.match(%r{https://example.forum.com/forums/users/(\d+)/[\w_%-.]*.html}).captures

      user = @lookup.find_user_by_import_id(import_user_id[0])
      if user.present?
        # puts "/users/#{user.username}"
        "/users/#{user.username}"
      else
        # puts legacy_url
        legacy_url
      end
    end

    # /forums/users/395/petrocket.html
    raw.gsub!(%r{(/forums/users/\d+/[\w_%-.]*.html)}) do
      legacy_url = $1
      import_user_id = legacy_url.match(%r{/forums/users/(\d+)/[\w_%-.]*.html}).captures

      # puts raw
      user = @lookup.find_user_by_import_id(import_user_id[0])
      if user.present?
        # puts "/users/#{user.username}"
        "/users/#{user.username}"
      else
        # puts legacy_url
        legacy_url
      end
    end

    raw
  end

  def create_permalinks
    puts "", "Creating redirects...", ""

    # https://example.forum.com/forums/questions/2005/missing-file.html
    Topic.find_each do |topic|
      pcf = topic.first_post.custom_fields
      if pcf && pcf["import_id"]
        id = pcf["import_id"]
        slug = Slug.for(topic.title)
        begin
          Permalink.create(url: "questions/#{id}/#{slug}.html", topic_id: topic.id)
        rescue StandardError
          nil
        end
        print "."
      end
    end
  end

  def staff_guardian
    @_staff_guardian ||= Guardian.new(Discourse.system_user)
  end

  # Some category parent id's need to be adjusted
  def check_parent_id(id)
    return nil if SITE_ID > 0 && id == SITE_ID
    return CATEGORY_MAP_TO if CATEGORY_MAP_FROM > 0 && id == CATEGORY_MAP_FROM
    id
  end
end

ImportScripts::AnswerHub.new.perform