discourse/script/import_scripts/answerbase.rb

# frozen_string_literal: true

require "csv"
require "reverse_markdown"
require_relative "base"
require_relative "base/generic_database"

# Call it like this:
#   RAILS_ENV=production bundle exec ruby script/import_scripts/answerbase.rb DIRNAME
class ImportScripts::Answerbase < ImportScripts::Base
  OLD_DOMAIN = "http://answerbase.example.com" # without trailing slash
  NEW_DOMAIN = "https://discourse.example.com"
  AVATAR_DIRECTORY = "User Images"
  ANSWER_ATTACHMENT_DIRECTORY = "Answer Attachments"
  ANSWER_IMAGE_DIRECTORY = "Answer Images"
  QUESTION_ATTACHMENT_DIRECTORY = "Question Attachments"
  QUESTION_IMAGE_DIRECTORY = "Question Images"
  EMBEDDED_IMAGE_REGEX =
    %r{<a[^>]*href="[^"]*relativeUrl=(?<path>[^"\&]*)[^"]*"[^>]*>\s*<img[^>]*>\s*</a>}i
  QUESTION_LINK_REGEX =
    %r{<a[^>]*?href="#{Regexp.escape(OLD_DOMAIN)}/[^"]*?(?:q|questionid=)(?<id>\d+)[^"]*?"[^>]*>(?<text>.*?)</a>}i
  TOPIC_LINK_NORMALIZATION = '/.*?-(q\d+).*/\1'
  BATCH_SIZE = 1000

  def initialize(path)
    super()

    @path = path
    @db =
      ImportScripts::GenericDatabase.new(
        @path,
        batch_size: BATCH_SIZE,
        recreate: true,
        numeric_keys: true,
      )
  end

  def execute
    read_csv_files

    add_permalink_normalizations
    import_categories
    import_users
    import_topics
    import_posts
  end

  def read_csv_files
    puts "", "reading CSV files..."

    category_position = 0
    csv_parse("categories") do |row|
      @db.insert_category(id: row[:id], name: row[:name], position: category_position += 1)
    end

    csv_parse("users") do |row|
      @db.insert_user(
        id: row[:id],
        email: row[:email],
        username: row[:username],
        bio: row[:description],
        avatar_path: row[:profile_image],
        created_at: parse_date(row[:createtime]),
        active: true,
      )
    end

    last_topic_id = nil
    csv_parse("questions-answers-comments") do |row|
      next if row[:published] == "No"
      user_id = @db.get_user_id(row[:username])
      created_at = parse_datetime(row[:createtime])

      begin
        if row[:type] == "Question"
          attachments =
            parse_filenames(row[:attachments], QUESTION_ATTACHMENT_DIRECTORY) +
              parse_filenames(row[:images], QUESTION_IMAGE_DIRECTORY)

          @db.insert_topic(
            id: row[:id],
            title: row[:title],
            raw: row[:text],
            category_id: row[:categorylist],
            user_id: user_id,
            created_at: created_at,
            attachments: attachments,
          )
          last_topic_id = row[:id]
        else
          attachments =
            parse_filenames(row[:attachments], ANSWER_ATTACHMENT_DIRECTORY) +
              parse_filenames(row[:images], ANSWER_IMAGE_DIRECTORY)

          @db.insert_post(
            id: row[:id],
            raw: row[:text],
            topic_id: last_topic_id,
            user_id: user_id,
            created_at: created_at,
            attachments: attachments,
          )
        end
      rescue StandardError
        p row
        raise
      end
    end
  end

  def parse_filenames(text, directory)
    return [] if text.blank?

    text.split(";").map { |filename| File.join(@path, directory, filename.strip) }
  end

  def parse_date(text)
    return nil if text.blank?
    DateTime.strptime(text, "%m/%d/%y")
  end

  def parse_datetime(text)
    return nil if text.blank?
    # DateTime.strptime(text, "%m/%d/%Y %H:%M")
    DateTime.parse(text).utc.to_datetime
  end

  def import_categories
    puts "", "creating categories"
    rows = @db.fetch_categories

    create_categories(rows) do |row|
      {
        id: row["id"],
        name: row["name"],
        description: row["description"],
        position: row["position"],
      }
    end
  end

  def batches
    super(BATCH_SIZE)
  end

  def import_users
    puts "", "creating users"
    total_count = @db.count_users
    last_id = 0

    batches do |offset|
      rows, last_id = @db.fetch_users(last_id)
      break if rows.empty?

      next if all_records_exist?(:users, rows.map { |row| row["id"] })

      create_users(rows, total: total_count, offset: offset) do |row|
        {
          id: row["id"],
          email: row["email"],
          username: row["username"],
          bio_raw: row["bio"],
          created_at: row["created_at"],
          active: row["active"] == 1,
          post_create_action: proc { |user| create_avatar(user, row["avatar_path"]) },
        }
      end
    end
  end

  def create_avatar(user, avatar_path)
    return if avatar_path.blank?
    avatar_path = File.join(@path, AVATAR_DIRECTORY, avatar_path)

    if File.exist?(avatar_path)
      @uploader.create_avatar(user, avatar_path)
    else
      STDERR.puts "Could not find avatar: #{avatar_path}"
    end
  end

  def import_topics
    puts "", "creating topics"
    total_count = @db.count_topics
    last_id = 0

    batches do |offset|
      rows, last_id = @db.fetch_topics(last_id)
      break if rows.empty?

      next if all_records_exist?(:posts, rows.map { |row| row["id"] })

      create_posts(rows, total: total_count, offset: offset) do |row|
        attachments = @db.fetch_topic_attachments(row["id"]) if row["upload_count"] > 0
        user_id = user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id

        {
          id: row["id"],
          title: row["title"],
          raw: raw_with_attachments(row["raw"].presence || row["title"], attachments, user_id),
          category: category_id_from_imported_category_id(row["category_id"]),
          user_id: user_id,
          created_at: row["created_at"],
          closed: row["closed"] == 1,
          post_create_action:
            proc do |post|
              url = "q#{row["id"]}"
              Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
            end,
        }
      end
    end
  end

  def import_posts
    puts "", "creating posts"
    total_count = @db.count_posts
    last_row_id = 0

    batches do |offset|
      rows, last_row_id = @db.fetch_posts(last_row_id)
      break if rows.empty?

      next if all_records_exist?(:posts, rows.map { |row| row["id"] })

      create_posts(rows, total: total_count, offset: offset) do |row|
        topic = topic_lookup_from_imported_post_id(row["topic_id"])
        attachments = @db.fetch_post_attachments(row["id"]) if row["upload_count"] > 0
        user_id = user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id

        {
          id: row["id"],
          raw: raw_with_attachments(row["raw"], attachments, user_id),
          user_id: user_id,
          topic_id: topic[:topic_id],
          created_at: row["created_at"],
        }
      end
    end
  end

  def raw_with_attachments(raw, attachments, user_id)
    raw, embedded_paths, upload_ids = replace_embedded_attachments(raw, user_id)
    raw = replace_question_links(raw)
    raw = ReverseMarkdown.convert(raw) || ""

    attachments&.each do |attachment|
      path = attachment["path"]
      next if embedded_paths.include?(path)

      if File.exist?(path)
        filename = File.basename(path)
        upload = @uploader.create_upload(user_id, path, filename)

        if upload.present? && upload.persisted? && !upload_ids.include?(upload.id)
          raw = "#{raw}\n#{@uploader.html_for_upload(upload, filename)}"
        end
      else
        STDERR.puts "Could not find file: #{path}"
      end
    end

    raw
  end

  def replace_embedded_attachments(raw, user_id)
    paths = []
    upload_ids = []

    raw =
      raw.gsub(EMBEDDED_IMAGE_REGEX) do
        path = File.join(@path, Regexp.last_match["path"])
        filename = File.basename(path)
        path = find_image_path(filename)

        if path
          upload = @uploader.create_upload(user_id, path, filename)

          if upload.present? && upload.persisted?
            paths << path
            upload_ids << upload.id
            @uploader.html_for_upload(upload, filename)
          end
        else
          STDERR.puts "Could not find file: #{path}"
        end
      end

    [raw, paths, upload_ids]
  end

  def find_image_path(filename)
    [QUESTION_IMAGE_DIRECTORY, ANSWER_IMAGE_DIRECTORY].each do |directory|
      path = File.join(@path, directory, filename)
      return path if File.exist?(path)
    end
  end

  def replace_question_links(raw)
    raw.gsub(QUESTION_LINK_REGEX) do
      topic_id = Regexp.last_match("id")
      topic = topic_lookup_from_imported_post_id(topic_id)
      return Regexp.last_match.to_s unless topic

      url = File.join(NEW_DOMAIN, topic[:url])
      text = Regexp.last_match("text")
      text.include?(OLD_DOMAIN) ? url : "<a href='#{url}'>#{text}</a>"
    end
  end

  def add_permalink_normalizations
    normalizations = SiteSetting.permalink_normalizations
    normalizations = normalizations.blank? ? [] : normalizations.split("|")

    add_normalization(normalizations, TOPIC_LINK_NORMALIZATION)

    SiteSetting.permalink_normalizations = normalizations.join("|")
  end

  def add_normalization(normalizations, normalization)
    normalizations << normalization if normalizations.exclude?(normalization)
  end

  def permalink_exists?(url)
    Permalink.find_by(url: url)
  end

  def csv_parse(table_name)
    CSV.foreach(
      File.join(@path, "#{table_name}.csv"),
      headers: true,
      header_converters: :symbol,
      skip_blanks: true,
      encoding: "bom|utf-8",
    ) { |row| yield row }
  end
end

unless ARGV[0] && Dir.exist?(ARGV[0])
  puts "", "Usage:", "", "bundle exec ruby script/import_scripts/answerbase.rb DIRNAME", ""
  exit 1
end

ImportScripts::Answerbase.new(ARGV[0]).perform