From e09ce998843729e60b771652abb6165972340670 Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Tue, 4 Jul 2023 21:37:45 +0200 Subject: [PATCH] DEV: Slack import script (#22386) It's very simple import script and currently imports only the following content: * Users * Messages as Discourse topics/posts * Attachments Each channel can be mapped to a category and tags. It uses regular expressions to convert formatted messages ("rich text") into Markdown used by Discourse. In the future we could convert the `blocks` attribute from each message into Markdown instead of applying regular expressions on the `text` attribute. --- .../import_scripts/base/generic_database.rb | 83 ++++- script/import_scripts/slack.rb | 299 ++++++++++++++++++ 2 files changed, 375 insertions(+), 7 deletions(-) create mode 100644 script/import_scripts/slack.rb diff --git a/script/import_scripts/base/generic_database.rb b/script/import_scripts/base/generic_database.rb index f4062695e7a..2a6f2e2e841 100644 --- a/script/import_scripts/base/generic_database.rb +++ b/script/import_scripts/base/generic_database.rb @@ -4,7 +4,7 @@ require "sqlite3" module ImportScripts class GenericDatabase - def initialize(directory, batch_size:, recreate: false, numeric_keys: false) + def initialize(directory, batch_size: 1000, recreate: false, numeric_keys: false) filename = "#{directory}/index.db" File.delete(filename) if recreate && File.exist?(filename) @@ -42,8 +42,8 @@ module ImportScripts def insert_user(user) @db.execute(<<-SQL, prepare(user)) INSERT OR REPLACE - INTO user (id, email, username, name, bio, avatar_path, created_at, last_seen_at, active) - VALUES (:id, :email, :username, :name, :bio, :avatar_path, :created_at, :last_seen_at, :active) + INTO user (id, email, username, name, bio, avatar_path, created_at, last_seen_at, active, staged, admin) + VALUES (:id, :email, :username, :name, :bio, :avatar_path, :created_at, :last_seen_at, :active, :staged, :admin) SQL end @@ -61,8 +61,8 @@ module ImportScripts @db.transaction do @db.execute(<<-SQL, prepare(topic)) - INSERT OR REPLACE INTO topic (id, title, raw, category_id, closed, user_id, created_at, url, upload_count) - VALUES (:id, :title, :raw, :category_id, :closed, :user_id, :created_at, :url, :upload_count) + INSERT OR REPLACE INTO topic (id, title, raw, category_id, closed, user_id, created_at, url, upload_count, tags) + VALUES (:id, :title, :raw, :category_id, :closed, :user_id, :created_at, :url, :upload_count, :tags) SQL attachments&.each do |attachment| @@ -150,6 +150,28 @@ module ImportScripts SQL end + def calculate_user_last_seen_at_dates + @db.execute <<~SQL + UPDATE user + SET last_seen_at = ( + SELECT MAX(created_at) + FROM post + WHERE post.user_id = user.id + ) + SQL + end + + def calculate_user_created_at_dates + @db.execute <<~SQL + UPDATE user + SET created_at = ( + SELECT MIN(created_at) + FROM post + WHERE post.user_id = user.id + ) + SQL + end + def fetch_categories @db.execute(<<-SQL) SELECT * @@ -325,6 +347,48 @@ module ImportScripts @db.get_first_value(sql) end + def create_missing_topics + posts = @db.execute(<<~SQL) + WITH missing_topics AS (SELECT *, RANK() OVER ( PARTITION BY topic_id ORDER BY created_at ) AS post_number + FROM post p + WHERE NOT EXISTS (SELECT 1 FROM topic t WHERE t.id = p.topic_id)) + SELECT * + FROM missing_topics + WHERE post_number = 1 + SQL + + posts.each do |post| + @db.execute("DELETE FROM post WHERE id = ?", post["id"]) + + topic = post.except("post_number", "reply_to_post_id") + topic["id"] = topic.delete("topic_id") + topic = yield(topic) + + @db.execute(<<-SQL, prepare(topic)) + INSERT OR REPLACE INTO topic (id, title, raw, category_id, closed, user_id, created_at, url, upload_count, tags) + VALUES (:id, :title, :raw, :category_id, :closed, :user_id, :created_at, :url, :upload_count, :tags) + SQL + + @db.execute("DELETE FROM post WHERE id = ?", post["id"]) + + @db.execute(<<-SQL, topic_id: topic["id"], post_id: post["id"]) + INSERT OR REPLACE INTO topic_upload (topic_id, path) + SELECT :topic_id, path + FROM post_upload + WHERE post_id = :post_id + SQL + + @db.execute("DELETE FROM post_upload WHERE post_id = ?", post["id"]) + + @db.execute(<<-SQL, topic_id: topic["id"], post_id: post["id"]) + UPDATE like + SET topic_id = :topic_id, + post_id = NULL + WHERE post_id = :post_id + SQL + end + end + private def configure_database @@ -382,7 +446,9 @@ module ImportScripts avatar_path TEXT, created_at DATETIME, last_seen_at DATETIME, - active BOOLEAN NOT NULL DEFAULT true + active BOOLEAN NOT NULL DEFAULT true, + staged BOOLEAN NOT NULL DEFAULT false, + admin BOOLEAN NOT NULL DEFAULT false ) SQL @@ -400,7 +466,8 @@ module ImportScripts user_id #{key_data_type} NOT NULL, created_at DATETIME, url TEXT, - upload_count INTEGER DEFAULT 0 + upload_count INTEGER DEFAULT 0, + tags JSON ) SQL @@ -486,6 +553,8 @@ module ImportScripts hash[key] = value ? 1 : 0 elsif value.is_a?(Date) hash[key] = value.to_s + elsif value.is_a?(Time) + hash[key] = value.utc.iso8601 end end end diff --git a/script/import_scripts/slack.rb b/script/import_scripts/slack.rb new file mode 100644 index 00000000000..dc6989f2b2b --- /dev/null +++ b/script/import_scripts/slack.rb @@ -0,0 +1,299 @@ +# frozen_string_literal: true + +require "colored2" +require_relative "base" +require_relative "base/generic_database" + +# Minimal Slack importer +# It imports users and messages into Discourse topics/posts + +# Call it like this: +# IMPORT=1 bundle install +# IMPORT=1 bundle exec ruby script/import_scripts/slack.rb "PATH_TO_SLACK_EXPORT" +# +# You will need to create a channels-mapping.json file in order to map Slack channels to Discourse categories and tags. +# The import script will create a template for you if you don't have one. + +class ImportScripts::Slack < ImportScripts::Base + TITLE_TIMEZONE = "America/Los_Angeles" + + def initialize(base_path) + super() + @base_path = base_path + @db = ImportScripts::GenericDatabase.new(@base_path, recreate: true) + end + + def execute + @channel_mapping = load_channel_mapping + read_json_files + + import_users + import_categories + import_topics + import_posts + end + + private + + def read_json_files + puts "", "Reading JSON files..." + + json_from_file("users.json").each do |user| + next if user[:deleted] + + @db.insert_user( + id: user[:id], + email: user[:profile][:email], + name: user[:real_name], + staged: !user[:is_email_confirmed], + admin: user[:is_admin], + avatar_path: user[:profile][:image_original], + ) + end + + @channel_mapping.each do |mapping| + @db.insert_category(id: mapping[:slack_channel], name: mapping[:discourse_category]) + + json_from_directory(mapping[:slack_channel]).each do |message| + raise "Unknown type: #{message[:type]}" if message[:type] != "message" + + topic_id = message[:thread_ts] || message[:ts] + created_at = Time.at(message[:ts].to_f).in_time_zone(TITLE_TIMEZONE) + attachments = message[:files]&.map { |file| file[:url_private_download] } + + if message[:ts] == topic_id + @db.insert_topic( + id: topic_id, + title: "Thread starting at #{created_at.iso8601}", + raw: message[:text].presence || "No text", + category_id: mapping[:slack_channel], + created_at: message[:ts], + user_id: message[:user], + tags: Oj.dump(mapping[:discourse_tags]), + attachments: attachments, + ) + else + @db.insert_post( + id: message[:client_msg_id], + raw: message[:text].presence || "No text", + topic_id: topic_id, + created_at: message[:ts], + user_id: message[:user], + attachments: attachments, + ) + end + end + + @db.create_missing_topics do |topic| + created_at = Time.at(topic["created_at"].to_f).in_time_zone(TITLE_TIMEZONE) + topic[:title] = "Thread starting at #{created_at.iso8601}" + topic[:category_id] = mapping[:slack_channel] + topic[:tags] = Oj.dump(mapping[:discourse_tags]) + topic + end + end + + @db.calculate_user_created_at_dates + @db.calculate_user_last_seen_at_dates + @db.sort_posts_by_created_at + end + + def import_categories + puts "", "Creating categories..." + rows = @db.fetch_categories + + create_categories(rows) { |row| { id: row["id"], name: row["name"] } } + end + + def import_users + puts "", "Creating users..." + total_count = @db.count_users + last_id = "" + + batches do |offset| + rows, last_id = @db.fetch_users(last_id) + break if rows.empty? + + next if all_records_exist?(:users, rows.map { |row| row["id"] }) + + create_users(rows, total: total_count, offset: offset) do |row| + { + id: row["id"], + email: row["email"].presence || fake_email, + name: row["name"], + created_at: row["created_at"], + last_seen_at: row["last_seen_at"], + active: row["active"] == 1, + staged: row["staged"] == 1, + admin: row["admin"] == 1, + merge: true, + post_create_action: + proc do |user| + if row["avatar_path"].present? + begin + UserAvatar.import_url_for_user(row["avatar_path"], user) + rescue StandardError + nil + end + end + end, + } + end + end + end + + def import_topics + puts "", "Creating topics..." + total_count = @db.count_topics + last_id = "" + + batches do |offset| + rows, last_id = @db.fetch_topics(last_id) + break if rows.empty? + + next if all_records_exist?(:posts, rows.map { |row| row["id"] }) + + create_posts(rows, total: total_count, offset: offset) do |row| + user_id = user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id + attachments = @db.fetch_topic_attachments(row["id"]) if row["upload_count"] > 0 + + { + id: row["id"], + title: row["title"].present? ? row["title"].strip[0...255] : "Topic title missing", + raw: to_markdown(row["raw"], attachments, user_id), + category: category_id_from_imported_category_id(row["category_id"]), + user_id: user_id, + created_at: Time.at(row["created_at"].to_f), + tags: Oj.load(row["tags"]), + } + end + end + end + + def import_posts + puts "", "creating posts" + total_count = @db.count_posts + last_row_id = 0 + + batches do |offset| + rows, last_row_id = @db.fetch_sorted_posts(last_row_id) + break if rows.empty? + + next if all_records_exist?(:posts, rows.map { |row| row["id"] }) + + create_posts(rows, total: total_count, offset: offset) do |row| + topic = topic_lookup_from_imported_post_id(row["topic_id"]) + user_id = user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id + attachments = @db.fetch_post_attachments(row["id"]) if row["upload_count"] > 0 + + { + id: row["id"], + raw: to_markdown(row["raw"], attachments, user_id), + user_id: user_id, + topic_id: topic[:topic_id], + created_at: Time.at(row["created_at"].to_f), + } + end + end + end + + def json_from_file(relative_path) + absolute_path = File.join(@base_path, relative_path) + load_json(absolute_path) + end + + def json_from_directory(directory) + base_path = File.join(@base_path, directory) + raise "Directory #{base_path} does not exist" unless File.directory?(base_path) + + Enumerator.new do |y| + # Don't use Dir[] because it allocates an array with the path of every file it finds + # which can use a huge amount of memory! + IO.popen(["find", base_path, "-name", "*.json"]) do |io| + io.each_line do |path| + path.chomp! + load_json(path).each { |item| y.yield(item) } + end + end + end + end + + def load_json(path) + raise "File #{path} does not exist" unless File.exist?(path) + Oj.load(File.read(path), { mode: :strict, symbol_keys: true }) + end + + def load_channel_mapping + path = File.join(@base_path, "channel-mapping.json") + + if !File.exist?(path) + create_channel_mapping_file(path) + puts "", "ERROR: channel-mapping.json is missing".red + puts "An example file has been created at #{path}".red, "Please edit it and try again.".red + exit 1 + end + + load_json(path) + end + + def create_channel_mapping_file(mapping_file_path) + mapping = + Dir[File.join(@base_path, "/*/")].map do |path| + channel = File.basename(path) + { slack_channel: channel, discourse_category: channel, discourse_tags: [] } + end + + File.write(mapping_file_path, Oj.dump(mapping, indent: 4)) + end + + def to_markdown(text, attachments, user_id) + # Emoji skin tones + text.gsub!(/::skin-tone-(\d):/, ':t\1:') + + # Mentions + text.gsub!(/<@(\w+)>/) do + user_id = $1 + username = @lookup.find_username_by_import_id(user_id) + username ? "@#{username}" : "`@#{user_id}`" + end + + # Links + text.gsub!(%r{<(https?://[^|]+?)\|([^>]+?)>}, '[\2](\1)') + text.gsub!(%r{<(https?://[^>]+?)>}, '\1') + + # Code blocks + text.gsub!(/```(.+?)```/m, "```\n\\1\n```") + + # Images and files + if attachments + attachments.each do |attachment| + upload_markdown = download_file(attachment["path"], user_id) + text << "\n#{upload_markdown}" + end + end + + text + end + + def download_file(url, user_id) + uri = URI.parse(url) + filename = File.basename(uri.path) + + tempfile = + FileHelper.download( + url, + max_file_size: SiteSetting.max_image_size_kb.kilobytes, + tmp_file_name: "sso-avatar", + follow_redirect: true, + ) + + return unless tempfile + + upload = UploadCreator.new(tempfile, filename, origin: url).create_for(user_id) + html_for_upload(upload, filename) + ensure + tempfile.close! if tempfile && tempfile.respond_to?(:close!) + end +end + +ImportScripts::Slack.new(ARGV[0]).perform