mirror of
https://github.com/discourse/discourse.git
synced 2024-11-27 18:23:37 +08:00
c83914e2e5
Various migration scripts define a normalize_raw method to do custom processing of post contents before storing it in the Post.raw and other fields. They normally do not handle nil inputs, but it's a relatively common occurrence in data dumps. Since this method is used from various points in the migration script, as it stands, the experience of using a migration script is that it will fail multiple times at different points, forcing you to fix the data or apply logic hacks every time then restarting. This PR generalizes handling of nil input by returning a <missing> string. Pros: no more messy repeated crashes + restarts consistency Cons: it might hide data issues OTOH we can't print a warning on that method because it will flood the console since it's called from inside loops. * FIX: zendesk import script: support nil inputs in normalize_raw * FIX: return '<missing>' instead of empty string; do it for all methods
508 lines
15 KiB
Ruby
508 lines
15 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
# Yammer importer
|
|
# https://docs.microsoft.com/en-us/yammer/manage-security-and-compliance/export-yammer-enterprise-data#export-yammer-network-data-by-date-range-and-network
|
|
#
|
|
# You will need a bunch of CSV files:
|
|
#
|
|
# - Users.csv Groups.csv Topics.csv Groups.csv Files.csv Messages.csv
|
|
# (Others included in Yammer export are ignored)
|
|
|
|
require "csv"
|
|
require_relative "base"
|
|
require_relative "base/generic_database"
|
|
|
|
# Call it like this:
|
|
# RAILS_ENV=production bundle exec ruby script/import_scripts/yammer.rb DIRNAME
|
|
|
|
class ImportScripts::Yammer < ImportScripts::Base
|
|
BATCH_SIZE = 1000
|
|
NUM_WORDS_IN_TITLE = ENV["NUM_WORDS_IN_TITLE"].to_i || 20
|
|
SKIP_EMPTY_EMAIL = true
|
|
SKIP_INACTIVE_USERS = false
|
|
PARENT_CATEGORY_NAME = ENV["PARENT_CATEGORY_NAME"] || "Yammer Import"
|
|
IMPORT_GROUPS_AS_TAGS = true
|
|
MERGE_USERS = true
|
|
# import groups as tags rather than as categories
|
|
SiteSetting.tagging_enabled = true if IMPORT_GROUPS_AS_TAGS
|
|
PM_TAG = ENV["PM_TAG"] || "eht"
|
|
|
|
def initialize(path)
|
|
super()
|
|
|
|
@path = path
|
|
@db = ImportScripts::GenericDatabase.new(@path, batch_size: BATCH_SIZE, recreate: true)
|
|
end
|
|
|
|
def execute
|
|
create_developer_users
|
|
read_csv_files
|
|
|
|
import_categories
|
|
import_users
|
|
import_topics
|
|
import_pm_topics
|
|
import_posts
|
|
import_pm_posts
|
|
end
|
|
|
|
def create_developer_users
|
|
GlobalSetting
|
|
.developer_emails
|
|
.split(",")
|
|
.each { |e| User.create(email: e, active: true, username: e.split("@")[0]) }
|
|
end
|
|
|
|
def read_csv_files
|
|
puts "", "reading CSV files"
|
|
|
|
# consider csv_parse Tags.csv
|
|
# consider Admins.csv that has admins
|
|
|
|
u_count = 0
|
|
csv_parse("Users") do |row|
|
|
next if SKIP_INACTIVE_USERS && row[:state] != "active"
|
|
u_count += 1
|
|
@db.insert_user(
|
|
id: row[:id],
|
|
email: row[:email],
|
|
name: row[:name],
|
|
username: row[:name],
|
|
bio: "",
|
|
# job_title: row[:job_title],
|
|
# location: row[:location],
|
|
# department: row[:department],
|
|
created_at: parse_datetime(row[:joined_at]),
|
|
# deleted_at: parse_datetime(row[:deleted_at]),
|
|
# suspended_at: parse_datetime(row[:suspended_at]),
|
|
# guid: row[:guid],
|
|
# state: row[:state],
|
|
avatar_path: row[:user_cover_image_url],
|
|
# last_seen_at: ,
|
|
active: row[:state] == "active" ? 1 : 0,
|
|
)
|
|
end
|
|
|
|
category_position = 0
|
|
csv_parse("Groups") do |row|
|
|
@db.insert_category(
|
|
id: row[:id],
|
|
name: row[:name],
|
|
description: row[:description],
|
|
position: category_position += 1,
|
|
)
|
|
end
|
|
|
|
csv_parse("Files") do |row|
|
|
@db.insert_upload(
|
|
id: row[:file_id],
|
|
user_id: row[:uploader_id],
|
|
original_filename: row[:name],
|
|
filename: row[:path],
|
|
description: row[:description],
|
|
)
|
|
end
|
|
|
|
# get topics from messages
|
|
csv_parse("Messages") do |row|
|
|
next unless row[:thread_id] == row[:id]
|
|
next if row[:in_private_conversation] == "true"
|
|
next unless row[:deleted_at].blank?
|
|
# next if row[:message_type] == 'system'
|
|
title = ""
|
|
url = ""
|
|
description = ""
|
|
raw = row[:body]
|
|
reg = /opengraphobject:\[(\d*?) : (.*?) : title="(.*?)" : description="(.*?)"\]/
|
|
if row[:attachments]
|
|
row[:attachments].match(reg) do
|
|
url = Regexp.last_match(2)
|
|
title = Regexp.last_match(3) if Regexp.last_match(3)
|
|
description = Regexp.last_match(4)
|
|
raw += "\n***\n#{description}\n#{url}\n" unless raw.include?(url)
|
|
end
|
|
row[:attachments].match(/uploadedfile:(\d*)$/) do
|
|
file_id = Regexp.last_match(1).to_i
|
|
up = @db.fetch_upload(file_id).first
|
|
path = File.join(@path, up["filename"])
|
|
filename = up["original_filename"]
|
|
user_id = user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id
|
|
if File.exist?(path)
|
|
upload = create_upload(user_id, path, filename)
|
|
raw += html_for_upload(upload, filename) if upload&.persisted?
|
|
end
|
|
end
|
|
end
|
|
@db.insert_topic(
|
|
id: row[:id],
|
|
title: title,
|
|
raw: raw,
|
|
category_id: row[:group_id],
|
|
closed: row[:closed] == "TRUE" ? 1 : 0,
|
|
user_id: row[:sender_id],
|
|
created_at: parse_datetime(row[:created_at]),
|
|
)
|
|
end
|
|
|
|
# get pm topics
|
|
csv_parse("Messages") do |row|
|
|
next unless row[:thread_id] == row[:id]
|
|
next unless row[:in_private_conversation] == "true"
|
|
next unless row[:deleted_at].blank?
|
|
# next if row[:message_type] == 'system'
|
|
title = ""
|
|
url = ""
|
|
description = ""
|
|
raw = row[:body]
|
|
reg = /opengraphobject:\[(\d*?) : (.*?) : title="(.*?)" : description="(.*?)"\]/
|
|
if row[:attachments]
|
|
row[:attachments].match(reg) do
|
|
url = Regexp.last_match(2)
|
|
title = Regexp.last_match(3) if Regexp.last_match(3)
|
|
description = Regexp.last_match(4)
|
|
raw += "\n***\n#{description}\n#{url}\n" unless raw.include?(url)
|
|
end
|
|
row[:attachments].match(/uploadedfile:(\d*)$/) do
|
|
file_id = Regexp.last_match(1).to_i
|
|
up = @db.fetch_upload(file_id).first
|
|
path = File.join(@path, up["filename"])
|
|
filename = up["original_filename"]
|
|
user_id = user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id
|
|
if File.exist?(path)
|
|
upload = create_upload(user_id, path, filename)
|
|
raw += html_for_upload(upload, filename) if upload&.persisted?
|
|
end
|
|
end
|
|
end
|
|
@db.insert_pm_topic(
|
|
id: row[:id],
|
|
title: title,
|
|
raw: raw,
|
|
category_id: row[:group_id],
|
|
closed: row[:closed] == "TRUE" ? 1 : 0,
|
|
target_users: row[:participants].gsub("user:", ""),
|
|
user_id: row[:sender_id],
|
|
created_at: parse_datetime(row[:created_at]),
|
|
)
|
|
end
|
|
|
|
# get posts from messages
|
|
csv_parse("Messages") do |row|
|
|
next if row[:thread_id] == row[:id]
|
|
next unless row[:deleted_at].blank?
|
|
next if row[:in_private_conversation] == "true"
|
|
@db.insert_post(
|
|
id: row[:id],
|
|
raw: row[:body] + "\n" + row[:attachments],
|
|
topic_id: row[:thread_id],
|
|
reply_to_post_id: row[:replied_to_id],
|
|
user_id: row[:sender_id],
|
|
created_at: parse_datetime(row[:created_at]),
|
|
)
|
|
end
|
|
|
|
# get pm posts from messages
|
|
csv_parse("Messages") do |row|
|
|
next if row[:thread_id] == row[:id]
|
|
next unless row[:deleted_at].blank?
|
|
next unless row[:in_private_conversation] == "false"
|
|
@db.insert_pm_post(
|
|
id: row[:id],
|
|
raw: row[:body] + "\n" + row[:attachments],
|
|
topic_id: row[:thread_id],
|
|
reply_to_post_id: row[:replied_to_id],
|
|
user_id: row[:sender_id],
|
|
created_at: parse_datetime(row[:created_at]),
|
|
)
|
|
end
|
|
|
|
#@db.delete_unused_users
|
|
@db.sort_posts_by_created_at
|
|
end
|
|
|
|
def parse_datetime(text)
|
|
return nil if text.blank? || text == "null"
|
|
DateTime.parse(text)
|
|
end
|
|
|
|
def import_categories
|
|
puts "", "creating categories"
|
|
parent_category = nil
|
|
if !PARENT_CATEGORY_NAME.blank?
|
|
parent_category = Category.find_by(name: PARENT_CATEGORY_NAME)
|
|
parent_category =
|
|
Category.create(
|
|
name: PARENT_CATEGORY_NAME,
|
|
user_id: Discourse.system_user.id,
|
|
) unless parent_category
|
|
end
|
|
|
|
if IMPORT_GROUPS_AS_TAGS
|
|
@tag_map = {}
|
|
rows = @db.fetch_categories
|
|
rows.each { |row| @tag_map[row["id"]] = row["name"] }
|
|
else
|
|
rows = @db.fetch_categories
|
|
create_categories(rows) do |row|
|
|
{
|
|
id: row["id"],
|
|
name: row["name"],
|
|
description: row["description"],
|
|
position: row["position"],
|
|
parent_category_id: parent_category,
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def batches
|
|
super(BATCH_SIZE)
|
|
end
|
|
|
|
def import_users
|
|
puts "", "creating users"
|
|
total_count = @db.count_users
|
|
puts "", "Got #{total_count} users!"
|
|
last_id = ""
|
|
|
|
batches do |offset|
|
|
rows, last_id = @db.fetch_users(last_id)
|
|
break if rows.empty?
|
|
|
|
next if all_records_exist?(:users, rows.map { |row| row["id"] })
|
|
|
|
create_users(rows, total: total_count, offset: offset) do |row|
|
|
user = User.find_by_email(row["email"].downcase)
|
|
if user
|
|
user.custom_fields["import_id"] = row["id"]
|
|
user.custom_fields["matched_existing"] = "yes"
|
|
user.save
|
|
add_user(row["id"].to_s, user)
|
|
next
|
|
end
|
|
{
|
|
id: row["id"],
|
|
email: row["email"],
|
|
name: row["name"],
|
|
created_at: row["created_at"],
|
|
last_seen_at: row["last_seen_at"],
|
|
active: row["active"] == 1,
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_topics
|
|
puts "", "creating topics"
|
|
staff_guardian = Guardian.new(Discourse.system_user)
|
|
|
|
total_count = @db.count_topics
|
|
last_id = ""
|
|
|
|
batches do |offset|
|
|
rows, last_id = @db.fetch_topics(last_id)
|
|
base_category = Category.find_by(name: PARENT_CATEGORY_NAME)
|
|
break if rows.empty?
|
|
|
|
next if all_records_exist?(:posts, rows.map { |row| import_topic_id(row["id"]) })
|
|
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
{
|
|
id: import_topic_id(row["id"]),
|
|
title:
|
|
(
|
|
if row["title"].present?
|
|
row["title"]
|
|
else
|
|
row["raw"].split(/\W/)[0..(NUM_WORDS_IN_TITLE - 1)].join(" ")
|
|
end
|
|
),
|
|
raw: normalize_raw(row["raw"]),
|
|
category:
|
|
(
|
|
if IMPORT_GROUPS_AS_TAGS
|
|
base_category.id
|
|
else
|
|
category_id_from_imported_category_id(row["category_id"])
|
|
end
|
|
),
|
|
user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
|
|
created_at: row["created_at"],
|
|
closed: row["closed"] == 1,
|
|
post_create_action:
|
|
proc do |post|
|
|
if IMPORT_GROUPS_AS_TAGS
|
|
topic = Topic.find(post.topic_id)
|
|
tag_names = [@tag_map[row["category_id"]]]
|
|
DiscourseTagging.tag_topic_by_names(topic, staff_guardian, tag_names)
|
|
end
|
|
end,
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_pm_topics
|
|
puts "", "creating pm topics"
|
|
staff_guardian = Guardian.new(Discourse.system_user)
|
|
|
|
total_count = @db.count_pm_topics
|
|
last_id = ""
|
|
|
|
batches do |offset|
|
|
rows, last_id = @db.fetch_pm_topics(last_id)
|
|
base_category = Category.find_by(name: PARENT_CATEGORY_NAME)
|
|
break if rows.empty?
|
|
|
|
next if all_records_exist?(:posts, rows.map { |row| import_topic_id(row["id"]) })
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
target_users = []
|
|
row["target_users"]
|
|
.split(",")
|
|
.each do |u|
|
|
user_id = user_id_from_imported_user_id(u)
|
|
next unless user_id
|
|
user = User.find(user_id)
|
|
target_users.append(user.username)
|
|
end
|
|
target_usernames = target_users.join(",")
|
|
{
|
|
id: import_topic_id(row["id"]),
|
|
title:
|
|
(
|
|
if row["title"].present?
|
|
row["title"]
|
|
else
|
|
row["raw"].split(/\W/)[0..(NUM_WORDS_IN_TITLE - 1)].join(" ")
|
|
end
|
|
),
|
|
raw: normalize_raw(row["raw"]),
|
|
category:
|
|
(
|
|
if IMPORT_GROUPS_AS_TAGS
|
|
base_category.id
|
|
else
|
|
category_id_from_imported_category_id(row["category_id"])
|
|
end
|
|
),
|
|
user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
|
|
created_at: row["created_at"],
|
|
closed: row["closed"] == 1,
|
|
archetype: Archetype.private_message,
|
|
target_usernames: target_usernames,
|
|
post_create_action:
|
|
proc do |post|
|
|
if PM_TAG
|
|
topic = Topic.find(post.topic_id)
|
|
tag_names = [PM_TAG]
|
|
DiscourseTagging.tag_topic_by_names(topic, staff_guardian, tag_names)
|
|
end
|
|
end,
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_topic_id(topic_id)
|
|
"T#{topic_id}"
|
|
end
|
|
|
|
def import_posts
|
|
puts "", "creating posts"
|
|
total_count = @db.count_posts
|
|
last_row_id = 0
|
|
|
|
batches do |offset|
|
|
rows, last_row_id = @db.fetch_sorted_posts(last_row_id)
|
|
break if rows.empty?
|
|
|
|
next if all_records_exist?(:posts, rows.map { |row| row["id"] })
|
|
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
topic = topic_lookup_from_imported_post_id(import_topic_id(row["topic_id"]))
|
|
if topic.nil?
|
|
p "MISSING TOPIC #{row["topic_id"]}"
|
|
p row
|
|
next
|
|
end
|
|
{
|
|
id: row["id"],
|
|
raw: normalize_raw(row["raw"]),
|
|
user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
|
|
topic_id: topic[:topic_id],
|
|
created_at: row["created_at"],
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_pm_posts
|
|
puts "", "creating pm posts"
|
|
total_count = @db.count_pm_posts
|
|
last_row_id = 0
|
|
|
|
batches do |offset|
|
|
rows, last_row_id = @db.fetch_pm_posts(last_row_id)
|
|
break if rows.empty?
|
|
|
|
next if all_records_exist?(:posts, rows.map { |row| row["id"] })
|
|
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
topic = topic_lookup_from_imported_post_id(import_topic_id(row["topic_id"]))
|
|
|
|
if topic.nil?
|
|
p "MISSING TOPIC #{row["topic_id"]}"
|
|
p row
|
|
next
|
|
end
|
|
|
|
{
|
|
id: row["id"],
|
|
raw: normalize_raw(row["raw"]),
|
|
user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
|
|
topic_id: topic[:topic_id],
|
|
created_at: row["created_at"],
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def normalize_raw(raw)
|
|
return "<missing>" if raw.blank?
|
|
|
|
raw = raw.gsub('\n', "")
|
|
raw.gsub!(/\[\[user:(\d+)\]\]/) do
|
|
u = Regexp.last_match(1)
|
|
user_id = user_id_from_imported_user_id(u) || Discourse.system_user.id
|
|
if user_id
|
|
user = User.find(user_id)
|
|
"@#{user.username}"
|
|
else
|
|
u
|
|
end
|
|
end
|
|
raw
|
|
end
|
|
|
|
def permalink_exists?(url)
|
|
Permalink.find_by(url: url)
|
|
end
|
|
|
|
def csv_parse(table_name)
|
|
CSV.foreach(
|
|
File.join(@path, "#{table_name}.csv"),
|
|
headers: true,
|
|
header_converters: :symbol,
|
|
skip_blanks: true,
|
|
encoding: "bom|utf-8",
|
|
) { |row| yield row }
|
|
end
|
|
end
|
|
|
|
unless ARGV[0] && Dir.exist?(ARGV[0])
|
|
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/yammer.rb DIRNAME", ""
|
|
exit 1
|
|
end
|
|
|
|
ImportScripts::Yammer.new(ARGV[0]).perform
|