discourse/script/import_scripts/sfn.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

324 lines
12 KiB
Ruby
Raw Permalink Normal View History

# frozen_string_literal: true
2015-04-15 00:16:42 +08:00
# custom importer for www.sfn.org, feel free to borrow ideas
2015-04-17 22:32:01 +08:00
require "csv"
require "mysql2"
2015-04-15 00:16:42 +08:00
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
class ImportScripts::Sfn < ImportScripts::Base
2015-04-17 22:32:01 +08:00
BATCH_SIZE = 100_000
MIN_CREATED_AT = "2003-11-01"
2015-04-15 00:16:42 +08:00
def execute
2015-04-17 22:32:01 +08:00
load_external_users
2015-04-15 00:16:42 +08:00
import_users
# import_categories
2015-04-15 00:16:42 +08:00
import_topics
import_posts
end
2015-04-17 22:32:01 +08:00
def load_external_users
puts "", "loading external users..."
@personify_id_to_contact_key = {}
contacts = mysql_query <<-SQL
SELECT ContactKey AS "contact_key",
PersonifyID AS "personify_id"
FROM Contact
SQL
contacts.each do |contact|
personify_id = contact["personify_id"].split(",").first
@personify_id_to_contact_key[personify_id] = contact["contact_key"]
end
@external_users = {}
CSV.foreach("/Users/zogstrip/Desktop/sfn.csv", col_sep: ";") do |row|
next if @personify_id_to_contact_key.exclude?(row[0])
2015-04-17 22:32:01 +08:00
id = @personify_id_to_contact_key[row[0]]
full_name = [row[1].strip, row[2].strip, row[3].strip].join(" ").strip
@external_users[id] = { email: row[4], full_name: full_name }
end
end
2015-04-15 00:16:42 +08:00
def import_users
puts "", "importing users..."
user_count = mysql_query <<-SQL
2015-04-17 22:32:01 +08:00
SELECT COUNT(ContactKey) AS "count" FROM Contact
2015-04-15 00:16:42 +08:00
SQL
user_count = user_count.first["count"]
batches(BATCH_SIZE) do |offset|
users = mysql_query <<-SQL
2015-04-17 22:32:01 +08:00
SELECT c.ContactKey AS "id",
2015-04-15 00:16:42 +08:00
c.Bio AS "bio",
2015-04-17 22:32:01 +08:00
c.ProfileImage AS "avatar",
es.EmailAddr_ AS "email",
es.FullName_ AS "full_name",
GREATEST('#{MIN_CREATED_AT}', COALESCE(cm.InvitedOn, '#{MIN_CREATED_AT}')) AS "created_at"
FROM Contact c
LEFT JOIN EgroupSubscription es ON es.ContactKey = c.ContactKey
LEFT JOIN CommunityMember cm ON cm.ContactKey = c.ContactKey
GROUP BY c.ContactKey
ORDER BY cm.InvitedOn
2015-04-15 00:16:42 +08:00
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if users.size < 1
next if all_records_exist? :users, users.map { |u| u["id"].to_i }
2015-04-15 00:16:42 +08:00
create_users(users, total: user_count, offset: offset) do |user|
2015-04-17 22:32:01 +08:00
external_user = @external_users[user["id"]]
email = user["email"].presence || external_user.try(:[], :email)
full_name = user["full_name"].presence || external_user.try(:[], :full_name)
bio = (user["bio"] || "")[0..250]
next if email.blank?
2015-04-15 00:16:42 +08:00
{
id: user["id"],
2015-04-17 22:32:01 +08:00
email: email,
name: full_name,
username: email.split("@")[0],
2015-04-17 22:32:01 +08:00
bio_raw: bio,
2015-04-15 00:16:42 +08:00
created_at: user["created_at"],
post_create_action:
proc do |newuser|
next if user["avatar"].blank?
2015-04-15 00:16:42 +08:00
avatar = Tempfile.new("sfn-avatar")
avatar.write(user["avatar"].encode("ASCII-8BIT").force_encoding("UTF-8"))
avatar.rewind
upload = UploadCreator.new(avatar, "avatar.jpg").create_for(newuser.id)
2015-04-15 00:16:42 +08:00
if upload.persisted?
newuser.create_user_avatar
newuser.user_avatar.update(custom_upload_id: upload.id)
newuser.update(uploaded_avatar_id: upload.id)
end
begin
2015-04-15 00:16:42 +08:00
avatar.try(:close!)
rescue StandardError
nil
end
end,
2015-04-15 00:16:42 +08:00
}
end
end
end
# NEW_CATEGORIES = [
# "Abstract Topic Matching Forum",
# "Animals in Research",
# "Brain Awareness & Teaching",
# "Career Advice",
# "Career Paths",
# "Diversity",
# "Early Career Policy Advocates",
# "LATP Associates",
# "LATP Fellows",
# "Mid & Advanced Career",
# "Neurobiology of Disease Workshop",
# "Neuronline Champions",
# "Neuroscience 2015",
# "Neuroscience Scholars Program",
# "NSP Associates",
# "NSP Fellows",
# "Outreach",
# "Postdocs & Early Career",
# "Program Committee",
# "Program Development",
# "Roommate Matching Forum",
# "Scientific Research",
# "Students",
# ]
2015-04-15 00:16:42 +08:00
# EgroupKey => New Category Name
CATEGORY_MAPPING = {
"{DE10E4F4-621A-48BF-9B45-05D9F774A590}" => 52, # "Abstract Topic Matching Forum",
"{3FFC1217-1576-4D38-BB81-D6CADC7FB793}" => 66, # "Animals in Research",
"{9362BB21-BF6C-4E55-A3E0-18CD5D9F3323}" => 67, # "Brain Awareness & Teaching",
"{3AC01B09-A21F-4166-95DA-0E585E271075}" => 67, # "Brain Awareness & Teaching",
"{C249728D-8C9E-4138-AA49-D02467C28EAD}" => 42, # "Career Advice",
"{01570B85-0124-478F-A8B9-B028BD1B1F2F}" => 43, # "Career Paths",
"{2A430528-278A-46CD-BE1A-07CFA1122919}" => 44, # "Diversity",
"{2F211345-3C19-43C9-90B5-27BA9FCD4DB0}" => 44, # "Diversity",
"{8092297D-8DF4-404A-8BEB-4D5D0DC6A191}" => 56, # "Early Career Policy Advocates",
"{8CB58762-D562-448C-9AF1-8DAE6C482C9B}" => 61, # "LATP Associates",
"{CDF80A92-925A-46DD-A867-8558FA72D016}" => 60, # "LATP Fellows",
"{E71E237B-7C23-4596-AECA-655BD8ED50DB}" => 51, # "Mid & Advanced Career",
"{1D674C38-17CB-4C48-826A-D465AC3F8948}" => 55, # "Neurobiology of Disease Workshop",
"{80C5835E-974E-4D44-BA01-C2C4F8BA91D7}" => 65, # "Neuronline Champions",
"{3D4F885B-0037-403B-83DD-62FAA8E81DF1}" => 54, # "Neuroscience 2015",
"{9ACC3B40-E4A3-4FFD-AADC-C8403EB6231D}" => 54, # "Neuroscience 2015",
"{9FC30FFB-E450-4361-8844-0266C3D96868}" => 57, # "Neuroscience Scholars Program",
"{3E78123E-87CE-435E-B4B7-7DAB1A21C541}" => 59, # "NSP Associates",
"{12D889D3-5CFD-49D5-93E4-32AAB2CFFCDA}" => 58, # "NSP Fellows",
"{FA86D79E-170E-4F53-8F1C-942CB3FFB19E}" => 45, # "Outreach",
"{D7041C64-3D32-4010-B3D8-71858323CB4A}" => 45, # "Outreach",
"{69B76913-4E23-4C80-A11E-9CDB4130722E}" => 45, # "Outreach",
"{774878EA-96AD-49F5-9D29-105AEA488007}" => 45, # "Outreach",
"{E6349704-FD01-41B1-9C59-68E928DD4318}" => 50, # "Postdocs & Early Career",
"{31CF5944-2567-4E79-9730-18EEC23E5B52}" => 50, # "Postdocs & Early Career",
"{5625C403-AFAE-4323-A470-33FC32B12B53}" => 62, # "Program Committee",
"{8415D871-54F5-4128-B099-E5A376A6B41B}" => 47, # "Program Development",
"{B4DF2044-47AB-4329-8BF7-0D832CAB402C}" => 53, # "Roommate Matching Forum",
"{6A3A12B9-5C72-472F-97AC-F34983674960}" => 48, # "Scientific Research",
"{2CF635E9-4866-451C-A4F2-E2A8A80FED54}" => 48, # "Scientific Research",
"{CF2DDCCE-737F-499D-AFE4-E5C36F195C8B}" => 48, # "Scientific Research",
"{282B48D7-AC1D-453E-9806-3C6CE6830EF9}" => 48, # "Scientific Research",
"{6D750CAF-E96F-4AD1-A45B-7B74FDFF0B40}" => 48, # "Scientific Research",
"{10AF5D45-BEB3-4F07-BE77-0BAB6910DE10}" => 48, # "Scientific Research",
"{18D7F624-26D1-44B9-BF33-AB5C5A2AB2BF}" => 48, # "Scientific Research",
"{6016FF4F-D834-4888-BA03-F9FE8CB1D4CC}" => 48, # "Scientific Research",
"{B0290A37-EA39-4CB8-B6CB-3E0B7EF6D036}" => 48, # "Scientific Research",
"{97CC60D0-B93A-43FF-BB48-366FAAEE2BAC}" => 48, # "Scientific Research",
"{8FC9B57B-2755-4FC5-90E8-CCDB56CF2F66}" => 48, # "Scientific Research",
"{57C8BF37-357E-4FE6-952D-906248642792}" => 48, # "Scientific Research",
"{7B2A3B63-BC2C-4219-830C-BA1DECB33337}" => 48, # "Scientific Research",
"{0ED1D205-0E48-48D2-B82B-3CE80C6C553F}" => 48, # "Scientific Research",
"{10355962-D172-4294-AA8E-1BC381B67971}" => 48, # "Scientific Research",
"{C84B0222-5232-4B94-9FB8-DDF802241171}" => 48, # "Scientific Research",
"{9143F984-0D67-46CB-AAAF-7FE3B6335E07}" => 48, # "Scientific Research",
"{1392DC10-37A0-46A6-9979-4568D0224C5F}" => 48, # "Scientific Research",
"{E4891409-0F4F-4151-B550-ECE53655E231}" => 48, # "Scientific Research",
"{9613BAC2-229B-4563-9E1C-35C31CDDCE2F}" => 49, # "Students",
2015-04-15 00:16:42 +08:00
}
def import_categories
puts "", "importing categories..."
2015-04-15 00:16:42 +08:00
create_categories(NEW_CATEGORIES) { |category| { id: category, name: category } }
end
2015-04-15 00:16:42 +08:00
def import_topics
puts "", "importing topics..."
topic_count = mysql_query <<-SQL
SELECT COUNT(MessageID_) AS "count"
FROM EgroupMessages
WHERE ParentId_ = 0
AND ApprovedRejectedPendingInd = "Approved"
2015-04-17 22:32:01 +08:00
AND (CrosspostFromMessageKey IS NULL OR CrosspostFromMessageKey = '{00000000-0000-0000-0000-000000000000}')
2015-04-15 00:16:42 +08:00
SQL
topic_count = topic_count.first["count"]
batches(BATCH_SIZE) do |offset|
topics = mysql_query <<-SQL
SELECT MessageID_ AS "id",
EgroupKey AS "category_id",
ContactKey AS "user_id",
HdrSubject_ AS "title",
Body_ AS "raw",
CreatStamp_ AS "created_at"
FROM EgroupMessages
WHERE ParentId_ = 0
AND ApprovedRejectedPendingInd = "Approved"
2015-04-17 22:32:01 +08:00
AND (CrosspostFromMessageKey IS NULL OR CrosspostFromMessageKey = '{00000000-0000-0000-0000-000000000000}')
ORDER BY CreatStamp_
2015-04-15 00:16:42 +08:00
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if topics.size < 1
next if all_records_exist? :posts, topics.map { |t| t["id"].to_i }
2015-04-15 00:16:42 +08:00
create_posts(topics, total: topic_count, offset: offset) do |topic|
next unless category_id = CATEGORY_MAPPING[topic["category_id"]]
2015-04-17 22:32:01 +08:00
title = topic["title"][0..250]
raw = cleanup_raw(topic["raw"])
next if raw.blank?
2015-04-15 00:16:42 +08:00
{
id: topic["id"],
category: category_id,
2015-04-15 00:16:42 +08:00
user_id: user_id_from_imported_user_id(topic["user_id"]) || Discourse::SYSTEM_USER_ID,
2015-04-17 22:32:01 +08:00
title: title,
raw: raw,
2015-04-15 00:16:42 +08:00
created_at: topic["created_at"],
}
end
end
end
def import_posts
puts "", "importing posts..."
posts_count = mysql_query <<-SQL
SELECT COUNT(MessageID_) AS "count"
FROM EgroupMessages
WHERE ParentId_ > 0
AND ApprovedRejectedPendingInd = "Approved"
2015-04-17 22:32:01 +08:00
AND (CrosspostFromMessageKey IS NULL OR CrosspostFromMessageKey = '{00000000-0000-0000-0000-000000000000}')
2015-04-15 00:16:42 +08:00
SQL
posts_count = posts_count.first["count"]
batches(BATCH_SIZE) do |offset|
posts = mysql_query <<-SQL
SELECT MessageID_ AS "id",
ContactKey AS "user_id",
ParentID_ AS "topic_id",
Body_ AS "raw",
CreatStamp_ AS "created_at"
FROM EgroupMessages
WHERE ParentId_ > 0
AND ApprovedRejectedPendingInd = "Approved"
2015-04-17 22:32:01 +08:00
AND (CrosspostFromMessageKey IS NULL OR CrosspostFromMessageKey = '{00000000-0000-0000-0000-000000000000}')
ORDER BY CreatStamp_
2015-04-15 00:16:42 +08:00
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
break if posts.size < 1
next if all_records_exist? :posts, posts.map { |p| p["id"].to_i }
2015-04-15 00:16:42 +08:00
create_posts(posts, total: posts_count, offset: offset) do |post|
next unless parent = topic_lookup_from_imported_post_id(post["topic_id"])
2015-04-17 22:32:01 +08:00
raw = cleanup_raw(post["raw"])
next if raw.blank?
2015-04-15 00:16:42 +08:00
{
id: post["id"],
topic_id: parent[:topic_id],
user_id: user_id_from_imported_user_id(post["user_id"]) || Discourse::SYSTEM_USER_ID,
raw: cleanup_raw(post["raw"]),
created_at: post["created_at"],
}
end
end
end
def cleanup_raw(raw)
# fix some html
raw.gsub!(%r{<br\s*/?>}i, "\n")
# remove "This message has been cross posted to the following eGroups: ..."
raw.gsub!(/^This message has been cross posted to the following eGroups: .+\n-{3,}/i, "")
# remove signatures
raw.gsub!(/-{3,}.+/m, "")
# strip leading/trailing whitespaces
raw.strip
end
def mysql_query(sql)
@client ||= Mysql2::Client.new(username: "root", database: "sfn")
@client.query(sql)
end
end
ImportScripts::Sfn.new.perform