DEV: Improvements to Drupal script (#10016)

Refactors script to follow conventions of other importers and adds some features including like import, processing of post raw text, and, if needed, SSO import.
This commit is contained in:
Justin DiRose 2020-06-10 10:59:17 -05:00 committed by GitHub
parent 3a7ca97c36
commit be28fc73a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,16 +1,21 @@
# frozen_string_literal: true # frozen_string_literal: true
require "mysql2" require "mysql2"
require "htmlentities"
require File.expand_path(File.dirname(__FILE__) + "/base.rb") require File.expand_path(File.dirname(__FILE__) + "/base.rb")
class ImportScripts::Drupal < ImportScripts::Base class ImportScripts::Drupal < ImportScripts::Base
DRUPAL_DB = ENV['DRUPAL_DB'] || "newsite3" DRUPAL_DB = ENV['DRUPAL_DB'] || "drupal"
VID = ENV['DRUPAL_VID'] || 1 VID = ENV['DRUPAL_VID'] || 1
BATCH_SIZE = 1000
ATTACHMENT_DIR = "/root/files/upload"
def initialize def initialize
super super
@htmlentities = HTMLEntities.new
@client = Mysql2::Client.new( @client = Mysql2::Client.new(
host: "localhost", host: "localhost",
username: "root", username: "root",
@ -19,142 +24,210 @@ class ImportScripts::Drupal < ImportScripts::Base
) )
end end
def categories_query
@client.query("SELECT tid, name, description FROM taxonomy_term_data WHERE vid = #{VID}")
end
def execute def execute
create_users(@client.query("SELECT uid id, name, mail email, created FROM users;")) do |row|
{ id: row['id'], username: row['name'], email: row['email'], created_at: Time.zone.at(row['created']) }
end
# You'll need to edit the following query for your Drupal install: import_users
# import_categories
# * Drupal allows duplicate category names, so you may need to exclude some categories or rename them here.
# * Table name may be term_data.
# * May need to select a vid other than 1.
create_categories(categories_query) do |c|
{ id: c['tid'], name: c['name'], description: c['description'] }
end
# "Nodes" in Drupal are divided into types. Here we import two types, # "Nodes" in Drupal are divided into types. Here we import two types,
# and will later import all the comments/replies for each node. # and will later import all the comments/replies for each node.
# You will need to figure out what the type names are on your install and edit the queries to match. # You will need to figure out what the type names are on your install and edit the queries to match.
if ENV['DRUPAL_IMPORT_BLOG'] if ENV['DRUPAL_IMPORT_BLOG']
create_blog_topics import_blog_topics
end end
create_forum_topics import_forum_topics
create_replies import_replies
import_likes
mark_topics_as_solved
import_sso_records
import_attachments
postprocess_posts
create_permalinks
import_gravatars
end
begin def import_users
create_admin(email: 'neil.lalonde@discourse.org', username: UserNameSuggester.suggest('neil')) puts "", "importing users"
rescue => e
puts '', "Failed to create admin user" user_count = mysql_query("SELECT count(uid) count FROM users").first["count"]
puts e.message
last_user_id = -1
batches(BATCH_SIZE) do |offset|
users = mysql_query(<<-SQL
SELECT uid,
name username,
mail email,
created
FROM users
WHERE uid > #{last_user_id}
ORDER BY uid
LIMIT #{BATCH_SIZE}
SQL
).to_a
break if users.empty?
last_user_id = users[-1]["uid"]
users.reject! { |u| @lookup.user_already_imported?(u["uid"]) }
create_users(users, total: user_count, offset: offset) do |user|
email = user["email"].presence || fake_email
email = fake_email unless email[EmailValidator.email_regex]
username = @htmlentities.decode(user["username"]).strip
{
id: user["uid"],
name: username,
email: email,
created_at: Time.zone.at(user["created"])
}
end
end end
end end
def create_blog_topics def import_categories
puts '', "creating blog topics" # You'll need to edit the following query for your Drupal install:
#
# * Drupal allows duplicate category names, so you may need to exclude some categories or rename them here.
# * Table name may be term_data.
# * May need to select a vid other than 1
create_category({ puts "", "importing categories"
name: 'Blog',
user_id: -1,
description: "Articles from the blog"
}, nil) unless Category.find_by_name('Blog')
results = @client.query(" categories = mysql_query(<<-SQL
SELECT n.nid nid, n.title title, n.uid uid, n.created created, n.sticky sticky, SELECT tid,
f.body_value body name,
FROM node n, description
field_data_body f FROM taxonomy_term_data
WHERE n.type = 'blog' WHERE vid = #{VID}
AND n.nid = f.entity_id SQL
AND n.status = 1 ).to_a
", cache_rows: false)
create_posts(results) do |row| create_categories(categories) do |category|
{ {
id: "nid:#{row['nid']}", id: category['tid'],
user_id: user_id_from_imported_user_id(row['uid']) || -1, name: @htmlentities.decode(category['name']).strip,
category: 'Blog', description: @htmlentities.decode(category['description']).strip
raw: row['body'],
created_at: Time.zone.at(row['created']),
pinned_at: row['sticky'].to_i == 1 ? Time.zone.at(row['created']) : nil,
title: row['title'].try(:strip),
custom_fields: { import_id: "nid:#{row['nid']}" }
} }
end end
end end
def create_forum_topics def import_blog_topics
puts '', "creating forum topics" puts '', "importing blog topics"
total_count = @client.query(" create_category(
{
name: 'Blog',
description: "Articles from the blog"
},
nil) unless Category.find_by_name('Blog')
blogs = mysql_query(<<-SQL
SELECT n.nid nid, n.title title, n.uid uid, n.created created, n.sticky sticky,
f.body_value body
FROM node n,
field_data_body f
WHERE n.type = 'article'
AND n.nid = f.entity_id
AND n.status = 1
SQL
).to_a
category_id = Category.find_by_name('Blog').id
create_posts(blogs) do |topic|
{
id: "nid:#{topic['nid']}",
user_id: user_id_from_imported_user_id(topic['uid']) || -1,
category: category_id,
raw: topic['body'],
created_at: Time.zone.at(topic['created']),
pinned_at: topic['sticky'].to_i == 1 ? Time.zone.at(topic['created']) : nil,
title: topic['title'].try(:strip),
custom_fields: { import_id: "nid:#{topic['nid']}" }
}
end
end
def import_forum_topics
puts '', "importing forum topics"
total_count = mysql_query(<<-SQL
SELECT COUNT(*) count SELECT COUNT(*) count
FROM forum_index fi, node n FROM forum_index fi, node n
WHERE n.type = 'forum' WHERE n.type = 'forum'
AND fi.nid = n.nid AND fi.nid = n.nid
AND n.status = 1;").first['count'] AND n.status = 1
SQL
).first['count']
batch_size = 1000 batches(BATCH_SIZE) do |offset|
results = mysql_query(<<-SQL
batches(batch_size) do |offset|
results = @client.query("
SELECT fi.nid nid, SELECT fi.nid nid,
fi.title title, fi.title title,
fi.tid tid, fi.tid tid,
n.uid uid, n.uid uid,
fi.created created, fi.created created,
fi.sticky sticky, fi.sticky sticky,
f.body_value body f.body_value body,
FROM forum_index fi, nc.totalcount views,
node n, fl.timestamp solved
field_data_body f FROM forum_index fi
LEFT JOIN node n ON fi.nid = n.nid
LEFT JOIN field_data_body f ON f.entity_id = n.nid
LEFT JOIN flagging fl ON fl.entity_id = n.nid
AND fl.fid = 7
LEFT JOIN node_counter nc ON nc.nid = n.nid
WHERE n.type = 'forum' WHERE n.type = 'forum'
AND fi.nid = n.nid
AND n.nid = f.entity_id
AND n.status = 1 AND n.status = 1
LIMIT #{batch_size} LIMIT #{BATCH_SIZE}
OFFSET #{offset}; OFFSET #{offset};
", cache_rows: false) SQL
).to_a
break if results.size < 1 break if results.size < 1
next if all_records_exist? :posts, results.map { |p| "nid:#{p['nid']}" } next if all_records_exist? :posts, results.map { |p| "nid:#{p['nid']}" }
create_posts(results, total: total_count, offset: offset) do |row| create_posts(results, total: total_count, offset: offset) do |row|
{ raw = preprocess_raw(row['body'])
topic = {
id: "nid:#{row['nid']}", id: "nid:#{row['nid']}",
user_id: user_id_from_imported_user_id(row['uid']) || -1, user_id: user_id_from_imported_user_id(row['uid']) || -1,
category: category_id_from_imported_category_id(row['tid']), category: category_id_from_imported_category_id(row['tid']),
raw: row['body'], raw: raw,
created_at: Time.zone.at(row['created']), created_at: Time.zone.at(row['created']),
pinned_at: row['sticky'].to_i == 1 ? Time.zone.at(row['created']) : nil, pinned_at: row['sticky'].to_i == 1 ? Time.zone.at(row['created']) : nil,
title: row['title'].try(:strip) title: row['title'].try(:strip),
views: row['views']
} }
topic[:custom_fields] = { import_solved: true } if row['solved'].present?
topic
end end
end end
end end
def create_replies def import_replies
puts '', "creating replies in topics" puts '', "creating replies in topics"
total_count = @client.query(" total_count = mysql_query(<<-SQL
SELECT COUNT(*) count SELECT COUNT(*) count
FROM comment c, FROM comment c,
node n node n
WHERE n.nid = c.nid WHERE n.nid = c.nid
AND c.status = 1 AND c.status = 1
AND n.type IN ('blog', 'forum') AND n.type IN ('article', 'forum')
AND n.status = 1;").first['count'] AND n.status = 1
SQL
).first['count']
batch_size = 1000 batches(BATCH_SIZE) do |offset|
results = mysql_query(<<-SQL
batches(batch_size) do |offset|
results = @client.query("
SELECT c.cid, c.pid, c.nid, c.uid, c.created, SELECT c.cid, c.pid, c.nid, c.uid, c.created,
f.comment_body_value body f.comment_body_value body
FROM comment c, FROM comment c,
@ -165,9 +238,10 @@ class ImportScripts::Drupal < ImportScripts::Base
AND c.status = 1 AND c.status = 1
AND n.type IN ('blog', 'forum') AND n.type IN ('blog', 'forum')
AND n.status = 1 AND n.status = 1
LIMIT #{batch_size} LIMIT #{BATCH_SIZE}
OFFSET #{offset}; OFFSET #{offset}
", cache_rows: false) SQL
).to_a
break if results.size < 1 break if results.size < 1
@ -176,11 +250,12 @@ class ImportScripts::Drupal < ImportScripts::Base
create_posts(results, total: total_count, offset: offset) do |row| create_posts(results, total: total_count, offset: offset) do |row|
topic_mapping = topic_lookup_from_imported_post_id("nid:#{row['nid']}") topic_mapping = topic_lookup_from_imported_post_id("nid:#{row['nid']}")
if topic_mapping && topic_id = topic_mapping[:topic_id] if topic_mapping && topic_id = topic_mapping[:topic_id]
raw = preprocess_raw(row['body'])
h = { h = {
id: "cid:#{row['cid']}", id: "cid:#{row['cid']}",
topic_id: topic_id, topic_id: topic_id,
user_id: user_id_from_imported_user_id(row['uid']) || -1, user_id: user_id_from_imported_user_id(row['uid']) || -1,
raw: row['body'], raw: raw,
created_at: Time.zone.at(row['created']), created_at: Time.zone.at(row['created']),
} }
if row['pid'] if row['pid']
@ -196,6 +271,265 @@ class ImportScripts::Drupal < ImportScripts::Base
end end
end end
def import_likes
puts "", "importing post likes"
batches(BATCH_SIZE) do |offset|
likes = mysql_query(<<-SQL
SELECT flagging_id,
fid,
entity_id,
uid
FROM flagging
WHERE fid = 5
OR fid = 6
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
).to_a
break if likes.empty?
likes.each do |l|
identifier = l['fid'] == 5 ? 'nid' : 'cid'
next unless user_id = user_id_from_imported_user_id(l['uid'])
next unless post_id = post_id_from_imported_post_id("#{identifier}:#{l['entity_id']}")
next unless user = User.find_by(id: user_id)
next unless post = Post.find_by(id: post_id)
PostActionCreator.like(user, post) rescue nil
end
end
end
def mark_topics_as_solved
puts "", "marking topics as solved"
solved_topics = TopicCustomField.where(name: "import_solved").where(value: true).pluck(:topic_id)
solved_topics.each do |topic_id|
next unless topic = Topic.find(topic_id)
next unless post = topic.posts.last
post_id = post.id
PostCustomField.create!(post_id: post_id, name: "is_accepted_answer", value: true)
TopicCustomField.create!(topic_id: topic_id, name: "accepted_answer_post_id", value: post_id)
end
end
def import_sso_records
puts "", "importing sso records"
start_time = Time.now
current_count = 0
users = UserCustomField.where(name: "import_id")
total_count = users.count
return if users.empty?
users.each do |ids|
user_id = ids.user_id
external_id = ids.value
next unless user = User.find(user_id)
begin
current_count += 1
print_status(current_count, total_count, start_time)
SingleSignOnRecord.create!(user_id: user.id, external_id: external_id, external_email: user.email, last_payload: '')
rescue
next
end
end
end
def import_attachments
puts "", "importing attachments"
current_count = 0
success_count = 0
fail_count = 0
total_count = mysql_query(<<-SQL
SELECT count(field_post_attachment_fid) count
FROM field_data_field_post_attachment
SQL
).first["count"]
batches(BATCH_SIZE) do |offset|
attachments = mysql_query(<<-SQL
SELECT *
FROM field_data_field_post_attachment fp
LEFT JOIN file_managed fm
ON fp.field_post_attachment_fid = fm.fid
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
).to_a
break if attachments.size < 1
attachments.each do |attachment|
current_count += 1
print_status current_count, total_count
identifier = attachment['entity_type'] == "comment" ? "cid" : "nid"
next unless user_id = user_id_from_imported_user_id(attachment['uid'])
next unless post_id = post_id_from_imported_post_id("#{identifier}:#{attachment['entity_id']}")
next unless user = User.find(user_id)
next unless post = Post.find(post_id)
begin
new_raw = post.raw.dup
upload, filename = find_upload(post, attachment)
unless upload
fail_count += 1
next
end
upload_html = html_for_upload(upload, filename)
new_raw = "#{new_raw}\n\n#{upload_html}" unless new_raw.include?(upload_html)
if new_raw != post.raw
PostRevisor.new(post).revise!(post.user, { raw: new_raw }, bypass_bump: true, edit_reason: "Import attachment from Drupal")
else
puts '', 'Skipped upload: already imported'
end
success_count += 1
rescue => e
puts e
end
end
end
end
def create_permalinks
puts '', 'creating permalinks...'
Topic.listable_topics.find_each do |topic|
begin
tcf = topic.custom_fields
if tcf && tcf['import_id']
node_id = tcf['import_id'][/nid:(\d+)/, 1]
slug = "/topic/#{node_id}"
Permalink.create(url: slug, topic_id: topic.id)
end
rescue => e
puts e.message
puts "Permalink creation failed for id #{topic.id}"
end
end
end
def find_upload(post, attachment)
uri = attachment['uri'][/public:\/\/upload\/(.+)/, 1]
real_filename = CGI.unescapeHTML(uri)
file = File.join(ATTACHMENT_DIR, real_filename)
unless File.exists?(file)
puts "Attachment file #{attachment['filename']} doesn't exist"
tmpfile = "attachments_failed.txt"
filename = File.join('/tmp/', tmpfile)
File.open(filename, 'a') { |f|
f.puts attachment['filename']
}
end
upload = create_upload(post.user.id || -1, file, real_filename)
if upload.nil? || upload.errors.any?
puts "Upload not valid"
puts upload.errors.inspect if upload
return
end
[upload, real_filename]
end
def preprocess_raw(raw)
return if raw.blank?
# quotes on new lines
raw.gsub!(/\[quote\](.+?)\[\/quote\]/im) { |quote|
quote.gsub!(/\[quote\](.+?)\[\/quote\]/im) { "\n#{$1}\n" }
quote.gsub!(/\n(.+?)/) { "\n> #{$1}" }
}
# [QUOTE=<username>]...[/QUOTE]
raw.gsub!(/\[quote=([^;\]]+)\](.+?)\[\/quote\]/im) do
username, quote = $1, $2
"\n[quote=\"#{username}\"]\n#{quote}\n[/quote]\n"
end
raw.strip!
raw
end
def postprocess_posts
puts '', 'postprocessing posts'
current = 0
max = Post.count
Post.find_each do |post|
begin
raw = post.raw
new_raw = raw.dup
# replace old topic to new topic links
new_raw.gsub!(/https:\/\/site.com\/forum\/topic\/(\d+)/im) do
post_id = post_id_from_imported_post_id("nid:#{$1}")
next unless post_id
topic = Post.find(post_id).topic
"https://community.site.com/t/-/#{topic.id}"
end
# replace old comment to reply links
new_raw.gsub!(/https:\/\/site.com\/comment\/(\d+)#comment-\d+/im) do
post_id = post_id_from_imported_post_id("cid:#{$1}")
next unless post_id
post_ref = Post.find(post_id)
"https://community.site.com/t/-/#{post_ref.topic_id}/#{post_ref.post_number}"
end
if raw != new_raw
post.raw = new_raw
post.save
end
rescue
puts '', "Failed rewrite on post: #{post.id}"
ensure
print_status(current += 1, max)
end
end
end
def import_gravatars
puts '', 'importing gravatars'
current = 0
max = User.count
User.find_each do |user|
begin
user.create_user_avatar(user_id: user.id) unless user.user_avatar
user.user_avatar.update_gravatar!
rescue
puts '', 'Failed avatar update on user #{user.id}'
ensure
print_status(current += 1, max)
end
end
end
def parse_datetime(time)
DateTime.strptime(time, '%s')
end
def mysql_query(sql)
@client.query(sql, cache_rows: true)
end
end end
if __FILE__ == $0 if __FILE__ == $0