mirror of
https://github.com/discourse/discourse.git
synced 2025-01-20 03:32:44 +08:00
d1e844841d
This bug is actually a Drupal issue where some edited posts have their `created` and `changed` timestamps set to the same value. But even when that happens in Drupal it still maintains the correct post order in an affected thread. This PR makes the Discourse importer also maintain the original Drupal comment order by sorting comments in the source DB by their `cid`, which is sequential and never changes. More details from this post onward: https://meta.discourse.org/t/large-drupal-forum-migration-importer-errors-and-limitations/246939/24?u=rahim123
532 lines
14 KiB
Ruby
532 lines
14 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "mysql2"
|
|
require "htmlentities"
|
|
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
|
|
|
class ImportScripts::Drupal < ImportScripts::Base
|
|
DRUPAL_DB = ENV["DRUPAL_DB"] || "drupal"
|
|
VID = ENV["DRUPAL_VID"] || 1
|
|
BATCH_SIZE = 1000
|
|
ATTACHMENT_DIR = "/root/files/upload"
|
|
|
|
def initialize
|
|
super
|
|
|
|
@htmlentities = HTMLEntities.new
|
|
|
|
@client =
|
|
Mysql2::Client.new(
|
|
host: "localhost",
|
|
username: "root",
|
|
#password: "password",
|
|
database: DRUPAL_DB,
|
|
)
|
|
end
|
|
|
|
def execute
|
|
import_users
|
|
import_categories
|
|
|
|
# "Nodes" in Drupal are divided into types. Here we import two types,
|
|
# and will later import all the comments/replies for each node.
|
|
# You will need to figure out what the type names are on your install and edit the queries to match.
|
|
import_blog_topics if ENV["DRUPAL_IMPORT_BLOG"]
|
|
|
|
import_forum_topics
|
|
|
|
import_replies
|
|
import_likes
|
|
mark_topics_as_solved
|
|
import_sso_records
|
|
import_attachments
|
|
postprocess_posts
|
|
create_permalinks
|
|
import_gravatars
|
|
end
|
|
|
|
def import_users
|
|
puts "", "importing users"
|
|
|
|
user_count = mysql_query("SELECT count(uid) count FROM users").first["count"]
|
|
|
|
last_user_id = -1
|
|
|
|
batches(BATCH_SIZE) do |offset|
|
|
users = mysql_query(<<-SQL).to_a
|
|
SELECT uid,
|
|
name username,
|
|
mail email,
|
|
created
|
|
FROM users
|
|
WHERE uid > #{last_user_id}
|
|
ORDER BY uid
|
|
LIMIT #{BATCH_SIZE}
|
|
SQL
|
|
|
|
break if users.empty?
|
|
|
|
last_user_id = users[-1]["uid"]
|
|
|
|
users.reject! { |u| @lookup.user_already_imported?(u["uid"]) }
|
|
|
|
create_users(users, total: user_count, offset: offset) do |user|
|
|
email = user["email"].presence || fake_email
|
|
email = fake_email if !EmailAddressValidator.valid_value?(email)
|
|
|
|
username = @htmlentities.decode(user["username"]).strip
|
|
|
|
{ id: user["uid"], name: username, email: email, created_at: Time.zone.at(user["created"]) }
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_categories
|
|
# You'll need to edit the following query for your Drupal install:
|
|
#
|
|
# * Drupal allows duplicate category names, so you may need to exclude some categories or rename them here.
|
|
# * Table name may be term_data.
|
|
# * May need to select a vid other than 1
|
|
|
|
puts "", "importing categories"
|
|
|
|
categories = mysql_query(<<-SQL).to_a
|
|
SELECT tid,
|
|
name,
|
|
description
|
|
FROM taxonomy_term_data
|
|
WHERE vid = #{VID}
|
|
SQL
|
|
|
|
create_categories(categories) do |category|
|
|
{
|
|
id: category["tid"],
|
|
name: @htmlentities.decode(category["name"]).strip,
|
|
description: @htmlentities.decode(category["description"]).strip,
|
|
}
|
|
end
|
|
end
|
|
|
|
def import_blog_topics
|
|
puts "", "importing blog topics"
|
|
|
|
unless Category.find_by_name("Blog")
|
|
create_category({ name: "Blog", description: "Articles from the blog" }, nil)
|
|
end
|
|
|
|
blogs = mysql_query(<<-SQL).to_a
|
|
SELECT n.nid nid, n.title title, n.uid uid, n.created created, n.sticky sticky,
|
|
f.body_value body
|
|
FROM node n,
|
|
field_data_body f
|
|
WHERE n.type = 'article'
|
|
AND n.nid = f.entity_id
|
|
AND n.status = 1
|
|
SQL
|
|
|
|
category_id = Category.find_by_name("Blog").id
|
|
|
|
create_posts(blogs) do |topic|
|
|
{
|
|
id: "nid:#{topic["nid"]}",
|
|
user_id: user_id_from_imported_user_id(topic["uid"]) || -1,
|
|
category: category_id,
|
|
raw: topic["body"],
|
|
created_at: Time.zone.at(topic["created"]),
|
|
pinned_at: topic["sticky"].to_i == 1 ? Time.zone.at(topic["created"]) : nil,
|
|
title: topic["title"].try(:strip),
|
|
custom_fields: {
|
|
import_id: "nid:#{topic["nid"]}",
|
|
},
|
|
}
|
|
end
|
|
end
|
|
|
|
def import_forum_topics
|
|
puts "", "importing forum topics"
|
|
|
|
total_count = mysql_query(<<-SQL).first["count"]
|
|
SELECT COUNT(*) count
|
|
FROM forum_index fi, node n
|
|
WHERE n.type = 'forum'
|
|
AND fi.nid = n.nid
|
|
AND n.status = 1
|
|
SQL
|
|
|
|
batches(BATCH_SIZE) do |offset|
|
|
results = mysql_query(<<-SQL).to_a
|
|
SELECT fi.nid nid,
|
|
fi.title title,
|
|
fi.tid tid,
|
|
n.uid uid,
|
|
fi.created created,
|
|
fi.sticky sticky,
|
|
f.body_value body,
|
|
nc.totalcount views,
|
|
fl.timestamp solved
|
|
FROM forum_index fi
|
|
LEFT JOIN node n ON fi.nid = n.nid
|
|
LEFT JOIN field_data_body f ON f.entity_id = n.nid
|
|
LEFT JOIN flagging fl ON fl.entity_id = n.nid
|
|
AND fl.fid = 7
|
|
LEFT JOIN node_counter nc ON nc.nid = n.nid
|
|
WHERE n.type = 'forum'
|
|
AND n.status = 1
|
|
LIMIT #{BATCH_SIZE}
|
|
OFFSET #{offset};
|
|
SQL
|
|
|
|
break if results.size < 1
|
|
|
|
next if all_records_exist? :posts, results.map { |p| "nid:#{p["nid"]}" }
|
|
|
|
create_posts(results, total: total_count, offset: offset) do |row|
|
|
raw = preprocess_raw(row["body"])
|
|
topic = {
|
|
id: "nid:#{row["nid"]}",
|
|
user_id: user_id_from_imported_user_id(row["uid"]) || -1,
|
|
category: category_id_from_imported_category_id(row["tid"]),
|
|
raw: raw,
|
|
created_at: Time.zone.at(row["created"]),
|
|
pinned_at: row["sticky"].to_i == 1 ? Time.zone.at(row["created"]) : nil,
|
|
title: row["title"].try(:strip),
|
|
views: row["views"],
|
|
}
|
|
topic[:custom_fields] = { import_solved: true } if row["solved"].present?
|
|
topic
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_replies
|
|
puts "", "creating replies in topics"
|
|
|
|
total_count = mysql_query(<<-SQL).first["count"]
|
|
SELECT COUNT(*) count
|
|
FROM comment c,
|
|
node n
|
|
WHERE n.nid = c.nid
|
|
AND c.status = 1
|
|
AND n.type IN ('article', 'forum')
|
|
AND n.status = 1
|
|
SQL
|
|
|
|
batches(BATCH_SIZE) do |offset|
|
|
results = mysql_query(<<-SQL).to_a
|
|
SELECT c.cid, c.pid, c.nid, c.uid, c.created,
|
|
f.comment_body_value body
|
|
FROM comment c,
|
|
field_data_comment_body f,
|
|
node n
|
|
WHERE c.cid = f.entity_id
|
|
AND n.nid = c.nid
|
|
AND c.status = 1
|
|
AND n.type IN ('blog', 'forum')
|
|
AND n.status = 1
|
|
ORDER BY c.cid ASC
|
|
LIMIT #{BATCH_SIZE}
|
|
OFFSET #{offset}
|
|
SQL
|
|
|
|
break if results.size < 1
|
|
|
|
next if all_records_exist? :posts, results.map { |p| "cid:#{p["cid"]}" }
|
|
|
|
create_posts(results, total: total_count, offset: offset) do |row|
|
|
topic_mapping = topic_lookup_from_imported_post_id("nid:#{row["nid"]}")
|
|
if topic_mapping && topic_id = topic_mapping[:topic_id]
|
|
raw = preprocess_raw(row["body"])
|
|
h = {
|
|
id: "cid:#{row["cid"]}",
|
|
topic_id: topic_id,
|
|
user_id: user_id_from_imported_user_id(row["uid"]) || -1,
|
|
raw: raw,
|
|
created_at: Time.zone.at(row["created"]),
|
|
}
|
|
if row["pid"]
|
|
parent = topic_lookup_from_imported_post_id("cid:#{row["pid"]}")
|
|
h[:reply_to_post_number] = parent[:post_number] if parent && parent[:post_number] > (1)
|
|
end
|
|
h
|
|
else
|
|
puts "No topic found for comment #{row["cid"]}"
|
|
nil
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_likes
|
|
puts "", "importing post likes"
|
|
|
|
batches(BATCH_SIZE) do |offset|
|
|
likes = mysql_query(<<-SQL).to_a
|
|
SELECT flagging_id,
|
|
fid,
|
|
entity_id,
|
|
uid
|
|
FROM flagging
|
|
WHERE fid = 5
|
|
OR fid = 6
|
|
LIMIT #{BATCH_SIZE}
|
|
OFFSET #{offset}
|
|
SQL
|
|
|
|
break if likes.empty?
|
|
|
|
likes.each do |l|
|
|
identifier = l["fid"] == 5 ? "nid" : "cid"
|
|
next unless user_id = user_id_from_imported_user_id(l["uid"])
|
|
next unless post_id = post_id_from_imported_post_id("#{identifier}:#{l["entity_id"]}")
|
|
next unless user = User.find_by(id: user_id)
|
|
next unless post = Post.find_by(id: post_id)
|
|
begin
|
|
PostActionCreator.like(user, post)
|
|
rescue StandardError
|
|
nil
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def mark_topics_as_solved
|
|
puts "", "marking topics as solved"
|
|
|
|
solved_topics =
|
|
TopicCustomField.where(name: "import_solved").where(value: true).pluck(:topic_id)
|
|
|
|
solved_topics.each do |topic_id|
|
|
next unless topic = Topic.find(topic_id)
|
|
next unless post = topic.posts.last
|
|
post_id = post.id
|
|
|
|
PostCustomField.create!(post_id: post_id, name: "is_accepted_answer", value: true)
|
|
TopicCustomField.create!(topic_id: topic_id, name: "accepted_answer_post_id", value: post_id)
|
|
end
|
|
end
|
|
|
|
def import_sso_records
|
|
puts "", "importing sso records"
|
|
|
|
start_time = Time.now
|
|
current_count = 0
|
|
|
|
users = UserCustomField.where(name: "import_id")
|
|
|
|
total_count = users.count
|
|
|
|
return if users.empty?
|
|
|
|
users.each do |ids|
|
|
user_id = ids.user_id
|
|
external_id = ids.value
|
|
next unless user = User.find(user_id)
|
|
|
|
begin
|
|
current_count += 1
|
|
print_status(current_count, total_count, start_time)
|
|
SingleSignOnRecord.create!(
|
|
user_id: user.id,
|
|
external_id: external_id,
|
|
external_email: user.email,
|
|
last_payload: "",
|
|
)
|
|
rescue StandardError
|
|
next
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_attachments
|
|
puts "", "importing attachments"
|
|
|
|
current_count = 0
|
|
success_count = 0
|
|
fail_count = 0
|
|
|
|
total_count = mysql_query(<<-SQL).first["count"]
|
|
SELECT count(field_post_attachment_fid) count
|
|
FROM field_data_field_post_attachment
|
|
SQL
|
|
|
|
batches(BATCH_SIZE) do |offset|
|
|
attachments = mysql_query(<<-SQL).to_a
|
|
SELECT *
|
|
FROM field_data_field_post_attachment fp
|
|
LEFT JOIN file_managed fm
|
|
ON fp.field_post_attachment_fid = fm.fid
|
|
LIMIT #{BATCH_SIZE}
|
|
OFFSET #{offset}
|
|
SQL
|
|
|
|
break if attachments.size < 1
|
|
|
|
attachments.each do |attachment|
|
|
current_count += 1
|
|
print_status current_count, total_count
|
|
|
|
identifier = attachment["entity_type"] == "comment" ? "cid" : "nid"
|
|
next unless user_id = user_id_from_imported_user_id(attachment["uid"])
|
|
unless post_id = post_id_from_imported_post_id("#{identifier}:#{attachment["entity_id"]}")
|
|
next
|
|
end
|
|
next unless user = User.find(user_id)
|
|
next unless post = Post.find(post_id)
|
|
|
|
begin
|
|
new_raw = post.raw.dup
|
|
upload, filename = find_upload(post, attachment)
|
|
|
|
unless upload
|
|
fail_count += 1
|
|
next
|
|
end
|
|
|
|
upload_html = html_for_upload(upload, filename)
|
|
new_raw = "#{new_raw}\n\n#{upload_html}" unless new_raw.include?(upload_html)
|
|
|
|
if new_raw != post.raw
|
|
PostRevisor.new(post).revise!(
|
|
post.user,
|
|
{ raw: new_raw },
|
|
bypass_bump: true,
|
|
edit_reason: "Import attachment from Drupal",
|
|
)
|
|
else
|
|
puts "", "Skipped upload: already imported"
|
|
end
|
|
|
|
success_count += 1
|
|
rescue => e
|
|
puts e
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def create_permalinks
|
|
puts "", "creating permalinks..."
|
|
|
|
Topic.listable_topics.find_each do |topic|
|
|
begin
|
|
tcf = topic.custom_fields
|
|
if tcf && tcf["import_id"]
|
|
node_id = tcf["import_id"][/nid:(\d+)/, 1]
|
|
slug = "/node/#{node_id}"
|
|
Permalink.create(url: slug, topic_id: topic.id)
|
|
end
|
|
rescue => e
|
|
puts e.message
|
|
puts "Permalink creation failed for id #{topic.id}"
|
|
end
|
|
end
|
|
end
|
|
|
|
def find_upload(post, attachment)
|
|
uri = attachment["uri"][%r{public://upload/(.+)}, 1]
|
|
real_filename = CGI.unescapeHTML(uri)
|
|
file = File.join(ATTACHMENT_DIR, real_filename)
|
|
|
|
unless File.exist?(file)
|
|
puts "Attachment file #{attachment["filename"]} doesn't exist"
|
|
|
|
tmpfile = "attachments_failed.txt"
|
|
filename = File.join("/tmp/", tmpfile)
|
|
File.open(filename, "a") { |f| f.puts attachment["filename"] }
|
|
end
|
|
|
|
upload = create_upload(post.user.id || -1, file, real_filename)
|
|
|
|
if upload.nil? || upload.errors.any?
|
|
puts "Upload not valid"
|
|
puts upload.errors.inspect if upload
|
|
return
|
|
end
|
|
|
|
[upload, real_filename]
|
|
end
|
|
|
|
def preprocess_raw(raw)
|
|
return if raw.blank?
|
|
# quotes on new lines
|
|
raw.gsub!(%r{\[quote\](.+?)\[/quote\]}im) do |quote|
|
|
quote.gsub!(%r{\[quote\](.+?)\[/quote\]}im) { "\n#{$1}\n" }
|
|
quote.gsub!(/\n(.+?)/) { "\n> #{$1}" }
|
|
end
|
|
|
|
# [QUOTE=<username>]...[/QUOTE]
|
|
raw.gsub!(%r{\[quote=([^;\]]+)\](.+?)\[/quote\]}im) do
|
|
username, quote = $1, $2
|
|
"\n[quote=\"#{username}\"]\n#{quote}\n[/quote]\n"
|
|
end
|
|
|
|
raw.strip!
|
|
raw
|
|
end
|
|
|
|
def postprocess_posts
|
|
puts "", "postprocessing posts"
|
|
|
|
current = 0
|
|
max = Post.count
|
|
|
|
Post.find_each do |post|
|
|
begin
|
|
raw = post.raw
|
|
new_raw = raw.dup
|
|
|
|
# replace old topic to new topic links
|
|
new_raw.gsub!(%r{https://site.com/forum/topic/(\d+)}im) do
|
|
post_id = post_id_from_imported_post_id("nid:#{$1}")
|
|
next unless post_id
|
|
topic = Post.find(post_id).topic
|
|
"https://community.site.com/t/-/#{topic.id}"
|
|
end
|
|
|
|
# replace old comment to reply links
|
|
new_raw.gsub!(%r{https://site.com/comment/(\d+)#comment-\d+}im) do
|
|
post_id = post_id_from_imported_post_id("cid:#{$1}")
|
|
next unless post_id
|
|
post_ref = Post.find(post_id)
|
|
"https://community.site.com/t/-/#{post_ref.topic_id}/#{post_ref.post_number}"
|
|
end
|
|
|
|
if raw != new_raw
|
|
post.raw = new_raw
|
|
post.save
|
|
end
|
|
rescue StandardError
|
|
puts "", "Failed rewrite on post: #{post.id}"
|
|
ensure
|
|
print_status(current += 1, max)
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_gravatars
|
|
puts "", "importing gravatars"
|
|
current = 0
|
|
max = User.count
|
|
User.find_each do |user|
|
|
begin
|
|
user.create_user_avatar(user_id: user.id) unless user.user_avatar
|
|
user.user_avatar.update_gravatar!
|
|
rescue StandardError
|
|
puts "", 'Failed avatar update on user #{user.id}'
|
|
ensure
|
|
print_status(current += 1, max)
|
|
end
|
|
end
|
|
end
|
|
|
|
def parse_datetime(time)
|
|
DateTime.strptime(time, "%s")
|
|
end
|
|
|
|
def mysql_query(sql)
|
|
@client.query(sql, cache_rows: true)
|
|
end
|
|
end
|
|
|
|
ImportScripts::Drupal.new.perform if __FILE__ == $0
|