discourse/script/import_scripts/discuz_x.rb

957 lines
36 KiB
Ruby
Raw Normal View History

2015-01-29 15:06:30 +08:00
# encoding: utf-8
#
# Author: Erick Guan <fantasticfears@gmail.com>
#
# This script import the data from latest Discuz! X
# Should work among Discuz! X3.x
# This script is tested only on Simplified Chinese Discuz! X instances
# If you want to import data other than Simplified Chinese, email me.
2016-04-01 05:33:25 +08:00
require 'php_serialize'
require 'miro'
2015-01-29 15:06:30 +08:00
require 'mysql2'
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
2015-01-29 15:06:30 +08:00
class ImportScripts::DiscuzX < ImportScripts::Base
DISCUZX_DB = "ultrax"
DB_TABLE_PREFIX = 'pre_'
BATCH_SIZE = 1000
ORIGINAL_SITE_PREFIX = "oldsite.example.com/forums" # without http(s)://
NEW_SITE_PREFIX = "http://discourse.example.com" # with http:// or https://
# Set DISCUZX_BASE_DIR to the base directory of your discuz installation.
DISCUZX_BASE_DIR = '/var/www/discuz/upload'
AVATAR_DIR = '/uc_server/data/avatar'
ATTACHMENT_DIR = '/data/attachment/forum'
AUTHORIZED_EXTENSIONS = ['jpg', 'jpeg', 'png', 'gif', 'zip', 'rar', 'pdf']
def initialize
super
@client = Mysql2::Client.new(
host: "localhost",
username: "root",
#password: "password",
database: DISCUZX_DB
)
@first_post_id_by_topic_id = {}
2016-04-01 05:33:25 +08:00
@internal_url_regexps = [
/http(?:s)?:\/\/#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}\/forum\.php\?mod=viewthread(?:&|&amp;)tid=(?<tid>\d+)(?:[^\[\]\s]*)(?:pid=?(?<pid>\d+))?(?:[^\[\]\s]*)/,
/http(?:s)?:\/\/#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}\/viewthread\.php\?tid=(?<tid>\d+)(?:[^\[\]\s]*)(?:pid=?(?<pid>\d+))?(?:[^\[\]\s]*)/,
/http(?:s)?:\/\/#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}\/forum\.php\?mod=redirect(?:&|&amp;)goto=findpost(?:&|&amp;)pid=(?<pid>\d+)(?:&|&amp;)ptid=(?<tid>\d+)(?:[^\[\]\s]*)/,
/http(?:s)?:\/\/#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}\/redirect\.php\?goto=findpost(?:&|&amp;)pid=(?<pid>\d+)(?:&|&amp;)ptid=(?<tid>\d+)(?:[^\[\]\s]*)/,
/http(?:s)?:\/\/#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}\/forumdisplay\.php\?fid=(?<fid>\d+)(?:[^\[\]\s]*)/,
/http(?:s)?:\/\/#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}\/forum\.php\?mod=forumdisplay(?:&|&amp;)fid=(?<fid>\d+)(?:[^\[\]\s]*)/,
/http(?:s)?:\/\/#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}\/(?<action>index)\.php(?:[^\[\]\s]*)/,
/http(?:s)?:\/\/#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}\/(?<action>stats)\.php(?:[^\[\]\s]*)/,
/http(?:s)?:\/\/#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}\/misc.php\?mod=(?<mod>stat|ranklist)(?:[^\[\]\s]*)/
]
2015-01-29 15:06:30 +08:00
end
def execute
2016-04-01 05:33:25 +08:00
get_knowledge_about_duplicated_email
2015-01-29 15:06:30 +08:00
import_users
import_categories
import_posts
import_private_messages
import_attachments
end
# add the prefix to the table name
def table_name(name = nil)
DB_TABLE_PREFIX + name
end
# find which group members can be granted as admin
def get_knowledge_about_group
group_table = table_name 'common_usergroup'
result = mysql_query(
2016-04-01 05:33:25 +08:00
"SELECT groupid group_id, radminid role_id
2015-01-29 15:06:30 +08:00
FROM #{group_table};")
2016-04-01 05:33:25 +08:00
@moderator_group_id = []
@admin_group_id = []
#@banned_group_id = [4,5] # 禁止的用户及其帖子均不导入,如果你想导入这些用户和帖子,请把这个数组清空。
2015-01-29 15:06:30 +08:00
result.each do |group|
2016-04-01 05:33:25 +08:00
case group['role_id']
when 1 # 管理员
@admin_group_id << group['group_id']
when 2, 3 # 超级版主、版主。如果你不希望原普通版主成为Discourse版主把3去掉。
@moderator_group_id << group['group_id']
end
end
end
def get_knowledge_about_category_slug
@category_slug = {}
results = mysql_query("SELECT svalue value
FROM #{table_name 'common_setting'}
WHERE skey = 'forumkeys'")
return if results.size < 1
value = results.first['value']
return if value.blank?
PHP.unserialize(value).each do |category_import_id, slug|
next if slug.blank?
@category_slug[category_import_id] = slug
end
end
def get_knowledge_about_duplicated_email
@duplicated_email = {}
results = mysql_query(
"select a.uid uid, b.uid import_id from pre_common_member a
join (select uid, email from pre_common_member group by email having count(email) > 1 order by uid asc) b USING(email)
where a.uid != b.uid")
users = @lookup.instance_variable_get :@users
results.each do |row|
@duplicated_email[row['uid']] = row['import_id']
user_id = users[row['import_id']]
if user_id
users[row['uid']] = user_id
2015-01-29 15:06:30 +08:00
end
end
end
def import_users
puts '', "creating users"
get_knowledge_about_group
sensitive_user_table = table_name 'ucenter_members'
user_table = table_name 'common_member'
profile_table = table_name 'common_member_profile'
status_table = table_name 'common_member_status'
2016-04-01 05:33:25 +08:00
forum_table = table_name 'common_member_field_forum'
home_table = table_name 'common_member_field_home'
2015-01-29 15:06:30 +08:00
total_count = mysql_query("SELECT count(*) count FROM #{user_table};").first['count']
batches(BATCH_SIZE) do |offset|
results = mysql_query(
2016-04-01 05:33:25 +08:00
"SELECT u.uid id, u.username username, u.email email, u.groupid group_id,
su.regdate regdate, su.password password_hash, su.salt salt,
s.regip regip, s.lastip last_visit_ip, s.lastvisit last_visit_time, s.lastpost last_posted_at, s.lastsendmail last_emailed_at,
u.emailstatus email_confirmed, u.avatarstatus avatar_exists,
p.site website, p.address address, p.bio bio, p.realname realname, p.qq qq,
p.resideprovince resideprovince, p.residecity residecity, p.residedist residedist, p.residecommunity residecommunity,
p.resideprovince birthprovince, p.birthcity birthcity, p.birthdist birthdist, p.birthcommunity birthcommunity,
h.spacecss spacecss, h.spacenote spacenote,
f.customstatus customstatus, f.sightml sightml
2015-01-29 15:06:30 +08:00
FROM #{user_table} u
2016-04-01 05:33:25 +08:00
LEFT JOIN #{sensitive_user_table} su USING(uid)
LEFT JOIN #{profile_table} p USING(uid)
LEFT JOIN #{status_table} s USING(uid)
LEFT JOIN #{forum_table} f USING(uid)
LEFT JOIN #{home_table} h USING(uid)
2015-01-29 15:06:30 +08:00
ORDER BY u.uid ASC
LIMIT #{BATCH_SIZE}
OFFSET #{offset};")
break if results.size < 1
2016-04-01 05:33:25 +08:00
# TODO: breaks the scipt reported by some users
# next if all_records_exist? :users, users.map {|u| u["id"].to_i}
2015-01-29 15:06:30 +08:00
create_users(results, total: total_count, offset: offset) do |user|
{ id: user['id'],
email: user['email'],
username: user['username'],
2016-04-01 05:33:25 +08:00
name: first_exists(user['realname'], user['customstatus'], user['username']),
import_pass: user['password_hash'],
active: true,
salt: user['salt'],
# TODO: title: user['customstatus'], # move custom title to name since discourse can't let user custom title https://meta.discourse.org/t/let-users-custom-their-title/37626
created_at: user['regdate'] ? Time.zone.at(user['regdate']) : nil,
2015-01-29 15:06:30 +08:00
registration_ip_address: user['regip'],
ip_address: user['last_visit_ip'],
last_seen_at: user['last_visit_time'],
last_emailed_at: user['last_emailed_at'],
last_posted_at: user['last_posted_at'],
2016-04-01 05:33:25 +08:00
moderator: @moderator_group_id.include?(user['group_id']),
admin: @admin_group_id.include?(user['group_id']),
website: (user['website'] and user['website'].include?('.')) ? user['website'].strip : ( user['qq'] and user['qq'].strip == user['qq'].strip.to_i and user['qq'].strip.to_i > 10000 ) ? 'http://user.qzone.qq.com/' + user['qq'].strip : nil,
bio_raw: first_exists((user['bio'] and CGI.unescapeHTML(user['bio'])), user['sightml'], user['spacenote']).strip[0,3000],
location: first_exists(user['address'], (!user['resideprovince'].blank? ? [user['resideprovince'], user['residecity'], user['residedist'], user['residecommunity']] : [user['birthprovince'], user['birthcity'], user['birthdist'], user['birthcommunity']]).reject{|location|location.blank?}.join(' ')),
2015-01-29 15:06:30 +08:00
post_create_action: lambda do |newmember|
if user['avatar_exists'] == 1 and newmember.uploaded_avatar_id.blank?
path, filename = discuzx_avatar_fullpath(user['id'])
if path
begin
upload = create_upload(newmember.id, path, filename)
2016-04-01 05:33:25 +08:00
if !upload.nil? && upload.persisted?
2015-01-29 15:06:30 +08:00
newmember.import_mode = false
newmember.create_user_avatar
newmember.import_mode = true
newmember.user_avatar.update(custom_upload_id: upload.id)
newmember.update(uploaded_avatar_id: upload.id)
else
puts "Error: Upload did not persist!"
end
rescue SystemCallError => err
puts "Could not import avatar: #{err.message}"
end
end
end
2016-04-01 05:33:25 +08:00
if !user['spacecss'].blank? and newmember.user_profile.profile_background.blank?
# profile background
if matched = user['spacecss'].match(/body\s*{[^}]*url\('?(.+?)'?\)/i)
body_background = matched[1].split(ORIGINAL_SITE_PREFIX, 2).last
end
if matched = user['spacecss'].match(/#hd\s*{[^}]*url\('?(.+?)'?\)/i)
header_background = matched[1].split(ORIGINAL_SITE_PREFIX, 2).last
end
if matched = user['spacecss'].match(/.blocktitle\s*{[^}]*url\('?(.+?)'?\)/i)
blocktitle_background = matched[1].split(ORIGINAL_SITE_PREFIX, 2).last
end
if matched = user['spacecss'].match(/#ct\s*{[^}]*url\('?(.+?)'?\)/i)
content_background = matched[1].split(ORIGINAL_SITE_PREFIX, 2).last
end
if body_background || header_background || blocktitle_background || content_background
profile_background = first_exists(header_background, body_background, content_background, blocktitle_background)
card_background = first_exists(content_background, body_background, header_background, blocktitle_background)
upload = create_upload(newmember.id, File.join(DISCUZX_BASE_DIR, profile_background), File.basename(profile_background))
if upload
newmember.user_profile.upload_profile_background upload
else
puts "WARNING: #{user['username']} (UID: #{user['id']}) profile_background file did not persist!"
end
upload = create_upload(newmember.id, File.join(DISCUZX_BASE_DIR, card_background), File.basename(card_background))
if upload
newmember.user_profile.upload_card_background upload
else
puts "WARNING: #{user['username']} (UID: #{user['id']}) card_background file did not persist!"
end
end
end
2015-01-29 15:06:30 +08:00
# we don't send email to the unconfirmed user
newmember.update(email_digests: user['email_confirmed'] == 1) if newmember.email_digests
2016-04-01 05:33:25 +08:00
newmember.update(name: '') if !newmember.name.blank? and newmember.name == newmember.username
2015-01-29 15:06:30 +08:00
end
}
end
end
end
def import_categories
puts '', "creating categories"
2016-04-01 05:33:25 +08:00
get_knowledge_about_category_slug
2015-01-29 15:06:30 +08:00
forums_table = table_name 'forum_forum'
forums_data_table = table_name 'forum_forumfield'
results = mysql_query("
SELECT f.fid id, f.fup parent_id, f.name, f.type type, f.status status, f.displayorder position,
2016-04-01 05:33:25 +08:00
d.description description, d.rules rules, d.icon, d.extra extra
2015-01-29 15:06:30 +08:00
FROM #{forums_table} f
2016-04-01 05:33:25 +08:00
LEFT JOIN #{forums_data_table} d USING(fid)
2015-01-29 15:06:30 +08:00
ORDER BY parent_id ASC, id ASC
")
max_position = Category.all.max_by(&:position).position
create_categories(results) do |row|
2016-04-01 05:33:25 +08:00
next if row['type'] == 'group' or row['status'] == 2 # or row['status'].to_i == 3 # 如果不想导入群组,取消注释
extra = PHP.unserialize(row['extra']) if !row['extra'].blank?
if extra and !extra["namecolor"].blank?
color = extra["namecolor"][1,6]
end
2015-01-29 15:06:30 +08:00
Category.all.max_by(&:position).position
2016-04-01 05:33:25 +08:00
2015-01-29 15:06:30 +08:00
h = {
id: row['id'],
name: row['name'],
description: row['description'],
2016-04-01 05:33:25 +08:00
position: row['position'].to_i + max_position,
color: color,
suppress_from_homepage: (row['status'] == 0 or row['status'] == 3),
post_create_action: lambda do |category|
if slug = @category_slug[row['id']]
category.update(slug: slug)
end
raw = process_discuzx_post(row['rules'], nil)
if @bbcode_to_md
raw = raw.bbcode_to_md(false) rescue raw
end
category.topic.posts.first.update_attribute(:raw, raw)
if !row['icon'].empty?
upload = create_upload(Discourse::SYSTEM_USER_ID, File.join(DISCUZX_BASE_DIR, ATTACHMENT_DIR, '../common', row['icon']), File.basename(row['icon']))
if upload
category.logo_url = upload.url
# FIXME: I don't know how to get '/shared' by script. May change to Rails.root
category.color = Miro::DominantColors.new(File.join('/shared', category.logo_url)).to_hex.first[1,6] if !color
category.save!
end
end
category
end
2015-01-29 15:06:30 +08:00
}
if row['parent_id'].to_i > 0
h[:parent_category_id] = category_id_from_imported_category_id(row['parent_id'])
2015-01-29 15:06:30 +08:00
end
h
end
end
def import_posts
puts "", "creating topics and posts"
2016-04-01 05:33:25 +08:00
users_table = table_name 'common_member'
2015-01-29 15:06:30 +08:00
posts_table = table_name 'forum_post'
topics_table = table_name 'forum_thread'
total_count = mysql_query("SELECT count(*) count FROM #{posts_table}").first['count']
batches(BATCH_SIZE) do |offset|
results = mysql_query("
SELECT p.pid id,
p.tid topic_id,
t.fid category_id,
t.subject title,
p.authorid user_id,
p.message raw,
p.dateline post_time,
2016-04-01 05:33:25 +08:00
p2.pid first_id,
p.invisible status,
t.special special
FROM #{posts_table} p
JOIN #{posts_table} p2 ON p2.first AND p2.tid = p.tid
JOIN #{topics_table} t ON t.tid = p.tid
where t.tid < 10000
2015-01-29 15:06:30 +08:00
ORDER BY id ASC, topic_id ASC
LIMIT #{BATCH_SIZE}
OFFSET #{offset};
")
2016-04-01 05:33:25 +08:00
# u.status != -1 AND u.groupid != 4 AND u.groupid != 5 用户未被锁定、禁访或禁言。在现实中的 Discuz 论坛,禁止的用户通常是广告机或驱逐的用户,这些不需要导入。
2015-01-29 15:06:30 +08:00
break if results.size < 1
next if all_records_exist? :posts, results.map {|p| p["id"].to_i}
2015-01-29 15:06:30 +08:00
create_posts(results, total: total_count, offset: offset) do |m|
skip = false
mapped = {}
mapped[:id] = m['id']
mapped[:user_id] = user_id_from_imported_user_id(m['user_id']) || -1
mapped[:raw] = process_discuzx_post(m['raw'], m['id'])
mapped[:created_at] = Time.zone.at(m['post_time'])
2016-04-01 05:33:25 +08:00
if m['id'] == m['first_id']
mapped[:category] = category_id_from_imported_category_id(m['category_id'])
2015-01-29 15:06:30 +08:00
mapped[:title] = CGI.unescapeHTML(m['title'])
2016-04-01 05:33:25 +08:00
if m['special'] == 1
results = mysql_query("
SELECT multiple, maxchoices
FROM #{table_name 'forum_poll'}
WHERE tid = #{m['topic_id']}")
poll = results.first || {}
results = mysql_query("
SELECT polloption
FROM #{table_name 'forum_polloption'}
WHERE tid = #{m['topic_id']}
ORDER BY displayorder")
if results.empty?
puts "WARNING: can't find poll options for topic #{m['topic_id']}, skip poll"
else
mapped[:raw].prepend "[poll#{poll['multiple'] ? ' type=multiple' : ''}#{poll['maxchoices'] > 0 ? " max=#{poll['maxchoices']}" : ''}]\n#{results.map{|option|'- ' + option['polloption']}.join("\n")}\n[/poll]\n"
end
end
2015-01-29 15:06:30 +08:00
else
2016-04-01 05:33:25 +08:00
parent = topic_lookup_from_imported_post_id(m['first_id'])
2015-01-29 15:06:30 +08:00
if parent
mapped[:topic_id] = parent[:topic_id]
2016-04-01 05:33:25 +08:00
reply_post_import_id = find_post_id_by_quote_number(m['raw'])
if reply_post_import_id
post_id = post_id_from_imported_post_id(reply_post_import_id.to_i)
if (post = Post.find_by(id: post_id))
if post.topic_id == mapped[:topic_id]
mapped[:reply_to_post_number] = post.post_number
else
puts "post #{m['id']} reply to another topic, skip reply"
end
else
puts "post #{m['id']} reply to not exists post #{reply_post_import_id}, skip reply"
end
2015-01-29 15:06:30 +08:00
end
else
puts "Parent topic #{m['topic_id']} doesn't exist. Skipping #{m['id']}: #{m['title'][0..40]}"
skip = true
end
2016-04-01 05:33:25 +08:00
2015-01-29 15:06:30 +08:00
end
2016-04-01 05:33:25 +08:00
if m['status'] & 1 == 1 || mapped[:raw].blank?
2015-01-29 15:06:30 +08:00
mapped[:post_create_action] = lambda do |post|
PostDestroyer.new(Discourse.system_user, post).perform_delete
end
2016-04-01 05:33:25 +08:00
elsif (m['status'] & 2) >> 1 == 1 # waiting for approve
2015-01-29 15:06:30 +08:00
mapped[:post_create_action] = lambda do |post|
PostAction.act(Discourse.system_user, post, 6, {take_action: false})
end
end
skip ? nil : mapped
end
end
end
2016-04-01 05:33:25 +08:00
def import_bookmarks
puts '', 'creating bookmarks'
favorites_table = table_name 'home_favorite'
posts_table = table_name 'forum_post'
total_count = mysql_query("SELECT count(*) count FROM #{favorites_table} WHERE idtype = 'tid'").first['count']
batches(BATCH_SIZE) do |offset|
results = mysql_query("
SELECT p.pid post_id, f.uid user_id
FROM #{favorites_table} f
JOIN #{posts_table} p ON f.id = p.tid
WHERE f.idtype = 'tid' AND p.first
LIMIT #{BATCH_SIZE}
OFFSET #{offset};")
break if results.size < 1
# next if all_records_exist?
create_bookmarks(results, total: total_count, offset: offset) do |row|
{
user_id: row['user_id'],
post_id: row['post_id']
}
end
end
end
2015-01-29 15:06:30 +08:00
def import_private_messages
puts '', 'creating private messages'
pm_indexes = table_name 'ucenter_pm_indexes'
pm_messages = table_name 'ucenter_pm_messages'
total_count = mysql_query("SELECT count(*) count FROM #{pm_indexes}").first['count']
batches(BATCH_SIZE) do |offset|
results = mysql_query("
SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at
FROM #{pm_messages}_1
UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at
FROM #{pm_messages}_2
UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at
FROM #{pm_messages}_3
UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at
FROM #{pm_messages}_4
UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at
FROM #{pm_messages}_5
UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at
FROM #{pm_messages}_6
UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at
FROM #{pm_messages}_7
UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at
FROM #{pm_messages}_8
UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at
FROM #{pm_messages}_9
ORDER BY thread_id ASC, id ASC
LIMIT #{BATCH_SIZE}
OFFSET #{offset};")
break if results.size < 1
2016-04-01 05:33:25 +08:00
# next if all_records_exist? :posts, results.map {|m| "pm:#{m['id']}"}
2015-01-29 15:06:30 +08:00
create_posts(results, total: total_count, offset: offset) do |m|
skip = false
mapped = {}
mapped[:id] = "pm:#{m['id']}"
mapped[:user_id] = user_id_from_imported_user_id(m['user_id']) || -1
mapped[:raw] = process_discuzx_post(m['message'], m['id'])
mapped[:created_at] = Time.zone.at(m['created_at'])
thread_id = "pm_#{m['thread_id']}"
if is_first_pm(m['id'], m['thread_id'])
# find the title from list table
pm_thread = mysql_query("
SELECT plid thread_id, subject
FROM #{table_name 'ucenter_pm_lists'}
WHERE plid = #{m['thread_id']};").first
mapped[:title] = pm_thread['subject']
mapped[:archetype] = Archetype.private_message
# Find the users who are part of this private message.
import_user_ids = mysql_query("
SELECT plid thread_id, uid user_id
FROM #{table_name 'ucenter_pm_members'}
WHERE plid = #{m['thread_id']};
").map {|r| r['user_id']}.uniq
mapped[:target_usernames] = import_user_ids.map! do |import_user_id|
import_user_id.to_s == m['user_id'].to_s ? nil : User.find_by(id: user_id_from_imported_user_id(import_user_id)).try(:username)
end.compact
if mapped[:target_usernames].empty? # pm with yourself?
skip = true
puts "Skipping pm:#{m['id']} due to no target"
else
@first_post_id_by_topic_id[thread_id] = mapped[:id]
end
else
parent = topic_lookup_from_imported_post_id(@first_post_id_by_topic_id[thread_id])
if parent
mapped[:topic_id] = parent[:topic_id]
else
puts "Parent post pm thread:#{thread_id} doesn't exist. Skipping #{m["id"]}: #{m["message"][0..40]}"
skip = true
end
end
skip ? nil : mapped
end
end
end
# search for first pm id for the series of pm
def is_first_pm(pm_id, thread_id)
result = mysql_query("
SELECT pmid id
FROM #{table_name 'ucenter_pm_indexes'}
WHERE plid = #{thread_id}
ORDER BY id")
result.first['id'].to_s == pm_id.to_s
end
2016-04-01 05:33:25 +08:00
def process_and_upload_inline_images(raw)
2015-01-29 15:06:30 +08:00
inline_image_regex = /\[img\]([\s\S]*?)\[\/img\]/
2016-04-01 05:33:25 +08:00
2015-01-29 15:06:30 +08:00
s = raw.dup
s.gsub!(inline_image_regex) do |d|
matches = inline_image_regex.match(d)
data = matches[1]
upload, filename = upload_inline_image data
upload ? html_for_upload(upload, filename) : nil
end
2016-04-01 05:33:25 +08:00
end
def process_discuzx_post(raw, import_id)
# raw = process_and_upload_inline_images(raw)
s = raw.dup
2015-01-29 15:06:30 +08:00
# Strip the quote
# [quote] quotation includes the topic which is the same as reply to in Discourse
# We get the pid to find the post number the post reply to. So it can be stripped
s = s.gsub(/\[b\]回复 \[url=forum.php\?mod=redirect&goto=findpost&pid=\d+&ptid=\d+\].* 的帖子\[\/url\]\[\/b\]/i, '').strip
2016-04-01 05:33:25 +08:00
s = s.gsub(/\[b\]回复 \[url=https?:\/\/#{ORIGINAL_SITE_PREFIX}\/redirect.php\?goto=findpost&pid=\d+&ptid=\d+\].*?\[\/url\].*?\[\/b\]/i, '').strip
s.gsub!(/\[quote\](.*)?\[\/quote\]/im) do |matched|
content = $1
post_import_id = find_post_id_by_quote_number(content)
if post_import_id
post_id = post_id_from_imported_post_id(post_import_id.to_i)
if (post = Post.find_by(id: post_id))
"[quote=\"#{post.user.username}\", post: #{post.post_number}, topic: #{post.topic_id}]\n#{content}\n[/quote]"
else
puts "post #{import_id} quote to not exists post #{post_import_id}, skip reply"
matched[0]
end
else
matched[0]
end
end
2016-04-01 05:33:25 +08:00
s.gsub!(/\[size=2\]\[color=#999999\].*? 发表于 [\d\-\: ]*\[\/color\] \[url=forum.php\?mod=redirect&goto=findpost&pid=\d+&ptid=\d+\].*?\[\/url\]\[\/size\]/i, '')
s.gsub!(/\[size=2\]\[color=#999999\].*? 发表于 [\d\-\: ]*\[\/color\] \[url=https?:\/\/#{ORIGINAL_SITE_PREFIX}\/redirect.php\?goto=findpost&pid=\d+&ptid=\d+\].*?\[\/url\]\[\/size\]/i, '')
# convert quote
s.gsub!(/\[quote\](.*?)\[\/quote\]/m) { "\n" + ($1.strip).gsub(/^/, '> ') + "\n" }
# truncate line space, preventing line starting with many blanks to be parsed as code blocks
s.gsub!(/^ {4,}/, ' ')
# TODO: Much better to use bbcode-to-md gem
# Convert image bbcode with width and height
s.gsub!(/\[img[^\]]*\]https?:\/\/#{ORIGINAL_SITE_PREFIX}\/(.*)\[\/img\]/i, '[x-attach]\1[/x-attach]') # dont convert attachment
s.gsub!(/<img[^>]*src="https?:\/\/#{ORIGINAL_SITE_PREFIX}\/(.*)".*?>/i, '[x-attach]\1[/x-attach]') # dont convert attachment
s.gsub!(/\[img[^\]]*\]https?:\/\/www\.touhou\.cc\/blog\/(.*)\[\/img\]/i, '[x-attach]../blog/\1[/x-attach]') # 私货
s.gsub!(/\[img[^\]]*\]https?:\/\/www\.touhou\.cc\/ucenter\/avatar.php\?uid=(\d+)[^\]]*\[\/img\]/i) { "[x-attach]#{discuzx_avatar_fullpath($1,false)[0]}[/x-attach]" } # 私货
s.gsub!(/\[img=(\d+),(\d+)\]([^\]]*)\[\/img\]/i, '<img width="\1" height="\2" src="\3">')
2016-04-01 05:33:25 +08:00
s.gsub!(/\[img\]([^\]]*)\[\/img\]/i, '<img src="\1">')
s.gsub!(/\[qq\]([^\]]*)\[\/qq\]/i, '<a href="http://wpa.qq.com/msgrd?V=3&Uin=\1&Site=[Discuz!]&from=discuz&Menu=yes" target="_blank"><!--<img src="static/image/common/qq_big.gif" border="0">-->QQ 交谈</a>')
s.gsub!(/\[email\]([^\]]*)\[\/email\]/i, '[url=mailto:\1]\1[/url]') # bbcode-to-md can convert it
s.gsub!(/\[s\]([^\]]*)\[\/s\]/i, '<s>\1</s>')
s.gsub!(/\[sup\]([^\]]*)\[\/sup\]/i, '<sup>\1</sup>')
s.gsub!(/\[sub\]([^\]]*)\[\/sub\]/i, '<sub>\1</sub>')
s.gsub!(/\[hr\]/i, "\n---\n")
# remove the media tag
s.gsub!(/\[\/?media[^\]]*\]/i, "\n")
s.gsub!(/\[\/?flash[^\]]*\]/i, "\n")
s.gsub!(/\[\/?audio[^\]]*\]/i, "\n")
s.gsub!(/\[\/?video[^\]]*\]/i, "\n")
2015-01-29 15:06:30 +08:00
2015-03-24 17:59:08 +08:00
# Remove the font, p and backcolor tag
2015-01-29 15:06:30 +08:00
# Discourse doesn't support the font tag
2015-03-24 17:59:08 +08:00
s.gsub!(/\[font=[^\]]*?\]/i, '')
2015-01-29 15:06:30 +08:00
s.gsub!(/\[\/font\]/i, '')
2015-03-24 17:59:08 +08:00
s.gsub!(/\[p=[^\]]*?\]/i, '')
s.gsub!(/\[\/p\]/i, '')
s.gsub!(/\[backcolor=[^\]]*?\]/i, '')
s.gsub!(/\[\/backcolor\]/i, '')
2015-01-29 15:06:30 +08:00
# Remove the size tag
# I really have no idea what is this
2015-03-24 17:59:08 +08:00
s.gsub!(/\[size=[^\]]*?\]/i, '')
2015-01-29 15:06:30 +08:00
s.gsub!(/\[\/size\]/i, '')
# Remove the color tag
2015-03-24 17:59:08 +08:00
s.gsub!(/\[color=[^\]]*?\]/i, '')
2015-01-29 15:06:30 +08:00
s.gsub!(/\[\/color\]/i, '')
# Remove the hide tag
s.gsub!(/\[\/?hide\]/i, '')
2016-04-01 05:33:25 +08:00
s.gsub!(/\[\/?free[^\]]*\]/i, "\n")
2015-01-29 15:06:30 +08:00
# Remove the align tag
# still don't know what it is
2016-04-01 05:33:25 +08:00
s.gsub!(/\[align=[^\]]*?\]/i, "\n")
2015-01-29 15:06:30 +08:00
s.gsub!(/\[\/align\]/i, "\n")
2016-04-01 05:33:25 +08:00
s.gsub!(/\[float=[^\]]*?\]/i, "\n")
s.gsub!(/\[\/float\]/i, "\n")
2015-01-29 15:06:30 +08:00
# Convert code
s.gsub!(/\[\/?code\]/i, "\n```\n")
2015-01-29 15:06:30 +08:00
# The edit notice should be removed
# example: 本帖最后由 Helloworld 于 2015-1-28 22:05 编辑
s.gsub!(/\[i=s\] 本帖最后由[\s\S]*?编辑 \[\/i\]/, '')
# Convert the custom smileys to emojis
# `{:cry:}` to `:cry`
s.gsub!(/\{(\:\S*?\:)\}/, '\1')
# Replace internal forum links that aren't in the <!-- l --> format
# convert list tags to ul and list=1 tags to ol
# (basically, we're only missing list=a here...)
s.gsub!(/\[list\](.*?)\[\/list:u\]/m, '[ul]\1[/ul]')
s.gsub!(/\[list=1\](.*?)\[\/list:o\]/m, '[ol]\1[/ol]')
# convert *-tags to li-tags so bbcode-to-md can do its magic on phpBB's lists:
s.gsub!(/\[\*\](.*?)\[\/\*:m\]/, '[li]\1[/li]')
# Discuz can create PM out of a post, which will generates like
# [url=http://example.com/forum.php?mod=redirect&goto=findpost&pid=111&ptid=11][b]关于您在“主题名称”的帖子[/b][/url]
s.gsub!(pm_url_regexp) do |discuzx_link|
replace_internal_link(discuzx_link, $1)
end
# [url][b]text[/b][/url] to **[url]text[/url]**
s.gsub!(/(\[url=[^\[\]]*?\])\[b\](\S*)\[\/b\](\[\/url\])/, '**\1\2\3**')
2016-04-01 05:33:25 +08:00
@internal_url_regexps.each do |internal_url_regexp|
s.gsub!(internal_url_regexp) do |discuzx_link|
replace_internal_link(discuzx_link, ($~[:tid].to_i rescue nil), ($~[:pid].to_i rescue nil), ($~[:fid].to_i rescue nil), ($~[:action] rescue nil))
end
2015-01-29 15:06:30 +08:00
end
# @someone without the url
s.gsub!(/@\[url=[^\[\]]*?\](\S*)\[\/url\]/i, '@\1')
2016-04-01 05:33:25 +08:00
s.scan(/http(?:s)?:\/\/#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}\/[^\[\]\s]*/) {|link|puts "WARNING: post #{import_id} can't replace internal url #{link}"}
2015-01-29 15:06:30 +08:00
s.strip
end
2016-04-01 05:33:25 +08:00
def replace_internal_link(discuzx_link, import_topic_id, import_post_id, import_category_id, action)
if import_post_id
post_id = post_id_from_imported_post_id import_post_id
if post_id
post = Post.find post_id
return post.full_url if post
end
end
if import_topic_id
2015-01-29 15:06:30 +08:00
2016-04-01 05:33:25 +08:00
results = mysql_query("SELECT pid
FROM #{table_name 'forum_post'}
WHERE tid = #{import_topic_id} AND first
LIMIT 1")
2015-01-29 15:06:30 +08:00
2016-04-01 05:33:25 +08:00
return discuzx_link unless results.size > 0
2015-01-29 15:06:30 +08:00
2016-04-01 05:33:25 +08:00
linked_post_id = results.first['pid']
lookup = topic_lookup_from_imported_post_id(linked_post_id)
if lookup
return "#{NEW_SITE_PREFIX}#{lookup[:url]}"
else
return discuzx_link
end
2015-01-29 15:06:30 +08:00
end
2016-04-01 05:33:25 +08:00
if import_category_id
category_id = category_id_from_imported_category_id import_category_id
if category_id
category = Category.find category_id
return category.url if category
end
end
case action
when 'index'
return "#{NEW_SITE_PREFIX}/"
when 'stat', 'stats', 'ranklist'
return "#{NEW_SITE_PREFIX}/users"
end
discuzx_link
2015-01-29 15:06:30 +08:00
end
def pm_url_regexp
@pm_url_regexp ||= Regexp.new("http(?:s)?://#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}/forum\\.php\\?mod=redirect&goto=findpost&pid=\\d+&ptid=(\\d+)")
end
# This step is done separately because it can take multiple attempts to get right (because of
# missing files, wrong paths, authorized extensions, etc.).
def import_attachments
setting = AUTHORIZED_EXTENSIONS.join('|')
SiteSetting.authorized_extensions = setting if setting != SiteSetting.authorized_extensions
attachment_regex = /\[attach\](\d+)\[\/attach\]/
2016-04-01 05:33:25 +08:00
attachment_link_regex = /\[x-attach\](.+)\[\/x-attach\]/
2015-01-29 15:06:30 +08:00
current_count = 0
total_count = mysql_query("SELECT count(*) count FROM #{table_name 'forum_post'};").first['count']
success_count = 0
fail_count = 0
puts '', "Importing attachments...", ''
Post.find_each do |post|
2016-04-01 05:33:25 +08:00
next unless post.custom_fields['import_id'] == post.custom_fields['import_id'].to_i.to_s
user = post.user
2015-01-29 15:06:30 +08:00
current_count += 1
print_status current_count, total_count
new_raw = post.raw.dup
2016-04-01 05:33:25 +08:00
inline_attachments = []
2015-01-29 15:06:30 +08:00
new_raw.gsub!(attachment_regex) do |s|
2016-04-01 05:33:25 +08:00
attachment_id = $1.to_i
inline_attachments.push attachment_id
2015-01-29 15:06:30 +08:00
upload, filename = find_upload(user, post, attachment_id)
unless upload
fail_count += 1
next
end
html_for_upload(upload, filename)
end
2016-04-01 05:33:25 +08:00
new_raw.gsub!(attachment_link_regex) do |s|
attachment_file = $1
filename = File.basename(attachment_file)
upload = create_upload(user.id, File.join(DISCUZX_BASE_DIR, attachment_file), filename)
unless upload
fail_count += 1
next
end
html_for_upload(upload, filename)
end
sql = "SELECT aid
FROM #{table_name 'forum_attachment'}
WHERE pid = #{post.custom_fields['import_id']}"
if !inline_attachments.empty?
sql << " AND aid NOT IN (#{inline_attachments.join(',')})"
end
results = mysql_query(sql)
results.each do |attachment|
attachment_id = attachment['aid']
upload, filename = find_upload(user, post, attachment_id)
unless upload
fail_count += 1
next
end
html = html_for_upload(upload, filename)
unless new_raw.include? html
new_raw << "\n"
new_raw << html
end
end
2015-01-29 15:06:30 +08:00
if new_raw != post.raw
PostRevisor.new(post).revise!(post.user, { raw: new_raw }, { bypass_bump: true, edit_reason: '从 Discuz 中导入附件' })
end
success_count += 1
end
puts '', ''
puts "succeeded: #{success_count}"
puts " failed: #{fail_count}" if fail_count > 0
puts ''
end
# Create the full path to the discuz avatar specified from user id
2016-04-01 05:33:25 +08:00
def discuzx_avatar_fullpath(user_id, absolute=true)
2015-01-29 15:06:30 +08:00
padded_id = user_id.to_s.rjust(9, '0')
part_1 = padded_id[0..2]
part_2 = padded_id[3..4]
part_3 = padded_id[5..6]
part_4 = padded_id[-2..-1]
file_name = "#{part_4}_avatar_big.jpg"
2016-04-01 05:33:25 +08:00
if absolute
return File.join(DISCUZX_BASE_DIR, AVATAR_DIR, part_1, part_2, part_3, file_name), file_name
else
return File.join(AVATAR_DIR, part_1, part_2, part_3, file_name), file_name
end
2015-01-29 15:06:30 +08:00
end
# post id is in the quote block
def find_post_id_by_quote_number(raw)
2016-04-01 05:33:25 +08:00
case raw
when /\[url=forum.php\?mod=redirect&goto=findpost&pid=(\d+)&ptid=\d+\]/ #standard
$1
when /\[url=https?:\/\/#{ORIGINAL_SITE_PREFIX}\/redirect.php\?goto=findpost&pid=(\d+)&ptid=\d+\]/ # old discuz 7 format
$1
when /\[quote\][\S\s]*pid=(\d+)[\S\s]*\[\/quote\]/ # quote
$1
end
2015-01-29 15:06:30 +08:00
end
# for some reason, discuz inlined some png file
# the corresponding image stored is broken in a way
def upload_inline_image(data)
return unless data
puts 'Creating inline image'
encoded_photo = data['data:image/png;base64,'.length .. -1]
if encoded_photo
raw_file = Base64.decode64(encoded_photo)
else
puts 'Error parsed inline photo', data[0..20]
return
end
real_filename = "#{SecureRandom.hex}.png"
filename = Tempfile.new(['inline', '.png'])
begin
filename.binmode
filename.write(raw_file)
filename.rewind
upload = create_upload(Discourse::SYSTEM_USER_ID, filename, real_filename)
ensure
filename.close rescue nil
filename.unlink rescue nil
end
if upload.nil? || !upload.valid?
puts "Upload not valid :("
puts upload.errors.inspect if upload
return nil
end
return upload, real_filename
end
# find the uploaded file and real name from the db
def find_upload(user, post, upload_id)
attachment_table = table_name 'forum_attachment'
# search for table id
sql = "SELECT a.pid post_id,
a.aid upload_id,
a.tableid table_id
FROM #{attachment_table} a
WHERE a.pid = #{post.custom_fields['import_id']}
AND a.aid = #{upload_id};"
results = mysql_query(sql)
unless (meta_data = results.first)
puts "Couldn't find forum_attachment record meta data for post.id = #{post.id}, import_id = #{post.custom_fields['import_id']}"
return nil
end
# search for uploaded file meta data
sql = "SELECT a.pid post_id,
a.aid upload_id,
a.tid topic_id,
a.uid user_id,
a.dateline uploaded_time,
a.filename real_filename,
a.attachment attachment_path,
a.remote is_remote,
a.description description,
a.isimage is_image,
a.thumb is_thumb
FROM #{attachment_table}_#{meta_data['table_id']} a
WHERE a.aid = #{upload_id};"
results = mysql_query(sql)
unless (row = results.first)
puts "Couldn't find attachment record for post.id = #{post.id}, import_id = #{post.custom_fields['import_id']}"
return nil
end
filename = File.join(DISCUZX_BASE_DIR, ATTACHMENT_DIR, row['attachment_path'])
unless File.exists?(filename)
puts "Attachment file doesn't exist: #{filename}"
return nil
end
real_filename = row['real_filename']
real_filename.prepend SecureRandom.hex if real_filename[0] == '.'
upload = create_upload(user.id, filename, real_filename)
if upload.nil? || !upload.valid?
puts "Upload not valid :("
puts upload.errors.inspect if upload
return nil
end
return upload, real_filename
rescue Mysql2::Error => e
puts "SQL Error"
puts e.message
puts sql
return nil
end
2016-04-01 05:33:25 +08:00
def first_exists(*items)
items.find{|item|!item.blank?} || ''
end
2015-01-29 15:06:30 +08:00
def mysql_query(sql)
@client.query(sql, cache_rows: false)
end
end
ImportScripts::DiscuzX.new.perform