discourse/script/import_scripts/smf1.rb
Sam Saffron 30990006a9 DEV: enable frozen string literal on all files
This reduces chances of errors where consumers of strings mutate inputs
and reduces memory usage of the app.

Test suite passes now, but there may be some stuff left, so we will run
a few sites on a branch prior to merging
2019-05-13 09:31:32 +08:00

658 lines
20 KiB
Ruby

# frozen_string_literal: true
require "mysql2"
require "htmlentities"
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
class ImportScripts::Smf1 < ImportScripts::Base
BATCH_SIZE ||= 5000
UPLOADS_DIR ||= ENV["UPLOADS_DIR"].presence
FORUM_URL ||= ENV["FORUM_URL"].presence
def initialize
fail "UPLOADS_DIR env variable is required (example: '/path/to/attachments')" unless UPLOADS_DIR
fail "FORUM_URL env variable is required (example: 'https://domain.com/forum')" unless FORUM_URL
@client = Mysql2::Client.new(
host: ENV["DB_HOST"] || "localhost",
username: ENV["DB_USER"] || "root",
password: ENV["DB_PW"],
database: ENV["DB_NAME"],
)
check_version!
super
@htmlentities = HTMLEntities.new
puts "Loading existing usernames..."
@old_to_new_usernames = UserCustomField.joins(:user).where(name: "import_username").pluck("value", "users.username").to_h
puts "Loading pm mapping..."
@pm_mapping = {}
Topic
.joins(:topic_allowed_users)
.where(archetype: Archetype.private_message)
.where("title NOT ILIKE 'Re: %'")
.group(:id)
.order(:id)
.pluck("string_agg(topic_allowed_users.user_id::text, ',' ORDER BY topic_allowed_users.user_id), title, topics.id")
.each do |users, title, topic_id|
@pm_mapping[users] ||= {}
@pm_mapping[users][title] ||= []
@pm_mapping[users][title] << topic_id
end
end
def execute
SiteSetting.permalink_normalizations = "/(.+)\\?.*/\\1"
import_groups
import_users
import_categories
import_posts
import_personal_posts
import_attachments
import_likes
import_feedbacks
import_banned_domains
import_banned_emails
import_banned_ips
end
def check_version!
version = mysql_query("SELECT value FROM smf_settings WHERE variable = 'smfVersion' LIMIT 1").first["value"]
fail "Incompatible version (#{version})" unless version&.start_with?("1.")
end
def import_groups
puts "", "Importing groups..."
# skip administrators/moderators
groups = mysql_query("SELECT id_group, groupName FROM smf_membergroups WHERE id_group > 3").to_a
create_groups(groups) do |g|
next if g["groupName"].blank?
{
id: g["id_group"],
full_name: g["groupName"],
}
end
end
def import_users
puts "", "Importing users..."
last_user_id = -1
total = mysql_query("SELECT COUNT(*) count FROM smf_members").first["count"]
batches(BATCH_SIZE) do |offset|
users = mysql_query(<<~SQL
SELECT m.id_member
, memberName
, dateRegistered
, id_group
, lastLogin
, realName
, emailAddress
, personalText
, CASE WHEN birthdate > '1900-01-01' THEN birthdate ELSE NULL END birthdate
, websiteUrl
, location
, usertitle
, memberIP
, memberIP2
, is_activated
, additionalGroups
, avatar
, id_attach
, attachmentType
, filename
FROM smf_members m
LEFT JOIN smf_attachments a ON a.id_member = m.id_member
WHERE m.id_member > #{last_user_id}
ORDER BY m.id_member
LIMIT #{BATCH_SIZE}
SQL
).to_a
break if users.empty?
last_user_id = users[-1]["id_member"]
user_ids = users.map { |u| u["id_member"] }
next if all_records_exist?(:users, user_ids)
create_users(users, total: total, offset: offset) do |u|
created_at = Time.zone.at(u["dateRegistered"])
group_ids = [u["id_group"], *u["additionalGroups"].split(",").map(&:to_i)].uniq
{
id: u["id_member"],
username: u["memberName"],
created_at: created_at,
first_seen_at: created_at,
primary_group_id: group_id_from_imported_group_id(u["id_group"]),
admin: group_ids.include?(1),
moderator: group_ids.include?(2) || group_ids.include?(3),
last_seen_at: Time.zone.at(u["lastLogin"]),
name: u["realName"].presence,
email: u["emailAddress"],
bio_raw: pre_process_raw(u["personalText"].presence),
date_of_birth: u["birthdate"],
website: u["website"].presence,
location: u["location"].presence,
title: u["usertitle"].presence,
registration_ip_address: u["memberIP"],
ip_address: u["memberIP2"],
active: u["is_activated"] == 1,
approved: u["is_activated"] == 1,
post_create_action: proc do |user|
# usernames
@old_to_new_usernames[u["memberName"]] = user.username
# groups
GroupUser.transaction do
group_ids.each do |gid|
(group_id = group_id_from_imported_group_id(gid)) && GroupUser.find_or_create_by(user: user, group_id: group_id)
end
end
# avatar
avatar_url = nil
if u["avatar"].present?
if u["avatar"].start_with?("http")
avatar_url = u["avatar"]
elsif u["avatar"].start_with?("avatar_")
avatar_url = "#{FORUM_URL}/avatar-members/#{u["avatar"]}"
end
end
avatar_url ||= if u["attachmentType"] == 0 && u["id_attach"].present?
"#{FORUM_URL}/index.php?action=dlattach;attach=#{u["id_attach"]};type=avatar"
elsif u["attachmentType"] == 1 && u["filename"].present?
"#{FORUM_URL}/avatar-members/#{u["filename"]}"
end
if avatar_url.present?
UserAvatar.import_url_for_user(avatar_url, user) rescue nil
end
end
}
end
end
end
def import_categories
puts "", "Importing categories..."
categories = mysql_query(<<~SQL
SELECT id_board
, id_parent
, boardOrder
, name
, description
FROM smf_boards
ORDER BY id_parent, id_board
SQL
).to_a
parent_categories = categories.select { |c| c["id_parent"] == 0 }
children_categories = categories.select { |c| c["id_parent"] != 0 }
create_categories(parent_categories) do |c|
{
id: c["id_board"],
name: c["name"],
description: pre_process_raw(c["description"].presence),
position: c["boardOrder"],
post_create_action: proc do |category|
Permalink.find_or_create_by(url: "forums/index.php/board,#{c["id_board"]}.0.html", category_id: category.id)
end,
}
end
create_categories(children_categories) do |c|
{
id: c["id_board"],
parent_category_id: category_id_from_imported_category_id(c["id_parent"]),
name: c["name"],
description: pre_process_raw(c["description"].presence),
position: c["boardOrder"],
post_create_action: proc do |category|
Permalink.find_or_create_by(url: "forums/index.php/board,#{c["id_board"]}.0.html", category_id: category.id)
end,
}
end
end
def import_posts
puts "", "Importing posts..."
last_post_id = -1
total = mysql_query("SELECT COUNT(*) count FROM smf_messages").first["count"]
batches(BATCH_SIZE) do |offset|
posts = mysql_query(<<~SQL
SELECT m.id_msg
, m.id_topic
, m.id_board
, m.posterTime
, m.id_member
, m.subject
, m.body
, t.isSticky
, t.id_first_msg
, t.numViews
FROM smf_messages m
JOIN smf_topics t ON t.id_topic = m.id_topic
WHERE m.id_msg > #{last_post_id}
ORDER BY m.id_msg
LIMIT #{BATCH_SIZE}
SQL
).to_a
break if posts.empty?
last_post_id = posts[-1]["id_msg"]
post_ids = posts.map { |p| p["id_msg"] }
next if all_records_exist?(:post, post_ids)
create_posts(posts, total: total, offset: offset) do |p|
created_at = Time.at(p["posterTime"])
post = {
id: p["id_msg"],
created_at: created_at,
user_id: user_id_from_imported_user_id(p["id_member"]) || -1,
raw: pre_process_raw(p["body"]),
}
if p["id_msg"] == p["id_first_msg"]
post[:category] = category_id_from_imported_category_id(p["id_board"])
post[:title] = @htmlentities.decode(p["subject"])
post[:views] = p["numViews"]
post[:pinned_at] = created_at if p["isSticky"] == 1
post[:post_create_action] = proc do |pp|
Permalink.find_or_create_by(url: "forums/index.php/topic,#{p["id_topic"]}.0.html", topic_id: pp.topic_id)
end
elsif parent = topic_lookup_from_imported_post_id(p["id_first_msg"])
post[:topic_id] = parent[:topic_id]
post[:post_create_action] = proc do |pp|
Permalink.find_or_create_by(url: "forums/index.php/topic,#{p["id_topic"]}.msg#{p["id_msg"]}.html", post_id: pp.id)
end
else
next
end
post
end
end
end
def import_personal_posts
puts "", "Importing personal posts..."
last_post_id = -1
total = mysql_query("SELECT COUNT(*) count FROM smf_personal_messages WHERE deletedBySender = 0").first["count"]
batches(BATCH_SIZE) do |offset|
posts = mysql_query(<<~SQL
SELECT id_pm
, id_member_from
, msgtime
, subject
, body
, (SELECT GROUP_CONCAT(id_member) FROM smf_pm_recipients r WHERE r.id_pm = pm.id_pm) recipients
FROM smf_personal_messages pm
WHERE deletedBySender = 0
AND id_pm > #{last_post_id}
ORDER BY id_pm
LIMIT #{BATCH_SIZE}
SQL
).to_a
break if posts.empty?
last_post_id = posts[-1]["id_pm"]
post_ids = posts.map { |p| "pm-#{p["id_pm"]}" }
next if all_records_exist?(:post, post_ids)
create_posts(posts, total: total, offset: offset) do |p|
next unless user_id = user_id_from_imported_user_id(p["id_member_from"])
next if p["recipients"].blank?
recipients = p["recipients"].split(",").map { |id| user_id_from_imported_user_id(id) }.compact.uniq
next if recipients.empty?
id = "pm-#{p["id_pm"]}"
next if post_id_from_imported_post_id(id)
post = {
id: id,
created_at: Time.at(p["msgtime"]),
user_id: user_id,
raw: pre_process_raw(p["body"]),
}
users = (recipients + [user_id]).sort.uniq.join(",")
title = @htmlentities.decode(p["subject"])
if topic_id = find_pm_topic_id(users, title)
post[:topic_id] = topic_id
else
post[:archetype] = Archetype.private_message
post[:title] = title
post[:target_usernames] = User.where(id: recipients).pluck(:username)
post[:post_create_action] = proc do |action_post|
@pm_mapping[users] ||= {}
@pm_mapping[users][title] ||= []
@pm_mapping[users][title] << action_post.topic_id
end
end
post
end
end
end
def find_pm_topic_id(users, title)
return unless title.start_with?("Re: ")
return unless @pm_mapping[users]
title = title.gsub(/^(Re: )+/i, "")
return unless @pm_mapping[users][title]
@pm_mapping[users][title][-1]
end
def import_attachments
puts "", "Importing attachments..."
count = 0
last_upload_id = -1
total = mysql_query("SELECT COUNT(*) count FROM smf_attachments WHERE id_msg IS NOT NULL").first["count"]
batches(BATCH_SIZE) do |offset|
uploads = mysql_query(<<~SQL
SELECT id_attach
, id_msg
, filename
, file_hash
FROM smf_attachments
WHERE id_msg IS NOT NULL
AND id_attach > #{last_upload_id}
ORDER BY id_attach
LIMIT #{BATCH_SIZE}
SQL
).to_a
break if uploads.empty?
last_upload_id = uploads[-1]["id_attach"]
uploads.each do |u|
count += 1
next unless post = PostCustomField.joins(:post).find_by(name: "import_id", value: u["id_msg"].to_s)&.post
path = File.join(UPLOADS_DIR, "#{u["id_attach"]}_#{u["file_hash"]}")
next unless File.exists?(path) && File.size(path) > 0
if upload = create_upload(post.user_id, path, u["filename"])
html = html_for_upload(upload, u["filename"])
unless post.raw[html] || PostUpload.where(upload: upload, post: post).exists?
post.raw += "\n\n#{html}\n\n"
post.save
PostUpload.create(upload: upload, post: post)
end
end
print_status(count, total, get_start_time("attachments"))
end
end
end
def import_likes
return if mysql_query("SHOW TABLES LIKE 'smf_thank_you_post'").first.nil?
puts "", "Importing likes..."
count = 0
total = mysql_query("SELECT COUNT(*) count FROM smf_thank_you_post WHERE thx_time > 0").first["count"]
like = PostActionType.types[:like]
mysql_query("SELECT id_msg, id_member, thx_time FROM smf_thank_you_post WHERE thx_time > 0 ORDER BY id_thx_post").each do |l|
print_status(count += 1, total, get_start_time("likes"))
next unless post_id = post_id_from_imported_post_id(l["id_msg"])
next unless user_id = user_id_from_imported_user_id(l["id_member"])
next if PostAction.where(post_action_type_id: like, post_id: post_id, user_id: user_id).exists?
PostAction.create(post_action_type_id: like, post_id: post_id, user_id: user_id, created_at: Time.at(l["thx_time"]))
end
end
FEEDBACKS ||= -"feedbacks"
def import_feedbacks
return if mysql_query("SHOW TABLES LIKE 'smf_feedback'").first.nil?
puts "", "Importing feedbacks..."
User.register_custom_field_type(FEEDBACKS, :json)
count = 0
total = mysql_query("SELECT COUNT(*) count FROM smf_feedback WHERE approved").first["count"]
mysql_query(<<~SQL
SELECT feedbackid
, id_member
, feedbackmember_id
, saledate
, saletype
, salevalue
, comment_short
, comment_long
FROM smf_feedback
WHERE approved
ORDER BY feedbackid
SQL
).each do |f|
print_status(count += 1, total, get_start_time("feedbacks"))
next unless user_id_from = user_id_from_imported_user_id(f["feedbackmember_id"])
next unless user_id_to = user_id_from_imported_user_id(f["id_member"])
next unless user = User.find_by(id: user_id_to)
feedbacks = user.custom_fields[FEEDBACKS] || []
next if feedbacks.find { |ff| ff["id"] == f["feedbackid"] }
feedbacks << {
id: f["feedbackid"],
created_at: Time.at(f["saledate"]),
from: user_id_from,
type: f["saletype"],
value: f["salevalue"],
comment_short: @htmlentities.decode(f["comment_short"]).strip.presence,
comment_long: @htmlentities.decode(f["comment_long"]).strip.presence,
}
user.custom_fields[FEEDBACKS] = feedbacks.to_json
user.save_custom_fields
end
end
def import_banned_domains
puts "", "Importing banned email domains..."
blacklist = SiteSetting.email_domains_blacklist.split("|")
banned_domains = mysql_query("SELECT SUBSTRING(email_address, 3) domain FROM smf_ban_items WHERE email_address RLIKE '^%@[^%]+$' GROUP BY email_address").map { |r| r["domain"] }
SiteSetting.email_domains_blacklist = (blacklist + banned_domains).uniq.sort.join("|")
end
def import_banned_emails
puts "", "Importing banned emails..."
count = 0
banned_emails = mysql_query("SELECT email_address FROM smf_ban_items WHERE email_address RLIKE '^[^%]+@[^%]+$' GROUP BY email_address").map { |r| r["email_address"] }
banned_emails.each do |email|
print_status(count += 1, banned_emails.size, get_start_time("banned_emails"))
ScreenedEmail.find_or_create_by(email: email)
end
end
def import_banned_ips
puts "", "Importing banned IPs..."
count = 0
banned_ips = mysql_query(<<~SQL
SELECT CONCAT_WS('.', ip_low1, ip_low2, ip_low3, ip_low4) low
, CONCAT_WS('.', ip_high1, ip_high2, ip_high3, ip_high4) high
, hits
FROM smf_ban_items
WHERE (ip_low1 + ip_low2 + ip_low3 + ip_low4 + ip_high1 + ip_high2 + ip_high3 + ip_high4) > 0
GROUP BY low, high, hits;
SQL
).to_a
banned_ips.each do |r|
print_status(count += 1, banned_ips.size, get_start_time("banned_ips"))
if r["low"] == r["high"]
if !ScreenedIpAddress.where("? <<= ip_address", r["low"]).exists?
ScreenedIpAddress.create(ip_address: r["low"], match_count: r["hits"])
end
else
low_values = r["low"].split(".").map(&:to_i)
high_values = r["high"].split(".").map(&:to_i)
first_diff = low_values.zip(high_values).count { |a, b| a == b }
first_diff -= 1 if low_values[first_diff] == 0 && high_values[first_diff] == 255
prefix = low_values[0...first_diff]
suffix = [0] * (3 - first_diff)
mask = 8 * (first_diff + 1)
values = (low_values[first_diff]..high_values[first_diff])
hits = (r["hits"] / [1, values.count].max).floor
values.each do |v|
range_values = prefix + [v] + suffix
ip_address = "#{range_values.join(".")}/#{mask}"
if !ScreenedIpAddress.where("? <<= ip_address", ip_address).exists?
ScreenedIpAddress.create(ip_address: ip_address, match_count: hits)
end
end
end
end
ScreenedIpAddress.where(last_match_at: nil).update_all(last_match_at: Time.new(2000, 01, 01))
puts "", "Rolling up..."
ScreenedIpAddress.roll_up
end
IGNORED_BBCODE ||= %w{
black blue center color email flash font glow green iurl left list move red
right shadown size table time white
}
def pre_process_raw(raw)
return "" if raw.blank?
raw = @htmlentities.decode(raw)
# [acronym]
raw.gsub!(/\[acronym=([^\]]+)\](.*?)\[\/acronym\]/im) { %{<abbr title="#{$1}">#{$2}</abbr>} }
# [br]
raw.gsub!(/\[br\]/i, "\n")
raw.gsub!(/<br\s*\/?>/i, "\n")
# [hr]
raw.gsub!(/\[hr\]/i, "<hr/>")
# [sub]
raw.gsub!(/\[sub\](.*?)\[\/sub\]/im) { "<sub>#{$1}</sub>" }
# [sup]
raw.gsub!(/\[sup\](.*?)\[\/sup\]/im) { "<sup>#{$1}</sup>" }
# [html]
raw.gsub!(/\[html\]/i, "\n```html\n")
raw.gsub!(/\[\/html\]/i, "\n```\n")
# [php]
raw.gsub!(/\[php\]/i, "\n```php\n")
raw.gsub!(/\[\/php\]/i, "\n```\n")
# [code]
raw.gsub!(/\[\/?code\]/i, "\n```\n")
# [pre]
raw.gsub!(/\[\/?pre\]/i, "\n```\n")
# [tt]
raw.gsub!(/\[\/?tt\]/i, "`")
# [ftp]
raw.gsub!(/\[ftp/i, "[url")
raw.gsub!(/\[\/ftp\]/i, "[/url]")
# [me]
raw.gsub!(/\[me=([^\]]*)\](.*?)\[\/me\]/im) { "_\\* #{$1} #{$2}_" }
# [li]
raw.gsub!(/\[li\](.*?)\[\/li\]/im) { "- #{$1}" }
# puts [img] on their own line
raw.gsub!(/\[img[^\]]*\](.*?)\[\/img\]/im) { "\n#{$1}\n" }
# puts [youtube] on their own line
raw.gsub!(/\[youtube\](.*?)\[\/youtube\]/im) { "\n#{$1}\n" }
IGNORED_BBCODE.each { |code| raw.gsub!(/\[#{code}[^\]]*\](.*?)\[\/#{code}\]/im, '\1') }
# ensure [/quote] are on their own line
raw.gsub!(/\s*\[\/quote\]\s*/im, "\n[/quote]\n")
# [quote]
raw.gsub!(/\s*\[quote (.+?)\]\s/im) {
params = $1
post_id = params[/msg(\d+)/, 1]
username = params[/author=(.+) link=/, 1]
username = @old_to_new_usernames[username] if @old_to_new_usernames.has_key?(username)
if t = topic_lookup_from_imported_post_id(post_id)
%{\n[quote="#{username},post:#{t[:post_number]},topic:#{t[:topic_id]}"]\n}
else
%{\n[quote="#{username}"]\n}
end
}
# remove tapatalk mess
raw.gsub!(/Sent from .+? using \[url=.*?\].+?\[\/url\]/i, "")
raw.gsub!(/Sent from .+? using .+?\z/i, "")
# clean URLs
raw.gsub!(/\[url=(.+?)\]\1\[\/url\]/i, '\1')
raw
end
def mysql_query(sql)
@client.query(sql)
end
end
ImportScripts::Smf1.new.perform