discourse/script/import_scripts/mylittleforum.rb
Sam Saffron 30990006a9 DEV: enable frozen string literal on all files
This reduces chances of errors where consumers of strings mutate inputs
and reduces memory usage of the app.

Test suite passes now, but there may be some stuff left, so we will run
a few sites on a branch prior to merging
2019-05-13 09:31:32 +08:00

449 lines
14 KiB
Ruby

# frozen_string_literal: true
require "mysql2"
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require 'htmlentities'
# Before running this script, paste these lines into your shell,
# then use arrow keys to edit the values
=begin
export DB_HOST="localhost"
export DB_NAME="mylittleforum"
export DB_PW=""
export DB_USER="root"
export TABLE_PREFIX="forum_"
export IMPORT_AFTER="1970-01-01"
export IMAGE_BASE="http://www.example.com/forum"
export BASE="forum"
=end
class ImportScripts::MylittleforumSQL < ImportScripts::Base
DB_HOST ||= ENV['DB_HOST'] || "localhost"
DB_NAME ||= ENV['DB_NAME'] || "mylittleforum"
DB_PW ||= ENV['DB_PW'] || ""
DB_USER ||= ENV['DB_USER'] || "root"
TABLE_PREFIX ||= ENV['TABLE_PREFIX'] || "forum_"
IMPORT_AFTER ||= ENV['IMPORT_AFTER'] || "1970-01-01"
IMAGE_BASE ||= ENV['IMAGE_BASE'] || ""
BASE ||= ENV['BASE'] || "forum/"
BATCH_SIZE = 1000
CONVERT_HTML = true
QUIET = nil || ENV['VERBOSE'] == "TRUE"
FORCE_HOSTNAME = nil || ENV['FORCE_HOSTNAME']
QUIET = true
# Site settings
SiteSetting.disable_emails = "non-staff"
if FORCE_HOSTNAME
SiteSetting.force_hostname = FORCE_HOSTNAME
end
def initialize
if IMPORT_AFTER > "1970-01-01"
print_warning("Importing data after #{IMPORT_AFTER}")
end
super
@htmlentities = HTMLEntities.new
begin
@client = Mysql2::Client.new(
host: DB_HOST,
username: DB_USER,
password: DB_PW,
database: DB_NAME
)
rescue Exception => e
puts '=' * 50
puts e.message
puts <<EOM
Cannot log in to database.
Hostname: #{DB_HOST}
Username: #{DB_USER}
Password: #{DB_PW}
database: #{DB_NAME}
You should set these variables:
export DB_HOST="localhost"
export DB_NAME="mylittleforum"
export DB_PW=""
export DB_USER="root"
export TABLE_PREFIX="forum_"
export IMPORT_AFTER="1970-01-01"
export IMAGE_BASE="http://www.example.com/forum"
export BASE="forum"
Exiting.
EOM
exit
end
end
def execute
import_users
import_categories
import_topics
import_posts
update_tl0
create_permalinks
end
def import_users
puts '', "creating users"
total_count = mysql_query("SELECT count(*) count FROM #{TABLE_PREFIX}userdata WHERE last_login > '#{IMPORT_AFTER}';").first['count']
batches(BATCH_SIZE) do |offset|
results = mysql_query("
SELECT user_id as UserID, user_name as username,
user_real_name as Name,
user_email as Email,
user_hp as website,
user_place as Location,
profile as bio_raw,
last_login as DateLastActive,
user_ip as InsertIPAddress,
user_pw as password,
logins as days_visited, # user_stats
registered as DateInserted,
user_pw as password,
user_type
FROM #{TABLE_PREFIX}userdata
WHERE last_login > '#{IMPORT_AFTER}'
order by UserID ASC
LIMIT #{BATCH_SIZE}
OFFSET #{offset};")
break if results.size < 1
next if all_records_exist? :users, results.map { |u| u['UserID'].to_i }
create_users(results, total: total_count, offset: offset) do |user|
next if user['Email'].blank?
next if @lookup.user_id_from_imported_user_id(user['UserID'])
# username = fix_username(user['username'])
{ id: user['UserID'],
email: user['Email'],
username: user['username'],
name: user['Name'],
created_at: user['DateInserted'] == nil ? 0 : Time.zone.at(user['DateInserted']),
bio_raw: user['bio_raw'],
registration_ip_address: user['InsertIPAddress'],
website: user['user_hp'],
password: user['password'],
last_seen_at: user['DateLastActive'] == nil ? 0 : Time.zone.at(user['DateLastActive']),
location: user['Location'],
admin: user['user_type'] == "admin",
moderator: user['user_type'] == "mod",
}
end
end
end
def fix_username(username)
olduser = username.dup
username.gsub!(/Dr\. /, "Dr") # no &
username.gsub!(/[ +!\/,*()?]/, "_") # can't have these
username.gsub!(/&/, "_and_") # no &
username.gsub!(/@/, "_at_") # no @
username.gsub!(/#/, "_hash_") # no &
username.gsub!(/\'/, "") # seriously?
username.gsub!(/[._]+/, "_") # can't have 2 special in a row
username.gsub!(/_+/, "_") # could result in dupes, but wtf?
username.gsub!(/_$/, "") # could result in dupes, but wtf?
if olduser != username
print_warning ("#{olduser} --> #{username}")
end
username
end
def import_categories
puts "", "importing categories..."
categories = mysql_query("
SELECT id as CategoryID,
category as Name,
description as Description
FROM #{TABLE_PREFIX}categories
ORDER BY CategoryID ASC
").to_a
create_categories(categories) do |category|
{
id: category['CategoryID'],
name: CGI.unescapeHTML(category['Name']),
description: CGI.unescapeHTML(category['Description'])
}
end
end
def import_topics
puts "", "importing topics..."
total_count = mysql_query("SELECT count(*) count FROM #{TABLE_PREFIX}entries
WHERE time > '#{IMPORT_AFTER}'
AND pid = 0;").first['count']
batches(BATCH_SIZE) do |offset|
discussions = mysql_query(
"SELECT id as DiscussionID,
category as CategoryID,
subject as Name,
text as Body,
time as DateInserted,
youtube_link as youtube,
user_id as InsertUserID
FROM #{TABLE_PREFIX}entries
WHERE pid = 0
AND time > '#{IMPORT_AFTER}'
ORDER BY time ASC
LIMIT #{BATCH_SIZE}
OFFSET #{offset};")
break if discussions.size < 1
next if all_records_exist? :posts, discussions.map { |t| "discussion#" + t['DiscussionID'].to_s }
create_posts(discussions, total: total_count, offset: offset) do |discussion|
raw = clean_up(discussion['Body'])
youtube = nil
unless discussion['youtube'].blank?
youtube = clean_youtube(discussion['youtube'])
raw += "\n#{youtube}\n"
print_warning(raw)
end
{
id: "discussion#" + discussion['DiscussionID'].to_s,
user_id: user_id_from_imported_user_id(discussion['InsertUserID']) || Discourse::SYSTEM_USER_ID,
title: discussion['Name'].gsub('\\"', '"'),
category: category_id_from_imported_category_id(discussion['CategoryID']),
raw: raw,
created_at: Time.zone.at(discussion['DateInserted']),
}
end
end
end
def import_posts
puts "", "importing posts..."
total_count = mysql_query(
"SELECT count(*) count
FROM #{TABLE_PREFIX}entries
WHERE pid > 0
AND time > '#{IMPORT_AFTER}';").first['count']
batches(BATCH_SIZE) do |offset|
comments = mysql_query(
"SELECT id as CommentID,
tid as DiscussionID,
text as Body,
time as DateInserted,
youtube_link as youtube,
user_id as InsertUserID
FROM #{TABLE_PREFIX}entries
WHERE pid > 0
AND time > '#{IMPORT_AFTER}'
ORDER BY time ASC
LIMIT #{BATCH_SIZE}
OFFSET #{offset};")
break if comments.size < 1
next if all_records_exist? :posts, comments.map { |comment| "comment#" + comment['CommentID'].to_s }
create_posts(comments, total: total_count, offset: offset) do |comment|
next unless t = topic_lookup_from_imported_post_id("discussion#" + comment['DiscussionID'].to_s)
next if comment['Body'].blank?
raw = clean_up(comment['Body'])
youtube = nil
unless comment['youtube'].blank?
youtube = clean_youtube(comment['youtube'])
raw += "\n#{youtube}\n"
end
{
id: "comment#" + comment['CommentID'].to_s,
user_id: user_id_from_imported_user_id(comment['InsertUserID']) || Discourse::SYSTEM_USER_ID,
topic_id: t[:topic_id],
raw: clean_up(raw),
created_at: Time.zone.at(comment['DateInserted'])
}
end
end
end
def clean_youtube(youtube_raw)
youtube_cooked = clean_up(youtube_raw.dup.to_s)
# get just src from <iframe> and put on a line by itself
re = /<iframe.+?src="(\S+?)".+?<\/iframe>/mix
youtube_cooked.gsub!(re) { "\n#{$1}\n" }
re = /<object.+?src="(\S+?)".+?<\/object>/mix
youtube_cooked.gsub!(re) { "\n#{$1}\n" }
youtube_cooked.gsub!(/^\/\//, "https://") # make sure it has a protocol
unless /http/.match(youtube_cooked) # handle case of only youtube object number
if youtube_cooked.length < 8 || /[<>=]/.match(youtube_cooked)
# probably not a youtube id
youtube_cooked = ""
else
youtube_cooked = 'https://www.youtube.com/watch?v=' + youtube_cooked
end
end
print_warning("#{'-' * 40}\nBefore: #{youtube_raw}\nAfter: #{youtube_cooked}") unless QUIET
youtube_cooked
end
def clean_up(raw)
return "" if raw.blank?
# decode HTML entities
raw = @htmlentities.decode(raw)
# don't \ quotes
raw = raw.gsub('\\"', '"')
raw = raw.gsub("\\'", "'")
raw = raw.gsub(/\[b\]/i, "<strong>")
raw = raw.gsub(/\[\/b\]/i, "</strong>")
raw = raw.gsub(/\[i\]/i, "<em>")
raw = raw.gsub(/\[\/i\]/i, "</em>")
raw = raw.gsub(/\[u\]/i, "<em>")
raw = raw.gsub(/\[\/u\]/i, "</em>")
raw = raw.gsub(/\[url\](\S+)\[\/url\]/im) { "#{$1}" }
raw = raw.gsub(/\[link\](\S+)\[\/link\]/im) { "#{$1}" }
# URL & LINK with text
raw = raw.gsub(/\[url=(\S+?)\](.*?)\[\/url\]/im) { "<a href=\"#{$1}\">#{$2}</a>" }
raw = raw.gsub(/\[link=(\S+?)\](.*?)\[\/link\]/im) { "<a href=\"#{$1}\">#{$2}</a>" }
# remote images
raw = raw.gsub(/\[img\](https?:.+?)\[\/img\]/im) { "<img src=\"#{$1}\">" }
raw = raw.gsub(/\[img=(https?.+?)\](.+?)\[\/img\]/im) { "<img src=\"#{$1}\" alt=\"#{$2}\">" }
# local images
raw = raw.gsub(/\[img\](.+?)\[\/img\]/i) { "<img src=\"#{IMAGE_BASE}/#{$1}\">" }
raw = raw.gsub(/\[img=(.+?)\](https?.+?)\[\/img\]/im) { "<img src=\"#{IMAGE_BASE}/#{$1}\" alt=\"#{$2}\">" }
# Convert image bbcode
raw.gsub!(/\[img=(\d+),(\d+)\]([^\]]*)\[\/img\]/im, '<img width="\1" height="\2" src="\3">')
# [div]s are really [quote]s
raw.gsub!(/\[div\]/mix, "[quote]")
raw.gsub!(/\[\/div\]/mix, "[/quote]")
# [postedby] -> link to @user
raw.gsub(/\[postedby\](.+?)\[b\](.+?)\[\/b\]\[\/postedby\]/i) { "#{$1}@#{$2}" }
# CODE (not tested)
raw = raw.gsub(/\[code\](\S+)\[\/code\]/im) { "```\n#{$1}\n```" }
raw = raw.gsub(/\[pre\](\S+)\[\/pre\]/im) { "```\n#{$1}\n```" }
raw = raw.gsub(/(https:\/\/youtu\S+)/i) { "\n#{$1}\n" } #youtube links on line by themselves
# no center
raw = raw.gsub(/\[\/?center\]/i, "")
# no size
raw = raw.gsub(/\[\/?size.*?\]/i, "")
### FROM VANILLA:
# fix whitespaces
raw = raw.gsub(/(\\r)?\\n/, "\n")
.gsub("\\t", "\t")
unless CONVERT_HTML
# replace all chevrons with HTML entities
# NOTE: must be done
# - AFTER all the "code" processing
# - BEFORE the "quote" processing
raw = raw.gsub(/`([^`]+)`/im) { "`" + $1.gsub("<", "\u2603") + "`" }
.gsub("<", "&lt;")
.gsub("\u2603", "<")
raw = raw.gsub(/`([^`]+)`/im) { "`" + $1.gsub(">", "\u2603") + "`" }
.gsub(">", "&gt;")
.gsub("\u2603", ">")
end
# Remove the color tag
raw.gsub!(/\[color=[#a-z0-9]+\]/i, "")
raw.gsub!(/\[\/color\]/i, "")
### END VANILLA:
raw
end
def staff_guardian
@_staff_guardian ||= Guardian.new(Discourse.system_user)
end
def mysql_query(sql)
@client.query(sql)
# @client.query(sql, cache_rows: false) #segfault: cache_rows: false causes segmentation fault
end
def create_permalinks
puts '', 'Creating redirects...', ''
puts '', 'Users...', ''
User.find_each do |u|
ucf = u.custom_fields
if ucf && ucf["import_id"] && ucf["import_username"]
Permalink.create(url: "#{BASE}/user-id-#{ucf['import_id']}.html", external_url: "/u/#{u.username}") rescue nil
print '.'
end
end
puts '', 'Posts...', ''
Post.find_each do |post|
pcf = post.custom_fields
if pcf && pcf["import_id"]
topic = post.topic
id = pcf["import_id"].split('#').last
if post.post_number == 1
Permalink.create(url: "#{BASE}/forum_entry-id-#{id}.html", topic_id: topic.id) rescue nil
unless QUIET
print_warning("forum_entry-id-#{id}.html --> http://localhost:3000/t/#{topic.id}")
end
else
Permalink.create(url: "#{BASE}/forum_entry-id-#{id}.html", post_id: post.id) rescue nil
unless QUIET
print_warning("forum_entry-id-#{id}.html --> http://localhost:3000/t/#{topic.id}/#{post.id}")
end
end
print '.'
end
end
puts '', 'Categories...', ''
Category.find_each do |cat|
ccf = cat.custom_fields
next unless id = ccf["import_id"]
unless QUIET
print_warning("forum-category-#{id}.html --> /t/#{cat.id}")
end
Permalink.create(url: "#{BASE}/forum-category-#{id}.html", category_id: cat.id) rescue nil
print '.'
end
end
def print_warning(message)
$stderr.puts "#{message}"
end
end
ImportScripts::MylittleforumSQL.new.perform