discourse/script/bulk_import/base.rb
Sam Saffron 0c52537f10 DEV: update rubocop to version 0.77
We like to stay as close as possible to latest with rubocop cause the cops
get better.

This update required some code changes, specifically the default is to avoid
explicit returns where implicit is done

Also this renames a few rules
2019-12-10 11:48:39 +11:00

785 lines
26 KiB
Ruby

# frozen_string_literal: true
if ARGV.include?('bbcode-to-md')
# Replace (most) bbcode with markdown before creating posts.
# This will dramatically clean up the final posts in Discourse.
#
# In a temp dir:
#
# git clone https://github.com/nlalonde/ruby-bbcode-to-md.git
# cd ruby-bbcode-to-md
# gem build ruby-bbcode-to-md.gemspec
# gem install ruby-bbcode-to-md-*.gem
require 'ruby-bbcode-to-md'
end
require "pg"
require "set"
require "redcarpet"
require "htmlentities"
puts "Loading application..."
require_relative "../../config/environment"
require_relative '../import_scripts/base/uploader'
module BulkImport; end
class BulkImport::Base
NOW ||= "now()".freeze
PRIVATE_OFFSET ||= 2**30
# rubocop:disable Layout/HashAlignment
CHARSET_MAP = {
"armscii8" => nil,
"ascii" => Encoding::US_ASCII,
"big5" => Encoding::Big5,
"binary" => Encoding::ASCII_8BIT,
"cp1250" => Encoding::Windows_1250,
"cp1251" => Encoding::Windows_1251,
"cp1256" => Encoding::Windows_1256,
"cp1257" => Encoding::Windows_1257,
"cp850" => Encoding::CP850,
"cp852" => Encoding::CP852,
"cp866" => Encoding::IBM866,
"cp932" => Encoding::Windows_31J,
"dec8" => nil,
"eucjpms" => Encoding::EucJP_ms,
"euckr" => Encoding::EUC_KR,
"gb2312" => Encoding::EUC_CN,
"gbk" => Encoding::GBK,
"geostd8" => nil,
"greek" => Encoding::ISO_8859_7,
"hebrew" => Encoding::ISO_8859_8,
"hp8" => nil,
"keybcs2" => nil,
"koi8r" => Encoding::KOI8_R,
"koi8u" => Encoding::KOI8_U,
"latin1" => Encoding::ISO_8859_1,
"latin2" => Encoding::ISO_8859_2,
"latin5" => Encoding::ISO_8859_9,
"latin7" => Encoding::ISO_8859_13,
"macce" => Encoding::MacCentEuro,
"macroman" => Encoding::MacRoman,
"sjis" => Encoding::SHIFT_JIS,
"swe7" => nil,
"tis620" => Encoding::TIS_620,
"ucs2" => Encoding::UTF_16BE,
"ujis" => Encoding::EucJP_ms,
"utf8" => Encoding::UTF_8,
}
# rubocop:enable Layout/HashAlignment
def initialize
charset = ENV["DB_CHARSET"] || "utf8"
db = ActiveRecord::Base.connection_config
@encoder = PG::TextEncoder::CopyRow.new
@raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port])
# @raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port], password: "discourse")
@uploader = ImportScripts::Uploader.new
@html_entities = HTMLEntities.new
@encoding = CHARSET_MAP[charset]
@bbcode_to_md = true if use_bbcode_to_md?
@markdown = Redcarpet::Markdown.new(
Redcarpet::Render::HTML.new(hard_wrap: true),
no_intra_emphasis: true,
fenced_code_blocks: true,
autolink: true
)
end
def run
puts "Starting..."
Rails.logger.level = 3 # :error, so that we don't create log files that are many GB
preload_i18n
fix_highest_post_numbers
load_imported_ids
load_indexes
execute
fix_primary_keys
puts "Done! Now run the 'import:ensure_consistency' rake task."
end
def preload_i18n
puts "Preloading I18n..."
I18n.locale = ENV.fetch("LOCALE") { SiteSettings::DefaultsProvider::DEFAULT_LOCALE }.to_sym
I18n.t("test")
ActiveSupport::Inflector.transliterate("test")
end
def fix_highest_post_numbers
puts "Fixing highest post numbers..."
@raw_connection.exec <<-SQL
WITH X AS (
SELECT topic_id
, COALESCE(MAX(post_number), 0) max_post_number
FROM posts
WHERE deleted_at IS NULL
GROUP BY topic_id
)
UPDATE topics
SET highest_post_number = X.max_post_number
FROM X
WHERE id = X.topic_id
AND highest_post_number <> X.max_post_number
SQL
end
def load_imported_ids
puts "Loading imported group ids..."
@groups = GroupCustomField.where(name: "import_id").pluck(:value, :group_id).to_h
@last_imported_group_id = @groups.keys.map(&:to_i).max || -1
puts "Loading imported user ids..."
@users = UserCustomField.where(name: "import_id").pluck(:value, :user_id).to_h
@last_imported_user_id = @users.keys.map(&:to_i).max || -1
puts "Loading imported category ids..."
@categories = CategoryCustomField.where(name: "import_id").pluck(:value, :category_id).to_h
@last_imported_category_id = @categories.keys.map(&:to_i).max || -1
puts "Loading imported topic ids..."
@topics = TopicCustomField.where(name: "import_id").pluck(:value, :topic_id).to_h
imported_topic_ids = @topics.keys.map(&:to_i)
@last_imported_topic_id = imported_topic_ids.select { |id| id < PRIVATE_OFFSET }.max || -1
@last_imported_private_topic_id = imported_topic_ids.select { |id| id > PRIVATE_OFFSET }.max || (PRIVATE_OFFSET - 1)
puts "Loading imported post ids..."
@posts = PostCustomField.where(name: "import_id").pluck(:value, :post_id).to_h
imported_post_ids = @posts.keys.map(&:to_i)
@last_imported_post_id = imported_post_ids.select { |id| id < PRIVATE_OFFSET }.max || -1
@last_imported_private_post_id = imported_post_ids.select { |id| id > PRIVATE_OFFSET }.max || (PRIVATE_OFFSET - 1)
end
def last_id(klass)
# the first record created will have id of this value + 1
[klass.unscoped.maximum(:id) || 0, 0].max
end
def load_indexes
puts "Loading groups indexes..."
@last_group_id = Group.unscoped.maximum(:id)
@group_names = Group.unscoped.pluck(:name).map(&:downcase).to_set
puts "Loading users indexes..."
@last_user_id = last_id(User)
@last_user_email_id = last_id(UserEmail)
@emails = User.unscoped.joins(:user_emails).pluck(:"user_emails.email").to_set
@usernames_lower = User.unscoped.pluck(:username_lower).to_set
@mapped_usernames = UserCustomField.joins(:user).where(name: "import_username").pluck("user_custom_fields.value", "users.username").to_h
puts "Loading categories indexes..."
@last_category_id = last_id(Category)
@category_names = Category.unscoped.pluck(:parent_category_id, :name).map { |pci, name| "#{pci}-#{name}" }.to_set
puts "Loading topics indexes..."
@last_topic_id = last_id(Topic)
@highest_post_number_by_topic_id = Topic.unscoped.pluck(:id, :highest_post_number).to_h
puts "Loading posts indexes..."
@last_post_id = last_id(Post)
@post_number_by_post_id = Post.unscoped.pluck(:id, :post_number).to_h
@topic_id_by_post_id = Post.unscoped.pluck(:id, :topic_id).to_h
puts "Loading post actions indexes..."
@last_post_action_id = last_id(PostAction)
end
def use_bbcode_to_md?
ARGV.include?("bbcode-to-md")
end
def execute
raise NotImplementedError
end
def fix_primary_keys
puts "Updating primary key sequences..."
@raw_connection.exec("SELECT setval('#{Group.sequence_name}', #{@last_group_id})") if @last_group_id > 0
@raw_connection.exec("SELECT setval('#{User.sequence_name}', #{@last_user_id})") if @last_user_id > 0
@raw_connection.exec("SELECT setval('#{UserEmail.sequence_name}', #{@last_user_email_id})") if @last_user_email_id > 0
@raw_connection.exec("SELECT setval('#{Category.sequence_name}', #{@last_category_id})") if @last_category_id > 0
@raw_connection.exec("SELECT setval('#{Topic.sequence_name}', #{@last_topic_id})") if @last_topic_id > 0
@raw_connection.exec("SELECT setval('#{Post.sequence_name}', #{@last_post_id})") if @last_post_id > 0
@raw_connection.exec("SELECT setval('#{PostAction.sequence_name}', #{@last_post_action_id})") if @last_post_action_id > 0
end
def group_id_from_imported_id(id)
@groups[id.to_s]
end
def user_id_from_imported_id(id)
@users[id.to_s]
end
def category_id_from_imported_id(id)
@categories[id.to_s]
end
def topic_id_from_imported_id(id)
@topics[id.to_s]
end
def post_id_from_imported_id(id)
@posts[id.to_s]
end
def post_number_from_imported_id(id)
@post_number_by_post_id[post_id_from_imported_id(id)]
end
def topic_id_from_imported_post_id(id)
@topic_id_by_post_id[post_id_from_imported_id(id)]
end
GROUP_COLUMNS ||= %i{
id name title bio_raw bio_cooked created_at updated_at
}
USER_COLUMNS ||= %i{
id username username_lower name active trust_level admin moderator
date_of_birth ip_address registration_ip_address primary_group_id
suspended_at suspended_till last_emailed_at created_at updated_at
}
USER_EMAIL_COLUMNS ||= %i{
id user_id email primary created_at updated_at
}
USER_STAT_COLUMNS ||= %i{
user_id topics_entered time_read days_visited posts_read_count
likes_given likes_received topic_reply_count new_since read_faq
first_post_created_at post_count topic_count bounce_score
reset_bounce_score_after
}
USER_PROFILE_COLUMNS ||= %i{
user_id location website bio_raw bio_cooked views
}
GROUP_USER_COLUMNS ||= %i{
group_id user_id created_at updated_at
}
CATEGORY_COLUMNS ||= %i{
id name name_lower slug user_id description position parent_category_id
created_at updated_at
}
TOPIC_COLUMNS ||= %i{
id archetype title fancy_title slug user_id last_post_user_id category_id
visible closed pinned_at views created_at bumped_at updated_at
}
POST_COLUMNS ||= %i{
id user_id last_editor_id topic_id post_number sort_order reply_to_post_number
like_count raw cooked hidden word_count created_at last_version_at updated_at
}
POST_ACTION_COLUMNS ||= %i{
id post_id user_id post_action_type_id deleted_at created_at updated_at
deleted_by_id related_post_id staff_took_action deferred_by_id targets_topic
agreed_at agreed_by_id deferred_at disagreed_at disagreed_by_id
}
TOPIC_ALLOWED_USER_COLUMNS ||= %i{
topic_id user_id created_at updated_at
}
TOPIC_TAG_COLUMNS ||= %i{
topic_id tag_id created_at updated_at
}
def create_groups(rows, &block)
create_records(rows, "group", GROUP_COLUMNS, &block)
end
def create_users(rows, &block)
@imported_usernames = {}
create_records(rows, "user", USER_COLUMNS, &block)
create_custom_fields("user", "username", @imported_usernames.keys) do |username|
{
record_id: @imported_usernames[username],
value: username,
}
end
end
def create_user_emails(rows, &block)
create_records(rows, "user_email", USER_EMAIL_COLUMNS, &block)
end
def create_user_stats(rows, &block)
create_records(rows, "user_stat", USER_STAT_COLUMNS, &block)
end
def create_user_profiles(rows, &block)
create_records(rows, "user_profile", USER_PROFILE_COLUMNS, &block)
end
def create_group_users(rows, &block)
create_records(rows, "group_user", GROUP_USER_COLUMNS, &block)
end
def create_categories(rows, &block)
create_records(rows, "category", CATEGORY_COLUMNS, &block)
end
def create_topics(rows, &block)
create_records(rows, "topic", TOPIC_COLUMNS, &block)
end
def create_posts(rows, &block)
create_records(rows, "post", POST_COLUMNS, &block)
end
def create_post_actions(rows, &block)
create_records(rows, "post_action", POST_ACTION_COLUMNS, &block)
end
def create_topic_allowed_users(rows, &block)
create_records(rows, "topic_allowed_user", TOPIC_ALLOWED_USER_COLUMNS, &block)
end
def create_topic_tags(rows, &block)
create_records(rows, "topic_tag", TOPIC_TAG_COLUMNS, &block)
end
def process_group(group)
@groups[group[:imported_id].to_s] = group[:id] = @last_group_id += 1
group[:name] = fix_name(group[:name])
unless @group_names.add?(group[:name].downcase)
group_name = group[:name] + "_1"
group_name.next! until @group_names.add?(group_name.downcase)
group[:name] = group_name
end
group[:title] = group[:title].scrub.strip.presence if group[:title].present?
group[:bio_raw] = group[:bio_raw].scrub.strip.presence if group[:bio_raw].present?
group[:bio_cooked] = pre_cook(group[:bio_raw]) if group[:bio_raw].present?
group[:created_at] ||= NOW
group[:updated_at] ||= group[:created_at]
group
end
def process_user(user)
@users[user[:imported_id].to_s] = user[:id] = @last_user_id += 1
imported_username = user[:username].dup
user[:username] = fix_name(user[:username]).presence || random_username
if user[:username] != imported_username
@imported_usernames[imported_username] = user[:id]
@mapped_usernames[imported_username] = user[:username]
end
# unique username_lower
unless @usernames_lower.add?(user[:username].downcase)
username = user[:username] + "_1"
username.next! until @usernames_lower.add?(username.downcase)
user[:username] = username
end
user[:username_lower] = user[:username].downcase
user[:email] ||= random_email
user[:email].downcase!
# unique email
user[:email] = random_email until user[:email] =~ EmailValidator.email_regex && @emails.add?(user[:email])
user[:trust_level] ||= TrustLevel[1]
user[:active] = true unless user.has_key?(:active)
user[:admin] ||= false
user[:moderator] ||= false
user[:last_emailed_at] ||= NOW
user[:created_at] ||= NOW
user[:updated_at] ||= user[:created_at]
user
end
def process_user_email(user_email)
user_email[:id] = @last_user_email_id += 1
user_email[:user_id] = @users[user_email[:imported_user_id].to_s]
user_email[:primary] = true
user_email[:created_at] ||= NOW
user_email[:updated_at] ||= user_email[:created_at]
user_email[:email] ||= random_email
user_email[:email].downcase!
user_email
end
def process_user_stat(user_stat)
user_stat[:user_id] = @users[user_stat[:imported_user_id].to_s]
user_stat[:topic_reply_count] = user_stat[:post_count] - user_stat[:topic_count]
user_stat[:topics_entered] ||= 0
user_stat[:time_read] ||= 0
user_stat[:days_visited] ||= 0
user_stat[:posts_read_count] ||= 0
user_stat[:likes_given] ||= 0
user_stat[:likes_received] ||= 0
user_stat[:topic_reply_count] ||= 0
user_stat[:new_since] ||= NOW
user_stat[:post_count] ||= 0
user_stat[:topic_count] ||= 0
user_stat[:bounce_score] ||= 0
user_stat
end
def process_user_profile(user_profile)
user_profile[:bio_raw] = (user_profile[:bio_raw].presence || "").scrub.strip.presence
user_profile[:bio_cooked] = pre_cook(user_profile[:bio_raw]) if user_profile[:bio_raw].present?
user_profile[:views] ||= 0
user_profile
end
def process_group_user(group_user)
group_user[:created_at] = NOW
group_user[:updated_at] = NOW
group_user
end
def process_category(category)
category[:id] ||= @last_category_id += 1
@categories[category[:imported_id].to_s] ||= category[:id]
category[:name] = category[:name][0...50].scrub.strip
# TODO: unique name
category[:name_lower] = category[:name].downcase
category[:slug] ||= Slug.ascii_generator(category[:name_lower])
category[:description] = (category[:description] || "").scrub.strip.presence
category[:user_id] ||= Discourse::SYSTEM_USER_ID
category[:created_at] ||= NOW
category[:updated_at] ||= category[:created_at]
category
end
def process_topic(topic)
@topics[topic[:imported_id].to_s] = topic[:id] = @last_topic_id += 1
topic[:archetype] ||= Archetype.default
topic[:title] = topic[:title][0...255].scrub.strip
topic[:fancy_title] ||= pre_fancy(topic[:title])
topic[:slug] ||= Slug.ascii_generator(topic[:title])
topic[:user_id] ||= Discourse::SYSTEM_USER_ID
topic[:last_post_user_id] ||= topic[:user_id]
topic[:category_id] ||= -1 if topic[:archetype] != Archetype.private_message
topic[:visible] = true unless topic.has_key?(:visible)
topic[:closed] ||= false
topic[:views] ||= 0
topic[:created_at] ||= NOW
topic[:bumped_at] ||= topic[:created_at]
topic[:updated_at] ||= topic[:created_at]
topic
end
def process_post(post)
@posts[post[:imported_id].to_s] = post[:id] = @last_post_id += 1
post[:user_id] ||= Discourse::SYSTEM_USER_ID
post[:last_editor_id] = post[:user_id]
@highest_post_number_by_topic_id[post[:topic_id]] ||= 0
post[:post_number] = @highest_post_number_by_topic_id[post[:topic_id]] += 1
post[:sort_order] = post[:post_number]
@post_number_by_post_id[post[:id]] = post[:post_number]
@topic_id_by_post_id[post[:id]] = post[:topic_id]
post[:raw] = (post[:raw] || "").scrub.strip.presence || "<Empty imported post>"
post[:raw] = process_raw post[:raw]
if @bbcode_to_md
post[:raw] = post[:raw].bbcode_to_md(false, {}, :disable, :quote) rescue post[:raw]
end
post[:like_count] ||= 0
post[:cooked] = pre_cook post[:raw]
post[:hidden] ||= false
post[:word_count] = post[:raw].scan(/[[:word:]]+/).size
post[:created_at] ||= NOW
post[:last_version_at] = post[:created_at]
post[:updated_at] ||= post[:created_at]
post
end
def process_post_action(post_action)
post_action[:id] ||= @last_post_action_id += 1
post_action[:staff_took_action] ||= false
post_action[:targets_topic] ||= false
post_action[:created_at] ||= NOW
post_action[:updated_at] ||= post_action[:created_at]
post_action
end
def process_topic_allowed_user(topic_allowed_user)
topic_allowed_user[:created_at] = NOW
topic_allowed_user[:updated_at] = NOW
topic_allowed_user
end
def process_topic_tag(topic_tag)
topic_tag[:created_at] = NOW
topic_tag[:updated_at] = NOW
topic_tag
end
def process_raw(raw)
# fix whitespaces
raw.gsub!(/(\\r)?\\n/, "\n")
raw.gsub!("\\t", "\t")
# [HTML]...[/HTML]
raw.gsub!(/\[HTML\]/i, "\n\n```html\n")
raw.gsub!(/\[\/HTML\]/i, "\n```\n\n")
# [PHP]...[/PHP]
raw.gsub!(/\[PHP\]/i, "\n\n```php\n")
raw.gsub!(/\[\/PHP\]/i, "\n```\n\n")
# [HIGHLIGHT="..."]
raw.gsub!(/\[HIGHLIGHT="?(\w+)"?\]/i) { "\n\n```#{$1.downcase}\n" }
# [CODE]...[/CODE]
# [HIGHLIGHT]...[/HIGHLIGHT]
raw.gsub!(/\[\/?CODE\]/i, "\n\n```\n\n")
raw.gsub!(/\[\/?HIGHLIGHT\]/i, "\n\n```\n\n")
# [SAMP]...[/SAMP]
raw.gsub!(/\[\/?SAMP\]/i, "`")
# replace all chevrons with HTML entities
# /!\ must be done /!\
# - AFTER the "code" processing
# - BEFORE the "quote" processing
raw.gsub!(/`([^`]+?)`/im) { "`" + $1.gsub("<", "\u2603") + "`" }
raw.gsub!("<", "&lt;")
raw.gsub!("\u2603", "<")
raw.gsub!(/`([^`]+?)`/im) { "`" + $1.gsub(">", "\u2603") + "`" }
raw.gsub!(">", "&gt;")
raw.gsub!("\u2603", ">")
raw.gsub!(/\[\/?I\]/i, "*")
raw.gsub!(/\[\/?B\]/i, "**")
raw.gsub!(/\[\/?U\]/i, "")
raw.gsub!(/\[\/?RED\]/i, "")
raw.gsub!(/\[\/?BLUE\]/i, "")
raw.gsub!(/\[AUTEUR\].+?\[\/AUTEUR\]/im, "")
raw.gsub!(/\[VOIRMSG\].+?\[\/VOIRMSG\]/im, "")
raw.gsub!(/\[PSEUDOID\].+?\[\/PSEUDOID\]/im, "")
# [IMG]...[/IMG]
raw.gsub!(/(?:\s*\[IMG\]\s*)+(.+?)(?:\s*\[\/IMG\]\s*)+/im) { "\n\n#{$1}\n\n" }
# [IMG=url]
raw.gsub!(/\[IMG=([^\]]*)\]/im) { "\n\n#{$1}\n\n" }
# [URL=...]...[/URL]
raw.gsub!(/\[URL="?(.+?)"?\](.+?)\[\/URL\]/im) { "[#{$2.strip}](#{$1})" }
# [URL]...[/URL]
# [MP3]...[/MP3]
# [EMAIL]...[/EMAIL]
# [LEFT]...[/LEFT]
raw.gsub!(/\[\/?URL\]/i, "")
raw.gsub!(/\[\/?MP3\]/i, "")
raw.gsub!(/\[\/?EMAIL\]/i, "")
raw.gsub!(/\[\/?LEFT\]/i, "")
# [FONT=blah] and [COLOR=blah]
raw.gsub!(/\[FONT=.*?\](.*?)\[\/FONT\]/im, "\\1")
raw.gsub!(/\[COLOR=.*?\](.*?)\[\/COLOR\]/im, "\\1")
raw.gsub!(/\[SIZE=.*?\](.*?)\[\/SIZE\]/im, "\\1")
raw.gsub!(/\[H=.*?\](.*?)\[\/H\]/im, "\\1")
# [CENTER]...[/CENTER]
raw.gsub!(/\[CENTER\](.*?)\[\/CENTER\]/im, "\\1")
# [INDENT]...[/INDENT]
raw.gsub!(/\[INDENT\](.*?)\[\/INDENT\]/im, "\\1")
raw.gsub!(/\[TABLE\](.*?)\[\/TABLE\]/im, "\\1")
raw.gsub!(/\[TR\](.*?)\[\/TR\]/im, "\\1")
raw.gsub!(/\[TD\](.*?)\[\/TD\]/im, "\\1")
raw.gsub!(/\[TD="?.*?"?\](.*?)\[\/TD\]/im, "\\1")
# [QUOTE]...[/QUOTE]
raw.gsub!(/\[QUOTE="([^\]]+)"\]/i) { "[QUOTE=#{$1}]" }
# Nested Quotes
raw.gsub!(/(\[\/?QUOTE.*?\])/mi) { |q| "\n#{q}\n" }
# raw.gsub!(/\[QUOTE\](.+?)\[\/QUOTE\]/im) { |quote|
# quote.gsub!(/\[QUOTE\](.+?)\[\/QUOTE\]/im) { "\n#{$1}\n" }
# quote.gsub!(/\n(.+?)/) { "\n> #{$1}" }
# }
# [QUOTE=<username>;<postid>]
raw.gsub!(/\[QUOTE=([^;\]]+);(\d+)\]/i) do
imported_username, imported_postid = $1, $2
username = @mapped_usernames[imported_username] || imported_username
post_id = post_id_from_imported_id(imported_postid)
post_number = @post_number_by_post_id[post_id]
topic_id = @topic_id_by_post_id[post_id]
if post_number && topic_id
"\n[quote=\"#{username}, post:#{post_number}, topic:#{topic_id}\"]\n"
else
"\n[quote=\"#{username}\"]\n"
end
end
# [YOUTUBE]<id>[/YOUTUBE]
raw.gsub!(/\[YOUTUBE\](.+?)\[\/YOUTUBE\]/i) { "\nhttps://www.youtube.com/watch?v=#{$1}\n" }
raw.gsub!(/\[DAILYMOTION\](.+?)\[\/DAILYMOTION\]/i) { "\nhttps://www.dailymotion.com/video/#{$1}\n" }
# [VIDEO=youtube;<id>]...[/VIDEO]
raw.gsub!(/\[VIDEO=YOUTUBE;([^\]]+)\].*?\[\/VIDEO\]/i) { "\nhttps://www.youtube.com/watch?v=#{$1}\n" }
raw.gsub!(/\[VIDEO=DAILYMOTION;([^\]]+)\].*?\[\/VIDEO\]/i) { "\nhttps://www.dailymotion.com/video/#{$1}\n" }
# [SPOILER=Some hidden stuff]SPOILER HERE!![/SPOILER]
raw.gsub!(/\[SPOILER="?(.+?)"?\](.+?)\[\/SPOILER\]/im) { "\n#{$1}\n[spoiler]#{$2}[/spoiler]\n" }
# convert list tags to ul and list=1 tags to ol
# (basically, we're only missing list=a here...)
# (https://meta.discourse.org/t/phpbb-3-importer-old/17397)
raw.gsub!(/\[list\](.*?)\[\/list\]/im, '[ul]\1[/ul]')
raw.gsub!(/\[list=1\](.*?)\[\/list\]/im, '[ol]\1[/ol]')
raw.gsub!(/\[list\](.*?)\[\/list:u\]/im, '[ul]\1[/ul]')
raw.gsub!(/\[list=1\](.*?)\[\/list:o\]/im, '[ol]\1[/ol]')
# convert *-tags to li-tags so bbcode-to-md can do its magic on phpBB's lists:
raw.gsub!(/\[\*\]\n/, '')
raw.gsub!(/\[\*\](.*?)\[\/\*:m\]/, '[li]\1[/li]')
raw.gsub!(/\[\*\](.*?)\n/, '[li]\1[/li]')
raw.gsub!(/\[\*=1\]/, '')
raw
end
def create_records(rows, name, columns)
start = Time.now
imported_ids = []
process_method_name = "process_#{name}"
sql = "COPY #{name.pluralize} (#{columns.map { |c| "\"#{c}\"" }.join(",")}) FROM STDIN"
@raw_connection.copy_data(sql, @encoder) do
rows.each do |row|
begin
mapped = yield(row)
next unless mapped
processed = send(process_method_name, mapped)
imported_ids << mapped[:imported_id] unless mapped[:imported_id].nil?
imported_ids |= mapped[:imported_ids] unless mapped[:imported_ids].nil?
@raw_connection.put_copy_data columns.map { |c| processed[c] }
print "\r%7d - %6d/sec".freeze % [imported_ids.size, imported_ids.size.to_f / (Time.now - start)] if imported_ids.size % 5000 == 0
rescue => e
puts "\n"
puts "ERROR: #{e.inspect}"
end
end
end
if imported_ids.size > 0
print "\r%7d - %6d/sec".freeze % [imported_ids.size, imported_ids.size.to_f / (Time.now - start)]
puts
end
id_mapping_method_name = "#{name}_id_from_imported_id".freeze
return unless respond_to?(id_mapping_method_name)
create_custom_fields(name, "id", imported_ids) do |imported_id|
{
record_id: send(id_mapping_method_name, imported_id),
value: imported_id,
}
end
rescue => e
puts e.message
puts e.backtrace.join("\n")
end
def create_custom_fields(table, name, rows)
name = "import_#{name}"
sql = "COPY #{table}_custom_fields (#{table}_id, name, value, created_at, updated_at) FROM STDIN"
@raw_connection.copy_data(sql, @encoder) do
rows.each do |row|
cf = yield row
next unless cf
@raw_connection.put_copy_data [cf[:record_id], name, cf[:value], NOW, NOW]
end
end
end
def create_upload(user_id, path, source_filename)
@uploader.create_upload(user_id, path, source_filename)
end
def html_for_upload(upload, display_filename)
@uploader.html_for_upload(upload, display_filename)
end
def fix_name(name)
name.scrub! if name.valid_encoding? == false
return if name.blank?
name = ActiveSupport::Inflector.transliterate(name)
name.gsub!(/[^\w.-]+/, "_")
name.gsub!(/^\W+/, "")
name.gsub!(/[^A-Za-z0-9]+$/, "")
name.gsub!(/([-_.]{2,})/) { $1.first }
name.strip!
name
end
def random_username
"Anonymous_#{SecureRandom.hex}"
end
def random_email
"#{SecureRandom.hex}@ema.il"
end
def pre_cook(raw)
cooked = raw
# Convert YouTube URLs to lazyYT DOMs before being transformed into links
cooked.gsub!(/\nhttps\:\/\/www.youtube.com\/watch\?v=(\w+)\n/) do
video_id = $1
result = <<-HTML
<div class="lazyYT" data-youtube-id="#{video_id}" data-width="480" data-height="270" data-parameters="feature=oembed&amp;wmode=opaque"></div>
HTML
result.strip
end
cooked = @markdown.render(cooked).scrub.strip
cooked.gsub!(/\[QUOTE="?([^,"]+)(?:, post:(\d+), topic:(\d+))?"?\](.+?)\[\/QUOTE\]/im) do
username, post_id, topic_id, quote = $1, $2, $3, $4
quote = quote.scrub.strip
quote.gsub!(/^(<br>\n?)+/, "")
quote.gsub!(/(<br>\n?)+$/, "")
if post_id.present? && topic_id.present?
<<-HTML
<aside class="quote" data-post="#{post_id}" data-topic="#{topic_id}">
<div class="title">#{username}:</div>
<blockquote>#{quote}</blockquote>
</aside>
HTML
else
<<-HTML
<aside class="quote">
<div class="title">#{username}:</div>
<blockquote>#{quote}</blockquote>
</aside>
HTML
end
end
cooked.scrub.strip
end
def pre_fancy(title)
Redcarpet::Render::SmartyPants.render(ERB::Util.html_escape(title)).scrub.strip
end
def normalize_text(text)
return nil unless text.present?
@html_entities.decode(normalize_charset(text.presence || "").scrub)
end
def normalize_charset(text)
return text if @encoding == Encoding::UTF_8
text && text.encode(@encoding).force_encoding(Encoding::UTF_8)
end
end