From 2dd01c61b0eed1c4bbb20872098931651387cbdf Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Mon, 4 May 2015 23:09:58 +0200 Subject: [PATCH] Improves the base importer - Move some methods into their own classes in order to make it easier to reuse them outside of classes extending the base importer. For compatibility reasons the old methods are still in the base importer and delegate to the new objects. The following methods and hashes were extracted: - all the lookup maps for existing and imported data - all the methods used for uploads and attachments - No need to store failed users and groups. This information wasn't used anyway. - Print progress instead of category names when importing categories. - Allow importers to override if bbcode_to_md should be used (until now it always used ARGV) - Allow importers to add additional site settings that automatically get restored after the importer finishes. - Show how many posts and messages are imported per minute. This should help detecting when the import is slowing down and needs to be restarted. - Use max_image_width and max_image_height from settings instead of hard-coded values for uploaded images. --- script/import_scripts/base.rb | 254 ++++++++---------- .../import_scripts/base/lookup_container.rb | 99 +++++++ script/import_scripts/base/uploader.rb | 45 ++++ 3 files changed, 261 insertions(+), 137 deletions(-) create mode 100644 script/import_scripts/base/lookup_container.rb create mode 100644 script/import_scripts/base/uploader.rb diff --git a/script/import_scripts/base.rb b/script/import_scripts/base.rb index 093875493cd..8a3d30225eb 100644 --- a/script/import_scripts/base.rb +++ b/script/import_scripts/base.rb @@ -7,13 +7,13 @@ if ARGV.include?('bbcode-to-md') # git clone https://github.com/nlalonde/ruby-bbcode-to-md.git # cd ruby-bbcode-to-md # gem build ruby-bbcode-to-md.gemspec - # gem install ruby-bbcode-to-md-0.0.13.gem + # gem install ruby-bbcode-to-md-*.gem require 'ruby-bbcode-to-md' end require_relative '../../config/environment' -require_dependency 'url_helper' -require_dependency 'file_helper' +require_relative 'base/lookup_container' +require_relative 'base/uploader' module ImportScripts; end @@ -24,46 +24,13 @@ class ImportScripts::Base def initialize preload_i18n - @bbcode_to_md = true if ARGV.include?('bbcode-to-md') - @existing_groups = {} - @failed_groups = [] - @existing_users = {} - @failed_users = [] - @categories_lookup = {} - @existing_posts = {} - @topic_lookup = {} - @site_settings_during_import = nil + @lookup = ImportScripts::LookupContainer.new + @uploader = ImportScripts::Uploader.new + + @bbcode_to_md = true if use_bbcode_to_md? + @site_settings_during_import = {} @old_site_settings = {} - @start_time = Time.now - - puts "loading existing groups..." - GroupCustomField.where(name: 'import_id').pluck(:group_id, :value).each do |group_id, import_id| - @existing_groups[import_id] = group_id - end - - puts "loading existing users..." - UserCustomField.where(name: 'import_id').pluck(:user_id, :value).each do |user_id, import_id| - @existing_users[import_id] = user_id - end - - puts "loading existing categories..." - CategoryCustomField.where(name: 'import_id').pluck(:category_id, :value).each do |category_id, import_id| - @categories_lookup[import_id] = category_id - end - - puts "loading existing posts..." - PostCustomField.where(name: 'import_id').pluck(:post_id, :value).each do |post_id, import_id| - @existing_posts[import_id] = post_id - end - - puts "loading existing topics..." - Post.joins(:topic).pluck("posts.id, posts.topic_id, posts.post_number, topics.slug").each do |p| - @topic_lookup[p[0]] = { - topic_id: p[1], - post_number: p[2], - url: Post.url(p[3], p[1], p[2]), - } - end + @start_times = {import: Time.now} end def preload_i18n @@ -87,15 +54,15 @@ class ImportScripts::Base update_topic_count_replies reset_topic_counters - elapsed = Time.now - @start_time - puts '', "Done (#{elapsed.to_s} seconds)" + elapsed = Time.now - @start_times[:import] + puts '', '', 'Done (%02dh %02dmin %02dsec)' % [elapsed/3600, elapsed/60%60, elapsed%60] ensure reset_site_settings end - def change_site_settings - @site_settings_during_import = { + def get_site_settings_for_import + { email_domains_blacklist: '', min_topic_title_length: 1, min_post_length: 1, @@ -106,6 +73,10 @@ class ImportScripts::Base disable_emails: true, authorized_extensions: '*' } + end + + def change_site_settings + @site_settings_during_import = get_site_settings_for_import @site_settings_during_import.each do |key, value| @old_site_settings[key] = SiteSetting.send(key) @@ -124,44 +95,42 @@ class ImportScripts::Base RateLimiter.enable end + def use_bbcode_to_md? + ARGV.include?("bbcode-to-md") + end + # Implementation will do most of its work in its execute method. # It will need to call create_users, create_categories, and create_posts. def execute raise NotImplementedError end - # Get the Discourse Post id based on the id of the source record def post_id_from_imported_post_id(import_id) - @existing_posts[import_id] || @existing_posts[import_id.to_s] + @lookup.post_id_from_imported_post_id(import_id) end - # Get the Discourse topic info (a hash) based on the id of the source record def topic_lookup_from_imported_post_id(import_id) - post_id = post_id_from_imported_post_id(import_id) - post_id ? @topic_lookup[post_id] : nil + @lookup.topic_lookup_from_imported_post_id(import_id) end - # Get the Discourse Group id based on the id of the source group def group_id_from_imported_group_id(import_id) - @existing_groups[import_id] || @existing_groups[import_id.to_s] || find_group_by_import_id(import_id).try(:id) + @lookup.group_id_from_imported_group_id(import_id) end def find_group_by_import_id(import_id) - GroupCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:group) + @lookup.find_group_by_import_id(import_id) end - # Get the Discourse User id based on the id of the source user def user_id_from_imported_user_id(import_id) - @existing_users[import_id] || @existing_users[import_id.to_s] || find_user_by_import_id(import_id).try(:id) + @lookup.user_id_from_imported_user_id(import_id) end def find_user_by_import_id(import_id) - UserCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:user) + @lookup.find_user_by_import_id(import_id) end - # Get the Discourse Category id based on the id of the source category def category_id_from_imported_category_id(import_id) - @categories_lookup[import_id] || @categories_lookup[import_id.to_s] + @lookup.category_id_from_imported_category_id(import_id) end def create_admin(opts={}) @@ -183,31 +152,32 @@ class ImportScripts::Base # group in the original datasource. The given id will not be used # to create the Discourse group record. def create_groups(results, opts={}) - groups_created = 0 - groups_skipped = 0 + created = 0 + skipped = 0 + failed = 0 total = opts[:total] || results.size results.each do |result| g = yield(result) - if group_id_from_imported_group_id(g[:id]) - groups_skipped += 1 + if @lookup.group_id_from_imported_group_id(g[:id]) + skipped += 1 else new_group = create_group(g, g[:id]) if new_group.valid? - @existing_groups[g[:id].to_s] = new_group.id - groups_created += 1 + @lookup.add_group(g[:id].to_s, new_group) + created += 1 else - @failed_groups << g + failed += 1 puts "Failed to create group id #{g[:id]} #{new_group.name}: #{new_group.errors.full_messages}" end end - print_status groups_created + groups_skipped + @failed_groups.length + (opts[:offset] || 0), total + print_status created + skipped + failed + (opts[:offset] || 0), total end - return [groups_created, groups_skipped] + [created, skipped] end def create_group(opts, import_id) @@ -231,8 +201,9 @@ class ImportScripts::Base # user in the original datasource. The given id will not be used to # create the Discourse user record. def create_users(results, opts={}) - users_created = 0 - users_skipped = 0 + created = 0 + skipped = 0 + failed = 0 total = opts[:total] || results.size results.each do |result| @@ -240,34 +211,34 @@ class ImportScripts::Base # block returns nil to skip a user if u.nil? - users_skipped += 1 + skipped += 1 else import_id = u[:id] - if user_id_from_imported_user_id(import_id) - users_skipped += 1 + if @lookup.user_id_from_imported_user_id(import_id) + skipped += 1 elsif u[:email].present? new_user = create_user(u, import_id) if new_user.valid? && new_user.user_profile.valid? - @existing_users[import_id.to_s] = new_user.id - users_created += 1 + @lookup.add_user(import_id.to_s, new_user) + created += 1 else - @failed_users << u + failed += 1 puts "Failed to create user id: #{import_id}, username: #{new_user.username}, email: #{new_user.email}" puts "user errors: #{new_user.errors.full_messages}" puts "user_profile errors: #{new_user.user_profiler.errors.full_messages}" end else - @failed_users << u + failed += 1 puts "Skipping user id #{import_id} because email is blank" end end - print_status users_created + users_skipped + @failed_users.length + (opts[:offset] || 0), total + print_status created + skipped + failed + (opts[:offset] || 0), total end - return [users_created, users_skipped] + [created, skipped] end def create_user(opts, import_id) @@ -334,29 +305,39 @@ class ImportScripts::Base # create the Discourse category record. # Optional attributes are position, description, and parent_category_id. def create_categories(results) + created = 0 + skipped = 0 + total = results.size + results.each do |c| params = yield(c) # block returns nil to skip - next if params.nil? || category_id_from_imported_category_id(params[:id]) + if params.nil? || @lookup.category_id_from_imported_category_id(params[:id]) + skipped += 1 + else + # Basic massaging on the category name + params[:name] = "Blank" if params[:name].blank? + params[:name].strip! + params[:name] = params[:name][0..49] - # Basic massaging on the category name - params[:name] = "Blank" if params[:name].blank? - params[:name].strip! - params[:name] = params[:name][0..49] + # make sure categories don't go more than 2 levels deep + if params[:parent_category_id] + top = Category.find_by_id(params[:parent_category_id]) + top = top.parent_category while top && !top.parent_category.nil? + params[:parent_category_id] = top.id if top + end - puts "\t#{params[:name]}" + new_category = create_category(params, params[:id]) + @lookup.add_category(params[:id], new_category) - # make sure categories don't go more than 2 levels deep - if params[:parent_category_id] - top = Category.find_by_id(params[:parent_category_id]) - top = top.parent_category while top && !top.parent_category.nil? - params[:parent_category_id] = top.id if top + created += 1 end - new_category = create_category(params, params[:id]) - @categories_lookup[params[:id]] = new_category.id + print_status created + skipped, total end + + [created, skipped] end def create_category(opts, import_id) @@ -396,6 +377,7 @@ class ImportScripts::Base skipped = 0 created = 0 total = opts[:total] || results.size + start_time = get_start_time("posts-#{total}") # the post count should be unique enough to differentiate between posts and PMs results.each do |r| params = yield(r) @@ -406,18 +388,14 @@ class ImportScripts::Base else import_id = params.delete(:id).to_s - if post_id_from_imported_post_id(import_id) + if @lookup.post_id_from_imported_post_id(import_id) skipped += 1 # already imported this post else begin new_post = create_post(params, import_id) if new_post.is_a?(Post) - @existing_posts[import_id] = new_post.id - @topic_lookup[new_post.id] = { - post_number: new_post.post_number, - topic_id: new_post.topic_id, - url: new_post.url, - } + @lookup.add_post(import_id, new_post) + @lookup.add_topic(new_post) created_post(new_post) @@ -439,10 +417,10 @@ class ImportScripts::Base end end - print_status skipped + created + (opts[:offset] || 0), total + print_status(created + skipped + (opts[:offset] || 0), total, start_time) end - return [created, skipped] + [created, skipped] end def create_post(opts, import_id) @@ -463,19 +441,8 @@ class ImportScripts::Base post ? post : post_creator.errors.full_messages end - # Creates an upload. - # Expects path to be the full path and filename of the source file. def create_upload(user_id, path, source_filename) - tmp = Tempfile.new('discourse-upload') - src = File.open(path) - FileUtils.copy_stream(src, tmp) - src.close - tmp.rewind - - Upload.create_for(user_id, tmp, source_filename, tmp.size) - ensure - tmp.close rescue nil - tmp.unlink rescue nil + @uploader.create_upload(user_id, path, source_filename) end # Iterate through a list of bookmark records to be imported. @@ -484,8 +451,8 @@ class ImportScripts::Base # Required fields are :user_id and :post_id, where both ids are # the values in the original datasource. def create_bookmarks(results, opts={}) - bookmarks_created = 0 - bookmarks_skipped = 0 + created = 0 + skipped = 0 total = opts[:total] || results.size user = User.new @@ -495,23 +462,29 @@ class ImportScripts::Base params = yield(result) # only the IDs are needed, so this should be enough - user.id = user_id_from_imported_user_id(params[:user_id]) - post.id = post_id_from_imported_post_id(params[:post_id]) - - if user.id.nil? || post.id.nil? - bookmarks_skipped += 1 - puts "Skipping bookmark for user id #{params[:user_id]} and post id #{params[:post_id]}" + if params.nil? + skipped += 1 else - begin - PostAction.act(user, post, PostActionType.types[:bookmark]) - bookmarks_created += 1 - rescue PostAction::AlreadyActed - bookmarks_skipped += 1 - end + user.id = @lookup.user_id_from_imported_user_id(params[:user_id]) + post.id = @lookup.post_id_from_imported_post_id(params[:post_id]) - print_status bookmarks_created + bookmarks_skipped + (opts[:offset] || 0), total + if user.id.nil? || post.id.nil? + skipped += 1 + puts "Skipping bookmark for user id #{params[:user_id]} and post id #{params[:post_id]}" + else + begin + PostAction.act(user, post, PostActionType.types[:bookmark]) + created += 1 + rescue PostAction::AlreadyActed + skipped += 1 + end + end end + + print_status created + skipped + (opts[:offset] || 0), total end + + [created, skipped] end def close_inactive_topics(opts={}) @@ -633,23 +606,26 @@ class ImportScripts::Base end def html_for_upload(upload, display_filename) - if FileHelper.is_image?(upload.url) - embedded_image_html(upload) - else - attachment_html(upload, display_filename) - end + @uploader.html_for_upload(upload, display_filename) end def embedded_image_html(upload) - %Q[
] + @uploader.embedded_image_html(upload) end def attachment_html(upload, display_filename) - "#{display_filename} (#{number_to_human_size(upload.filesize)})" + @uploader.attachment_html(upload, display_filename) end - def print_status(current, max) - print "\r%9d / %d (%5.1f%%) " % [current, max, ((current.to_f / max.to_f) * 100).round(1)] + def print_status(current, max, start_time = nil) + if start_time.present? + elapsed_seconds = Time.now - start_time + elements_per_minute = '[%.0f items/min] ' % [current / elapsed_seconds.to_f * 60] + else + elements_per_minute = '' + end + + print "\r%9d / %d (%5.1f%%) %s" % [current, max, current / max.to_f * 100, elements_per_minute] end def print_spinner @@ -658,6 +634,10 @@ class ImportScripts::Base print "\b#{@spinner_chars[0]}" end + def get_start_time(key) + @start_times.fetch(key) {|k| @start_times[k] = Time.now} + end + def batches(batch_size) offset = 0 loop do diff --git a/script/import_scripts/base/lookup_container.rb b/script/import_scripts/base/lookup_container.rb new file mode 100644 index 00000000000..0d8070932ae --- /dev/null +++ b/script/import_scripts/base/lookup_container.rb @@ -0,0 +1,99 @@ +module ImportScripts + class LookupContainer + def initialize + puts 'loading existing groups...' + @groups = {} + GroupCustomField.where(name: 'import_id').pluck(:group_id, :value).each do |group_id, import_id| + @groups[import_id] = group_id + end + + puts 'loading existing users...' + @users = {} + UserCustomField.where(name: 'import_id').pluck(:user_id, :value).each do |user_id, import_id| + @users[import_id] = user_id + end + + puts 'loading existing categories...' + @categories = {} + CategoryCustomField.where(name: 'import_id').pluck(:category_id, :value).each do |category_id, import_id| + @categories[import_id] = category_id + end + + puts 'loading existing posts...' + @posts = {} + PostCustomField.where(name: 'import_id').pluck(:post_id, :value).each do |post_id, import_id| + @posts[import_id] = post_id + end + + puts 'loading existing topics...' + @topics = {} + Post.joins(:topic).pluck('posts.id, posts.topic_id, posts.post_number, topics.slug').each do |p| + @topics[p[0]] = { + topic_id: p[1], + post_number: p[2], + url: Post.url(p[3], p[1], p[2]) + } + end + end + + # Get the Discourse Post id based on the id of the source record + def post_id_from_imported_post_id(import_id) + @posts[import_id] || @posts[import_id.to_s] + end + + # Get the Discourse topic info (a hash) based on the id of the source record + def topic_lookup_from_imported_post_id(import_id) + post_id = post_id_from_imported_post_id(import_id) + post_id ? @topics[post_id] : nil + end + + # Get the Discourse Group id based on the id of the source group + def group_id_from_imported_group_id(import_id) + @groups[import_id] || @groups[import_id.to_s] || find_group_by_import_id(import_id).try(:id) + end + + # Get the Discourse Group based on the id of the source group + def find_group_by_import_id(import_id) + GroupCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:group) + end + + # Get the Discourse User id based on the id of the source user + def user_id_from_imported_user_id(import_id) + @users[import_id] || @users[import_id.to_s] || find_user_by_import_id(import_id).try(:id) + end + + # Get the Discourse User based on the id of the source user + def find_user_by_import_id(import_id) + UserCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:user) + end + + # Get the Discourse Category id based on the id of the source category + def category_id_from_imported_category_id(import_id) + @categories[import_id] || @categories[import_id.to_s] + end + + def add_group(import_id, group) + @groups[import_id] = group.id + end + + def add_user(import_id, user) + @users[import_id] = user.id + end + + def add_category(import_id, category) + @categories[import_id] = category.id + end + + def add_post(import_id, post) + @posts[import_id] = post.id + end + + def add_topic(post) + @topics[post.id] = { + post_number: post.post_number, + topic_id: post.topic_id, + url: post.url, + } + end + end +end diff --git a/script/import_scripts/base/uploader.rb b/script/import_scripts/base/uploader.rb new file mode 100644 index 00000000000..62ddac451d7 --- /dev/null +++ b/script/import_scripts/base/uploader.rb @@ -0,0 +1,45 @@ +require_dependency 'url_helper' +require_dependency 'file_helper' + +module ImportScripts + class Uploader + include ActionView::Helpers::NumberHelper + + # Creates an upload. + # Expects path to be the full path and filename of the source file. + # @return [Upload] + def create_upload(user_id, path, source_filename) + tmp = Tempfile.new('discourse-upload') + src = File.open(path) + FileUtils.copy_stream(src, tmp) + src.close + tmp.rewind + + Upload.create_for(user_id, tmp, source_filename, tmp.size) + rescue => e + Rails.logger.error("Failed to create upload: #{e}") + nil + ensure + tmp.close rescue nil + tmp.unlink rescue nil + end + + def html_for_upload(upload, display_filename) + if FileHelper.is_image?(upload.url) + embedded_image_html(upload) + else + attachment_html(upload, display_filename) + end + end + + def embedded_image_html(upload) + image_width = [upload.width, SiteSetting.max_image_width].compact.min + image_height = [upload.height, SiteSetting.max_image_height].compact.min + %Q[
] + end + + def attachment_html(upload, display_filename) + "#{display_filename} (#{number_to_human_size(upload.filesize)})" + end + end +end