From a71b219c9a18ad418e330282fde511bcb79ba7c4 Mon Sep 17 00:00:00 2001 From: Bianca Nenciu Date: Thu, 14 Jan 2021 21:44:43 +0200 Subject: [PATCH] Improvements to phpBB3 import script (#10999) * FEATURE: Import attachments * FEATURE: Add support for importing multiple forums in one * FEATURE: Add support for category and tag mapping * FEATURE: Import groups * FIX: Add spaces around images * FEATURE: Custom mapping of user rank to trust levels * FIX: Do not fail import if it cannot import polls * FIX: Optimize existing records lookup Co-authored-by: Gerhard Schlager Co-authored-by: Jarek Radosz --- script/import_scripts/base.rb | 11 +- .../import_scripts/base/lookup_container.rb | 5 + .../phpbb3/database/database.rb | 4 +- .../phpbb3/database/database_3_0.rb | 24 +- script/import_scripts/phpbb3/importer.rb | 86 +- .../phpbb3/importers/bookmark_importer.rb | 8 +- .../phpbb3/importers/category_importer.rb | 18 +- .../phpbb3/importers/importer_factory.rb | 12 +- .../phpbb3/importers/message_importer.rb | 6 +- .../phpbb3/importers/permalink_importer.rb | 2 +- .../phpbb3/importers/poll_importer.rb | 10 +- .../phpbb3/importers/post_importer.rb | 31 +- .../phpbb3/importers/user_importer.rb | 15 +- script/import_scripts/phpbb3/settings.yml | 67 ++ .../phpbb3/support/bbcode/markdown_node.rb | 88 ++ .../phpbb3/support/bbcode/xml_to_markdown.rb | 356 ++++++++ .../import_scripts/phpbb3/support/settings.rb | 33 +- .../phpbb3/support/smiley_processor.rb | 20 +- .../phpbb3/support/text_processor.rb | 72 +- .../support/bbcode/xml_to_markdown_spec.rb | 817 ++++++++++++++++++ 20 files changed, 1606 insertions(+), 79 deletions(-) create mode 100644 script/import_scripts/phpbb3/support/bbcode/markdown_node.rb create mode 100644 script/import_scripts/phpbb3/support/bbcode/xml_to_markdown.rb create mode 100644 spec/script/import_scripts/phpbb3/support/bbcode/xml_to_markdown_spec.rb diff --git a/script/import_scripts/base.rb b/script/import_scripts/base.rb index 39530d74474..aaef4efca4f 100644 --- a/script/import_scripts/base.rb +++ b/script/import_scripts/base.rb @@ -606,10 +606,15 @@ class ImportScripts::Base skipped += 1 puts "Skipping bookmark for user id #{params[:user_id]} and post id #{params[:post_id]}" else - result = BookmarkManager.new(user).create(post_id: post.id) + begin + manager = BookmarkManager.new(user) + bookmark = manager.create(post_id: post.id) - created += 1 if result.errors.none? - skipped += 1 if result.errors.any? + created += 1 if manager.errors.none? + skipped += 1 if manager.errors.any? + rescue + skipped += 1 + end end end diff --git a/script/import_scripts/base/lookup_container.rb b/script/import_scripts/base/lookup_container.rb index 6c4169b916b..caef17cd5a0 100644 --- a/script/import_scripts/base/lookup_container.rb +++ b/script/import_scripts/base/lookup_container.rb @@ -57,6 +57,11 @@ module ImportScripts UserCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:user) end + def find_username_by_import_id(import_id) + user_id = user_id_from_imported_user_id(import_id) + User.where(id: user_id).pluck(:username).first if user_id.present? + end + # Get the Discourse Category id based on the id of the source category def category_id_from_imported_category_id(import_id) @categories[import_id] || @categories[import_id.to_s] diff --git a/script/import_scripts/phpbb3/database/database.rb b/script/import_scripts/phpbb3/database/database.rb index 6b34febffc4..725d932ab8f 100644 --- a/script/import_scripts/phpbb3/database/database.rb +++ b/script/import_scripts/phpbb3/database/database.rb @@ -22,13 +22,13 @@ module ImportScripts::PhpBB3 if version.start_with?('3.0') require_relative 'database_3_0' Database_3_0.new(@database_client, @database_settings) - elsif version.start_with?('3.1') + elsif version.start_with?('3.1') || version.start_with?('3.2') require_relative 'database_3_1' Database_3_1.new(@database_client, @database_settings) else raise UnsupportedVersionError, <<~MSG Unsupported version (#{version}) of phpBB detected. - Currently only 3.0.x and 3.1.x are supported by this importer. + Currently only version 3.0, 3.1 and 3.2 are supported by this importer. MSG end end diff --git a/script/import_scripts/phpbb3/database/database_3_0.rb b/script/import_scripts/phpbb3/database/database_3_0.rb index 0799b92ba78..a69a9f9fe23 100644 --- a/script/import_scripts/phpbb3/database/database_3_0.rb +++ b/script/import_scripts/phpbb3/database/database_3_0.rb @@ -53,6 +53,20 @@ module ImportScripts::PhpBB3 SQL end + def fetch_groups + query(<<-SQL) + SELECT g.group_id, g.group_type, g.group_name, g.group_desc + FROM #{@table_prefix}groups g + SQL + end + + def fetch_group_users + query(<<-SQL) + SELECT ug.group_id, ug.user_id, ug.group_leader + FROM #{@table_prefix}user_group ug + SQL + end + def fetch_categories query(<<-SQL) SELECT f.forum_id, f.parent_id, f.forum_name, f.forum_desc, x.first_post_time @@ -213,12 +227,20 @@ module ImportScripts::PhpBB3 SELECT b.user_id, t.topic_first_post_id FROM #{@table_prefix}bookmarks b JOIN #{@table_prefix}topics t ON (b.topic_id = t.topic_id) - WHERE b.user_id > #{last_user_id} AND b.topic_id > #{last_topic_id} + WHERE b.user_id > #{last_user_id} ORDER BY b.user_id, b.topic_id LIMIT #{@batch_size} SQL end + def get_smiley(smiley_code) + query(<<-SQL).first + SELECT emotion, smiley_url + FROM #{@table_prefix}smilies + WHERE code = '#{smiley_code}' + SQL + end + def get_config_values query(<<-SQL).first SELECT diff --git a/script/import_scripts/phpbb3/importer.rb b/script/import_scripts/phpbb3/importer.rb index ca3274002cf..f9ee6e04b98 100644 --- a/script/import_scripts/phpbb3/importer.rb +++ b/script/import_scripts/phpbb3/importer.rb @@ -27,8 +27,13 @@ module ImportScripts::PhpBB3 def execute puts '', "importing from phpBB #{@php_config[:phpbb_version]}" + SiteSetting.tagging_enabled = true if @settings.tag_mappings.present? + import_users import_anonymous_users if @settings.import_anonymous_users + import_groups + import_user_groups + import_new_categories import_categories import_posts import_private_messages if @settings.import_private_messages @@ -67,12 +72,12 @@ module ImportScripts::PhpBB3 batches do |offset| rows, last_user_id = @database.fetch_users(last_user_id) + rows = rows.to_a.uniq { |row| row[:user_id] } break if rows.size < 1 - next if all_records_exist?(:users, importer.map_users_to_import_ids(rows)) - create_users(rows, total: total_count, offset: offset) do |row| begin + next if user_id_from_imported_user_id(@settings.prefix(row[:user_id])) importer.map_user(row) rescue => e log_error("Failed to map user with ID #{row[:user_id]}", e) @@ -91,10 +96,9 @@ module ImportScripts::PhpBB3 rows, last_username = @database.fetch_anonymous_users(last_username) break if rows.size < 1 - next if all_records_exist?(:users, importer.map_anonymous_users_to_import_ids(rows)) - create_users(rows, total: total_count, offset: offset) do |row| begin + next if user_id_from_imported_user_id(@settings.prefix(row[:post_username])) importer.map_anonymous_user(row) rescue => e log_error("Failed to map anonymous user with ID #{row[:user_id]}", e) @@ -103,12 +107,74 @@ module ImportScripts::PhpBB3 end end + def import_groups + puts '', 'creating groups' + rows = @database.fetch_groups + + create_groups(rows) do |row| + begin + next if row[:group_type] == 3 + + group_name = if @settings.site_name.present? + "#{@settings.site_name}_#{row[:group_name]}" + else + row[:group_name] + end[0..19].gsub(/[^a-zA-Z0-9\-_. ]/, '_') + + bio_raw = @importers.text_processor.process_raw_text(row[:group_desc]) rescue row[:group_desc] + + { + id: @settings.prefix(row[:group_id]), + name: group_name, + full_name: row[:group_name], + bio_raw: bio_raw + } + rescue => e + log_error("Failed to map group with ID #{row[:group_id]}", e) + end + end + end + + def import_user_groups + puts '', 'creating user groups' + rows = @database.fetch_group_users + + rows.each do |row| + group_id = @lookup.group_id_from_imported_group_id(@settings.prefix(row[:group_id])) + next if !group_id + + user_id = @lookup.user_id_from_imported_user_id(@settings.prefix(row[:user_id])) + + begin + GroupUser.find_or_create_by(user_id: user_id, group_id: group_id, owner: row[:group_leader]) + rescue => e + log_error("Failed to add user #{row[:user_id]} to group #{row[:group_id]}", e) + end + end + end + + def import_new_categories + puts '', 'creating new categories' + + create_categories(@settings.new_categories) do |row| + next if row == "SKIP" + + { + id: @settings.prefix(row[:forum_id]), + name: row[:name], + parent_category_id: @lookup.category_id_from_imported_category_id(@settings.prefix(row[:parent_id])) + } + end + end + def import_categories puts '', 'creating categories' rows = @database.fetch_categories importer = @importers.category_importer create_categories(rows) do |row| + next if @settings.category_mappings[row[:forum_id].to_s] == 'SKIP' + importer.map_category(row) end end @@ -123,10 +189,9 @@ module ImportScripts::PhpBB3 rows, last_post_id = @database.fetch_posts(last_post_id) break if rows.size < 1 - next if all_records_exist?(:posts, importer.map_to_import_ids(rows)) - create_posts(rows, total: total_count, offset: offset) do |row| begin + next if post_id_from_imported_post_id(@settings.prefix(row[:post_id])) importer.map_post(row) rescue => e log_error("Failed to map post with ID #{row[:post_id]}", e) @@ -145,10 +210,9 @@ module ImportScripts::PhpBB3 rows, last_msg_id = @database.fetch_messages(last_msg_id) break if rows.size < 1 - next if all_records_exist?(:posts, importer.map_to_import_ids(rows)) - create_posts(rows, total: total_count, offset: offset) do |row| begin + next if post_id_from_imported_post_id(@settings.prefix("pm:#{row[:msg_id]}")) importer.map_message(row) rescue => e log_error("Failed to map message with ID #{row[:msg_id]}", e) @@ -168,7 +232,11 @@ module ImportScripts::PhpBB3 break if rows.size < 1 create_bookmarks(rows, total: total_count, offset: offset) do |row| - importer.map_bookmark(row) + begin + importer.map_bookmark(row) + rescue => e + log_error("Failed to map bookmark (#{row[:user_id]}, #{row[:topic_first_post_id]})", e) + end end end end diff --git a/script/import_scripts/phpbb3/importers/bookmark_importer.rb b/script/import_scripts/phpbb3/importers/bookmark_importer.rb index 49c3799a09e..784e6c74768 100644 --- a/script/import_scripts/phpbb3/importers/bookmark_importer.rb +++ b/script/import_scripts/phpbb3/importers/bookmark_importer.rb @@ -2,10 +2,14 @@ module ImportScripts::PhpBB3 class BookmarkImporter + def initialize(settings) + @settings = settings + end + def map_bookmark(row) { - user_id: row[:user_id], - post_id: row[:topic_first_post_id] + user_id: @settings.prefix(row[:user_id]), + post_id: @settings.prefix(row[:topic_first_post_id]) } end end diff --git a/script/import_scripts/phpbb3/importers/category_importer.rb b/script/import_scripts/phpbb3/importers/category_importer.rb index 1d4cbaa60c4..d7ad577ff3b 100644 --- a/script/import_scripts/phpbb3/importers/category_importer.rb +++ b/script/import_scripts/phpbb3/importers/category_importer.rb @@ -5,20 +5,28 @@ module ImportScripts::PhpBB3 # @param lookup [ImportScripts::LookupContainer] # @param text_processor [ImportScripts::PhpBB3::TextProcessor] # @param permalink_importer [ImportScripts::PhpBB3::PermalinkImporter] - def initialize(lookup, text_processor, permalink_importer) + # @param settings [ImportScripts::PhpBB3::Settings] + def initialize(lookup, text_processor, permalink_importer, settings) @lookup = lookup @text_processor = text_processor @permalink_importer = permalink_importer + @settings = settings end def map_category(row) + return if @settings.category_mappings[row[:forum_id].to_s] + + if row[:parent_id] && @settings.category_mappings[row[:parent_id].to_s] + puts "parent category (#{row[:parent_id]}) was mapped, but children was not (#{row[:forum_id]})" + end + { - id: row[:forum_id], + id: @settings.prefix(row[:forum_id]), name: CGI.unescapeHTML(row[:forum_name]), - parent_category_id: @lookup.category_id_from_imported_category_id(row[:parent_id]), + parent_category_id: @lookup.category_id_from_imported_category_id(@settings.prefix(row[:parent_id])), post_create_action: proc do |category| update_category_description(category, row) - @permalink_importer.create_for_category(category, row[:forum_id]) + @permalink_importer.create_for_category(category, row[:forum_id]) # skip @settings.prefix because ID is used in permalink generation end } end @@ -43,7 +51,7 @@ module ImportScripts::PhpBB3 end if row[:forum_desc].present? - changes = { raw: @text_processor.process_raw_text(row[:forum_desc]) } + changes = { raw: (@text_processor.process_raw_text(row[:forum_desc]) rescue row[:forum_desc]) } opts = { revised_at: post.created_at, bypass_bump: true } post.revise(Discourse.system_user, changes, opts) end diff --git a/script/import_scripts/phpbb3/importers/importer_factory.rb b/script/import_scripts/phpbb3/importers/importer_factory.rb index dffb4c4f7f7..b02cb92ff0d 100644 --- a/script/import_scripts/phpbb3/importers/importer_factory.rb +++ b/script/import_scripts/phpbb3/importers/importer_factory.rb @@ -32,7 +32,7 @@ module ImportScripts::PhpBB3 end def category_importer - CategoryImporter.new(@lookup, text_processor, permalink_importer) + CategoryImporter.new(@lookup, text_processor, permalink_importer, @settings) end def post_importer @@ -44,15 +44,13 @@ module ImportScripts::PhpBB3 end def bookmark_importer - BookmarkImporter.new + BookmarkImporter.new(@settings) end def permalink_importer @permalink_importer ||= PermalinkImporter.new(@settings.permalinks) end - protected - def attachment_importer AttachmentImporter.new(@database, @uploader, @settings, @phpbb_config) end @@ -62,15 +60,15 @@ module ImportScripts::PhpBB3 end def poll_importer - PollImporter.new(@lookup, @database, text_processor) + PollImporter.new(@lookup, @database, text_processor, @settings) end def text_processor - @text_processor ||= TextProcessor.new(@lookup, @database, smiley_processor, @settings) + @text_processor ||= TextProcessor.new(@lookup, @database, smiley_processor, @settings, @phpbb_config) end def smiley_processor - SmileyProcessor.new(@uploader, @settings, @phpbb_config) + SmileyProcessor.new(@uploader, @database, @settings, @phpbb_config) end end end diff --git a/script/import_scripts/phpbb3/importers/message_importer.rb b/script/import_scripts/phpbb3/importers/message_importer.rb index 8f8218d7327..efdcd9d1231 100644 --- a/script/import_scripts/phpbb3/importers/message_importer.rb +++ b/script/import_scripts/phpbb3/importers/message_importer.rb @@ -20,7 +20,7 @@ module ImportScripts::PhpBB3 end def map_message(row) - user_id = @lookup.user_id_from_imported_user_id(row[:author_id]) || Discourse.system_user.id + user_id = @lookup.user_id_from_imported_user_id(@settings.prefix(row[:author_id])) || Discourse.system_user.id attachments = import_attachments(row, user_id) mapped = { @@ -84,7 +84,7 @@ module ImportScripts::PhpBB3 import_user_ids = get_recipient_user_ids(row[:to_address]) import_user_ids.map! do |import_user_id| - @lookup.find_user_by_import_id(import_user_id).try(:username) + @lookup.find_user_by_import_id(@settings.prefix(import_user_id)).try(:username) end.compact end @@ -93,7 +93,7 @@ module ImportScripts::PhpBB3 end def get_import_id(msg_id) - "pm:#{msg_id}" + @settings.prefix("pm:#{msg_id}") end # Creates a sorted array consisting of the message's author and recipients. diff --git a/script/import_scripts/phpbb3/importers/permalink_importer.rb b/script/import_scripts/phpbb3/importers/permalink_importer.rb index 5afd18e9053..051604ba87e 100644 --- a/script/import_scripts/phpbb3/importers/permalink_importer.rb +++ b/script/import_scripts/phpbb3/importers/permalink_importer.rb @@ -39,7 +39,7 @@ module ImportScripts::PhpBB3 end def create_for_post(post, import_id) - return unless @settings.create_topic_links && post + return unless @settings.create_post_links && post url = "viewtopic.php?p=#{import_id}" diff --git a/script/import_scripts/phpbb3/importers/poll_importer.rb b/script/import_scripts/phpbb3/importers/poll_importer.rb index 82066bf3dea..fe43ffe65c2 100644 --- a/script/import_scripts/phpbb3/importers/poll_importer.rb +++ b/script/import_scripts/phpbb3/importers/poll_importer.rb @@ -5,10 +5,12 @@ module ImportScripts::PhpBB3 # @param lookup [ImportScripts::LookupContainer] # @param database [ImportScripts::PhpBB3::Database_3_0 | ImportScripts::PhpBB3::Database_3_1] # @param text_processor [ImportScripts::PhpBB3::TextProcessor] - def initialize(lookup, database, text_processor) + # @param settings [ImportScripts::PhpBB3::Settings] + def initialize(lookup, database, text_processor, settings) @lookup = lookup @database = database @text_processor = text_processor + @settings = settings end # @param poll_data [ImportScripts::PhpBB3::PollData] @@ -47,7 +49,7 @@ module ImportScripts::PhpBB3 end def get_option_text(row) - text = @text_processor.process_raw_text(row[:poll_option_text]) + text = @text_processor.process_raw_text(row[:poll_option_text]) rescue row[:poll_option_text] text.squish! text.gsub!(/^(\d+)\./, '\1\.') text @@ -55,7 +57,7 @@ module ImportScripts::PhpBB3 # @param poll_data [ImportScripts::PhpBB3::PollData] def get_poll_text(poll_data) - title = @text_processor.process_raw_text(poll_data.title) + title = @text_processor.process_raw_text(poll_data.title) rescue poll_data.title text = +"#{title}\n\n" arguments = ["results=always"] @@ -118,7 +120,7 @@ module ImportScripts::PhpBB3 rows.each do |row| option_id = mapped_option_ids[row[:poll_option_id]] - user_id = @lookup.user_id_from_imported_user_id(row[:user_id]) + user_id = @lookup.user_id_from_imported_user_id(@settings.prefix(row[:user_id])) if option_id.present? && user_id.present? PollVote.create!(poll: poll, poll_option_id: option_id, user_id: user_id) diff --git a/script/import_scripts/phpbb3/importers/post_importer.rb b/script/import_scripts/phpbb3/importers/post_importer.rb index c0c0ee15a7f..8355835568f 100644 --- a/script/import_scripts/phpbb3/importers/post_importer.rb +++ b/script/import_scripts/phpbb3/importers/post_importer.rb @@ -18,22 +18,24 @@ module ImportScripts::PhpBB3 end def map_to_import_ids(rows) - rows.map { |row| row[:post_id] } + rows.map { |row| @settings.prefix(row[:post_id]) } end def map_post(row) - imported_user_id = row[:post_username].blank? ? row[:poster_id] : row[:post_username] + return if @settings.category_mappings[row[:forum_id].to_s] == 'SKIP' + + imported_user_id = @settings.prefix(row[:post_username].blank? ? row[:poster_id] : row[:post_username]) user_id = @lookup.user_id_from_imported_user_id(imported_user_id) || -1 is_first_post = row[:post_id] == row[:topic_first_post_id] attachments = import_attachments(row, user_id) mapped = { - id: row[:post_id], + id: @settings.prefix(row[:post_id]), user_id: user_id, created_at: Time.zone.at(row[:post_time]), raw: @text_processor.process_post(row[:post_text], attachments), - import_topic_id: row[:topic_id] + import_topic_id: @settings.prefix(row[:topic_id]) } if is_first_post @@ -54,14 +56,18 @@ module ImportScripts::PhpBB3 def map_first_post(row, mapped) poll_data = add_poll(row, mapped) if @settings.import_polls - mapped[:category] = @lookup.category_id_from_imported_category_id(row[:forum_id]) + mapped[:category] = @lookup.category_id_from_imported_category_id(@settings.prefix(@settings.category_mappings[row[:forum_id].to_s])) || + @lookup.category_id_from_imported_category_id(@settings.prefix(row[:forum_id])) mapped[:title] = CGI.unescapeHTML(row[:topic_title]).strip[0...255] mapped[:pinned_at] = mapped[:created_at] unless row[:topic_type] == Constants::POST_NORMAL mapped[:pinned_globally] = row[:topic_type] == Constants::POST_GLOBAL mapped[:views] = row[:topic_views] mapped[:post_create_action] = proc do |post| - @permalink_importer.create_for_topic(post.topic, row[:topic_id]) - @permalink_importer.create_for_post(post, row[:post_id]) + if tags = @settings.tag_mappings[row[:forum_id].to_s].presence + DiscourseTagging.tag_topic_by_names(post.topic, staff_guardian, tags) + end + @permalink_importer.create_for_topic(post.topic, row[:topic_id]) # skip @settings.prefix because ID is used in permalink generation + @permalink_importer.create_for_post(post, row[:post_id]) # skip @settings.prefix because ID is used in permalink generation @poll_importer.update_poll(row[:topic_id], post, poll_data) if poll_data TopicViewItem.add(post.topic_id, row[:poster_ip], post.user_id, post.created_at, true) end @@ -70,16 +76,16 @@ module ImportScripts::PhpBB3 end def map_other_post(row, mapped) - parent = @lookup.topic_lookup_from_imported_post_id(row[:topic_first_post_id]) + parent = @lookup.topic_lookup_from_imported_post_id(@settings.prefix(row[:topic_first_post_id])) if parent.blank? - puts "Parent post #{row[:topic_first_post_id]} doesn't exist. Skipping #{row[:post_id]}: #{row[:topic_title][0..40]}" + puts "Parent post #{@settings.prefix(row[:topic_first_post_id])} doesn't exist. Skipping #{@settings.prefix(row[:post_id])}: #{row[:topic_title][0..40]}" return nil end mapped[:topic_id] = parent[:topic_id] mapped[:post_create_action] = proc do |post| - @permalink_importer.create_for_post(post, row[:post_id]) + @permalink_importer.create_for_post(post, row[:post_id]) # skip @settings.prefix because ID is used in permalink generation TopicViewItem.add(post.topic_id, row[:poster_ip], post.user_id, post.created_at, true) end @@ -91,9 +97,14 @@ module ImportScripts::PhpBB3 poll_data = PollData.new(row[:poll_title], row[:poll_max_options], row[:poll_end]) poll_raw = @poll_importer.create_raw(row[:topic_id], poll_data) + return if poll_data.options.size < 2 mapped_post[:raw] = poll_raw << "\n\n" << mapped_post[:raw] poll_data end + + def staff_guardian + @_staff_guardian ||= Guardian.new(Discourse.system_user) + end end end diff --git a/script/import_scripts/phpbb3/importers/user_importer.rb b/script/import_scripts/phpbb3/importers/user_importer.rb index 80655006e18..bf12bbc5d62 100644 --- a/script/import_scripts/phpbb3/importers/user_importer.rb +++ b/script/import_scripts/phpbb3/importers/user_importer.rb @@ -12,14 +12,18 @@ module ImportScripts::PhpBB3 end def map_users_to_import_ids(rows) - rows.map { |row| row[:user_id] } + rows.map { |row| @settings.prefix(row[:user_id]) } end def map_user(row) is_active_user = row[:user_inactive_reason] != Constants::INACTIVE_REGISTER + trust_level = row[:user_posts] == 0 ? TrustLevel[0] : TrustLevel[1] + trust_level = @settings.trust_level_for_posts(row[:user_posts], trust_level: trust_level) + manual_locked_trust_level = trust_level > TrustLevel[1] ? trust_level : nil + { - id: row[:user_id], + id: @settings.prefix(row[:user_id]), email: row[:user_email], username: row[:username], password: @settings.import_passwords ? row[:user_password] : nil, @@ -28,7 +32,8 @@ module ImportScripts::PhpBB3 last_seen_at: row[:user_lastvisit] == 0 ? Time.zone.at(row[:user_regdate]) : Time.zone.at(row[:user_lastvisit]), registration_ip_address: (IPAddr.new(row[:user_ip]) rescue nil), active: is_active_user, - trust_level: row[:user_posts] == 0 ? TrustLevel[0] : TrustLevel[1], + trust_level: trust_level, + manual_locked_trust_level: manual_locked_trust_level, approved: is_active_user, approved_by_id: is_active_user ? Discourse.system_user.id : nil, approved_at: is_active_user ? Time.now : nil, @@ -45,14 +50,14 @@ module ImportScripts::PhpBB3 end def map_anonymous_users_to_import_ids(rows) - rows.map { |row| row[:post_username] } + rows.map { |row| @settings.prefix(row[:post_username]) } end def map_anonymous_user(row) username = row[:post_username] { - id: username, + id: @settings.prefix(username), email: "anonymous_#{SecureRandom.hex}@no-email.invalid", username: username, name: @settings.username_as_name ? username : '', diff --git a/script/import_scripts/phpbb3/settings.yml b/script/import_scripts/phpbb3/settings.yml index e884dfc12fd..0598dc3039c 100644 --- a/script/import_scripts/phpbb3/settings.yml +++ b/script/import_scripts/phpbb3/settings.yml @@ -11,6 +11,73 @@ database: batch_size: 1000 # Don't change this unless you know what you're doing. The default (1000) should work just fine. import: + # Set this if you import multiple phpBB forums into a single Discourse forum. + # + # For example, when importing multiple sites, prefix all imported IDs + # with 'first' to avoid conflicts. Subsequent import runs must have a + # different 'site_name'. + # + # site_name: first + # + site_name: + + # Create new categories + # + # For example, to create a parent category and a subcategory. + # + # new_categories: + # - forum_id: foo + # name: Foo Category + # - forum_id: bar + # name: Bar Category + # parent_id: foo + # + new_categories: + + # Category mappings + # + # For example, topics from phpBB category 1 and 2 will be imported + # in the new "Foo Category" category, topics from phpBB category 3 + # will be imported in subcategory "Bar category", topics from phpBB + # category 4 will be merged into category 5 and category 6 will be + # skipped. + # + # category_mappings: + # 1: foo + # 2: foo + # 3: bar + # 4: 5 + # 6: SKIP + # + category_mappings: + + # Tag mappings + # + # For example, imported topics from phpBB category 1 will be tagged + # with 'first-category', etc. + # + # tag_mappings: + # 1: + # - first-category + # 2: + # - second-category + # 3: + # - third-category + # + tag_mappings: + + # Rank to trust level mapping + # + # Map phpBB 3.x rank levels to trust level + # Users with rank at least 3000 will have TL3, etc. + # + # rank_mapping: + # trust_level_1: 200 + # trust_level_2: 1000 + # trust_level_3: 3000 + # + rank_mapping: + # WARNING: Do not activate this option unless you know what you are doing. # It will probably break the BBCode to Markdown conversion and slows down your import. use_bbcode_to_md: false diff --git a/script/import_scripts/phpbb3/support/bbcode/markdown_node.rb b/script/import_scripts/phpbb3/support/bbcode/markdown_node.rb new file mode 100644 index 00000000000..5a42a1bf408 --- /dev/null +++ b/script/import_scripts/phpbb3/support/bbcode/markdown_node.rb @@ -0,0 +1,88 @@ +# frozen_string_literal: true + +module ImportScripts; end +module ImportScripts::PhpBB3; end + +module ImportScripts::PhpBB3::BBCode + LINEBREAK_AUTO = :auto + LINEBREAK_HARD = :hard + LINEBREAK_HTML = :html + + class MarkdownNode + # @return [String] + attr_reader :xml_node_name + + # @return [MarkdownNode] + attr_reader :parent + + # @return [Array] + attr_reader :children + + # @return [Array] + attr_accessor :previous_sibling + + # @return [Array] + attr_accessor :next_sibling + + # @return [String] + attr_accessor :text + + # @return [String] + attr_accessor :prefix + + # @return [String] + attr_accessor :postfix + + # @return [Integer] + attr_accessor :prefix_linebreaks + + # @return [Integer] + attr_accessor :postfix_linebreaks + + # @return [Symbol] + attr_accessor :prefix_linebreak_type + + # @return [Symbol] + attr_accessor :postfix_linebreak_type + + # @return [String] + attr_accessor :prefix_children + + # @param xml_node_name [String] + # @param parent [MarkdownNode] + def initialize(xml_node_name:, parent:) + @xml_node_name = xml_node_name + + @text = +"" + @prefix = +"" + @postfix = +"" + + @prefix_linebreaks = 0 + @postfix_linebreaks = 0 + + @prefix_linebreak_type = LINEBREAK_AUTO + @postfix_linebreak_type = LINEBREAK_AUTO + + @parent = parent + @children = [] + + if @parent + @previous_sibling = @parent.children.last + @previous_sibling.next_sibling = self if @previous_sibling + @parent.children << self + end + end + + def enclosed_with=(text) + @prefix = @postfix = text + end + + def skip_children + @children = nil + end + + def to_s + "name: #{xml_node_name}, prefix: #{prefix}, text: #{text}, children: #{children.size}, postfix: #{postfix}" + end + end +end diff --git a/script/import_scripts/phpbb3/support/bbcode/xml_to_markdown.rb b/script/import_scripts/phpbb3/support/bbcode/xml_to_markdown.rb new file mode 100644 index 00000000000..972c4a6c453 --- /dev/null +++ b/script/import_scripts/phpbb3/support/bbcode/xml_to_markdown.rb @@ -0,0 +1,356 @@ +# frozen_string_literal: true + +require 'nokogiri' +require_relative 'markdown_node' + +module ImportScripts::PhpBB3::BBCode + class XmlToMarkdown + def initialize(xml, opts = {}) + @username_from_user_id = opts[:username_from_user_id] + @smilie_to_emoji = opts[:smilie_to_emoji] + @quoted_post_from_post_id = opts[:quoted_post_from_post_id] + @upload_md_from_file = opts[:upload_md_from_file] + @url_replacement = opts[:url_replacement] + @allow_inline_code = opts.fetch(:allow_inline_code, false) + @traditional_linebreaks = opts.fetch(:traditional_linebreaks, false) + + @doc = Nokogiri::XML(xml) + @list_stack = [] + end + + def convert + preprocess_xml + + md_root = MarkdownNode.new(xml_node_name: "ROOT", parent: nil) + visit(@doc.root, md_root) + to_markdown(md_root).rstrip + end + + private + + IGNORED_ELEMENTS = ["s", "e", "i"] + ELEMENTS_WITHOUT_LEADING_WHITESPACES = ["LIST", "LI"] + ELEMENTS_WITH_HARD_LINEBREAKS = ["B", "I", "U"] + EXPLICIT_LINEBREAK_THRESHOLD = 2 + + def preprocess_xml + @doc.traverse do |node| + if node.is_a? Nokogiri::XML::Text + node.content = node.content.gsub(/\A\n+\s*/, "") + node.content = node.content.lstrip if remove_leading_whitespaces?(node) + node.remove if node.content.empty? + elsif IGNORED_ELEMENTS.include?(node.name) + node.remove + end + end + end + + def remove_leading_whitespaces?(xml_node) + parent = xml_node.parent + return false unless parent + + ELEMENTS_WITHOUT_LEADING_WHITESPACES.include?(parent.name) && + parent.children.first == xml_node + end + + def visit(xml_node, md_parent) + visitor = "visit_#{xml_node.name}" + visitor_exists = respond_to?(visitor, include_all: true) + + if visitor_exists && md_parent.children + md_node = create_node(xml_node, md_parent) + send(visitor, xml_node, md_node) + end + + xml_node.children.each { |xml_child| visit(xml_child, md_node || md_parent) } + + after_hook = "after_#{xml_node.name}" + if respond_to?(after_hook, include_all: true) + send(after_hook, xml_node, md_node) + end + end + + def create_node(xml_node, md_parent) + if xml_node.name == "br" + last_child = md_parent.children.last + return last_child if last_child&.xml_node_name == "br" + end + + MarkdownNode.new(xml_node_name: xml_node.name, parent: md_parent) + end + + def visit_text(xml_node, md_node) + md_node.text << text(xml_node) + end + + def visit_B(xml_node, md_node) + if xml_node.parent&.name != 'B' + md_node.enclosed_with = "**" + end + end + + def visit_I(xml_node, md_node) + if xml_node.parent&.name != 'I' + md_node.enclosed_with = "_" + end + end + + def visit_U(xml_node, md_node) + if xml_node.parent&.name != 'U' + md_node.prefix = "[u]" + md_node.postfix = "[/u]" + end + end + + def visit_CODE(xml_node, md_node) + content = xml_node.content + + if !@allow_inline_code || content.include?("\n") + md_node.prefix = "```text\n" + md_node.postfix = "\n```" + else + md_node.enclosed_with = "`" + end + + md_node.text = content.rstrip + md_node.skip_children + md_node.prefix_linebreaks = md_node.postfix_linebreaks = 2 + md_node.prefix_linebreak_type = LINEBREAK_HTML + end + + def visit_LIST(xml_node, md_node) + md_node.prefix_linebreaks = md_node.postfix_linebreaks = @list_stack.size == 0 ? 2 : 1 + md_node.prefix_linebreak_type = LINEBREAK_HTML if @list_stack.size == 0 + + @list_stack << { + unordered: xml_node.attribute('type').nil?, + item_count: 0 + } + end + + def after_LIST(xml_node, md_node) + @list_stack.pop + end + + def visit_LI(xml_node, md_node) + list = @list_stack.last + depth = @list_stack.size - 1 + + list[:item_count] += 1 + + indentation = ' ' * 2 * depth + symbol = list[:unordered] ? '*' : "#{list[:item_count]}." + + md_node.prefix = "#{indentation}#{symbol} " + md_node.postfix_linebreaks = 1 + end + + def visit_IMG(xml_node, md_node) + md_node.text = +"![](#{xml_node.attribute('src')})" + md_node.prefix_linebreaks = md_node.postfix_linebreaks = 2 + md_node.skip_children + end + + def visit_URL(xml_node, md_node) + original_url = xml_node.attribute('url').to_s + url = CGI.unescapeHTML(original_url) + url = @url_replacement.call(url) if @url_replacement + + if xml_node.content.strip == original_url + md_node.text = url + md_node.skip_children + else + md_node.prefix = "[" + md_node.postfix = "](#{url})" + end + end + + def visit_EMAIL(xml_node, md_node) + md_node.prefix = "<" + md_node.postfix = ">" + end + + def visit_br(xml_node, md_node) + md_node.postfix_linebreaks += 1 + + if md_node.postfix_linebreaks > 1 && ELEMENTS_WITH_HARD_LINEBREAKS.include?(xml_node.parent&.name) + md_node.postfix_linebreak_type = LINEBREAK_HARD + end + end + + def visit_E(xml_node, md_node) + if @smilie_to_emoji + md_node.text = @smilie_to_emoji.call(xml_node.content) + md_node.skip_children + end + end + + def visit_QUOTE(xml_node, md_node) + if post = quoted_post(xml_node) + md_node.prefix = %Q{[quote="#{post[:username]}, post:#{post[:post_number]}, topic:#{post[:topic_id]}"]\n} + md_node.postfix = "\n[/quote]" + elsif username = quoted_username(xml_node) + md_node.prefix = %Q{[quote="#{username}"]\n} + md_node.postfix = "\n[/quote]" + else + md_node.prefix_children = "> " + end + + md_node.prefix_linebreaks = md_node.postfix_linebreaks = 2 + md_node.prefix_linebreak_type = LINEBREAK_HTML + end + + def quoted_post(xml_node) + if @quoted_post_from_post_id + post_id = to_i(xml_node.attr("post_id")) + @quoted_post_from_post_id.call(post_id) if post_id + end + end + + def quoted_username(xml_node) + if @username_from_user_id + user_id = to_i(xml_node.attr("user_id")) + username = @username_from_user_id.call(user_id) if user_id + end + + username = xml_node.attr("author") unless username + username + end + + def to_i(string) + string.to_i if string&.match(/\A\d+\z/) + end + + def visit_ATTACHMENT(xml_node, md_node) + filename = xml_node.attr("filename") + index = to_i(xml_node.attr("index")) + + md_node.text = @upload_md_from_file.call(filename, index) if @upload_md_from_file + md_node.prefix_linebreaks = md_node.postfix_linebreaks = 1 + md_node.skip_children + end + + def visit_SIZE(xml_node, md_node) + size = to_i(xml_node.attr("size")) + return if size.nil? + + if size.between?(1, 99) + md_node.prefix = '' + md_node.postfix = '' + elsif size.between?(101, 200) + md_node.prefix = '' + md_node.postfix = '' + end + end + + def text(xml_node, escape_markdown: true) + text = CGI.unescapeHTML(xml_node.text) + # text.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" } if escape_markdown + text + end + + # @param md_parent [MarkdownNode] + def to_markdown(md_parent) + markdown = +"" + + md_parent.children.each do |md_node| + prefix = md_node.prefix + text = md_node.children&.any? ? to_markdown(md_node) : md_node.text + postfix = md_node.postfix + + parent_prefix = prefix_from_parent(md_parent) + + if parent_prefix && md_node.xml_node_name != "br" && (md_parent.prefix_children || !markdown.empty?) + prefix = "#{parent_prefix}#{prefix}" + end + + if md_node.xml_node_name != "CODE" + text, prefix, postfix = hoist_whitespaces!(markdown, text, prefix, postfix) + end + + add_linebreaks!(markdown, md_node.prefix_linebreaks, md_node.prefix_linebreak_type, parent_prefix) + markdown << prefix + markdown << text + markdown << postfix + add_linebreaks!(markdown, md_node.postfix_linebreaks, md_node.postfix_linebreak_type, parent_prefix) + end + + markdown + end + + def hoist_whitespaces!(markdown, text, prefix, postfix) + text = text.lstrip if markdown.end_with?("\n") + + unless prefix.empty? + if starts_with_whitespace?(text) && !ends_with_whitespace?(markdown) + prefix = "#{text[0]}#{prefix}" + end + text = text.lstrip + end + + unless postfix.empty? + if ends_with_whitespace?(text) + postfix = "#{postfix}#{text[-1]}" + end + text = text.rstrip + end + + [text, prefix, postfix] + end + + def prefix_from_parent(md_parent) + while md_parent + return md_parent.prefix_children if md_parent.prefix_children + md_parent = md_parent.parent + end + end + + def add_linebreaks!(markdown, required_linebreak_count, linebreak_type, prefix = nil) + return if required_linebreak_count == 0 || markdown.empty? + + existing_linebreak_count = markdown[/(?:\\?\n|
\n)*\z/].count("\n") + + if linebreak_type == LINEBREAK_HTML + max_linebreak_count = [existing_linebreak_count, required_linebreak_count - 1].max + 1 + required_linebreak_count = max_linebreak_count if max_linebreak_count > EXPLICIT_LINEBREAK_THRESHOLD + end + + return if existing_linebreak_count >= required_linebreak_count + + rstrip!(markdown) + alternative_linebreak_start_index = required_linebreak_count > EXPLICIT_LINEBREAK_THRESHOLD ? 1 : 2 + + required_linebreak_count.times do |index| + linebreak = linebreak(linebreak_type, index, alternative_linebreak_start_index, required_linebreak_count) + + markdown << (linebreak == "\n" ? prefix.rstrip : prefix) if prefix && index > 0 + markdown << linebreak + end + end + + def rstrip!(markdown) + markdown.gsub!(/\s*(?:\\?\n|
\n)*\z/, '') + end + + def linebreak(linebreak_type, linebreak_index, alternative_linebreak_start_index, required_linebreak_count) + use_alternative_linebreak = linebreak_index >= alternative_linebreak_start_index + is_last_linebreak = linebreak_index + 1 == required_linebreak_count + + return "
\n" if linebreak_type == LINEBREAK_HTML && + use_alternative_linebreak && is_last_linebreak + + return "\\\n" if linebreak_type == LINEBREAK_HARD || + @traditional_linebreaks || use_alternative_linebreak + + "\n" + end + + def starts_with_whitespace?(text) + text.match?(/\A\s/) + end + + def ends_with_whitespace?(text) + text.match?(/\s\z/) + end + end +end diff --git a/script/import_scripts/phpbb3/support/settings.rb b/script/import_scripts/phpbb3/support/settings.rb index 8b1ebf706f8..ee01a66e3ac 100644 --- a/script/import_scripts/phpbb3/support/settings.rb +++ b/script/import_scripts/phpbb3/support/settings.rb @@ -1,14 +1,23 @@ # frozen_string_literal: true +require 'csv' require 'yaml' +require_relative '../../base' module ImportScripts::PhpBB3 class Settings def self.load(filename) yaml = YAML::load_file(filename) - Settings.new(yaml) + Settings.new(yaml.deep_stringify_keys.with_indifferent_access) end + attr_reader :site_name + + attr_reader :new_categories + attr_reader :category_mappings + attr_reader :tag_mappings + attr_reader :rank_mapping + attr_reader :import_anonymous_users attr_reader :import_attachments attr_reader :import_private_messages @@ -34,6 +43,14 @@ module ImportScripts::PhpBB3 def initialize(yaml) import_settings = yaml['import'] + + @site_name = import_settings['site_name'] + + @new_categories = import_settings['new_categories'] + @category_mappings = import_settings['category_mappings'] + @tag_mappings = import_settings['tag_mappings'] + @rank_mapping = import_settings['rank_mapping'] + @import_anonymous_users = import_settings['anonymous_users'] @import_attachments = import_settings['attachments'] @import_private_messages = import_settings['private_messages'] @@ -58,6 +75,20 @@ module ImportScripts::PhpBB3 @database = DatabaseSettings.new(yaml['database']) end + + def prefix(val) + @site_name.present? && val.present? ? "#{@site_name}:#{val}" : val + end + + def trust_level_for_posts(rank, trust_level: 0) + if @rank_mapping.present? + @rank_mapping.each do |key, value| + trust_level = [trust_level, key.gsub('trust_level_', '').to_i].max if rank >= value + end + end + + trust_level + end end class DatabaseSettings diff --git a/script/import_scripts/phpbb3/support/smiley_processor.rb b/script/import_scripts/phpbb3/support/smiley_processor.rb index 29abc14f8ad..36df4035bb1 100644 --- a/script/import_scripts/phpbb3/support/smiley_processor.rb +++ b/script/import_scripts/phpbb3/support/smiley_processor.rb @@ -3,10 +3,12 @@ module ImportScripts::PhpBB3 class SmileyProcessor # @param uploader [ImportScripts::Uploader] + # @param database [ImportScripts::PhpBB3::Database_3_0 | ImportScripts::PhpBB3::Database_3_1] # @param settings [ImportScripts::PhpBB3::Settings] # @param phpbb_config [Hash] - def initialize(uploader, settings, phpbb_config) + def initialize(uploader, database, settings, phpbb_config) @uploader = uploader + @database = database @smilies_path = File.join(settings.base_dir, phpbb_config[:smilies_path]) @smiley_map = {} @@ -16,12 +18,16 @@ module ImportScripts::PhpBB3 def replace_smilies(text) # :) is encoded as :) - text.gsub!(/(.*?)/) do - smiley = $1 + text.gsub!(/.*?/) do + emoji($1) + end + end - @smiley_map.fetch(smiley) do - upload_smiley(smiley, $2, $3, $4) || smiley_as_text(smiley) - end + def emoji(smiley_code) + @smiley_map.fetch(smiley_code) do + smiley = @database.get_smiley(smiley_code) + emoji = upload_smiley(smiley_code, smiley[:smiley_url], smiley_code, smiley[:emotion]) if smiley + emoji || smiley_as_text(smiley_code) end end @@ -36,7 +42,7 @@ module ImportScripts::PhpBB3 [':o', ':-o', ':eek:'] => ':astonished:', [':shock:'] => ':open_mouth:', [':?', ':-?', ':???:'] => ':confused:', - ['8-)', ':cool:'] => ':sunglasses:', + ['8)', '8-)', ':cool:'] => ':sunglasses:', [':lol:'] => ':laughing:', [':x', ':-x', ':mad:'] => ':angry:', [':P', ':-P', ':razz:'] => ':stuck_out_tongue:', diff --git a/script/import_scripts/phpbb3/support/text_processor.rb b/script/import_scripts/phpbb3/support/text_processor.rb index 158864b1c9e..b5b9d9a54e3 100644 --- a/script/import_scripts/phpbb3/support/text_processor.rb +++ b/script/import_scripts/phpbb3/support/text_processor.rb @@ -1,48 +1,75 @@ # frozen_string_literal: true +require_relative 'bbcode/xml_to_markdown' + module ImportScripts::PhpBB3 class TextProcessor # @param lookup [ImportScripts::LookupContainer] # @param database [ImportScripts::PhpBB3::Database_3_0 | ImportScripts::PhpBB3::Database_3_1] # @param smiley_processor [ImportScripts::PhpBB3::SmileyProcessor] # @param settings [ImportScripts::PhpBB3::Settings] - def initialize(lookup, database, smiley_processor, settings) + # @param phpbb_config [Hash] + def initialize(lookup, database, smiley_processor, settings, phpbb_config) @lookup = lookup @database = database @smiley_processor = smiley_processor @he = HTMLEntities.new + @use_xml_to_markdown = phpbb_config[:phpbb_version].start_with?('3.2') @settings = settings @new_site_prefix = settings.new_site_prefix create_internal_link_regexps(settings.original_site_prefix) end - def process_raw_text(raw) - text = raw.dup - text = CGI.unescapeHTML(text) + def process_raw_text(raw, attachments = nil) + if @use_xml_to_markdown + unreferenced_attachments = attachments&.dup - clean_bbcodes(text) - if @settings.use_bbcode_to_md - text = bbcode_to_md(text) + converter = BBCode::XmlToMarkdown.new( + raw, + username_from_user_id: lambda { |user_id| @lookup.find_username_by_import_id(user_id) }, + smilie_to_emoji: lambda { |smilie| @smiley_processor.emoji(smilie).dup }, + quoted_post_from_post_id: lambda { |post_id| @lookup.topic_lookup_from_imported_post_id(post_id) }, + upload_md_from_file: (lambda do |filename, index| + unreferenced_attachments[index] = nil + attachments.fetch(index, filename).dup + end if attachments), + url_replacement: nil, + allow_inline_code: false + ) + + text = converter.convert + + text.gsub!(@short_internal_link_regexp) do |link| + replace_internal_link(link, $1, $2) + end + + add_unreferenced_attachments(text, unreferenced_attachments) + else + text = raw.dup + text = CGI.unescapeHTML(text) + + clean_bbcodes(text) + if @settings.use_bbcode_to_md + text = bbcode_to_md(text) + end + process_smilies(text) + process_links(text) + process_lists(text) + process_code(text) + fix_markdown(text) + process_attachments(text, attachments) if attachments.present? + + text end - process_smilies(text) - process_links(text) - process_lists(text) - process_code(text) - fix_markdown(text) - text end def process_post(raw, attachments) - text = process_raw_text(raw) - text = process_attachments(text, attachments) if attachments.present? - text + process_raw_text(raw, attachments) rescue raw end def process_private_msg(raw, attachments) - text = process_raw_text(raw) - text = process_attachments(text, attachments) if attachments.present? - text + process_raw_text(raw, attachments) rescue raw end protected @@ -139,6 +166,12 @@ module ImportScripts::PhpBB3 attachments.fetch(index, real_filename) end + add_unreferenced_attachments(text, unreferenced_attachments) + end + + def add_unreferenced_attachments(text, unreferenced_attachments) + return text unless unreferenced_attachments + unreferenced_attachments = unreferenced_attachments.compact text << "\n" << unreferenced_attachments.join("\n") unless unreferenced_attachments.empty? text @@ -161,6 +194,7 @@ module ImportScripts::PhpBB3 def fix_markdown(text) text.gsub!(/(\n*\[\/?quote.*?\]\n*)/mi) { |q| "\n#{q.strip}\n" } + text.gsub!(/^!\[[^\]]*\]\([^\]]*\)$/i) { |img| "\n#{img.strip}\n" } # space out images single on line text end end diff --git a/spec/script/import_scripts/phpbb3/support/bbcode/xml_to_markdown_spec.rb b/spec/script/import_scripts/phpbb3/support/bbcode/xml_to_markdown_spec.rb new file mode 100644 index 00000000000..dc4e813b981 --- /dev/null +++ b/spec/script/import_scripts/phpbb3/support/bbcode/xml_to_markdown_spec.rb @@ -0,0 +1,817 @@ +# frozen_string_literal: true + +require 'rails_helper' +require Rails.root.join('script/import_scripts/phpbb3/support/bbcode/xml_to_markdown') + +RSpec.describe ImportScripts::PhpBB3::BBCode::XmlToMarkdown do + def convert(xml, opts = {}) + described_class.new(xml, opts).convert + end + + it "converts unformatted text" do + xml = 'unformatted text' + expect(convert(xml)).to eq('unformatted text') + end + + it "converts nested formatting" do + xml = '[i]this is italic[b] and bold[/b] text[/i]' + expect(convert(xml)).to eq('_this is italic **and bold** text_') + end + + context "bold text" do + it "converts bold text" do + xml = '[b]this is bold text[/b]' + expect(convert(xml)).to eq('**this is bold text**') + end + + it "converts multi-line bold text" do + xml = <<~XML + [b]this is bold text
+ on two lines[/b]

+
+ [b]this is bold text
+
+
+ with two empty lines[/b]
+ XML + + expect(convert(xml)).to eq(<<~MD.chomp) + **this is bold text + on two lines** + + **this is bold text\\ + \\ + \\ + with two empty lines** + MD + end + + it "ignores duplicate bold text" do + xml = '[b][b]this is bold text[/b][/b]' + expect(convert(xml)).to eq('**this is bold text**') + end + end + + context "italic text" do + it "converts italic text" do + xml = '[i]this is italic text[/i]' + expect(convert(xml)).to eq('_this is italic text_') + end + + it "converts multi-line italic text" do + xml = <<~XML + [i]this is italic text
+ on two lines[/i]

+
+ [i]this is italic text
+
+
+ with two empty lines[/i]
+ XML + + expect(convert(xml)).to eq(<<~MD.chomp) + _this is italic text + on two lines_ + + _this is italic text\\ + \\ + \\ + with two empty lines_ + MD + end + + it "ignores duplicate italic text" do + xml = '[i][i]this is italic text[/i][/i]' + expect(convert(xml)).to eq('_this is italic text_') + end + end + + context "underlined text" do + it "converts underlined text" do + xml = '[u]this is underlined text[/u]' + expect(convert(xml)).to eq('[u]this is underlined text[/u]') + end + + it "converts multi-line underlined text" do + xml = <<~XML + [u]this is underlined text
+ on two lines[/u]

+
+ [u]this is underlined text
+
+
+ with two empty lines[/u]
+ XML + + expect(convert(xml)).to eq(<<~MD.chomp) + [u]this is underlined text + on two lines[/u] + + [u]this is underlined text\\ + \\ + \\ + with two empty lines[/u] + MD + end + + it "ignores duplicate underlined text" do + xml = '[u][u]this is underlined text[/u][/u]' + expect(convert(xml)).to eq('[u]this is underlined text[/u]') + end + end + + context "code blocks" do + context "inline code blocks enabled" do + let(:opts) { { allow_inline_code: true } } + + it "converts single line code blocks" do + xml = '[code]one line of code[/code]' + expect(convert(xml, opts)).to eq('`one line of code`') + end + end + + context "inline code blocks disabled" do + it "converts single line code blocks" do + xml = 'foo [code]some code[/code] bar' + + expect(convert(xml)).to eq(<<~MD.chomp) + foo + + ```text + some code + ``` + + bar + MD + end + end + + it "converts multi-line code blocks" do + xml = <<~XML + [code] + /\_/\ + ( o.o ) + > ^ < + [/code] + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + ```text + /\_/\ + ( o.o ) + > ^ < + ``` + MD + end + + it "adds leading and trailing linebreaks to code blocks" do + xml = <<~XML + text before code block
+ + [code] + foo + + bar + [/code] + + text after code block
+ XML + + expect(convert(xml)).to eq(<<~MD.chomp) + text before code block + + ```text + foo + + bar + ``` + + text after code block + MD + end + end + + context "lists" do + it "converts unordered lists" do + xml = <<~XML + [list] +
  • [*]Red
  • +
  • [*]Blue
  • +
  • [*]Yellow
  • + [/list]
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + * Red + * Blue + * Yellow + MD + end + + it "converts ordered lists" do + xml = <<~XML + [list=1] +
  • [*]Go to the shops
  • +
  • [*]Buy a new computer
  • +
  • [*]Swear at computer when it crashes
  • + [/list]
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + 1. Go to the shops + 2. Buy a new computer + 3. Swear at computer when it crashes + MD + end + + it "converts all types of ordered lists into regular ordered lists" do + xml = <<~XML + [list=A] +
  • [*]The first possible answer
  • +
  • [*]The second possible answer
  • +
  • [*]The third possible answer
  • + [/list]
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + 1. The first possible answer + 2. The second possible answer + 3. The third possible answer + MD + end + + it "adds leading and trailing linebreaks to lists if needed" do + xml = <<~XML + foo + [list] +
  • [*]Red
  • +
  • [*]Blue
  • +
  • [*]Yellow
  • + [/list]
    + bar
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + foo + + * Red + * Blue + * Yellow + + bar + MD + end + + it "converts nested lists" do + xml = <<~XML + [list] +
  • [*]Option 1 + [list] +
  • [*]Option 1.1
  • +
  • [*]Option 1.2
  • + [/list]
    +
  • [*]Option 2 + [list] +
  • [*]Option 2.1 + [list=1] +
  • [*] Red
  • +
  • [*] Blue
  • + [/list] +
  • [*]Option 2.2
  • + [/list] + [/list]
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + * Option 1 + * Option 1.1 + * Option 1.2 + * Option 2 + * Option 2.1 + 1. Red + 2. Blue + * Option 2.2 + MD + end + + it "handles nested elements and linebreaks in list items" do + xml = <<~XML + [list]
  • [*]some text [b][i]foo[/i][/b]
    + or [b][i]bar[/i][/b] more text
  • [/list]
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + * some text **_foo_** + or **_bar_** more text + MD + end + end + + context "images" do + it "converts image" do + xml = <<~XML + [img] + + https://example.com/foo.png + [/img] + XML + + expect(convert(xml)).to eq('![](https://example.com/foo.png)') + end + + it "converts image with link" do + xml = <<~XML + [url=https://example.com/] + [img] + https://example.com/foo.png + [/img][/url] + XML + + expect(convert(xml)).to eq('[![](https://example.com/foo.png)](https://example.com/)') + end + end + + context "links" do + it "converts links created without BBCode" do + xml = 'https://en.wikipedia.org/wiki/Capybara' + expect(convert(xml)).to eq('https://en.wikipedia.org/wiki/Capybara') + end + + it "converts links created with BBCode" do + xml = '[url]https://en.wikipedia.org/wiki/Capybara[/url]' + expect(convert(xml)).to eq('https://en.wikipedia.org/wiki/Capybara') + end + + it "converts links with link text" do + xml = '[url=https://en.wikipedia.org/wiki/Capybara]Capybara[/url]' + expect(convert(xml)).to eq('[Capybara](https://en.wikipedia.org/wiki/Capybara)') + end + + it "converts internal links" do + opts = { + url_replacement: lambda do |url| + if url == 'http://forum.example.com/viewtopic.php?f=2&t=2' + 'https://discuss.example.com/t/welcome-topic/18' + end + end + } + + xml = 'http://forum.example.com/viewtopic.php?f=2&t=2' + expect(convert(xml, opts)).to eq('https://discuss.example.com/t/welcome-topic/18') + end + + it "converts email links created without BBCode" do + xml = 'foo.bar@example.com' + expect(convert(xml)).to eq('') + end + + it "converts email links created with BBCode" do + xml = '[email]foo.bar@example.com[/email]' + expect(convert(xml)).to eq('') + end + + it "converts truncated, long links" do + xml = <<~XML + + [url] + http://answers.yahoo.com/question/index?qid=20070920134223AAkkPli + [/url] + XML + + expect(convert(xml)).to eq('http://answers.yahoo.com/question/index?qid=20070920134223AAkkPli') + end + + it "converts BBCodes inside link text" do + xml = <<~XML + [url=http://example.com] + [b]Hello [i]world[/i]![/b] + [/url] + XML + + expect(convert(xml)).to eq('[**Hello _world_!**](http://example.com)') + end + end + + context "quotes" do + it "converts simple quote" do + xml = <<~XML + [quote]Lorem
    + ipsum[/quote]
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + > Lorem + > ipsum + MD + end + + it "converts quote with line breaks" do + xml = <<~XML + [quote]First paragraph
    +
    + Second paragraph
    +
    +
    + Third paragraph[/quote]
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + > First paragraph + > + > Second paragraph + > \\ + > \\ + > Third paragraph + MD + end + + it "converts quote with line breaks and nested formatting" do + xml = <<~XML + [quote] + [i]this is italic
    + [b]and bold
    + text
    + [/b]
    on multiple
    +
    +
    + lines[/i]
    + [/quote]
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + > _this is italic + > **and bold + > text** + > on multiple\\ + > \\ + > \\ + > lines_ + MD + end + + it "converts quote with author attribute" do + xml = '[quote="Mr. Blobby"]Lorem ipsum[/quote]' + + expect(convert(xml)).to eq(<<~MD.chomp) + [quote="Mr. Blobby"] + Lorem ipsum + [/quote] + MD + end + + it "converts quote with author attribute and line breaks" do + xml = <<~XML + [quote="Mr. Blobby"]First paragraph
    +
    + Second paragraph
    +
    + Third paragraph[/quote]
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + [quote="Mr. Blobby"] + First paragraph + + Second paragraph + + Third paragraph + [/quote] + MD + end + + context "with user_id attribute" do + let(:opts) { { username_from_user_id: lambda { |user_id| user_id == 48 ? "mr_blobby" : nil } } } + + it "uses the correct username when the user exists" do + xml = '[quote="Mr. Blobby" user_id=48]Lorem ipsum[/quote]' + + expect(convert(xml, opts)).to eq(<<~MD.chomp) + [quote="mr_blobby"] + Lorem ipsum + [/quote] + MD + end + + it "uses the author name when the user does not exist" do + xml = '[quote="Mr. Blobby" user_id=48]Lorem ipsum[/quote]' + + expect(convert(xml, opts)).to eq(<<~MD.chomp) + [quote="Mr. Blobby"] + Lorem ipsum + [/quote] + MD + end + + it "creates a blockquote when the user does not exist and the author is missing" do + xml = '[quote=user_id=48]Lorem ipsum[/quote]' + expect(convert(xml, opts)).to eq("> Lorem ipsum") + end + end + + context "with post_id attribute" do + let(:opts) do + { quoted_post_from_post_id: lambda { |post_id| { username: 'mr_blobby', post_number: 3, topic_id: 951 } if post_id == 43 } } + end + + it "uses information from the quoted post if the post exists" do + xml = <<~XML + + [quote="Mr. Blobby" post_id=43 time=1534626128 user_id=48]Lorem ipsum[/quote] + + XML + + expect(convert(xml, opts)).to eq(<<~MD.chomp) + [quote="mr_blobby, post:3, topic:951"] + Lorem ipsum + [/quote] + MD + end + + it "uses other attributes when post doesn't exist" do + xml = <<~XML + + [quote="Mr. Blobby" post_id=44 time=1534626128 user_id=48]Lorem ipsum[/quote] + + XML + + expect(convert(xml, opts)).to eq(<<~MD.chomp) + [quote="Mr. Blobby"] + Lorem ipsum + [/quote] + MD + end + end + + it "converts nested quotes" do + xml = <<~XML + Multiple nested quotes:
    + + + [quote=user3] + + [quote=user2] + + [quote=user1] + [b]foo [i]and[/i] bar[/b] + [/quote] + + + Lorem ipsum + [/quote] + + + nested quotes + [/quote] + + + Text after quotes. +
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + Multiple nested quotes: + + [quote="user3"] + [quote="user2"] + [quote="user1"] + **foo _and_ bar** + [/quote] + + Lorem ipsum + [/quote] + + nested quotes + [/quote] + + Text after quotes. + MD + end + end + + it "converts smilies" do + opts = { + smilie_to_emoji: lambda do |smilie| + case smilie + when ':D' + ':smiley:' + when ':eek:' + ':astonished:' + end + end + } + + xml = ':D :eek:' + expect(convert(xml, opts)).to eq(":smiley: :astonished:") + end + + context "attachments" do + it "converts attachments" do + opts = { + upload_md_from_file: lambda do |filename, index| + url = \ + case index + when 0 then + "upload://hash2.png" + when 1 then + "upload://hash1.png" + end + + "![#{filename}|231x231](#{url})" + end + } + + xml = <<~XML + Multiple attachments: + [attachment=1]image1.png[/attachment] + This is an inline image.
    +
    + And another one: + [attachment=0]image2.png[/attachment]
    + XML + + expect(convert(xml, opts)).to eq(<<~MD.chomp) + Multiple attachments: + ![image1.png|231x231](upload://hash1.png) + This is an inline image. + + And another one: + ![image2.png|231x231](upload://hash2.png) + MD + end + end + + context "line breaks" do + it "converts line breaks" do + xml = <<~XML + Lorem ipsum dolor sit amet.
    +
    + Consetetur sadipscing elitr.
    +
    +
    + Sed diam nonumy eirmod tempor.
    +
    +
    +
    +
    + Invidunt ut labore et dolore.
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + Lorem ipsum dolor sit amet. + + Consetetur sadipscing elitr. + \\ + \\ + Sed diam nonumy eirmod tempor. + \\ + \\ + \\ + \\ + Invidunt ut labore et dolore. + MD + end + + it "uses hard linebreaks when tradition line breaks are enabled" do + xml = <<~XML + Lorem ipsum dolor sit amet.
    + Consetetur sadipscing elitr.
    +
    + Sed diam nonumy eirmod tempor.
    +
    +
    +
    + Invidunt ut labore et dolore.
    + XML + + expect(convert(xml, traditional_linebreaks: true)).to eq(<<~MD.chomp) + Lorem ipsum dolor sit amet.\\ + Consetetur sadipscing elitr.\\ + \\ + Sed diam nonumy eirmod tempor.\\ + \\ + \\ + \\ + Invidunt ut labore et dolore. + MD + end + + it "uses
    in front of block elements" do + xml = <<~XML + text before 4 empty lines
    +
    +
    +
    + + [code]some code[/code] + text before 3 empty lines
    +
    +
    + + [list] +
  • [*] item 1
  • +
  • [*] item 2
  • + [/list]
    + text before 2 empty lines
    +
    + + [list] +
  • [*] item 1
  • +
  • [*] item 2
  • + [/list]
    + XML + + expect(convert(xml)).to eq(<<~MD.chomp) + text before 4 empty lines + \\ + \\ + \\ +
    + ```text + some code + ``` + + text before 3 empty lines + \\ + \\ +
    + * item 1 + * item 2 + + text before 2 empty lines + \\ +
    + * item 1 + * item 2 + MD + end + end + + context "whitespace" do + it "doesn't strip whitespaces from inline tags" do + xml = <<~XML + Lorem[b] ipsum [/b]dolor
    + [i] sit [/i]amet,
    + consetetur[b] sadipscing [/b]
    + XML + + expect(convert(xml)).to eq(<<~MD.rstrip) + Lorem **ipsum** dolor + _sit_ amet, + consetetur **sadipscing** + MD + end + + it "preserves whitespace between tags" do + xml = "foo [b]bold[/b] [i]italic[/i] [u]underlined[/u] bar" + expect(convert(xml)).to eq("foo **bold** _italic_ [u]underlined[/u] bar") + end + end + + context "unknown element" do + it "converts an unknown element right below the root element" do + xml = '[unknown]foo[/unknown]' + expect(convert(xml)).to eq('foo') + end + + it "converts an unknown element inside a known element" do + xml = '[b][unknown]bar[/unknown][/b]' + expect(convert(xml)).to eq('**bar**') + end + end + + context "font size" do + it "converts sizes to either or " do + xml = <<~XML + [size=50]very small[/size]
    + [size=85]small[/size]
    + [size=150]large[/size]
    + [size=200]very large[/size]
    + XML + + expect(convert(xml)).to eq(<<~MD.rstrip) + very small + small + large + very large + MD + end + + it "ignores invalid sizes" do + xml = <<~XML + [size=-50]negative number[/size]
    + [size=0]zero[/size]
    + [size=300]too large[/size]
    + [size=abc]not a number[/size]
    + [size]no size[/size]
    + XML + + expect(convert(xml)).to eq(<<~MD.rstrip) + negative number + zero + too large + not a number + no size + MD + end + end +end