From bd7cbcd8f8dbff84efab67bef196a05f41c9feda Mon Sep 17 00:00:00 2001 From: Arpit Jalan <arpit@techapj.com> Date: Wed, 13 Jan 2021 23:10:00 +0530 Subject: [PATCH] Improve Vanilla import script. (#11701) - import groups and group users - import uploads/attachments - improved code tag parsing - improved text formatting - mark topics as solved --- script/import_scripts/vanilla_body_parser.rb | 43 +++-- script/import_scripts/vanilla_mysql.rb | 150 +++++++++++++++++- .../vanilla_body_parser_spec.rb | 4 +- 3 files changed, 182 insertions(+), 15 deletions(-) diff --git a/script/import_scripts/vanilla_body_parser.rb b/script/import_scripts/vanilla_body_parser.rb index af9d2c4da1e..ba4608e3ff8 100644 --- a/script/import_scripts/vanilla_body_parser.rb +++ b/script/import_scripts/vanilla_body_parser.rb @@ -24,6 +24,17 @@ class VanillaBodyParser private def clean_up(text) + # <pre class="CodeBlock">...</pre> + text = text.gsub(/\<pre class="CodeBlock"\>(.*?)\<\/pre\>/im) { "\n```\n#{$1}\n```\n" } + # <pre>...</pre> + text = text.gsub(/\<pre\>(.*?)\<\/pre\>/im) { "\n```\n#{$1}\n```\n" } + # <code></code> + text = text.gsub("\<code\>\</code\>", "").gsub(/\<code\>(.*?)\<\/code\>/im) { "#{$1}" } + # <div class="Quote">...</div> + text = text.gsub(/\<div class="Quote"\>(.*?)\<\/div\>/im) { "\n[quote]\n#{$1}\n[/quote]\n" } + # [code], [quote] + text = text.gsub(/\[\/?code\]/i, "\n```\n").gsub(/\[quote.*?\]/i, "\n" + '\0' + "\n").gsub(/\[\/quote\]/i, "\n" + '\0' + "\n") + text.gsub(/<\/?font[^>]*>/, '').gsub(/<\/?span[^>]*>/, '').gsub(/<\/?div[^>]*>/, '').gsub(/^ +/, '').gsub(/ +/, ' ') end @@ -58,7 +69,7 @@ class VanillaBodyParser return parse_quote(insert) if quoting embed = embed_type.in? ['image', 'link', 'file'] - parse_embed(insert) if embed + parse_embed(insert, embed_type) if embed end def parse_mention(mention) @@ -87,9 +98,6 @@ class VanillaBodyParser # In the Quill format used by Vanilla Forums, a line is rendered as `code` # when it's followed by a fragment with attributes: {'code-block': true}. - # So we open our ``` block when the next fragment has a 'code-block' - # attribute and the previous one didn't and we close the ``` block when - # the second next fragment does not contain the 'code-block' attribute def parse_code(text, fragment, index) next_fragment = next_fragment(index) @@ -98,18 +106,27 @@ class VanillaBodyParser previous_fragment = previous_fragment(index) previous_code = previous_fragment.dig(:attributes, :'code-block') - # if next is code and previous is not, prepend ``` - text = "\n```#{text}" unless previous_code + if previous_code + text = text.gsub(/\\n(.*?)\\n/) { "\n```\n#{$1}\n```\n" } + else + last_pos = text.rindex(/\n/) + + if last_pos + array = [text[0..last_pos].strip, text[last_pos + 1 .. text.length].strip] + text = array.join("\n```\n") + else + text = "\n```\n#{text}" + end + end end current_code = fragment.dig(:attributes, :'code-block') - if current_code second_next_fragment = second_next_fragment(index) second_next_code = second_next_fragment.dig(:attributes, :'code-block') # if current is code and 2 after is not, prepend ``` - text = "\n```#{text}" unless second_next_code + text = "\n```\n#{text}" unless second_next_code end text @@ -174,7 +191,7 @@ class VanillaBodyParser "[quote#{quote_info}]\n#{embed[:body]}\n[/quote]\n\n""" end - def parse_embed(insert) + def parse_embed(insert, embed_type) embed = insert.dig(:'embed-external', :data) url = embed[:url] @@ -193,7 +210,13 @@ class VanillaBodyParser end end - "\n[#{embed[:name]}](#{url})\n" + if embed_type == "link" + "\n[#{embed[:name]}](#{url})\n" + elsif embed_type == "image" + "\n<img src=\"#{url}\" alt=\"#{embed[:name]}\">\n" + else + "\n<a href=\"#{url}\">#{embed[:name]}</a>\n" + end end def normalize(full_text) diff --git a/script/import_scripts/vanilla_mysql.rb b/script/import_scripts/vanilla_mysql.rb index f2b4e6ef583..1a290ecd13b 100644 --- a/script/import_scripts/vanilla_mysql.rb +++ b/script/import_scripts/vanilla_mysql.rb @@ -45,16 +45,37 @@ class ImportScripts::VanillaSQL < ImportScripts::Base SiteSetting.max_tags_per_topic = 10 end + import_groups import_users import_avatars + import_group_users import_categories import_topics import_posts import_messages update_tl0 + mark_topics_as_solved create_permalinks + import_attachments + end + + def import_groups + puts "", "importing groups..." + + groups = mysql_query <<-SQL + SELECT RoleID, Name + FROM #{TABLE_PREFIX}Role + ORDER BY RoleID + SQL + + create_groups(groups) do |group| + { + id: group["RoleID"], + name: @htmlentities.decode(group["Name"]).strip + } + end end def import_users @@ -147,7 +168,7 @@ class ImportScripts::VanillaSQL < ImportScripts::Base photo_real_filename = nil parts = photo.squeeze("/").split("/") - if parts[0] == "cf:" + if parts[0] =~ /^[a-z0-9]{2}:/ photo_path = "#{ATTACHMENTS_BASE_DIR}/#{parts[2..-2].join('/')}".squeeze("/") elsif parts[0] == "~cf" photo_path = "#{ATTACHMENTS_BASE_DIR}/#{parts[1..-2].join('/')}".squeeze("/") @@ -200,6 +221,24 @@ class ImportScripts::VanillaSQL < ImportScripts::Base nil end + def import_group_users + puts "", "importing group users..." + + group_users = mysql_query(" + SELECT RoleID, UserID + FROM #{TABLE_PREFIX}UserRole + ").to_a + + group_users.each do |row| + user_id = user_id_from_imported_user_id(row["UserID"]) + group_id = group_id_from_imported_group_id(row["RoleID"]) + + if user_id && group_id + GroupUser.find_or_create_by(user_id: user_id, group_id: group_id) + end + end + end + def import_categories puts "", "importing categories..." @@ -272,7 +311,7 @@ class ImportScripts::VanillaSQL < ImportScripts::Base batches(BATCH_SIZE) do |offset| comments = mysql_query( "SELECT CommentID, DiscussionID, Body, Format, - DateInserted, InsertUserID + DateInserted, InsertUserID, QnA FROM #{TABLE_PREFIX}Comment WHERE CommentID > #{@last_post_id} ORDER BY CommentID ASC @@ -286,13 +325,20 @@ class ImportScripts::VanillaSQL < ImportScripts::Base next unless t = topic_lookup_from_imported_post_id("discussion#" + comment['DiscussionID'].to_s) next if comment['Body'].blank? user_id = user_id_from_imported_user_id(comment['InsertUserID']) || Discourse::SYSTEM_USER_ID - { + + mapped = { id: "comment#" + comment['CommentID'].to_s, user_id: user_id, topic_id: t[:topic_id], raw: VanillaBodyParser.new(comment, user_id).parse, created_at: Time.zone.at(comment['DateInserted']) } + + if comment['QnA'] == "Accepted" + mapped[:custom_fields] = { is_accepted_answer: "true" } + end + + mapped end end end @@ -395,6 +441,104 @@ class ImportScripts::VanillaSQL < ImportScripts::Base end end + def import_attachments + if ATTACHMENTS_BASE_DIR && File.exists?(ATTACHMENTS_BASE_DIR) + puts "", "importing attachments" + + start = Time.now + count = 0 + + # https://us.v-cdn.net/1234567/uploads/editor/xyz/image.jpg + cdn_regex = /https:\/\/us.v-cdn.net\/1234567\/uploads\/(\S+\/(\w|-)+.\w+)/i + # [attachment=10109:Screen Shot 2012-04-01 at 3.47.35 AM.png] + attachment_regex = /\[attachment=(\d+):(.*?)\]/i + + Post.where("raw LIKE '%/us.v-cdn.net/%' OR raw LIKE '%[attachment%'").find_each do |post| + count += 1 + print "\r%7d - %6d/sec" % [count, count.to_f / (Time.now - start)] + new_raw = post.raw.dup + + new_raw.gsub!(attachment_regex) do |s| + matches = attachment_regex.match(s) + attachment_id = matches[1] + file_name = matches[2] + next unless attachment_id + + r = mysql_query("SELECT Path, Name FROM #{TABLE_PREFIX}Media WHERE MediaID = #{attachment_id};").first + next if r.nil? + path = r["Path"] + name = r["Name"] + next unless path.present? + + path.gsub!("s3://content/", "") + path.gsub!("s3://uploads/", "") + file_path = "#{ATTACHMENTS_BASE_DIR}/#{path}" + + if File.exists?(file_path) + upload = create_upload(post.user.id, file_path, File.basename(file_path)) + if upload && upload.errors.empty? + # upload.url + filename = name || file_name || File.basename(file_path) + html_for_upload(upload, normalize_text(filename)) + else + puts "Error: Upload did not persist for #{post.id} #{attachment_id}!" + end + else + puts "Couldn't find file for #{attachment_id}. Skipping." + next + end + end + + new_raw.gsub!(cdn_regex) do |s| + matches = cdn_regex.match(s) + attachment_id = matches[1] + + file_path = "#{ATTACHMENTS_BASE_DIR}/#{attachment_id}" + + if File.exists?(file_path) + upload = create_upload(post.user.id, file_path, File.basename(file_path)) + if upload && upload.errors.empty? + upload.url + else + puts "Error: Upload did not persist for #{post.id} #{attachment_id}!" + end + else + puts "Couldn't find file for #{attachment_id}. Skipping." + next + end + end + + if new_raw != post.raw + begin + PostRevisor.new(post).revise!(post.user, { raw: new_raw }, skip_revision: true, skip_validations: true, bypass_bump: true) + rescue + puts "PostRevisor error for #{post.id}" + post.raw = new_raw + post.save(validate: false) + end + end + end + end + end + + def mark_topics_as_solved + puts "", "Marking topics as solved..." + + DB.exec <<~SQL + INSERT INTO topic_custom_fields (name, value, topic_id, created_at, updated_at) + SELECT 'accepted_answer_post_id', pcf.post_id, p.topic_id, p.created_at, p.created_at + FROM post_custom_fields pcf + JOIN posts p ON p.id = pcf.post_id + WHERE pcf.name = 'is_accepted_answer' AND pcf.value = 'true' + AND NOT EXISTS ( + SELECT 1 + FROM topic_custom_fields x + WHERE x.topic_id = p.topic_id AND x.name = 'accepted_answer_post_id' + ) + ON CONFLICT DO NOTHING + SQL + end + end ImportScripts::VanillaSQL.new.perform diff --git a/spec/script/import_scripts/vanilla_body_parser_spec.rb b/spec/script/import_scripts/vanilla_body_parser_spec.rb index 771abab86d5..f01ce76c6be 100644 --- a/spec/script/import_scripts/vanilla_body_parser_spec.rb +++ b/spec/script/import_scripts/vanilla_body_parser_spec.rb @@ -90,7 +90,7 @@ this starts with spaces but IS NOT a quote''' it 'keeps uploaded files as links' do parsed = VanillaBodyParser.new({ 'Format' => 'Rich', 'Body' => rich_bodies[:upload_file].to_json }, user_id).parse - expect(parsed).to eq "This is a PDF I've uploaded:\n\n[original_name_of_file.pdf](https:\/\/vanilla.sampleforum.org\/uploads\/393\/5QR3BX57K7HM.pdf)" + expect(parsed).to eq "This is a PDF I've uploaded:\n\n<a href=\"https://vanilla.sampleforum.org/uploads/393/5QR3BX57K7HM.pdf\">original_name_of_file.pdf</a>" end it 'supports complex formatting' do @@ -100,7 +100,7 @@ this starts with spaces but IS NOT a quote''' it 'support code blocks' do parsed = VanillaBodyParser.new({ 'Format' => 'Rich', 'Body' => rich_bodies[:code_block].to_json }, user_id).parse - expect(parsed).to eq "Here's a monospaced block:\n\n```this line should be monospaced\nthis one too, with extra spaces#{' ' * 4}\n```\n\nbut not this one" + expect(parsed).to eq "Here's a monospaced block:\n\n```\nthis line should be monospaced\nthis one too, with extra spaces#{' ' * 4}\n```\n\nbut not this one" end end end