FIX: improvements for vanilla bulk import (#10212)

Adjustments to the base:
1. PG connection doesn't require host - it was broken on import droplet
2. Drop `topic_reply_count` - it was removed here - https://github.com/discourse/discourse/blob/master/db/post_migrate/20200513185052_drop_topic_reply_count.rb
3. Error with `backtrace.join("\n")` -> `e.backtrace.join("\n")`
4. Correctly link the user and avatar to quote block

Adjustments to vanilla:
1. Top-level Vanilla categories are valid categories
2. Posts have `format` column which should be used to decide if the format is HTML or Markdown
3. Remove no UTF8 characters
4. Remove not supported HTML elements like `font` `span` `sub` `u`
This commit is contained in:
Krzysztof Kotlarek 2020-07-14 15:58:27 +10:00 committed by GitHub
parent cc01297f1f
commit 93ff54e184
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 76 additions and 28 deletions

View File

@ -76,7 +76,7 @@ class BulkImport::Base
charset = ENV["DB_CHARSET"] || "utf8"
db = ActiveRecord::Base.connection_config
@encoder = PG::TextEncoder::CopyRow.new
@raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port])
@raw_connection = PG.connect(dbname: db[:database], port: db[:port])
@uploader = ImportScripts::Uploader.new
@html_entities = HTMLEntities.new
@encoding = CHARSET_MAP[charset]
@ -283,7 +283,7 @@ class BulkImport::Base
USER_STAT_COLUMNS ||= %i{
user_id topics_entered time_read days_visited posts_read_count
likes_given likes_received topic_reply_count new_since read_faq
likes_given likes_received new_since read_faq
first_post_created_at post_count topic_count bounce_score
reset_bounce_score_after
}
@ -441,14 +441,12 @@ class BulkImport::Base
def process_user_stat(user_stat)
user_stat[:user_id] = @users[user_stat[:imported_user_id].to_i]
user_stat[:topic_reply_count] = user_stat[:post_count] - user_stat[:topic_count]
user_stat[:topics_entered] ||= 0
user_stat[:time_read] ||= 0
user_stat[:days_visited] ||= 0
user_stat[:posts_read_count] ||= 0
user_stat[:likes_given] ||= 0
user_stat[:likes_received] ||= 0
user_stat[:topic_reply_count] ||= 0
user_stat[:new_since] ||= NOW
user_stat[:post_count] ||= 0
user_stat[:topic_count] ||= 0
@ -546,7 +544,8 @@ class BulkImport::Base
topic_tag
end
def process_raw(raw)
def process_raw(original_raw)
raw = original_raw.dup
# fix whitespaces
raw.gsub!(/(\\r)?\\n/, "\n")
raw.gsub!("\\t", "\t")
@ -699,7 +698,7 @@ class BulkImport::Base
rescue => e
puts "\n"
puts "ERROR: #{e.message}"
puts backtrace.join("\n")
puts e.backtrace.join("\n")
end
end
end
@ -782,17 +781,25 @@ class BulkImport::Base
quote.gsub!(/^(<br>\n?)+/, "")
quote.gsub!(/(<br>\n?)+$/, "")
user = User.find_by(username: username)
if post_id.present? && topic_id.present?
<<-HTML
<aside class="quote" data-post="#{post_id}" data-topic="#{topic_id}">
<div class="title">#{username}:</div>
<div class="title">
<div class="quote-controls"></div>
#{user ? user_avatar(user) : username}:
</div>
<blockquote>#{quote}</blockquote>
</aside>
HTML
else
<<-HTML
<aside class="quote">
<div class="title">#{username}:</div>
<aside class="quote no-group" data-username="#{username}">
<div class="title">
<div class="quote-controls"></div>
#{user ? user_avatar(user) : username}:
</div>
<blockquote>#{quote}</blockquote>
</aside>
HTML
@ -802,6 +809,11 @@ class BulkImport::Base
cooked.scrub.strip
end
def user_avatar(user)
url = user.avatar_template.gsub("{size}", "45")
"<img alt=\"\" width=\"20\" height=\"20\" src=\"#{url}\" class=\"avatar\"> #{user.username}"
end
def pre_fancy(title)
Redcarpet::Render::SmartyPants.render(ERB::Util.html_escape(title)).scrub.strip
end

View File

@ -188,7 +188,7 @@ class BulkImport::Vanilla < BulkImport::Base
now = Time.zone.now
create_user_stats(users) do |row|
next unless @users[row['UserID'].to_s] # shouldn't need this but it can be NULL :<
next unless @users[row['UserID'].to_i] # shouldn't need this but it can be NULL :<
{
imported_id: row['UserID'],
@ -371,9 +371,8 @@ class BulkImport::Vanilla < BulkImport::Base
# Throw the -1 level categories away since they contain no topics.
# Use the next level as root categories.
root_category_ids = Set.new(categories.select { |c| c["ParentCategoryID"] == -1 }.map { |c| c["CategoryID"] })
top_level_categories = categories.select { |c| root_category_ids.include?(c["ParentCategoryID"]) }
top_level_categories = categories.select { |c| c["ParentCategoryID"].blank? || c['ParentCategoryID'] == -1 }
# Depth = 2
create_categories(top_level_categories) do |category|
@ -432,13 +431,13 @@ class BulkImport::Vanilla < BulkImport::Base
def import_topics
puts "", "Importing topics..."
topics_sql = "SELECT DiscussionID, CategoryID, Name, Body, DateInserted, InsertUserID, Announce
topics_sql = "SELECT DiscussionID, CategoryID, Name, Body, DateInserted, InsertUserID, Announce, Format
FROM #{TABLE_PREFIX}Discussion
WHERE DiscussionID > #{@last_imported_topic_id}
ORDER BY DiscussionID ASC"
create_topics(mysql_stream(topics_sql)) do |row|
{
data = {
imported_id: row["DiscussionID"],
title: normalize_text(row["Name"]),
category_id: category_id_from_imported_id(row["CategoryID"]) ||
@ -447,18 +446,20 @@ class BulkImport::Vanilla < BulkImport::Base
created_at: Time.zone.at(row['DateInserted']),
pinned_at: row['Announce'] == 0 ? nil : Time.zone.at(row['DateInserted'])
}
(data[:user_id].present? && data[:title].present?) ? data : false
end
puts "", "importing first posts..."
create_posts(mysql_stream(topics_sql)) do |row|
{
data = {
imported_id: "d-" + row['DiscussionID'].to_s,
topic_id: topic_id_from_imported_id(row["DiscussionID"]),
topic_id: topic_id_from_imported_id(row['DiscussionID']),
user_id: user_id_from_imported_id(row["InsertUserID"]),
created_at: Time.zone.at(row['DateInserted']),
raw: clean_up(row["Body"])
raw: clean_up(row['Body'], row['Format'])
}
data[:topic_id].present? ? data : false
end
puts '', 'converting deep categories to tags...'
@ -477,7 +478,7 @@ class BulkImport::Vanilla < BulkImport::Base
puts "", "Importing posts..."
posts = mysql_stream(
"SELECT CommentID, DiscussionID, Body, DateInserted, InsertUserID
"SELECT CommentID, DiscussionID, Body, DateInserted, InsertUserID, Format
FROM #{TABLE_PREFIX}Comment
WHERE CommentID > #{@last_imported_post_id}
ORDER BY CommentID ASC")
@ -489,9 +490,9 @@ class BulkImport::Vanilla < BulkImport::Base
{
imported_id: row['CommentID'],
topic_id: topic_id,
user_id: user_id_from_imported_id(row["InsertUserID"]),
user_id: user_id_from_imported_id(row['InsertUserID']),
created_at: Time.zone.at(row['DateInserted']),
raw: clean_up(row["Body"])
raw: clean_up(row['Body'], row['Format'])
}
end
end
@ -572,7 +573,7 @@ class BulkImport::Vanilla < BulkImport::Base
puts "", "importing private replies..."
private_posts_sql = "
SELECT ConversationID, MessageID, Body, InsertUserID, DateInserted
SELECT ConversationID, MessageID, Body, InsertUserID, DateInserted, Format
FROM GDN_ConversationMessage
WHERE ConversationID > #{@last_imported_private_topic_id - PRIVATE_OFFSET}
ORDER BY ConversationID ASC, MessageID ASC"
@ -585,7 +586,7 @@ class BulkImport::Vanilla < BulkImport::Base
topic_id: topic_id,
user_id: user_id_from_imported_id(row['InsertUserID']),
created_at: Time.zone.at(row['DateInserted']),
raw: clean_up(row['Body'])
raw: clean_up(row['Body'], row['Format'])
}
end
end
@ -650,13 +651,48 @@ class BulkImport::Vanilla < BulkImport::Base
end
end
def clean_up(raw)
# post id is sometimes prefixed with "c-"
raw.gsub!(/\[QUOTE="([^;]+);c-(\d+)"\]/i) { "[QUOTE=#{$1};#{$2}]" }
raw = raw.delete("\u0000")
raw = process_raw_text(raw)
def clean_up(raw, format)
raw.encode!("utf-8", "utf-8", invalid: :replace, undef: :replace, replace: "")
raw
raw.gsub!(/<(.+)>&nbsp;<\/\1>/, "\n\n")
html =
if format == 'Html'
raw
else
markdown = Redcarpet::Markdown.new(Redcarpet::Render::HTML, autolink: true, tables: true)
markdown.render(raw)
end
doc = Nokogiri::HTML5.fragment(html)
doc.css("blockquote").each do |bq|
name = bq["rel"]
user = User.find_by(name: name)
bq.replace %{<br>[QUOTE="#{user&.username || name}"]\n#{bq.inner_html}\n[/QUOTE]<br>}
end
doc.css("font").reverse.each do |f|
f.replace f.inner_html
end
doc.css("span").reverse.each do |f|
f.replace f.inner_html
end
doc.css("sub").reverse.each do |f|
f.replace f.inner_html
end
doc.css("u").reverse.each do |f|
f.replace f.inner_html
end
markdown = format == 'Html' ? ReverseMarkdown.convert(doc.to_html) : doc.to_html
markdown.gsub!(/\[QUOTE="([^;]+);c-(\d+)"\]/i) { "[QUOTE=#{$1};#{$2}]" }
markdown = process_raw_text(markdown)
markdown
end
def process_raw_text(raw)