mirror of
https://github.com/discourse/discourse.git
synced 2024-11-25 09:42:07 +08:00
c83914e2e5
Various migration scripts define a normalize_raw method to do custom processing of post contents before storing it in the Post.raw and other fields. They normally do not handle nil inputs, but it's a relatively common occurrence in data dumps. Since this method is used from various points in the migration script, as it stands, the experience of using a migration script is that it will fail multiple times at different points, forcing you to fix the data or apply logic hacks every time then restarting. This PR generalizes handling of nil input by returning a <missing> string. Pros: no more messy repeated crashes + restarts consistency Cons: it might hide data issues OTOH we can't print a warning on that method because it will flood the console since it's called from inside loops. * FIX: zendesk import script: support nil inputs in normalize_raw * FIX: return '<missing>' instead of empty string; do it for all methods
475 lines
12 KiB
Ruby
475 lines
12 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
# Zendesk importer
|
|
#
|
|
# This one uses their API.
|
|
|
|
require "open-uri"
|
|
require "reverse_markdown"
|
|
require_relative "base"
|
|
require_relative "base/generic_database"
|
|
|
|
# Call it like this:
|
|
# RAILS_ENV=production bundle exec ruby script/import_scripts/zendesk_api.rb SOURCE_URL DIRNAME AUTH_EMAIL AUTH_TOKEN
|
|
class ImportScripts::ZendeskApi < ImportScripts::Base
|
|
BATCH_SIZE = 1000
|
|
|
|
HTTP_ERRORS = [
|
|
EOFError,
|
|
Errno::ECONNRESET,
|
|
Errno::EINVAL,
|
|
Net::HTTPBadResponse,
|
|
Net::HTTPHeaderSyntaxError,
|
|
Net::ProtocolError,
|
|
Timeout::Error,
|
|
OpenURI::HTTPError,
|
|
OpenSSL::SSL::SSLError,
|
|
]
|
|
|
|
MAX_RETRIES = 5
|
|
|
|
IMAGE_DOWNLOAD_PATH = "replace-me"
|
|
|
|
SUBDOMAIN = "replace-me"
|
|
|
|
def initialize(source_url, path, auth_email, auth_token)
|
|
super()
|
|
|
|
@source_url = source_url
|
|
@path = path
|
|
@auth_email = auth_email
|
|
@auth_token = auth_token
|
|
@db = ImportScripts::GenericDatabase.new(@path, batch_size: BATCH_SIZE, recreate: true)
|
|
end
|
|
|
|
def execute
|
|
fetch_from_api
|
|
|
|
import_categories
|
|
import_users
|
|
import_topics
|
|
import_posts
|
|
import_likes
|
|
end
|
|
|
|
def fetch_from_api
|
|
fetch_categories
|
|
fetch_topics
|
|
fetch_posts
|
|
fetch_users
|
|
|
|
@db.sort_posts_by_created_at
|
|
end
|
|
|
|
def fetch_categories
|
|
puts "", "fetching categories..."
|
|
|
|
get_from_api("/api/v2/community/topics.json", "topics", show_status: true) do |row|
|
|
@db.insert_category(
|
|
id: row["id"],
|
|
name: row["name"],
|
|
description: row["description"],
|
|
position: row["position"],
|
|
url: row["html_url"],
|
|
)
|
|
end
|
|
end
|
|
|
|
def fetch_topics
|
|
puts "", "fetching topics..."
|
|
|
|
get_from_api("/api/v2/community/posts.json", "posts", show_status: true) do |row|
|
|
if row["vote_count"] > 0
|
|
like_user_ids = fetch_likes("/api/v2/community/posts/#{row["id"]}/votes.json")
|
|
end
|
|
|
|
@db.insert_topic(
|
|
id: row["id"],
|
|
title: row["title"],
|
|
raw: row["details"],
|
|
category_id: row["topic_id"],
|
|
closed: row["closed"],
|
|
user_id: row["author_id"],
|
|
created_at: row["created_at"],
|
|
url: row["html_url"],
|
|
like_user_ids: like_user_ids,
|
|
)
|
|
end
|
|
end
|
|
|
|
def fetch_posts
|
|
puts "", "fetching posts..."
|
|
current_count = 0
|
|
total_count = @db.count_topics
|
|
start_time = Time.now
|
|
last_id = ""
|
|
|
|
batches do |offset|
|
|
rows, last_id = @db.fetch_topics(last_id)
|
|
break if rows.empty?
|
|
|
|
rows.each do |topic_row|
|
|
get_from_api(
|
|
"/api/v2/community/posts/#{topic_row["id"]}/comments.json",
|
|
"comments",
|
|
) do |row|
|
|
if row["vote_count"] > 0
|
|
like_user_ids =
|
|
fetch_likes(
|
|
"/api/v2/community/posts/#{topic_row["id"]}/comments/#{row["id"]}/votes.json",
|
|
)
|
|
end
|
|
|
|
@db.insert_post(
|
|
id: row["id"],
|
|
raw: row["body"],
|
|
topic_id: topic_row["id"],
|
|
user_id: row["author_id"],
|
|
created_at: row["created_at"],
|
|
url: row["html_url"],
|
|
like_user_ids: like_user_ids,
|
|
)
|
|
end
|
|
|
|
current_count += 1
|
|
print_status(current_count, total_count, start_time)
|
|
end
|
|
end
|
|
end
|
|
|
|
def fetch_users
|
|
puts "", "fetching users..."
|
|
|
|
user_ids = @db.execute_sql(<<~SQL).map { |row| row["user_id"] }
|
|
SELECT user_id FROM topic
|
|
UNION
|
|
SELECT user_id FROM post
|
|
UNION
|
|
SELECT user_id FROM like
|
|
SQL
|
|
|
|
current_count = 0
|
|
total_count = user_ids.size
|
|
start_time = Time.now
|
|
|
|
while !user_ids.empty?
|
|
get_from_api(
|
|
"/api/v2/users/show_many.json?ids=#{user_ids.shift(50).join(",")}",
|
|
"users",
|
|
) do |row|
|
|
@db.insert_user(
|
|
id: row["id"],
|
|
email: row["email"],
|
|
name: row["name"],
|
|
created_at: row["created_at"],
|
|
last_seen_at: row["last_login_at"],
|
|
active: row["active"],
|
|
avatar_path: row["photo"].present? ? row["photo"]["content_url"] : nil,
|
|
)
|
|
|
|
current_count += 1
|
|
print_status(current_count, total_count, start_time)
|
|
end
|
|
end
|
|
end
|
|
|
|
def fetch_likes(url)
|
|
user_ids = []
|
|
|
|
get_from_api(url, "votes") do |row|
|
|
user_ids << row["user_id"] if row["id"].present? && row["value"] == 1
|
|
end
|
|
|
|
user_ids
|
|
end
|
|
|
|
def import_categories
|
|
puts "", "creating categories"
|
|
rows = @db.fetch_categories
|
|
|
|
create_categories(rows) do |row|
|
|
{
|
|
id: row["id"],
|
|
name: row["name"],
|
|
description: row["description"],
|
|
position: row["position"],
|
|
post_create_action:
|
|
proc do |category|
|
|
url = remove_domain(row["url"])
|
|
Permalink.create(url: url, category_id: category.id) unless permalink_exists?(url)
|
|
end,
|
|
}
|
|
end
|
|
end
|
|
|
|
def import_users
|
|
puts "", "creating users"
|
|
total_count = @db.count_users
|
|
last_id = ""
|
|
|
|
batches do |offset|
|
|
rows, last_id = @db.fetch_users(last_id)
|
|
break if rows.empty?
|
|
|
|
next if all_records_exist?(:users, rows.map { |row| row["id"] })
|
|
|
|
create_users(rows, total: total_count, offset: offset) do |row|
|
|
{
|
|
id: row["id"],
|
|
email: row["email"],
|
|
name: row["name"],
|
|
created_at: row["created_at"],
|
|
last_seen_at: row["last_seen_at"],
|
|
active: row["active"] == 1,
|
|
post_create_action:
|
|
proc do |user|
|
|
if row["avatar_path"].present?
|
|
begin
|
|
UserAvatar.import_url_for_user(row["avatar_path"], user)
|
|
rescue StandardError
|
|
nil
|
|
end
|
|
end
|
|
end,
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_topics
|
|
puts "", "creating topics"
|
|
total_count = @db.count_topics
|
|
last_id = ""
|
|
|
|
batches do |offset|
|
|
rows, last_id = @db.fetch_topics(last_id)
|
|
break if rows.empty?
|
|
|
|
next if all_records_exist?(:posts, rows.map { |row| import_topic_id(row["id"]) })
|
|
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
{
|
|
id: import_topic_id(row["id"]),
|
|
title: row["title"].present? ? row["title"].strip[0...255] : "Topic title missing",
|
|
raw:
|
|
normalize_raw(
|
|
row["raw"],
|
|
user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
|
|
),
|
|
category: category_id_from_imported_category_id(row["category_id"]),
|
|
user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
|
|
created_at: row["created_at"],
|
|
closed: row["closed"] == 1,
|
|
post_create_action:
|
|
proc do |post|
|
|
url = remove_domain(row["url"])
|
|
Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
|
|
end,
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_topic_id(topic_id)
|
|
"T#{topic_id}"
|
|
end
|
|
|
|
def import_posts
|
|
puts "", "creating posts"
|
|
total_count = @db.count_posts
|
|
last_row_id = 0
|
|
|
|
batches do |offset|
|
|
rows, last_row_id = @db.fetch_sorted_posts(last_row_id)
|
|
break if rows.empty?
|
|
|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
|
topic = topic_lookup_from_imported_post_id(import_topic_id(row["topic_id"]))
|
|
|
|
if topic.nil?
|
|
p "MISSING TOPIC #{row["topic_id"]}"
|
|
p row
|
|
next
|
|
end
|
|
|
|
{
|
|
id: row["id"],
|
|
raw:
|
|
normalize_raw(
|
|
row["raw"],
|
|
user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
|
|
),
|
|
user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
|
|
topic_id: topic[:topic_id],
|
|
created_at: row["created_at"],
|
|
post_create_action:
|
|
proc do |post|
|
|
url = remove_domain(row["url"])
|
|
Permalink.create(url: url, post_id: post.id) unless permalink_exists?(url)
|
|
end,
|
|
}
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_likes
|
|
puts "", "importing likes..."
|
|
start_time = Time.now
|
|
current_count = 0
|
|
total_count = @db.count_likes
|
|
last_row_id = 0
|
|
|
|
batches do |offset|
|
|
rows, last_row_id = @db.fetch_likes(last_row_id)
|
|
break if rows.empty?
|
|
|
|
rows.each do |row|
|
|
import_id = row["topic_id"] ? import_topic_id(row["topic_id"]) : row["post_id"]
|
|
post = Post.find_by(id: post_id_from_imported_post_id(import_id)) if import_id
|
|
user = User.find_by(id: user_id_from_imported_user_id(row["user_id"]))
|
|
|
|
if post && user
|
|
begin
|
|
PostActionCreator.like(user, post) if user && post
|
|
rescue => e
|
|
puts "error acting on post #{e}"
|
|
end
|
|
else
|
|
puts "Skipping Like from #{row["user_id"]} on topic #{row["topic_id"]} / post #{row["post_id"]}"
|
|
end
|
|
|
|
current_count += 1
|
|
print_status(current_count, total_count, start_time)
|
|
end
|
|
end
|
|
end
|
|
|
|
def normalize_raw(raw, user_id)
|
|
return "<missing>" if raw.blank?
|
|
|
|
raw = raw.gsub('\n', "")
|
|
raw = ReverseMarkdown.convert(raw)
|
|
|
|
# Process images, after the ReverseMarkdown they look like
|
|
# ![](https://<sub-domain>.zendesk.com/<hash>.<image-format>)
|
|
raw.gsub!(%r{!\[\]\((https://#{SUBDOMAIN}\.zendesk\.com/hc/user_images/([^).]+\.[^)]+))\)}i) do
|
|
image_url = $1
|
|
filename = $2
|
|
attempts = 0
|
|
|
|
begin
|
|
URI
|
|
.parse(image_url)
|
|
.open do |image|
|
|
# IMAGE_DOWNLOAD_PATH is whatever image, it will be replaced with the downloaded image
|
|
File.open(IMAGE_DOWNLOAD_PATH, "wb") { |file| file.write(image.read) }
|
|
end
|
|
rescue *HTTP_ERRORS => e
|
|
if attempts < MAX_RETRIES
|
|
attempts += 1
|
|
sleep(2)
|
|
retry
|
|
else
|
|
puts "Error downloading image"
|
|
end
|
|
next
|
|
end
|
|
|
|
upl_obj = create_upload(user_id, IMAGE_DOWNLOAD_PATH, filename)
|
|
|
|
if upl_obj&.persisted?
|
|
html = html_for_upload(upl_obj, filename)
|
|
html
|
|
else
|
|
puts "Error creating image upload"
|
|
"![](#{$1})"
|
|
exit
|
|
end
|
|
end
|
|
|
|
raw
|
|
end
|
|
|
|
def remove_domain(url)
|
|
url.sub(@source_url, "")
|
|
end
|
|
|
|
def permalink_exists?(url)
|
|
Permalink.find_by(url: url)
|
|
end
|
|
|
|
def connection
|
|
@_connection ||=
|
|
begin
|
|
connect_uri = URI.parse(@source_url)
|
|
|
|
http = Net::HTTP.new(connect_uri.host, connect_uri.port)
|
|
http.open_timeout = 30
|
|
http.read_timeout = 30
|
|
http.use_ssl = connect_uri.scheme == "https"
|
|
|
|
http
|
|
end
|
|
end
|
|
|
|
def authorization
|
|
@_authorization ||=
|
|
begin
|
|
auth_str = "#{@auth_email}/token:#{@auth_token}"
|
|
"Basic #{Base64.strict_encode64(auth_str)}"
|
|
end
|
|
end
|
|
|
|
def get_from_api(path, array_name, show_status: false)
|
|
url = "#{@source_url}#{path}"
|
|
start_time = Time.now
|
|
|
|
while url
|
|
get = Net::HTTP::Get.new(url)
|
|
get["User-Agent"] = "Discourse Zendesk Importer"
|
|
get["Authorization"] = authorization
|
|
|
|
retry_count = 0
|
|
|
|
begin
|
|
while retry_count < 5
|
|
begin
|
|
response = connection.request(get)
|
|
puts("Retry successful!") if retry_count > 0
|
|
break
|
|
rescue => e
|
|
puts "Request failed #{url}. Waiting and will retry. #{e.class.name} #{e.message}"
|
|
sleep(20)
|
|
retry_count += 1
|
|
end
|
|
end
|
|
end
|
|
|
|
json = JSON.parse(response.body)
|
|
|
|
json[array_name].each { |row| yield row }
|
|
|
|
url = json["next_page"]
|
|
|
|
if show_status
|
|
if json["page"] && json["page_count"]
|
|
print_status(json["page"], json["page_count"], start_time)
|
|
else
|
|
print "."
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
unless ARGV.length == 4 && Dir.exist?(ARGV[1])
|
|
puts "",
|
|
"Usage:",
|
|
"",
|
|
"bundle exec ruby script/import_scripts/zendesk_api.rb SOURCE_URL DIRNAME AUTH_EMAIL AUTH_TOKEN",
|
|
""
|
|
exit 1
|
|
end
|
|
|
|
ImportScripts::ZendeskApi.new(ARGV[0], ARGV[1], ARGV[2], ARGV[3]).perform
|