discourse/script/import_scripts/zendesk_api.rb
Leonardo Mosquera c83914e2e5
FIX: fix normalize_raw method for nil inputs in migration scripts (#22304)
Various migration scripts define a normalize_raw method to do custom processing of post contents before storing it in the Post.raw and other fields.

They normally do not handle nil inputs, but it's a relatively common occurrence in data dumps.

Since this method is used from various points in the migration script, as it stands, the experience of using a migration script is that it will fail multiple times at different points, forcing you to fix the data or apply logic hacks every time then restarting.

This PR generalizes handling of nil input by returning a <missing> string.

Pros:

    no more messy repeated crashes + restarts
    consistency

Cons:

    it might hide data issues
        OTOH we can't print a warning on that method because it will flood the console since it's called from inside loops.

* FIX: zendesk import script: support nil inputs in normalize_raw
* FIX: return '<missing>' instead of empty string; do it for all methods
2023-06-29 13:22:47 -03:00

475 lines
12 KiB
Ruby

# frozen_string_literal: true
# Zendesk importer
#
# This one uses their API.
require "open-uri"
require "reverse_markdown"
require_relative "base"
require_relative "base/generic_database"
# Call it like this:
# RAILS_ENV=production bundle exec ruby script/import_scripts/zendesk_api.rb SOURCE_URL DIRNAME AUTH_EMAIL AUTH_TOKEN
class ImportScripts::ZendeskApi < ImportScripts::Base
BATCH_SIZE = 1000
HTTP_ERRORS = [
EOFError,
Errno::ECONNRESET,
Errno::EINVAL,
Net::HTTPBadResponse,
Net::HTTPHeaderSyntaxError,
Net::ProtocolError,
Timeout::Error,
OpenURI::HTTPError,
OpenSSL::SSL::SSLError,
]
MAX_RETRIES = 5
IMAGE_DOWNLOAD_PATH = "replace-me"
SUBDOMAIN = "replace-me"
def initialize(source_url, path, auth_email, auth_token)
super()
@source_url = source_url
@path = path
@auth_email = auth_email
@auth_token = auth_token
@db = ImportScripts::GenericDatabase.new(@path, batch_size: BATCH_SIZE, recreate: true)
end
def execute
fetch_from_api
import_categories
import_users
import_topics
import_posts
import_likes
end
def fetch_from_api
fetch_categories
fetch_topics
fetch_posts
fetch_users
@db.sort_posts_by_created_at
end
def fetch_categories
puts "", "fetching categories..."
get_from_api("/api/v2/community/topics.json", "topics", show_status: true) do |row|
@db.insert_category(
id: row["id"],
name: row["name"],
description: row["description"],
position: row["position"],
url: row["html_url"],
)
end
end
def fetch_topics
puts "", "fetching topics..."
get_from_api("/api/v2/community/posts.json", "posts", show_status: true) do |row|
if row["vote_count"] > 0
like_user_ids = fetch_likes("/api/v2/community/posts/#{row["id"]}/votes.json")
end
@db.insert_topic(
id: row["id"],
title: row["title"],
raw: row["details"],
category_id: row["topic_id"],
closed: row["closed"],
user_id: row["author_id"],
created_at: row["created_at"],
url: row["html_url"],
like_user_ids: like_user_ids,
)
end
end
def fetch_posts
puts "", "fetching posts..."
current_count = 0
total_count = @db.count_topics
start_time = Time.now
last_id = ""
batches do |offset|
rows, last_id = @db.fetch_topics(last_id)
break if rows.empty?
rows.each do |topic_row|
get_from_api(
"/api/v2/community/posts/#{topic_row["id"]}/comments.json",
"comments",
) do |row|
if row["vote_count"] > 0
like_user_ids =
fetch_likes(
"/api/v2/community/posts/#{topic_row["id"]}/comments/#{row["id"]}/votes.json",
)
end
@db.insert_post(
id: row["id"],
raw: row["body"],
topic_id: topic_row["id"],
user_id: row["author_id"],
created_at: row["created_at"],
url: row["html_url"],
like_user_ids: like_user_ids,
)
end
current_count += 1
print_status(current_count, total_count, start_time)
end
end
end
def fetch_users
puts "", "fetching users..."
user_ids = @db.execute_sql(<<~SQL).map { |row| row["user_id"] }
SELECT user_id FROM topic
UNION
SELECT user_id FROM post
UNION
SELECT user_id FROM like
SQL
current_count = 0
total_count = user_ids.size
start_time = Time.now
while !user_ids.empty?
get_from_api(
"/api/v2/users/show_many.json?ids=#{user_ids.shift(50).join(",")}",
"users",
) do |row|
@db.insert_user(
id: row["id"],
email: row["email"],
name: row["name"],
created_at: row["created_at"],
last_seen_at: row["last_login_at"],
active: row["active"],
avatar_path: row["photo"].present? ? row["photo"]["content_url"] : nil,
)
current_count += 1
print_status(current_count, total_count, start_time)
end
end
end
def fetch_likes(url)
user_ids = []
get_from_api(url, "votes") do |row|
user_ids << row["user_id"] if row["id"].present? && row["value"] == 1
end
user_ids
end
def import_categories
puts "", "creating categories"
rows = @db.fetch_categories
create_categories(rows) do |row|
{
id: row["id"],
name: row["name"],
description: row["description"],
position: row["position"],
post_create_action:
proc do |category|
url = remove_domain(row["url"])
Permalink.create(url: url, category_id: category.id) unless permalink_exists?(url)
end,
}
end
end
def import_users
puts "", "creating users"
total_count = @db.count_users
last_id = ""
batches do |offset|
rows, last_id = @db.fetch_users(last_id)
break if rows.empty?
next if all_records_exist?(:users, rows.map { |row| row["id"] })
create_users(rows, total: total_count, offset: offset) do |row|
{
id: row["id"],
email: row["email"],
name: row["name"],
created_at: row["created_at"],
last_seen_at: row["last_seen_at"],
active: row["active"] == 1,
post_create_action:
proc do |user|
if row["avatar_path"].present?
begin
UserAvatar.import_url_for_user(row["avatar_path"], user)
rescue StandardError
nil
end
end
end,
}
end
end
end
def import_topics
puts "", "creating topics"
total_count = @db.count_topics
last_id = ""
batches do |offset|
rows, last_id = @db.fetch_topics(last_id)
break if rows.empty?
next if all_records_exist?(:posts, rows.map { |row| import_topic_id(row["id"]) })
create_posts(rows, total: total_count, offset: offset) do |row|
{
id: import_topic_id(row["id"]),
title: row["title"].present? ? row["title"].strip[0...255] : "Topic title missing",
raw:
normalize_raw(
row["raw"],
user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
),
category: category_id_from_imported_category_id(row["category_id"]),
user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
created_at: row["created_at"],
closed: row["closed"] == 1,
post_create_action:
proc do |post|
url = remove_domain(row["url"])
Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
end,
}
end
end
end
def import_topic_id(topic_id)
"T#{topic_id}"
end
def import_posts
puts "", "creating posts"
total_count = @db.count_posts
last_row_id = 0
batches do |offset|
rows, last_row_id = @db.fetch_sorted_posts(last_row_id)
break if rows.empty?
create_posts(rows, total: total_count, offset: offset) do |row|
topic = topic_lookup_from_imported_post_id(import_topic_id(row["topic_id"]))
if topic.nil?
p "MISSING TOPIC #{row["topic_id"]}"
p row
next
end
{
id: row["id"],
raw:
normalize_raw(
row["raw"],
user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
),
user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id,
topic_id: topic[:topic_id],
created_at: row["created_at"],
post_create_action:
proc do |post|
url = remove_domain(row["url"])
Permalink.create(url: url, post_id: post.id) unless permalink_exists?(url)
end,
}
end
end
end
def import_likes
puts "", "importing likes..."
start_time = Time.now
current_count = 0
total_count = @db.count_likes
last_row_id = 0
batches do |offset|
rows, last_row_id = @db.fetch_likes(last_row_id)
break if rows.empty?
rows.each do |row|
import_id = row["topic_id"] ? import_topic_id(row["topic_id"]) : row["post_id"]
post = Post.find_by(id: post_id_from_imported_post_id(import_id)) if import_id
user = User.find_by(id: user_id_from_imported_user_id(row["user_id"]))
if post && user
begin
PostActionCreator.like(user, post) if user && post
rescue => e
puts "error acting on post #{e}"
end
else
puts "Skipping Like from #{row["user_id"]} on topic #{row["topic_id"]} / post #{row["post_id"]}"
end
current_count += 1
print_status(current_count, total_count, start_time)
end
end
end
def normalize_raw(raw, user_id)
return "<missing>" if raw.blank?
raw = raw.gsub('\n', "")
raw = ReverseMarkdown.convert(raw)
# Process images, after the ReverseMarkdown they look like
# ![](https://<sub-domain>.zendesk.com/<hash>.<image-format>)
raw.gsub!(%r{!\[\]\((https://#{SUBDOMAIN}\.zendesk\.com/hc/user_images/([^).]+\.[^)]+))\)}i) do
image_url = $1
filename = $2
attempts = 0
begin
URI
.parse(image_url)
.open do |image|
# IMAGE_DOWNLOAD_PATH is whatever image, it will be replaced with the downloaded image
File.open(IMAGE_DOWNLOAD_PATH, "wb") { |file| file.write(image.read) }
end
rescue *HTTP_ERRORS => e
if attempts < MAX_RETRIES
attempts += 1
sleep(2)
retry
else
puts "Error downloading image"
end
next
end
upl_obj = create_upload(user_id, IMAGE_DOWNLOAD_PATH, filename)
if upl_obj&.persisted?
html = html_for_upload(upl_obj, filename)
html
else
puts "Error creating image upload"
"![](#{$1})"
exit
end
end
raw
end
def remove_domain(url)
url.sub(@source_url, "")
end
def permalink_exists?(url)
Permalink.find_by(url: url)
end
def connection
@_connection ||=
begin
connect_uri = URI.parse(@source_url)
http = Net::HTTP.new(connect_uri.host, connect_uri.port)
http.open_timeout = 30
http.read_timeout = 30
http.use_ssl = connect_uri.scheme == "https"
http
end
end
def authorization
@_authorization ||=
begin
auth_str = "#{@auth_email}/token:#{@auth_token}"
"Basic #{Base64.strict_encode64(auth_str)}"
end
end
def get_from_api(path, array_name, show_status: false)
url = "#{@source_url}#{path}"
start_time = Time.now
while url
get = Net::HTTP::Get.new(url)
get["User-Agent"] = "Discourse Zendesk Importer"
get["Authorization"] = authorization
retry_count = 0
begin
while retry_count < 5
begin
response = connection.request(get)
puts("Retry successful!") if retry_count > 0
break
rescue => e
puts "Request failed #{url}. Waiting and will retry. #{e.class.name} #{e.message}"
sleep(20)
retry_count += 1
end
end
end
json = JSON.parse(response.body)
json[array_name].each { |row| yield row }
url = json["next_page"]
if show_status
if json["page"] && json["page_count"]
print_status(json["page"], json["page_count"], start_time)
else
print "."
end
end
end
end
end
unless ARGV.length == 4 && Dir.exist?(ARGV[1])
puts "",
"Usage:",
"",
"bundle exec ruby script/import_scripts/zendesk_api.rb SOURCE_URL DIRNAME AUTH_EMAIL AUTH_TOKEN",
""
exit 1
end
ImportScripts::ZendeskApi.new(ARGV[0], ARGV[1], ARGV[2], ARGV[3]).perform