discourse/script/import_scripts/zendesk_api.rb
Sam Saffron 30990006a9 DEV: enable frozen string literal on all files
This reduces chances of errors where consumers of strings mutate inputs
and reduces memory usage of the app.

Test suite passes now, but there may be some stuff left, so we will run
a few sites on a branch prior to merging
2019-05-13 09:31:32 +08:00

309 lines
8.0 KiB
Ruby

# frozen_string_literal: true
# Zendesk importer
#
# This one uses their API.
require 'reverse_markdown'
require_relative 'base'
require_relative 'base/generic_database'
# Call it like this:
# RAILS_ENV=production bundle exec ruby script/import_scripts/zendesk_api.rb SOURCE_URL DIRNAME AUTH_EMAIL AUTH_TOKEN
class ImportScripts::ZendeskApi < ImportScripts::Base
BATCH_SIZE = 1000
def initialize(source_url, path, auth_email, auth_token)
super()
@source_url = source_url
@path = path
@auth_email = auth_email
@auth_token = auth_token
@db = ImportScripts::GenericDatabase.new(@path, batch_size: BATCH_SIZE, recreate: true)
end
def execute
fetch_from_api
import_categories
import_users
import_topics
import_posts
end
def fetch_from_api
puts '', 'fetching categories...'
get_from_api('/api/v2/community/topics.json', 'topics') do |row|
@db.insert_category(
id: row['id'],
name: row['name'],
description: row['description'],
position: row['position'],
url: row['html_url']
)
end
puts '', 'fetching topics...'
get_from_api('/api/v2/community/posts.json', 'posts') do |row|
@db.insert_topic(
id: row['id'],
title: row['title'],
raw: row['details'],
category_id: row['topic_id'],
closed: row['closed'],
user_id: row['author_id'],
created_at: row['created_at'],
url: row['html_url']
)
end
puts '', 'fetching posts...'
total_count = @db.count_topics
start_time = Time.now
last_id = ''
batches do |offset|
rows, last_id = @db.fetch_topics(last_id)
break if rows.empty?
print_status(offset, total_count, start_time)
rows.each do |topic_row|
get_from_api("/api/v2/community/posts/#{topic_row['id']}/comments.json", 'comments', show_status: false) do |row|
@db.insert_post(
id: row['id'],
raw: row['body'],
topic_id: topic_row['id'],
user_id: row['author_id'],
created_at: row['created_at'],
url: row['html_url']
)
end
end
end
puts '', 'fetching users...'
results = @db.execute_sql("SELECT user_id FROM topic")
user_ids = results.map { |h| h['user_id']&.to_i }
results = @db.execute_sql("SELECT user_id FROM post")
user_ids += results.map { |h| h['user_id']&.to_i }
user_ids.uniq!
user_ids.sort!
total_users = user_ids.size
start_time = Time.now
while !user_ids.empty?
print_status(total_users - user_ids.size, total_users, start_time)
get_from_api("/api/v2/users/show_many.json?ids=#{user_ids.shift(50).join(',')}", 'users', show_status: false) do |row|
@db.insert_user(
id: row['id'],
email: row['email'],
name: row['name'],
created_at: row['created_at'],
last_seen_at: row['last_login_at'],
active: row['active']
)
end
end
@db.sort_posts_by_created_at
end
def import_categories
puts "", "creating categories"
rows = @db.fetch_categories
create_categories(rows) do |row|
{
id: row['id'],
name: row['name'],
description: row['description'],
position: row['position'],
post_create_action: proc do |category|
url = remove_domain(row['url'])
Permalink.create(url: url, category_id: category.id) unless permalink_exists?(url)
end
}
end
end
def import_users
puts "", "creating users"
total_count = @db.count_users
last_id = ''
batches do |offset|
rows, last_id = @db.fetch_users(last_id)
break if rows.empty?
next if all_records_exist?(:users, rows.map { |row| row['id'] })
create_users(rows, total: total_count, offset: offset) do |row|
{
id: row['id'],
email: row['email'],
name: row['name'],
created_at: row['created_at'],
last_seen_at: row['last_seen_at'],
active: row['active'] == 1
}
end
end
end
def import_topics
puts "", "creating topics"
total_count = @db.count_topics
last_id = ''
batches do |offset|
rows, last_id = @db.fetch_topics(last_id)
break if rows.empty?
next if all_records_exist?(:posts, rows.map { |row| import_topic_id(row['id']) })
create_posts(rows, total: total_count, offset: offset) do |row|
{
id: import_topic_id(row['id']),
title: row['title'].present? ? row['title'].strip[0...255] : "Topic title missing",
raw: normalize_raw(row['raw']),
category: category_id_from_imported_category_id(row['category_id']),
user_id: user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id,
created_at: row['created_at'],
closed: row['closed'] == 1,
post_create_action: proc do |post|
url = remove_domain(row['url'])
Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url)
end
}
end
end
end
def import_topic_id(topic_id)
"T#{topic_id}"
end
def import_posts
puts "", "creating posts"
total_count = @db.count_posts
last_row_id = 0
batches do |offset|
rows, last_row_id = @db.fetch_sorted_posts(last_row_id)
break if rows.empty?
create_posts(rows, total: total_count, offset: offset) do |row|
topic = topic_lookup_from_imported_post_id(import_topic_id(row['topic_id']))
if topic.nil?
p "MISSING TOPIC #{row['topic_id']}"
p row
next
end
{
id: row['id'],
raw: normalize_raw(row['raw']),
user_id: user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id,
topic_id: topic[:topic_id],
created_at: row['created_at'],
post_create_action: proc do |post|
url = remove_domain(row['url'])
Permalink.create(url: url, post_id: post.id) unless permalink_exists?(url)
end
}
end
end
end
def normalize_raw(raw)
raw = raw.gsub('\n', '')
raw = ReverseMarkdown.convert(raw)
raw
end
def remove_domain(url)
url.sub(@source_url, "")
end
def permalink_exists?(url)
Permalink.find_by(url: url)
end
def connection
@_connection ||= begin
connect_uri = URI.parse(@source_url)
http = Net::HTTP.new(connect_uri.host, connect_uri.port)
http.open_timeout = 30
http.read_timeout = 30
http.use_ssl = connect_uri.scheme == "https"
http
end
end
def authorization
@_authorization ||= begin
auth_str = "#{@auth_email}/token:#{@auth_token}"
"Basic #{Base64.strict_encode64(auth_str)}"
end
end
def get_from_api(path, array_name, show_status: true)
url = "#{@source_url}#{path}"
start_time = Time.now
while url
get = Net::HTTP::Get.new(url)
get['User-Agent'] = 'Discourse Zendesk Importer'
get['Authorization'] = authorization
retry_count = 0
begin
while retry_count < 5
begin
response = connection.request(get)
puts("Retry successful!") if retry_count > 0
break
rescue => e
puts "Request failed #{url}. Waiting and will retry. #{e.class.name} #{e.message}"
sleep(20)
retry_count += 1
end
end
end
json = JSON.parse(response.body)
json[array_name].each do |row|
yield row
end
url = json['next_page']
if show_status
if json['page'] && json['page_count']
print_status(json['page'], json['page_count'], start_time)
else
print '.'
end
end
end
end
end
unless ARGV.length == 4 && Dir.exist?(ARGV[1])
puts "", "Usage:", "", "bundle exec ruby script/import_scripts/zendesk_api.rb SOURCE_URL DIRNAME AUTH_EMAIL AUTH_TOKEN", ""
exit 1
end
ImportScripts::ZendeskApi.new(ARGV[0], ARGV[1], ARGV[2], ARGV[3]).perform