From 15f657309a9b457ff315482da2fe8fbbed9caa02 Mon Sep 17 00:00:00 2001 From: Neil Lalonde Date: Tue, 28 Aug 2018 10:21:39 -0400 Subject: [PATCH] FEATURE: Zendesk importer that uses its API to get data --- script/import_scripts/base.rb | 2 +- .../import_scripts/base/generic_database.rb | 4 + script/import_scripts/zendesk_api.rb | 306 ++++++++++++++++++ 3 files changed, 311 insertions(+), 1 deletion(-) create mode 100644 script/import_scripts/zendesk_api.rb diff --git a/script/import_scripts/base.rb b/script/import_scripts/base.rb index 71df638ed61..880150c11f5 100644 --- a/script/import_scripts/base.rb +++ b/script/import_scripts/base.rb @@ -873,7 +873,7 @@ class ImportScripts::Base @start_times.fetch(key) { |k| @start_times[k] = Time.now } end - def batches(batch_size) + def batches(batch_size = 1000) offset = 0 loop do yield offset diff --git a/script/import_scripts/base/generic_database.rb b/script/import_scripts/base/generic_database.rb index 5b7df59cf4b..39284a4d345 100644 --- a/script/import_scripts/base/generic_database.rb +++ b/script/import_scripts/base/generic_database.rb @@ -125,6 +125,10 @@ module ImportScripts @db.execute(sql) end + def get_first_value(sql) + @db.get_first_value(sql) + end + private def configure_database diff --git a/script/import_scripts/zendesk_api.rb b/script/import_scripts/zendesk_api.rb new file mode 100644 index 00000000000..ddea6f1cb5f --- /dev/null +++ b/script/import_scripts/zendesk_api.rb @@ -0,0 +1,306 @@ +# Zendesk importer +# +# This one uses their API. + +require 'reverse_markdown' +require_relative 'base' +require_relative 'base/generic_database' + +# Call it like this: +# RAILS_ENV=production bundle exec ruby script/import_scripts/zendesk_api.rb SOURCE_URL DIRNAME AUTH_EMAIL AUTH_TOKEN +class ImportScripts::Zendesk < ImportScripts::Base + BATCH_SIZE = 1000 + + def initialize(source_url, path, auth_email, auth_token) + super() + + @source_url = source_url + @path = path + @auth_email = auth_email + @auth_token = auth_token + @db = ImportScripts::GenericDatabase.new(@path, batch_size: BATCH_SIZE, recreate: true) + end + + def execute + fetch_from_api + + import_categories + import_users + import_topics + import_posts + end + + def fetch_from_api + puts '', 'fetching categories...' + + get_from_api('/api/v2/community/topics.json', 'topics') do |row| + @db.insert_category( + id: row['id'], + name: row['name'], + description: row['description'], + position: row['position'], + url: row['html_url'] + ) + end + + puts '', 'fetching topics...' + + get_from_api('/api/v2/community/posts.json', 'posts') do |row| + @db.insert_topic( + id: row['id'], + title: row['title'], + raw: row['details'], + category_id: row['topic_id'], + closed: row['closed'], + user_id: row['author_id'], + created_at: row['created_at'], + url: row['html_url'] + ) + end + + puts '', 'fetching posts...' + total_count = @db.count_topics + start_time = Time.now + last_id = '' + + batches do |offset| + rows, last_id = @db.fetch_topics(last_id) + break if rows.empty? + + print_status(offset, total_count, start_time) + + rows.each do |topic_row| + get_from_api("/api/v2/community/posts/#{topic_row['id']}/comments.json", 'comments', show_status: false) do |row| + @db.insert_post( + id: row['id'], + raw: row['body'], + topic_id: topic_row['id'], + user_id: row['author_id'], + created_at: row['created_at'], + url: row['html_url'] + ) + end + end + end + + puts '', 'fetching users...' + + results = @db.execute_sql("SELECT user_id FROM topic") + user_ids = results.map { |h| h['user_id']&.to_i } + results = @db.execute_sql("SELECT user_id FROM post") + user_ids += results.map { |h| h['user_id']&.to_i } + user_ids.uniq! + user_ids.sort! + + total_users = user_ids.size + start_time = Time.now + + while !user_ids.empty? + print_status(total_users - user_ids.size, total_users, start_time) + get_from_api("/api/v2/users/show_many.json?ids=#{user_ids.shift(50).join(',')}", 'users', show_status: false) do |row| + @db.insert_user( + id: row['id'], + email: row['email'], + name: row['name'], + created_at: row['created_at'], + last_seen_at: row['last_login_at'], + active: row['active'] + ) + end + end + + @db.sort_posts_by_created_at + end + + def import_categories + puts "", "creating categories" + rows = @db.fetch_categories + + create_categories(rows) do |row| + { + id: row['id'], + name: row['name'], + description: row['description'], + position: row['position'], + post_create_action: proc do |category| + url = remove_domain(row['url']) + Permalink.create(url: url, category_id: category.id) unless permalink_exists?(url) + end + } + end + end + + def import_users + puts "", "creating users" + total_count = @db.count_users + last_id = '' + + batches do |offset| + rows, last_id = @db.fetch_users(last_id) + break if rows.empty? + + next if all_records_exist?(:users, rows.map { |row| row['id'] }) + + create_users(rows, total: total_count, offset: offset) do |row| + { + id: row['id'], + email: row['email'], + name: row['name'], + created_at: row['created_at'], + last_seen_at: row['last_seen_at'], + active: row['active'] == 1 + } + end + end + end + + def import_topics + puts "", "creating topics" + total_count = @db.count_topics + last_id = '' + + batches do |offset| + rows, last_id = @db.fetch_topics(last_id) + break if rows.empty? + + next if all_records_exist?(:posts, rows.map { |row| import_topic_id(row['id']) }) + + create_posts(rows, total: total_count, offset: offset) do |row| + { + id: import_topic_id(row['id']), + title: row['title'].present? ? row['title'].strip[0...255] : "Topic title missing", + raw: normalize_raw(row['raw']), + category: category_id_from_imported_category_id(row['category_id']), + user_id: user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id, + created_at: row['created_at'], + closed: row['closed'] == 1, + post_create_action: proc do |post| + url = remove_domain(row['url']) + Permalink.create(url: url, topic_id: post.topic.id) unless permalink_exists?(url) + end + } + end + end + end + + def import_topic_id(topic_id) + "T#{topic_id}" + end + + def import_posts + puts "", "creating posts" + total_count = @db.count_posts + last_row_id = 0 + + batches do |offset| + rows, last_row_id = @db.fetch_posts(last_row_id) + break if rows.empty? + + create_posts(rows, total: total_count, offset: offset) do |row| + topic = topic_lookup_from_imported_post_id(import_topic_id(row['topic_id'])) + + if topic.nil? + p "MISSING TOPIC #{row['topic_id']}" + p row + next + end + + { + id: row['id'], + raw: normalize_raw(row['raw']), + user_id: user_id_from_imported_user_id(row['user_id']) || Discourse.system_user.id, + topic_id: topic[:topic_id], + created_at: row['created_at'], + post_create_action: proc do |post| + url = remove_domain(row['url']) + Permalink.create(url: url, post_id: post.id) unless permalink_exists?(url) + end + } + end + end + end + + def normalize_raw(raw) + raw = raw.gsub('\n', '') + raw = ReverseMarkdown.convert(raw) + raw + end + + def remove_domain(url) + url.sub(@source_url, "") + end + + def permalink_exists?(url) + Permalink.find_by(url: url) + end + + def connection + @_connection ||= begin + connect_uri = URI.parse(@source_url) + + http = Net::HTTP.new(connect_uri.host, connect_uri.port) + http.open_timeout = 30 + http.read_timeout = 30 + http.use_ssl = connect_uri.scheme == "https" + + http + end + end + + def authorization + @_authorization ||= begin + auth_str = "#{@auth_email}/token:#{@auth_token}" + "Basic #{Base64.strict_encode64(auth_str)}" + end + end + + def get_from_api(path, array_name, show_status: true) + url = "#{@source_url}#{path}" + start_time = Time.now + + while url + get = Net::HTTP::Get.new(url) + get['User-Agent'] = 'Discourse Zendesk Importer' + get['Authorization'] = authorization + + retry_count = 0 + + begin + while retry_count < 5 + begin + response = connection.request(get) + puts("Retry successful!") if retry_count > 0 + break + rescue => e + puts "Request failed #{url}. Waiting and will retry. #{e.class.name} #{e.message}" + sleep(20) + retry_count += 1 + end + end + end + + json = JSON.parse(response.body) + + json[array_name].each do |row| + yield row + end + + url = json['next_page'] + + if show_status + if json['page'] && json['page_count'] + print_status(json['page'], json['page_count'], start_time) + else + print '.' + end + end + end + end +end + +unless ARGV.length == 4 && Dir.exist?(ARGV[1]) + puts "", "Usage:", "", "bundle exec ruby script/import_scripts/zendesk_api.rb SOURCE_URL DIRNAME AUTH_EMAIL AUTH_TOKEN", "" + exit 1 +end + +ImportScripts::Zendesk.new(ARGV[0], ARGV[1], ARGV[2], ARGV[3]).perform