discourse/script/import_scripts/mbox/importer.rb
Gerhard Schlager 0a88232e87 DEV: Improve mbox import script
* Better documentation of settings
* Add option to exclude trimmed parts of emails (enabled by default) to not revail email addresses
2020-03-14 00:00:36 +01:00

220 lines
5.4 KiB
Ruby

# frozen_string_literal: true
require_relative '../base'
require_relative 'support/database'
require_relative 'support/indexer'
require_relative 'support/settings'
module ImportScripts::Mbox
class Importer < ImportScripts::Base
def initialize(settings_filename)
@settings = Settings.load(settings_filename)
super()
@database = Database.new(@settings.data_dir, @settings.batch_size)
end
def get_site_settings_for_import
settings = super
settings[:enable_staged_users] = true
settings[:incoming_email_prefer_html] = @settings.prefer_html
settings
end
protected
def execute
index_messages
if @settings.index_only
@skip_updates = true
else
SiteSetting.tagging_enabled = true if @settings.tags.present?
import_categories
import_users
import_posts
end
end
def index_messages
puts '', 'creating index'
indexer = Indexer.new(@database, @settings)
indexer.execute
end
def import_categories
puts '', 'creating categories'
rows = @database.fetch_categories
create_categories(rows) do |row|
{
id: row['name'],
name: row['name']
}
end
end
def import_users
puts '', 'creating users'
total_count = @database.count_users
last_email = ''
batches do |offset|
rows, last_email = @database.fetch_users(last_email)
break if rows.empty?
next if all_records_exist?(:users, rows.map { |row| row['email'] })
create_users(rows, total: total_count, offset: offset) do |row|
{
id: row['email'],
email: row['email'],
name: row['name'],
trust_level: @settings.trust_level,
staged: @settings.staged,
active: !@settings.staged,
created_at: to_time(row['date_of_first_message'])
}
end
end
end
def batches
super(@settings.batch_size)
end
def import_posts
puts '', 'creating topics and posts'
total_count = @database.count_messages
last_row_id = 0
batches do |offset|
rows, last_row_id = @database.fetch_messages(last_row_id)
break if rows.empty?
next if all_records_exist?(:posts, rows.map { |row| row['msg_id'] })
create_posts(rows, total: total_count, offset: offset) do |row|
begin
if row['email_date'].blank?
puts "Date is missing. Skipping #{row['msg_id']}"
nil
elsif row['in_reply_to'].blank?
map_first_post(row)
else
map_reply(row)
end
rescue => e
puts "Failed to map post for #{row['msg_id']}", e, e.backtrace.join("\n")
end
end
end
end
def map_post(row)
user_id = user_id_from_imported_user_id(row['from_email']) || Discourse::SYSTEM_USER_ID
{
id: row['msg_id'],
user_id: user_id,
created_at: to_time(row['email_date']),
raw: format_raw(row, user_id),
raw_email: row['raw_message'],
via_email: true,
post_create_action: proc do |post|
create_incoming_email(post, row)
end
}
end
def format_raw(row, user_id)
body = row['body'] || ''
elided = row['elided']
if row['attachment_count'].positive?
receiver = Email::Receiver.new(row['raw_message'])
user = User.find(user_id)
body = receiver.add_attachments(body, user)
end
if elided.present? && @settings.show_trimmed_content
body = "#{body}#{Email::Receiver.elided_html(elided)}"
end
body
end
def map_first_post(row)
subject = row['subject']
tags = remove_tags!(subject)
mapped = map_post(row)
mapped[:category] = category_id_from_imported_category_id(row['category'])
mapped[:title] = subject.strip[0...255]
mapped[:tags] = tags if tags.present?
mapped
end
def map_reply(row)
parent = @lookup.topic_lookup_from_imported_post_id(row['in_reply_to'])
if parent.blank?
puts "Parent message #{row['in_reply_to']} doesn't exist. Skipping #{row['msg_id']}: #{row['subject'][0..40]}"
return nil
end
mapped = map_post(row)
mapped[:topic_id] = parent[:topic_id]
mapped
end
def remove_tags!(subject)
tag_names = []
remove_prefixes!(subject)
loop do
old_length = subject.length
@settings.tags.each do |tag|
if subject.sub!(tag[:regex], "") && tag[:name].present?
tag_names << tag[:name]
end
end
remove_prefixes!(subject) if subject.length != old_length
break if subject.length == old_length
end
tag_names.uniq
end
def remove_prefixes!(subject)
# There could be multiple prefixes...
loop do
if subject.sub!(@settings.subject_prefix_regex, "")
subject.strip!
else
break
end
end
end
def create_incoming_email(post, row)
IncomingEmail.create(
message_id: row['msg_id'],
raw: row['raw_message'],
subject: row['subject'],
from_address: row['from_email'],
user_id: post.user_id,
topic_id: post.topic_id,
post_id: post.id
)
end
def to_time(timestamp)
Time.zone.at(timestamp) if timestamp
end
end
end