DEV: Improve mbox import script

* Better documentation of settings
* Add option to exclude trimmed parts of emails (enabled by default) to not revail email addresses
This commit is contained in:
Gerhard Schlager 2020-03-13 23:59:14 +01:00
parent 36062f43c8
commit 0a88232e87
4 changed files with 40 additions and 22 deletions

View File

@ -8,11 +8,7 @@ end
module ImportScripts
module Mbox
require_relative 'mbox/support/settings'
@settings = Settings.load(ARGV[0])
require_relative 'mbox/importer'
Importer.new(@settings).perform
Importer.new(ARGV[0]).perform
end
end

View File

@ -7,9 +7,8 @@ require_relative 'support/settings'
module ImportScripts::Mbox
class Importer < ImportScripts::Base
# @param settings [ImportScripts::Mbox::Settings]
def initialize(settings)
@settings = settings
def initialize(settings_filename)
@settings = Settings.load(settings_filename)
super()
@database = Database.new(@settings.data_dir, @settings.batch_size)
@ -139,7 +138,10 @@ module ImportScripts::Mbox
body = receiver.add_attachments(body, user)
end
body = "#{body}#{Email::Receiver.elided_html(elided)}" if elided.present?
if elided.present? && @settings.show_trimmed_content
body = "#{body}#{Email::Receiver.elided_html(elided)}"
end
body
end

View File

@ -1,22 +1,38 @@
# Directory where all emails and mbox files are stored.
data_dir: /shared/import/data
# mbox files
# Regular expression for splitting emails in mbox files.
# Choose one of the following examples that works for you or add your own regular expression.
split_regex: "^From .+@.+"
#split_regex: "^From .+@example.com.+"
# individual emails
#split_regex: ""
# Listserv files
#split_regex: "^========================================================================="
default_trust_level: 1
# When each file contains only one email
#split_regex: ""
# Prefer the HTML part of emails instead of the plain text part.
prefer_html: true
# The trust level of users created by the import script.
default_trust_level: 1
# Create staged users instead of regular users.
staged: true
# You can enable this option if the script should stop after indexing all emails
# instead of executing the import right away. That's useful if you need to make some changes
# to the indexed data in the `index.db` created during the indexing step.
index_only: false
# Only enable this option when the import script fails to group messages with
# with the `In-Reply-To` and `References` headers.
group_messages_by_subject: false
# Always show trimmed part of emails. WARNING: This might reveal email addresses.
show_trimmed_content: false
# Remove prefixes like [FOO] or (BAR) from topic titles and replace them with tags.
# You can map one or more case-insensitive prefixes to the same tag in Discourse.
# "Tag name in Discourse": "foo"

View File

@ -19,6 +19,7 @@ module ImportScripts::Mbox
attr_reader :group_messages_by_subject
attr_reader :subject_prefix_regex
attr_reader :automatically_remove_list_name_prefix
attr_reader :show_trimmed_content
attr_reader :tags
def initialize(yaml)
@ -31,20 +32,23 @@ module ImportScripts::Mbox
@index_only = yaml['index_only']
@group_messages_by_subject = yaml['group_messages_by_subject']
unless yaml['remove_subject_prefixes'].empty?
if yaml['remove_subject_prefixes'].present?
prefix_regexes = yaml['remove_subject_prefixes'].map { |p| Regexp.new(p) }
@subject_prefix_regex = /^#{Regexp.union(prefix_regexes).source}/i
end
@automatically_remove_list_name_prefix = yaml['automatically_remove_list_name_prefix']
@show_trimmed_content = yaml['show_trimmed_content']
@tags = []
yaml['tags'].each do |tag_name, value|
prefixes = Regexp.union(value).source
@tags << {
regex: /^(?:(?:\[(?:#{prefixes})\])|(?:\((?:#{prefixes})\)))\s*/i,
name: tag_name
}
if yaml['tags'].present?
@tags = []
yaml['tags'].each do |tag_name, value|
prefixes = Regexp.union(value).source
@tags << {
regex: /^(?:(?:\[(?:#{prefixes})\])|(?:\((?:#{prefixes})\)))\s*/i,
name: tag_name
}
end
end
end
end