From 0a88232e872cb7ab3d1540a87589aa9ddbfe40bd Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Fri, 13 Mar 2020 23:59:14 +0100 Subject: [PATCH] DEV: Improve mbox import script * Better documentation of settings * Add option to exclude trimmed parts of emails (enabled by default) to not revail email addresses --- script/import_scripts/mbox.rb | 6 +---- script/import_scripts/mbox/importer.rb | 10 ++++--- script/import_scripts/mbox/settings.yml | 26 +++++++++++++++---- .../import_scripts/mbox/support/settings.rb | 20 ++++++++------ 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/script/import_scripts/mbox.rb b/script/import_scripts/mbox.rb index 59d3a3780fd..cee093033fc 100644 --- a/script/import_scripts/mbox.rb +++ b/script/import_scripts/mbox.rb @@ -8,11 +8,7 @@ end module ImportScripts module Mbox - require_relative 'mbox/support/settings' - - @settings = Settings.load(ARGV[0]) - require_relative 'mbox/importer' - Importer.new(@settings).perform + Importer.new(ARGV[0]).perform end end diff --git a/script/import_scripts/mbox/importer.rb b/script/import_scripts/mbox/importer.rb index d4f8ec47945..71b405e7011 100644 --- a/script/import_scripts/mbox/importer.rb +++ b/script/import_scripts/mbox/importer.rb @@ -7,9 +7,8 @@ require_relative 'support/settings' module ImportScripts::Mbox class Importer < ImportScripts::Base - # @param settings [ImportScripts::Mbox::Settings] - def initialize(settings) - @settings = settings + def initialize(settings_filename) + @settings = Settings.load(settings_filename) super() @database = Database.new(@settings.data_dir, @settings.batch_size) @@ -139,7 +138,10 @@ module ImportScripts::Mbox body = receiver.add_attachments(body, user) end - body = "#{body}#{Email::Receiver.elided_html(elided)}" if elided.present? + if elided.present? && @settings.show_trimmed_content + body = "#{body}#{Email::Receiver.elided_html(elided)}" + end + body end diff --git a/script/import_scripts/mbox/settings.yml b/script/import_scripts/mbox/settings.yml index f1188001002..60322ec3058 100644 --- a/script/import_scripts/mbox/settings.yml +++ b/script/import_scripts/mbox/settings.yml @@ -1,22 +1,38 @@ +# Directory where all emails and mbox files are stored. data_dir: /shared/import/data -# mbox files +# Regular expression for splitting emails in mbox files. +# Choose one of the following examples that works for you or add your own regular expression. split_regex: "^From .+@.+" #split_regex: "^From .+@example.com.+" -# individual emails -#split_regex: "" - # Listserv files #split_regex: "^=========================================================================" -default_trust_level: 1 +# When each file contains only one email +#split_regex: "" + +# Prefer the HTML part of emails instead of the plain text part. prefer_html: true + +# The trust level of users created by the import script. +default_trust_level: 1 + +# Create staged users instead of regular users. staged: true + +# You can enable this option if the script should stop after indexing all emails +# instead of executing the import right away. That's useful if you need to make some changes +# to the indexed data in the `index.db` created during the indexing step. index_only: false +# Only enable this option when the import script fails to group messages with +# with the `In-Reply-To` and `References` headers. group_messages_by_subject: false +# Always show trimmed part of emails. WARNING: This might reveal email addresses. +show_trimmed_content: false + # Remove prefixes like [FOO] or (BAR) from topic titles and replace them with tags. # You can map one or more case-insensitive prefixes to the same tag in Discourse. # "Tag name in Discourse": "foo" diff --git a/script/import_scripts/mbox/support/settings.rb b/script/import_scripts/mbox/support/settings.rb index f30c5d56d18..9311dff5e78 100644 --- a/script/import_scripts/mbox/support/settings.rb +++ b/script/import_scripts/mbox/support/settings.rb @@ -19,6 +19,7 @@ module ImportScripts::Mbox attr_reader :group_messages_by_subject attr_reader :subject_prefix_regex attr_reader :automatically_remove_list_name_prefix + attr_reader :show_trimmed_content attr_reader :tags def initialize(yaml) @@ -31,20 +32,23 @@ module ImportScripts::Mbox @index_only = yaml['index_only'] @group_messages_by_subject = yaml['group_messages_by_subject'] - unless yaml['remove_subject_prefixes'].empty? + if yaml['remove_subject_prefixes'].present? prefix_regexes = yaml['remove_subject_prefixes'].map { |p| Regexp.new(p) } @subject_prefix_regex = /^#{Regexp.union(prefix_regexes).source}/i end @automatically_remove_list_name_prefix = yaml['automatically_remove_list_name_prefix'] + @show_trimmed_content = yaml['show_trimmed_content'] - @tags = [] - yaml['tags'].each do |tag_name, value| - prefixes = Regexp.union(value).source - @tags << { - regex: /^(?:(?:\[(?:#{prefixes})\])|(?:\((?:#{prefixes})\)))\s*/i, - name: tag_name - } + if yaml['tags'].present? + @tags = [] + yaml['tags'].each do |tag_name, value| + prefixes = Regexp.union(value).source + @tags << { + regex: /^(?:(?:\[(?:#{prefixes})\])|(?:\((?:#{prefixes})\)))\s*/i, + name: tag_name + } + end end end end