DEV: Improve mbox import script

* Better documentation of settings * Add option to exclude trimmed parts of emails (enabled by default) to not revail email addresses
2024-11-22 14:38:17 +08:00 · 2020-03-13 23:59:14 +01:00 · 2020-03-13 23:59:14 +01:00 · 0a88232e87
commit 0a88232e87
parent 36062f43c8
4 changed files with 40 additions and 22 deletions
--- a/script/import_scripts/mbox.rb
+++ b/script/import_scripts/mbox.rb
@ -8,11 +8,7 @@ end

 module ImportScripts
  module Mbox
-    require_relative 'mbox/support/settings'
-
-    @settings = Settings.load(ARGV[0])
-
    require_relative 'mbox/importer'
-    Importer.new(@settings).perform
+    Importer.new(ARGV[0]).perform
  end
 end
--- a/script/import_scripts/mbox/importer.rb
+++ b/script/import_scripts/mbox/importer.rb
@ -7,9 +7,8 @@ require_relative 'support/settings'

 module ImportScripts::Mbox
  class Importer < ImportScripts::Base
-    # @param settings [ImportScripts::Mbox::Settings]
-    def initialize(settings)
-      @settings = settings
+    def initialize(settings_filename)
+      @settings = Settings.load(settings_filename)
      super()

      @database = Database.new(@settings.data_dir, @settings.batch_size)
@ -139,7 +138,10 @@ module ImportScripts::Mbox
        body = receiver.add_attachments(body, user)
      end

-      body = "#{body}#{Email::Receiver.elided_html(elided)}" if elided.present?
+      if elided.present? && @settings.show_trimmed_content
+        body = "#{body}#{Email::Receiver.elided_html(elided)}"
+      end
+
      body
    end

--- a/script/import_scripts/mbox/settings.yml
+++ b/script/import_scripts/mbox/settings.yml
@ -1,22 +1,38 @@
+# Directory where all emails and mbox files are stored.
 data_dir: /shared/import/data

-# mbox files
+# Regular expression for splitting emails in mbox files.
+# Choose one of the following examples that works for you or add your own regular expression.
 split_regex: "^From .+@.+"
 #split_regex: "^From .+@example.com.+"

-# individual emails
-#split_regex: ""
-
 # Listserv files
 #split_regex: "^========================================================================="

-default_trust_level: 1
+# When each file contains only one email
+#split_regex: ""
+
+# Prefer the HTML part of emails instead of the plain text part.
 prefer_html: true
+
+# The trust level of users created by the import script.
+default_trust_level: 1
+
+# Create staged users instead of regular users.
 staged: true
+
+# You can enable this option if the script should stop after indexing all emails
+# instead of executing the import right away. That's useful if you need to make some changes
+# to the indexed data in the `index.db` created during the indexing step.
 index_only: false

+# Only enable this option when the import script fails to group messages with
+# with the `In-Reply-To` and `References` headers.
 group_messages_by_subject: false

+# Always show trimmed part of emails. WARNING: This might reveal email addresses.
+show_trimmed_content: false
+
 # Remove prefixes like [FOO] or (BAR) from topic titles and replace them with tags.
 # You can map one or more case-insensitive prefixes to the same tag in Discourse.
 # "Tag name in Discourse": "foo"
--- a/script/import_scripts/mbox/support/settings.rb
+++ b/script/import_scripts/mbox/support/settings.rb
@ -19,6 +19,7 @@ module ImportScripts::Mbox
    attr_reader :group_messages_by_subject
    attr_reader :subject_prefix_regex
    attr_reader :automatically_remove_list_name_prefix
+    attr_reader :show_trimmed_content
    attr_reader :tags

    def initialize(yaml)
@ -31,20 +32,23 @@ module ImportScripts::Mbox
      @index_only = yaml['index_only']
      @group_messages_by_subject = yaml['group_messages_by_subject']

-      unless yaml['remove_subject_prefixes'].empty?
+      if yaml['remove_subject_prefixes'].present?
        prefix_regexes = yaml['remove_subject_prefixes'].map { |p| Regexp.new(p) }
        @subject_prefix_regex = /^#{Regexp.union(prefix_regexes).source}/i
      end

      @automatically_remove_list_name_prefix = yaml['automatically_remove_list_name_prefix']
+      @show_trimmed_content = yaml['show_trimmed_content']

-      @tags = []
-      yaml['tags'].each do |tag_name, value|
-        prefixes = Regexp.union(value).source
-        @tags << {
-          regex: /^(?:(?:\[(?:#{prefixes})\])|(?:\((?:#{prefixes})\)))\s*/i,
-          name: tag_name
-        }
+      if yaml['tags'].present?
+        @tags = []
+        yaml['tags'].each do |tag_name, value|
+          prefixes = Regexp.union(value).source
+          @tags << {
+            regex: /^(?:(?:\[(?:#{prefixes})\])|(?:\((?:#{prefixes})\)))\s*/i,
+            name: tag_name
+          }
+        end
      end
    end
  end