From 3e5faffb0de300c7273b0c0e479d2666218a0677 Mon Sep 17 00:00:00 2001 From: Leonardo Mosquera Date: Fri, 29 Apr 2022 13:24:29 -0300 Subject: [PATCH] DEV: mbox importer improvements (#16557) * FIX: support specifying parent_category_id in mbox import metadata * FIX: elide tabs from topic titles * FIX: optionally fix Mailman from: addresses * DEV: optionally elide anything up to the last = in email addresses * Fix Mailmain broken from: detection --- script/import_scripts/base.rb | 11 +++++- script/import_scripts/mbox/importer.rb | 3 +- script/import_scripts/mbox/settings.yml | 16 +++++++- .../import_scripts/mbox/support/database.rb | 9 +++-- script/import_scripts/mbox/support/indexer.rb | 39 ++++++++++++++++--- .../import_scripts/mbox/support/settings.rb | 4 ++ 6 files changed, 69 insertions(+), 13 deletions(-) diff --git a/script/import_scripts/base.rb b/script/import_scripts/base.rb index 3fcfce549a6..6a28124d3fa 100644 --- a/script/import_scripts/base.rb +++ b/script/import_scripts/base.rb @@ -458,7 +458,16 @@ class ImportScripts::Base .where("LOWER(name) = ?", opts[:name].downcase.strip) .first - return existing if existing + if existing + if import_id && existing.custom_fields["import_id"] != import_id + existing.custom_fields["import_id"] = import_id + existing.save! + + add_category(import_id, existing) + end + + return existing + end post_create_action = opts.delete(:post_create_action) diff --git a/script/import_scripts/mbox/importer.rb b/script/import_scripts/mbox/importer.rb index 71b405e7011..b3712481697 100644 --- a/script/import_scripts/mbox/importer.rb +++ b/script/import_scripts/mbox/importer.rb @@ -50,7 +50,8 @@ module ImportScripts::Mbox create_categories(rows) do |row| { id: row['name'], - name: row['name'] + name: row['name'], + parent_category_id: row['parent_category_id'].presence, } end end diff --git a/script/import_scripts/mbox/settings.yml b/script/import_scripts/mbox/settings.yml index 4c0a3acc11a..639ee0fea33 100644 --- a/script/import_scripts/mbox/settings.yml +++ b/script/import_scripts/mbox/settings.yml @@ -48,4 +48,18 @@ tags: # https://en.wikipedia.org/wiki/List_of_email_subject_abbreviations#Abbreviations_in_other_languages). remove_subject_prefixes: - "Re: " - - "Fwd?: " \ No newline at end of file + - "Fwd?: " + +# For emails processed by mailman with this type of bogus header: +# From: Jane Smith via ListName +# attempt to get the actual sender's address from other available headers like Reply-To or the /^From / line +fix_mailman_via_addresses: false + +# Elide up to last occurence of = in addresses (BATV, etc) +# ie. normalize these: +# prvs=00377ab9a2=someone@somewhere.com +# bounces+840901-dbd9-flang-dev=someone@somewhere.com +# blah=blih=bluh=someone@somewhere.com +# into: +# someone@somewhere.com +elide_equals_in_addresses: false diff --git a/script/import_scripts/mbox/support/database.rb b/script/import_scripts/mbox/support/database.rb index 052f0d479a9..9eb0b1f30b1 100644 --- a/script/import_scripts/mbox/support/database.rb +++ b/script/import_scripts/mbox/support/database.rb @@ -30,8 +30,8 @@ module ImportScripts::Mbox def insert_category(category) @db.execute(<<-SQL, category) - INSERT OR REPLACE INTO category (name, description) - VALUES (:name, :description) + INSERT OR REPLACE INTO category (name, description, parent_category_id) + VALUES (:name, :description, :parent_category_id) SQL end @@ -150,7 +150,7 @@ module ImportScripts::Mbox def fetch_categories @db.execute <<-SQL - SELECT name, description + SELECT name, description, parent_category_id FROM category ORDER BY name SQL @@ -218,7 +218,8 @@ module ImportScripts::Mbox @db.execute <<-SQL CREATE TABLE IF NOT EXISTS category ( name TEXT NOT NULL PRIMARY KEY, - description TEXT + description TEXT, + parent_category_id INTEGER ) SQL end diff --git a/script/import_scripts/mbox/support/indexer.rb b/script/import_scripts/mbox/support/indexer.rb index c8164534d77..b381292c47a 100644 --- a/script/import_scripts/mbox/support/indexer.rb +++ b/script/import_scripts/mbox/support/indexer.rb @@ -55,7 +55,8 @@ module ImportScripts::Mbox category = { name: metadata['name'].presence || File.basename(directory), - description: metadata['description'] + description: metadata['description'], + parent_category_id: metadata['parent_category_id'].presence, } @database.insert_category(category) @@ -67,7 +68,28 @@ module ImportScripts::Mbox begin msg_id = receiver.message_id parsed_email = receiver.mail + from_email, from_display_name = receiver.parse_from_field(parsed_email) + + if @settings.fix_mailman_via_addresses + # Detect cases like this and attempt to get actual sender from other headers: + # From: Jane Smith via ListName + + if receiver.mail['X-Mailman-Version'] && from_display_name =~ /\bvia \S+$/i + email_from_from_line = opts[:from_line].scan(/From (\S+)/).flatten.first + a = Mail::Address.new(email_from_from_line) + from_email = a.address + from_display_name = a.display_name + # if name is not available there, look for it in Reply-To + if from_display_name.nil? + reply_to = receiver.mail.to_s.scan(/[\n\r]Reply-To: ([^\r\n]+)/).flatten.first + from_display_name = Mail::Address.new(reply_to).display_name + end + end + end + + from_email = from_email.sub(/^(.*)=/, '') if @settings.elide_equals_in_addresses + body, elided, format = receiver.select_body reply_message_ids = extract_reply_message_ids(parsed_email) @@ -125,11 +147,12 @@ module ImportScripts::Mbox puts "indexing #{filename}" if @split_regex.present? - each_mail(filename) do |raw_message, first_line_number, last_line_number| + each_mail(filename) do |raw_message, first_line_number, last_line_number, from_line| opts = { first_line_number: first_line_number, last_line_number: last_line_number, - start_time: monotonic_time + start_time: monotonic_time, + from_line: from_line, } receiver = read_mail_from_string(raw_message) yield receiver, filename, opts if receiver.present? @@ -159,13 +182,17 @@ module ImportScripts::Mbox first_line_number = 1 last_line_number = 0 + from_line = nil + each_line(filename) do |line| if line.scrub =~ @split_regex if last_line_number > 0 - yield raw_message, first_line_number, last_line_number + yield raw_message, first_line_number, last_line_number, from_line raw_message = +'' first_line_number = last_line_number + 1 end + + from_line = line else raw_message << line end @@ -173,7 +200,7 @@ module ImportScripts::Mbox last_line_number += 1 end - yield raw_message, first_line_number, last_line_number if raw_message.present? + yield raw_message, first_line_number, last_line_number, from_line if raw_message.present? end def each_line(filename) @@ -202,7 +229,7 @@ module ImportScripts::Mbox def extract_subject(receiver, list_name) subject = receiver.subject - subject.blank? ? nil : subject.strip + subject.blank? ? nil : subject.strip.gsub(/\t+/, ' ') end def ignored_file?(path, checksums) diff --git a/script/import_scripts/mbox/support/settings.rb b/script/import_scripts/mbox/support/settings.rb index e1fd15b8697..eff1511b67a 100644 --- a/script/import_scripts/mbox/support/settings.rb +++ b/script/import_scripts/mbox/support/settings.rb @@ -21,6 +21,8 @@ module ImportScripts::Mbox attr_reader :automatically_remove_list_name_prefix attr_reader :show_trimmed_content attr_reader :tags + attr_reader :fix_mailman_via_addresses + attr_reader :elide_equals_in_addresses def initialize(yaml) @data_dir = yaml['data_dir'] @@ -39,6 +41,8 @@ module ImportScripts::Mbox @automatically_remove_list_name_prefix = yaml['automatically_remove_list_name_prefix'] @show_trimmed_content = yaml['show_trimmed_content'] + @fix_mailman_via_addresses = yaml['fix_mailman_via_addresses'] + @elide_equals_in_addresses = yaml['elide_equals_in_addresses'] @tags = [] if yaml['tags'].present?