DEV: mbox importer improvements (#16557)

* FIX: support specifying parent_category_id in mbox import metadata
* FIX: elide tabs from topic titles
* FIX: optionally fix Mailman from: addresses
* DEV: optionally elide anything up to the last = in email addresses
* Fix Mailmain broken from: detection
This commit is contained in:
Leonardo Mosquera 2022-04-29 13:24:29 -03:00 committed by GitHub
parent 0f772bdf5b
commit 3e5faffb0d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 69 additions and 13 deletions

View File

@ -458,7 +458,16 @@ class ImportScripts::Base
.where("LOWER(name) = ?", opts[:name].downcase.strip)
.first
return existing if existing
if existing
if import_id && existing.custom_fields["import_id"] != import_id
existing.custom_fields["import_id"] = import_id
existing.save!
add_category(import_id, existing)
end
return existing
end
post_create_action = opts.delete(:post_create_action)

View File

@ -50,7 +50,8 @@ module ImportScripts::Mbox
create_categories(rows) do |row|
{
id: row['name'],
name: row['name']
name: row['name'],
parent_category_id: row['parent_category_id'].presence,
}
end
end

View File

@ -48,4 +48,18 @@ tags:
# https://en.wikipedia.org/wiki/List_of_email_subject_abbreviations#Abbreviations_in_other_languages).
remove_subject_prefixes:
- "Re: "
- "Fwd?: "
- "Fwd?: "
# For emails processed by mailman with this type of bogus header:
# From: Jane Smith via ListName <ListName@lists.example.com>
# attempt to get the actual sender's address from other available headers like Reply-To or the /^From / line
fix_mailman_via_addresses: false
# Elide up to last occurence of = in addresses (BATV, etc)
# ie. normalize these:
# prvs=00377ab9a2=someone@somewhere.com
# bounces+840901-dbd9-flang-dev=someone@somewhere.com
# blah=blih=bluh=someone@somewhere.com
# into:
# someone@somewhere.com
elide_equals_in_addresses: false

View File

@ -30,8 +30,8 @@ module ImportScripts::Mbox
def insert_category(category)
@db.execute(<<-SQL, category)
INSERT OR REPLACE INTO category (name, description)
VALUES (:name, :description)
INSERT OR REPLACE INTO category (name, description, parent_category_id)
VALUES (:name, :description, :parent_category_id)
SQL
end
@ -150,7 +150,7 @@ module ImportScripts::Mbox
def fetch_categories
@db.execute <<-SQL
SELECT name, description
SELECT name, description, parent_category_id
FROM category
ORDER BY name
SQL
@ -218,7 +218,8 @@ module ImportScripts::Mbox
@db.execute <<-SQL
CREATE TABLE IF NOT EXISTS category (
name TEXT NOT NULL PRIMARY KEY,
description TEXT
description TEXT,
parent_category_id INTEGER
)
SQL
end

View File

@ -55,7 +55,8 @@ module ImportScripts::Mbox
category = {
name: metadata['name'].presence || File.basename(directory),
description: metadata['description']
description: metadata['description'],
parent_category_id: metadata['parent_category_id'].presence,
}
@database.insert_category(category)
@ -67,7 +68,28 @@ module ImportScripts::Mbox
begin
msg_id = receiver.message_id
parsed_email = receiver.mail
from_email, from_display_name = receiver.parse_from_field(parsed_email)
if @settings.fix_mailman_via_addresses
# Detect cases like this and attempt to get actual sender from other headers:
# From: Jane Smith via ListName <ListName@lists.example.com>
if receiver.mail['X-Mailman-Version'] && from_display_name =~ /\bvia \S+$/i
email_from_from_line = opts[:from_line].scan(/From (\S+)/).flatten.first
a = Mail::Address.new(email_from_from_line)
from_email = a.address
from_display_name = a.display_name
# if name is not available there, look for it in Reply-To
if from_display_name.nil?
reply_to = receiver.mail.to_s.scan(/[\n\r]Reply-To: ([^\r\n]+)/).flatten.first
from_display_name = Mail::Address.new(reply_to).display_name
end
end
end
from_email = from_email.sub(/^(.*)=/, '') if @settings.elide_equals_in_addresses
body, elided, format = receiver.select_body
reply_message_ids = extract_reply_message_ids(parsed_email)
@ -125,11 +147,12 @@ module ImportScripts::Mbox
puts "indexing #{filename}"
if @split_regex.present?
each_mail(filename) do |raw_message, first_line_number, last_line_number|
each_mail(filename) do |raw_message, first_line_number, last_line_number, from_line|
opts = {
first_line_number: first_line_number,
last_line_number: last_line_number,
start_time: monotonic_time
start_time: monotonic_time,
from_line: from_line,
}
receiver = read_mail_from_string(raw_message)
yield receiver, filename, opts if receiver.present?
@ -159,13 +182,17 @@ module ImportScripts::Mbox
first_line_number = 1
last_line_number = 0
from_line = nil
each_line(filename) do |line|
if line.scrub =~ @split_regex
if last_line_number > 0
yield raw_message, first_line_number, last_line_number
yield raw_message, first_line_number, last_line_number, from_line
raw_message = +''
first_line_number = last_line_number + 1
end
from_line = line
else
raw_message << line
end
@ -173,7 +200,7 @@ module ImportScripts::Mbox
last_line_number += 1
end
yield raw_message, first_line_number, last_line_number if raw_message.present?
yield raw_message, first_line_number, last_line_number, from_line if raw_message.present?
end
def each_line(filename)
@ -202,7 +229,7 @@ module ImportScripts::Mbox
def extract_subject(receiver, list_name)
subject = receiver.subject
subject.blank? ? nil : subject.strip
subject.blank? ? nil : subject.strip.gsub(/\t+/, ' ')
end
def ignored_file?(path, checksums)

View File

@ -21,6 +21,8 @@ module ImportScripts::Mbox
attr_reader :automatically_remove_list_name_prefix
attr_reader :show_trimmed_content
attr_reader :tags
attr_reader :fix_mailman_via_addresses
attr_reader :elide_equals_in_addresses
def initialize(yaml)
@data_dir = yaml['data_dir']
@ -39,6 +41,8 @@ module ImportScripts::Mbox
@automatically_remove_list_name_prefix = yaml['automatically_remove_list_name_prefix']
@show_trimmed_content = yaml['show_trimmed_content']
@fix_mailman_via_addresses = yaml['fix_mailman_via_addresses']
@elide_equals_in_addresses = yaml['elide_equals_in_addresses']
@tags = []
if yaml['tags'].present?