mirror of
https://github.com/discourse/discourse.git
synced 2024-11-25 09:42:07 +08:00
DEV: mbox importer improvements (#16557)
* FIX: support specifying parent_category_id in mbox import metadata * FIX: elide tabs from topic titles * FIX: optionally fix Mailman from: addresses * DEV: optionally elide anything up to the last = in email addresses * Fix Mailmain broken from: detection
This commit is contained in:
parent
0f772bdf5b
commit
3e5faffb0d
|
@ -458,7 +458,16 @@ class ImportScripts::Base
|
||||||
.where("LOWER(name) = ?", opts[:name].downcase.strip)
|
.where("LOWER(name) = ?", opts[:name].downcase.strip)
|
||||||
.first
|
.first
|
||||||
|
|
||||||
return existing if existing
|
if existing
|
||||||
|
if import_id && existing.custom_fields["import_id"] != import_id
|
||||||
|
existing.custom_fields["import_id"] = import_id
|
||||||
|
existing.save!
|
||||||
|
|
||||||
|
add_category(import_id, existing)
|
||||||
|
end
|
||||||
|
|
||||||
|
return existing
|
||||||
|
end
|
||||||
|
|
||||||
post_create_action = opts.delete(:post_create_action)
|
post_create_action = opts.delete(:post_create_action)
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,8 @@ module ImportScripts::Mbox
|
||||||
create_categories(rows) do |row|
|
create_categories(rows) do |row|
|
||||||
{
|
{
|
||||||
id: row['name'],
|
id: row['name'],
|
||||||
name: row['name']
|
name: row['name'],
|
||||||
|
parent_category_id: row['parent_category_id'].presence,
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -48,4 +48,18 @@ tags:
|
||||||
# https://en.wikipedia.org/wiki/List_of_email_subject_abbreviations#Abbreviations_in_other_languages).
|
# https://en.wikipedia.org/wiki/List_of_email_subject_abbreviations#Abbreviations_in_other_languages).
|
||||||
remove_subject_prefixes:
|
remove_subject_prefixes:
|
||||||
- "Re: "
|
- "Re: "
|
||||||
- "Fwd?: "
|
- "Fwd?: "
|
||||||
|
|
||||||
|
# For emails processed by mailman with this type of bogus header:
|
||||||
|
# From: Jane Smith via ListName <ListName@lists.example.com>
|
||||||
|
# attempt to get the actual sender's address from other available headers like Reply-To or the /^From / line
|
||||||
|
fix_mailman_via_addresses: false
|
||||||
|
|
||||||
|
# Elide up to last occurence of = in addresses (BATV, etc)
|
||||||
|
# ie. normalize these:
|
||||||
|
# prvs=00377ab9a2=someone@somewhere.com
|
||||||
|
# bounces+840901-dbd9-flang-dev=someone@somewhere.com
|
||||||
|
# blah=blih=bluh=someone@somewhere.com
|
||||||
|
# into:
|
||||||
|
# someone@somewhere.com
|
||||||
|
elide_equals_in_addresses: false
|
||||||
|
|
|
@ -30,8 +30,8 @@ module ImportScripts::Mbox
|
||||||
|
|
||||||
def insert_category(category)
|
def insert_category(category)
|
||||||
@db.execute(<<-SQL, category)
|
@db.execute(<<-SQL, category)
|
||||||
INSERT OR REPLACE INTO category (name, description)
|
INSERT OR REPLACE INTO category (name, description, parent_category_id)
|
||||||
VALUES (:name, :description)
|
VALUES (:name, :description, :parent_category_id)
|
||||||
SQL
|
SQL
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -150,7 +150,7 @@ module ImportScripts::Mbox
|
||||||
|
|
||||||
def fetch_categories
|
def fetch_categories
|
||||||
@db.execute <<-SQL
|
@db.execute <<-SQL
|
||||||
SELECT name, description
|
SELECT name, description, parent_category_id
|
||||||
FROM category
|
FROM category
|
||||||
ORDER BY name
|
ORDER BY name
|
||||||
SQL
|
SQL
|
||||||
|
@ -218,7 +218,8 @@ module ImportScripts::Mbox
|
||||||
@db.execute <<-SQL
|
@db.execute <<-SQL
|
||||||
CREATE TABLE IF NOT EXISTS category (
|
CREATE TABLE IF NOT EXISTS category (
|
||||||
name TEXT NOT NULL PRIMARY KEY,
|
name TEXT NOT NULL PRIMARY KEY,
|
||||||
description TEXT
|
description TEXT,
|
||||||
|
parent_category_id INTEGER
|
||||||
)
|
)
|
||||||
SQL
|
SQL
|
||||||
end
|
end
|
||||||
|
|
|
@ -55,7 +55,8 @@ module ImportScripts::Mbox
|
||||||
|
|
||||||
category = {
|
category = {
|
||||||
name: metadata['name'].presence || File.basename(directory),
|
name: metadata['name'].presence || File.basename(directory),
|
||||||
description: metadata['description']
|
description: metadata['description'],
|
||||||
|
parent_category_id: metadata['parent_category_id'].presence,
|
||||||
}
|
}
|
||||||
|
|
||||||
@database.insert_category(category)
|
@database.insert_category(category)
|
||||||
|
@ -67,7 +68,28 @@ module ImportScripts::Mbox
|
||||||
begin
|
begin
|
||||||
msg_id = receiver.message_id
|
msg_id = receiver.message_id
|
||||||
parsed_email = receiver.mail
|
parsed_email = receiver.mail
|
||||||
|
|
||||||
from_email, from_display_name = receiver.parse_from_field(parsed_email)
|
from_email, from_display_name = receiver.parse_from_field(parsed_email)
|
||||||
|
|
||||||
|
if @settings.fix_mailman_via_addresses
|
||||||
|
# Detect cases like this and attempt to get actual sender from other headers:
|
||||||
|
# From: Jane Smith via ListName <ListName@lists.example.com>
|
||||||
|
|
||||||
|
if receiver.mail['X-Mailman-Version'] && from_display_name =~ /\bvia \S+$/i
|
||||||
|
email_from_from_line = opts[:from_line].scan(/From (\S+)/).flatten.first
|
||||||
|
a = Mail::Address.new(email_from_from_line)
|
||||||
|
from_email = a.address
|
||||||
|
from_display_name = a.display_name
|
||||||
|
# if name is not available there, look for it in Reply-To
|
||||||
|
if from_display_name.nil?
|
||||||
|
reply_to = receiver.mail.to_s.scan(/[\n\r]Reply-To: ([^\r\n]+)/).flatten.first
|
||||||
|
from_display_name = Mail::Address.new(reply_to).display_name
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
from_email = from_email.sub(/^(.*)=/, '') if @settings.elide_equals_in_addresses
|
||||||
|
|
||||||
body, elided, format = receiver.select_body
|
body, elided, format = receiver.select_body
|
||||||
reply_message_ids = extract_reply_message_ids(parsed_email)
|
reply_message_ids = extract_reply_message_ids(parsed_email)
|
||||||
|
|
||||||
|
@ -125,11 +147,12 @@ module ImportScripts::Mbox
|
||||||
puts "indexing #{filename}"
|
puts "indexing #{filename}"
|
||||||
|
|
||||||
if @split_regex.present?
|
if @split_regex.present?
|
||||||
each_mail(filename) do |raw_message, first_line_number, last_line_number|
|
each_mail(filename) do |raw_message, first_line_number, last_line_number, from_line|
|
||||||
opts = {
|
opts = {
|
||||||
first_line_number: first_line_number,
|
first_line_number: first_line_number,
|
||||||
last_line_number: last_line_number,
|
last_line_number: last_line_number,
|
||||||
start_time: monotonic_time
|
start_time: monotonic_time,
|
||||||
|
from_line: from_line,
|
||||||
}
|
}
|
||||||
receiver = read_mail_from_string(raw_message)
|
receiver = read_mail_from_string(raw_message)
|
||||||
yield receiver, filename, opts if receiver.present?
|
yield receiver, filename, opts if receiver.present?
|
||||||
|
@ -159,13 +182,17 @@ module ImportScripts::Mbox
|
||||||
first_line_number = 1
|
first_line_number = 1
|
||||||
last_line_number = 0
|
last_line_number = 0
|
||||||
|
|
||||||
|
from_line = nil
|
||||||
|
|
||||||
each_line(filename) do |line|
|
each_line(filename) do |line|
|
||||||
if line.scrub =~ @split_regex
|
if line.scrub =~ @split_regex
|
||||||
if last_line_number > 0
|
if last_line_number > 0
|
||||||
yield raw_message, first_line_number, last_line_number
|
yield raw_message, first_line_number, last_line_number, from_line
|
||||||
raw_message = +''
|
raw_message = +''
|
||||||
first_line_number = last_line_number + 1
|
first_line_number = last_line_number + 1
|
||||||
end
|
end
|
||||||
|
|
||||||
|
from_line = line
|
||||||
else
|
else
|
||||||
raw_message << line
|
raw_message << line
|
||||||
end
|
end
|
||||||
|
@ -173,7 +200,7 @@ module ImportScripts::Mbox
|
||||||
last_line_number += 1
|
last_line_number += 1
|
||||||
end
|
end
|
||||||
|
|
||||||
yield raw_message, first_line_number, last_line_number if raw_message.present?
|
yield raw_message, first_line_number, last_line_number, from_line if raw_message.present?
|
||||||
end
|
end
|
||||||
|
|
||||||
def each_line(filename)
|
def each_line(filename)
|
||||||
|
@ -202,7 +229,7 @@ module ImportScripts::Mbox
|
||||||
|
|
||||||
def extract_subject(receiver, list_name)
|
def extract_subject(receiver, list_name)
|
||||||
subject = receiver.subject
|
subject = receiver.subject
|
||||||
subject.blank? ? nil : subject.strip
|
subject.blank? ? nil : subject.strip.gsub(/\t+/, ' ')
|
||||||
end
|
end
|
||||||
|
|
||||||
def ignored_file?(path, checksums)
|
def ignored_file?(path, checksums)
|
||||||
|
|
|
@ -21,6 +21,8 @@ module ImportScripts::Mbox
|
||||||
attr_reader :automatically_remove_list_name_prefix
|
attr_reader :automatically_remove_list_name_prefix
|
||||||
attr_reader :show_trimmed_content
|
attr_reader :show_trimmed_content
|
||||||
attr_reader :tags
|
attr_reader :tags
|
||||||
|
attr_reader :fix_mailman_via_addresses
|
||||||
|
attr_reader :elide_equals_in_addresses
|
||||||
|
|
||||||
def initialize(yaml)
|
def initialize(yaml)
|
||||||
@data_dir = yaml['data_dir']
|
@data_dir = yaml['data_dir']
|
||||||
|
@ -39,6 +41,8 @@ module ImportScripts::Mbox
|
||||||
|
|
||||||
@automatically_remove_list_name_prefix = yaml['automatically_remove_list_name_prefix']
|
@automatically_remove_list_name_prefix = yaml['automatically_remove_list_name_prefix']
|
||||||
@show_trimmed_content = yaml['show_trimmed_content']
|
@show_trimmed_content = yaml['show_trimmed_content']
|
||||||
|
@fix_mailman_via_addresses = yaml['fix_mailman_via_addresses']
|
||||||
|
@elide_equals_in_addresses = yaml['elide_equals_in_addresses']
|
||||||
|
|
||||||
@tags = []
|
@tags = []
|
||||||
if yaml['tags'].present?
|
if yaml['tags'].present?
|
||||||
|
|
Loading…
Reference in New Issue
Block a user