discourse/script/import_scripts/mbox/support/indexer.rb
Leonardo Mosquera 3e5faffb0d
DEV: mbox importer improvements (#16557)
* FIX: support specifying parent_category_id in mbox import metadata
* FIX: elide tabs from topic titles
* FIX: optionally fix Mailman from: addresses
* DEV: optionally elide anything up to the last = in email addresses
* Fix Mailmain broken from: detection
2022-04-29 13:24:29 -03:00

262 lines
8.2 KiB
Ruby

# frozen_string_literal: true
require_relative 'database'
require 'json'
require 'yaml'
module ImportScripts::Mbox
class Indexer
# @param database [ImportScripts::Mbox::Database]
# @param settings [ImportScripts::Mbox::Settings]
def initialize(database, settings)
@database = database
@settings = settings
@split_regex = settings.split_regex
end
def execute
directories = Dir.glob(File.join(@settings.data_dir, '*'))
directories.select! { |f| File.directory?(f) }
directories.sort!
directories.each do |directory|
puts "indexing files in #{directory}"
category = index_category(directory)
index_emails(directory, category[:name])
end
puts '', 'indexing replies and users'
if @settings.group_messages_by_subject
@database.sort_emails_by_subject
@database.update_in_reply_to_by_email_subject
else
@database.update_in_reply_to_of_emails
@database.sort_emails_by_date_and_reply_level
end
@database.fill_users_from_emails
end
private
METADATA_FILENAME = 'metadata.yml'
IGNORED_FILE_EXTENSIONS = ['.dbindex', '.dbnames', '.digest', '.subjects', '.yml']
def index_category(directory)
metadata_file = File.join(directory, METADATA_FILENAME)
if File.exist?(metadata_file)
# workaround for YML files that contain classname in file header
yaml = File.read(metadata_file).sub(/^--- !.*$/, '---')
metadata = YAML.safe_load(yaml)
else
metadata = {}
end
category = {
name: metadata['name'].presence || File.basename(directory),
description: metadata['description'],
parent_category_id: metadata['parent_category_id'].presence,
}
@database.insert_category(category)
category
end
def index_emails(directory, category_name)
all_messages(directory, category_name) do |receiver, filename, opts|
begin
msg_id = receiver.message_id
parsed_email = receiver.mail
from_email, from_display_name = receiver.parse_from_field(parsed_email)
if @settings.fix_mailman_via_addresses
# Detect cases like this and attempt to get actual sender from other headers:
# From: Jane Smith via ListName <ListName@lists.example.com>
if receiver.mail['X-Mailman-Version'] && from_display_name =~ /\bvia \S+$/i
email_from_from_line = opts[:from_line].scan(/From (\S+)/).flatten.first
a = Mail::Address.new(email_from_from_line)
from_email = a.address
from_display_name = a.display_name
# if name is not available there, look for it in Reply-To
if from_display_name.nil?
reply_to = receiver.mail.to_s.scan(/[\n\r]Reply-To: ([^\r\n]+)/).flatten.first
from_display_name = Mail::Address.new(reply_to).display_name
end
end
end
from_email = from_email.sub(/^(.*)=/, '') if @settings.elide_equals_in_addresses
body, elided, format = receiver.select_body
reply_message_ids = extract_reply_message_ids(parsed_email)
email = {
msg_id: msg_id,
from_email: from_email,
from_name: from_display_name,
subject: extract_subject(receiver, category_name),
email_date: timestamp(parsed_email.date),
raw_message: receiver.raw_email,
body: body,
elided: elided,
format: format,
attachment_count: receiver.attachments.count,
charset: parsed_email.charset&.downcase,
category: category_name,
filename: File.basename(filename),
first_line_number: opts[:first_line_number],
last_line_number: opts[:last_line_number],
index_duration: (monotonic_time - opts[:start_time]).round(4)
}
@database.transaction do |db|
db.insert_email(email)
db.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?
end
rescue StandardError => e
if opts[:first_line_number] && opts[:last_line_number]
STDERR.puts "Failed to index message in #{filename} at lines #{opts[:first_line_number]}-#{opts[:last_line_number]}"
else
STDERR.puts "Failed to index message in #{filename}"
end
STDERR.puts e.message
STDERR.puts e.backtrace.inspect
end
end
end
def imported_file_checksums(category_name)
rows = @database.fetch_imported_files(category_name)
rows.each_with_object({}) do |row, hash|
filename = File.basename(row['filename'])
hash[filename] = row['checksum']
end
end
def all_messages(directory, category_name)
checksums = imported_file_checksums(category_name)
Dir.foreach(directory) do |filename|
filename = File.join(directory, filename)
next if ignored_file?(filename, checksums)
puts "indexing #{filename}"
if @split_regex.present?
each_mail(filename) do |raw_message, first_line_number, last_line_number, from_line|
opts = {
first_line_number: first_line_number,
last_line_number: last_line_number,
start_time: monotonic_time,
from_line: from_line,
}
receiver = read_mail_from_string(raw_message)
yield receiver, filename, opts if receiver.present?
end
else
opts = { start_time: monotonic_time }
receiver = read_mail_from_file(filename)
yield receiver, filename, opts if receiver.present?
end
mark_as_fully_indexed(category_name, filename)
end
end
def mark_as_fully_indexed(category_name, filename)
imported_file = {
category: category_name,
filename: File.basename(filename),
checksum: calc_checksum(filename)
}
@database.insert_imported_file(imported_file)
end
def each_mail(filename)
raw_message = +''
first_line_number = 1
last_line_number = 0
from_line = nil
each_line(filename) do |line|
if line.scrub =~ @split_regex
if last_line_number > 0
yield raw_message, first_line_number, last_line_number, from_line
raw_message = +''
first_line_number = last_line_number + 1
end
from_line = line
else
raw_message << line
end
last_line_number += 1
end
yield raw_message, first_line_number, last_line_number, from_line if raw_message.present?
end
def each_line(filename)
raw_file = File.open(filename, 'r')
text_file = filename.end_with?('.gz') ? Zlib::GzipReader.new(raw_file) : raw_file
text_file.each_line do |line|
yield line
end
ensure
raw_file.close if raw_file
end
def read_mail_from_file(filename)
raw_message = File.read(filename)
read_mail_from_string(raw_message)
end
def read_mail_from_string(raw_message)
Email::Receiver.new(raw_message, convert_plaintext: true, skip_trimming: false) unless raw_message.blank?
end
def extract_reply_message_ids(mail)
Email::Receiver.extract_reply_message_ids(mail, max_message_id_count: 20)
end
def extract_subject(receiver, list_name)
subject = receiver.subject
subject.blank? ? nil : subject.strip.gsub(/\t+/, ' ')
end
def ignored_file?(path, checksums)
filename = File.basename(path)
filename.start_with?('.') ||
filename == METADATA_FILENAME ||
IGNORED_FILE_EXTENSIONS.include?(File.extname(filename)) ||
fully_indexed?(path, filename, checksums)
end
def fully_indexed?(path, filename, checksums)
checksum = checksums[filename]
checksum.present? && calc_checksum(path) == checksum
end
def calc_checksum(filename)
Digest::SHA256.file(filename).hexdigest
end
def monotonic_time
Process.clock_gettime(Process::CLOCK_MONOTONIC)
end
def timestamp(datetime)
Time.zone.at(datetime).to_i if datetime
end
end
end