mirror of
https://github.com/discourse/discourse.git
synced 2024-11-28 01:25:06 +08:00
Merge pull request #4889 from gschlager/mbox-refactor
Add new (experimental) mbox importer
This commit is contained in:
commit
d9e5969a5c
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -42,6 +42,7 @@ config/discourse.conf
|
|||
/tmp
|
||||
/logfile
|
||||
log/
|
||||
bootsnap-load-path-cache
|
||||
|
||||
# Ignore plugins except for the bundled ones.
|
||||
/plugins/*
|
||||
|
|
14
.rubocop.yml
Normal file
14
.rubocop.yml
Normal file
|
@ -0,0 +1,14 @@
|
|||
AllCops:
|
||||
TargetRubyVersion: 2.3
|
||||
|
||||
Metrics/LineLength:
|
||||
Max: 120
|
||||
|
||||
Metrics/MethodLength:
|
||||
Enabled: false
|
||||
|
||||
Style/Documentation:
|
||||
Enabled: false
|
||||
|
||||
Style/FrozenStringLiteralComment:
|
||||
Enabled: False
|
1
Gemfile
1
Gemfile
|
@ -190,4 +190,5 @@ gem 'sassc', require: false
|
|||
if ENV["IMPORT"] == "1"
|
||||
gem 'mysql2'
|
||||
gem 'redcarpet'
|
||||
gem 'sqlite3', '~> 1.3.13'
|
||||
end
|
||||
|
|
|
@ -28,6 +28,9 @@ module Email
|
|||
class InvalidPostAction < ProcessingError; end
|
||||
|
||||
attr_reader :incoming_email
|
||||
attr_reader :raw_email
|
||||
attr_reader :mail
|
||||
attr_reader :message_id
|
||||
|
||||
def initialize(mail_string)
|
||||
raise EmptyEmailError if mail_string.blank?
|
||||
|
@ -241,7 +244,7 @@ module Email
|
|||
|
||||
def try_to_encode(string, encoding)
|
||||
encoded = string.encode("UTF-8", encoding)
|
||||
encoded.present? && encoded.valid_encoding? ? encoded : nil
|
||||
!encoded.nil? && encoded.valid_encoding? ? encoded : nil
|
||||
rescue Encoding::InvalidByteSequenceError,
|
||||
Encoding::UndefinedConversionError,
|
||||
Encoding::ConverterNotFoundError
|
||||
|
@ -622,10 +625,7 @@ module Email
|
|||
|
||||
# only add elided part in messages
|
||||
if options[:elided].present? && (SiteSetting.always_show_trimmed_content || is_private_message)
|
||||
options[:raw] << "\n\n" << "<details class='elided'>" << "\n"
|
||||
options[:raw] << "<summary title='#{I18n.t('emails.incoming.show_trimmed_content')}'>···</summary>" << "\n"
|
||||
options[:raw] << options[:elided] << "\n"
|
||||
options[:raw] << "</details>" << "\n"
|
||||
options[:raw] << Email::Receiver.elided_html(options[:elided])
|
||||
end
|
||||
|
||||
user = options.delete(:user)
|
||||
|
@ -643,6 +643,14 @@ module Email
|
|||
result.post
|
||||
end
|
||||
|
||||
def self.elided_html(elided)
|
||||
html = "\n\n" << "<details class='elided'>" << "\n"
|
||||
html << "<summary title='#{I18n.t('emails.incoming.show_trimmed_content')}'>···</summary>" << "\n"
|
||||
html << elided << "\n"
|
||||
html << "</details>" << "\n"
|
||||
html
|
||||
end
|
||||
|
||||
def add_other_addresses(topic, sender)
|
||||
%i(to cc bcc).each do |d|
|
||||
if @mail[d] && @mail[d].address_list && @mail[d].address_list.addresses
|
||||
|
|
17
script/import_scripts/mbox-experimental.rb
Normal file
17
script/import_scripts/mbox-experimental.rb
Normal file
|
@ -0,0 +1,17 @@
|
|||
if ARGV.length != 1 || !File.exists?(ARGV[0])
|
||||
STDERR.puts '', 'Usage of mbox importer:', 'bundle exec ruby mbox-experimental.rb <path/to/settings.yml>'
|
||||
STDERR.puts '', "Use the settings file from #{File.expand_path('mbox/settings.yml', File.dirname(__FILE__))} as an example."
|
||||
exit 1
|
||||
end
|
||||
|
||||
module ImportScripts
|
||||
module Mbox
|
||||
require_relative 'mbox/support/settings'
|
||||
|
||||
@settings = Settings.load(ARGV[0])
|
||||
|
||||
require_relative 'mbox/importer'
|
||||
Importer.new(@settings).perform
|
||||
end
|
||||
end
|
||||
|
161
script/import_scripts/mbox/importer.rb
Normal file
161
script/import_scripts/mbox/importer.rb
Normal file
|
@ -0,0 +1,161 @@
|
|||
require_relative '../base'
|
||||
require_relative 'support/database'
|
||||
require_relative 'support/indexer'
|
||||
require_relative 'support/settings'
|
||||
|
||||
module ImportScripts::Mbox
|
||||
class Importer < ImportScripts::Base
|
||||
# @param settings [ImportScripts::Mbox::Settings]
|
||||
def initialize(settings)
|
||||
@settings = settings
|
||||
super()
|
||||
|
||||
@database = Database.new(@settings.data_dir, @settings.batch_size)
|
||||
end
|
||||
|
||||
def change_site_settings
|
||||
super
|
||||
|
||||
SiteSetting.enable_staged_users = true
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
def execute
|
||||
index_messages
|
||||
import_categories
|
||||
import_users
|
||||
import_posts
|
||||
end
|
||||
|
||||
def index_messages
|
||||
puts '', 'creating index'
|
||||
indexer = Indexer.new(@database, @settings)
|
||||
indexer.execute
|
||||
end
|
||||
|
||||
def import_categories
|
||||
puts '', 'creating categories'
|
||||
rows = @database.fetch_categories
|
||||
|
||||
create_categories(rows) do |row|
|
||||
{
|
||||
id: row['name'],
|
||||
name: row['name']
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
def import_users
|
||||
puts '', 'creating users'
|
||||
total_count = @database.count_users
|
||||
last_email = ''
|
||||
|
||||
batches do |offset|
|
||||
rows, last_email = @database.fetch_users(last_email)
|
||||
break if rows.empty?
|
||||
|
||||
next if all_records_exist?(:users, rows.map { |row| row['email'] })
|
||||
|
||||
create_users(rows, total: total_count, offset: offset) do |row|
|
||||
{
|
||||
id: row['email'],
|
||||
email: row['email'],
|
||||
name: row['name'],
|
||||
trust_level: @settings.trust_level,
|
||||
staged: true,
|
||||
created_at: to_time(row['date_of_first_message'])
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def batches
|
||||
super(@settings.batch_size)
|
||||
end
|
||||
|
||||
def import_posts
|
||||
puts '', 'creating topics and posts'
|
||||
total_count = @database.count_messages
|
||||
last_row_id = 0
|
||||
|
||||
batches do |offset|
|
||||
rows, last_row_id = @database.fetch_messages(last_row_id)
|
||||
break if rows.empty?
|
||||
|
||||
next if all_records_exist?(:posts, rows.map { |row| row['msg_id'] })
|
||||
|
||||
create_posts(rows, total: total_count, offset: offset) do |row|
|
||||
if row['in_reply_to'].blank?
|
||||
map_first_post(row)
|
||||
else
|
||||
map_reply(row)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def map_post(row)
|
||||
user_id = user_id_from_imported_user_id(row['from_email']) || Discourse::SYSTEM_USER_ID
|
||||
body = row['body'] || ''
|
||||
body << map_attachments(row['raw_message'], user_id) if row['attachment_count'].positive?
|
||||
body << Email::Receiver.elided_html(row['elided']) if row['elided'].present?
|
||||
|
||||
{
|
||||
id: row['msg_id'],
|
||||
user_id: user_id,
|
||||
created_at: to_time(row['email_date']),
|
||||
raw: body,
|
||||
raw_email: row['raw_message'],
|
||||
via_email: true,
|
||||
# cook_method: Post.cook_methods[:email] # this is slowing down the import by factor 4
|
||||
}
|
||||
end
|
||||
|
||||
def map_first_post(row)
|
||||
mapped = map_post(row)
|
||||
mapped[:category] = category_id_from_imported_category_id(row['category'])
|
||||
mapped[:title] = row['subject'].strip[0...255]
|
||||
mapped
|
||||
end
|
||||
|
||||
def map_reply(row)
|
||||
parent = @lookup.topic_lookup_from_imported_post_id(row['in_reply_to'])
|
||||
|
||||
if parent.blank?
|
||||
puts "Parent message #{row['in_reply_to']} doesn't exist. Skipping #{row['msg_id']}: #{row['subject'][0..40]}"
|
||||
return nil
|
||||
end
|
||||
|
||||
mapped = map_post(row)
|
||||
mapped[:topic_id] = parent[:topic_id]
|
||||
mapped
|
||||
end
|
||||
|
||||
def map_attachments(raw_message, user_id)
|
||||
receiver = Email::Receiver.new(raw_message)
|
||||
attachment_markdown = ''
|
||||
|
||||
receiver.attachments.each do |attachment|
|
||||
tmp = Tempfile.new(['discourse-email-attachment', File.extname(attachment.filename)])
|
||||
|
||||
begin
|
||||
File.open(tmp.path, 'w+b') { |f| f.write attachment.body.decoded }
|
||||
upload = UploadCreator.new(tmp, attachment.filename).create_for(user_id)
|
||||
|
||||
if upload && upload.errors.empty?
|
||||
attachment_markdown << "\n\n#{receiver.attachment_markdown(upload)}\n\n"
|
||||
end
|
||||
ensure
|
||||
tmp.try(:close!)
|
||||
end
|
||||
end
|
||||
|
||||
attachment_markdown
|
||||
end
|
||||
|
||||
def to_time(datetime)
|
||||
Time.zone.at(DateTime.iso8601(datetime)) if datetime
|
||||
end
|
||||
end
|
||||
end
|
9
script/import_scripts/mbox/settings.yml
Normal file
9
script/import_scripts/mbox/settings.yml
Normal file
|
@ -0,0 +1,9 @@
|
|||
# PostgreSQL mailing lists
|
||||
#data_dir: /data/import/postgres
|
||||
#split_regex: "^From .*@postgresql.org.*"
|
||||
|
||||
# ruby-talk mailing list
|
||||
data_dir: /data/import/ruby-talk/news/gmane/comp/lang/ruby
|
||||
split_regex: ""
|
||||
|
||||
default_trust_level: 1
|
261
script/import_scripts/mbox/support/database.rb
Normal file
261
script/import_scripts/mbox/support/database.rb
Normal file
|
@ -0,0 +1,261 @@
|
|||
require 'sqlite3'
|
||||
|
||||
module ImportScripts::Mbox
|
||||
class Database
|
||||
SCHEMA_VERSION = 1
|
||||
|
||||
def initialize(directory, batch_size)
|
||||
@db = SQLite3::Database.new("#{directory}/index.db", results_as_hash: true)
|
||||
@batch_size = batch_size
|
||||
|
||||
configure_database
|
||||
upgrade_schema_version
|
||||
create_table_for_categories
|
||||
create_table_for_imported_files
|
||||
create_table_for_emails
|
||||
create_table_for_replies
|
||||
create_table_for_users
|
||||
end
|
||||
|
||||
def insert_category(category)
|
||||
@db.execute(<<-SQL, category)
|
||||
INSERT OR REPLACE INTO category (name, description)
|
||||
VALUES (:name, :description)
|
||||
SQL
|
||||
end
|
||||
|
||||
def insert_imported_file(imported_file)
|
||||
@db.execute(<<-SQL, imported_file)
|
||||
INSERT OR REPLACE INTO imported_file (category, filename, checksum)
|
||||
VALUES (:category, :filename, :checksum)
|
||||
SQL
|
||||
end
|
||||
|
||||
def insert_email(email)
|
||||
@db.execute(<<-SQL, email)
|
||||
INSERT OR REPLACE INTO email (msg_id, from_email, from_name, subject,
|
||||
email_date, raw_message, body, elided, attachment_count, charset,
|
||||
category, filename, first_line_number, last_line_number)
|
||||
VALUES (:msg_id, :from_email, :from_name, :subject,
|
||||
:email_date, :raw_message, :body, :elided, :attachment_count, :charset,
|
||||
:category, :filename, :first_line_number, :last_line_number)
|
||||
SQL
|
||||
end
|
||||
|
||||
def insert_replies(msg_id, reply_message_ids)
|
||||
sql = <<-SQL
|
||||
INSERT OR REPLACE INTO reply (msg_id, in_reply_to)
|
||||
VALUES (:msg_id, :in_reply_to)
|
||||
SQL
|
||||
|
||||
@db.prepare(sql) do |stmt|
|
||||
reply_message_ids.each do |in_reply_to|
|
||||
stmt.execute(msg_id, in_reply_to)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def update_in_reply_to_of_emails
|
||||
@db.execute <<-SQL
|
||||
UPDATE email
|
||||
SET in_reply_to = (
|
||||
SELECT e.msg_id
|
||||
FROM reply r
|
||||
JOIN email e ON (r.in_reply_to = e.msg_id)
|
||||
WHERE r.msg_id = email.msg_id
|
||||
ORDER BY e.email_date DESC
|
||||
LIMIT 1
|
||||
)
|
||||
SQL
|
||||
end
|
||||
|
||||
def sort_emails
|
||||
@db.execute 'DELETE FROM email_order'
|
||||
|
||||
@db.execute <<-SQL
|
||||
WITH RECURSIVE
|
||||
messages(msg_id, level, email_date) AS (
|
||||
SELECT msg_id, 0 AS level, email_date
|
||||
FROM email
|
||||
WHERE in_reply_to IS NULL
|
||||
UNION ALL
|
||||
SELECT e.msg_id, m.level + 1, e.email_date
|
||||
FROM email e
|
||||
JOIN messages m ON e.in_reply_to = m.msg_id
|
||||
ORDER BY level, email_date, msg_id
|
||||
)
|
||||
INSERT INTO email_order (msg_id)
|
||||
SELECT msg_id
|
||||
FROM messages
|
||||
SQL
|
||||
end
|
||||
|
||||
def fill_users_from_emails
|
||||
@db.execute 'DELETE FROM user'
|
||||
|
||||
@db.execute <<-SQL
|
||||
INSERT INTO user (email, name, date_of_first_message)
|
||||
SELECT from_email, MIN(from_name) AS from_name, MIN(email_date)
|
||||
FROM email
|
||||
WHERE from_email IS NOT NULL
|
||||
GROUP BY from_email
|
||||
ORDER BY from_email
|
||||
SQL
|
||||
end
|
||||
|
||||
def fetch_imported_files(category)
|
||||
@db.execute(<<-SQL, category)
|
||||
SELECT filename, checksum
|
||||
FROM imported_file
|
||||
WHERE category = :category
|
||||
SQL
|
||||
end
|
||||
|
||||
def fetch_categories
|
||||
@db.execute <<-SQL
|
||||
SELECT name, description
|
||||
FROM category
|
||||
ORDER BY name
|
||||
SQL
|
||||
end
|
||||
|
||||
def count_users
|
||||
@db.get_first_value <<-SQL
|
||||
SELECT COUNT(*)
|
||||
FROM user
|
||||
SQL
|
||||
end
|
||||
|
||||
def fetch_users(last_email)
|
||||
rows = @db.execute(<<-SQL, last_email)
|
||||
SELECT email, name, date_of_first_message
|
||||
FROM user
|
||||
WHERE email > :last_email
|
||||
LIMIT #{@batch_size}
|
||||
SQL
|
||||
|
||||
add_last_column_value(rows, 'email')
|
||||
end
|
||||
|
||||
def count_messages
|
||||
@db.get_first_value <<-SQL
|
||||
SELECT COUNT(*)
|
||||
FROM email
|
||||
WHERE email_date IS NOT NULL
|
||||
SQL
|
||||
end
|
||||
|
||||
def fetch_messages(last_row_id)
|
||||
rows = @db.execute(<<-SQL, last_row_id)
|
||||
SELECT o.ROWID, e.msg_id, from_email, subject, email_date, in_reply_to,
|
||||
raw_message, body, elided, attachment_count, category
|
||||
FROM email e
|
||||
JOIN email_order o USING (msg_id)
|
||||
WHERE email_date IS NOT NULL AND
|
||||
o.ROWID > :last_row_id
|
||||
ORDER BY o.ROWID
|
||||
LIMIT #{@batch_size}
|
||||
SQL
|
||||
|
||||
add_last_column_value(rows, 'rowid')
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def configure_database
|
||||
@db.execute 'PRAGMA journal_mode = TRUNCATE'
|
||||
end
|
||||
|
||||
def upgrade_schema_version
|
||||
# current_version = query("PRAGMA user_version").last[0]
|
||||
@db.execute "PRAGMA user_version = #{SCHEMA_VERSION}"
|
||||
end
|
||||
|
||||
def create_table_for_categories
|
||||
@db.execute <<-SQL
|
||||
CREATE TABLE IF NOT EXISTS category (
|
||||
name TEXT NOT NULL PRIMARY KEY,
|
||||
description TEXT
|
||||
)
|
||||
SQL
|
||||
end
|
||||
|
||||
def create_table_for_imported_files
|
||||
@db.execute <<-SQL
|
||||
CREATE TABLE IF NOT EXISTS imported_file (
|
||||
category TEXT NOT NULL,
|
||||
filename TEXT NOT NULL,
|
||||
checksum TEXT NOT NULL,
|
||||
PRIMARY KEY (category, filename),
|
||||
FOREIGN KEY(category) REFERENCES category(name)
|
||||
)
|
||||
SQL
|
||||
end
|
||||
|
||||
def create_table_for_emails
|
||||
@db.execute <<-SQL
|
||||
CREATE TABLE IF NOT EXISTS email (
|
||||
msg_id TEXT NOT NULL PRIMARY KEY,
|
||||
from_email TEXT,
|
||||
from_name TEXT,
|
||||
subject TEXT,
|
||||
in_reply_to TEXT,
|
||||
email_date DATETIME,
|
||||
raw_message TEXT,
|
||||
body TEXT,
|
||||
elided TEXT,
|
||||
attachment_count INTEGER NOT NULL DEFAULT 0,
|
||||
charset TEXT,
|
||||
category TEXT NOT NULL,
|
||||
filename TEXT NOT NULL,
|
||||
first_line_number INTEGER,
|
||||
last_line_number INTEGER,
|
||||
FOREIGN KEY(category) REFERENCES category(name)
|
||||
)
|
||||
SQL
|
||||
|
||||
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_from ON email (from_email)'
|
||||
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_in_reply_to ON email (in_reply_to)'
|
||||
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_date ON email (email_date)'
|
||||
|
||||
@db.execute <<-SQL
|
||||
CREATE TABLE IF NOT EXISTS email_order (
|
||||
msg_id TEXT NOT NULL PRIMARY KEY
|
||||
)
|
||||
SQL
|
||||
end
|
||||
|
||||
def create_table_for_replies
|
||||
@db.execute <<-SQL
|
||||
CREATE TABLE IF NOT EXISTS reply (
|
||||
msg_id TEXT NOT NULL,
|
||||
in_reply_to TEXT NOT NULL,
|
||||
PRIMARY KEY (msg_id, in_reply_to),
|
||||
FOREIGN KEY(msg_id) REFERENCES email(msg_id)
|
||||
)
|
||||
SQL
|
||||
|
||||
@db.execute 'CREATE INDEX IF NOT EXISTS reply_by_in_reply_to ON reply (in_reply_to)'
|
||||
end
|
||||
|
||||
def create_table_for_users
|
||||
@db.execute <<-SQL
|
||||
CREATE TABLE IF NOT EXISTS user (
|
||||
email TEXT NOT NULL PRIMARY KEY,
|
||||
name TEXT,
|
||||
date_of_first_message DATETIME NOT NULL
|
||||
)
|
||||
SQL
|
||||
end
|
||||
|
||||
def add_last_column_value(rows, *last_columns)
|
||||
return rows if last_columns.empty?
|
||||
|
||||
result = [rows]
|
||||
last_row = rows.last
|
||||
|
||||
last_columns.each { |column| result.push(last_row ? last_row[column] : nil) }
|
||||
result
|
||||
end
|
||||
end
|
||||
end
|
226
script/import_scripts/mbox/support/indexer.rb
Normal file
226
script/import_scripts/mbox/support/indexer.rb
Normal file
|
@ -0,0 +1,226 @@
|
|||
require_relative 'database'
|
||||
require 'json'
|
||||
require 'yaml'
|
||||
|
||||
module ImportScripts::Mbox
|
||||
class Indexer
|
||||
# @param database [ImportScripts::Mbox::Database]
|
||||
# @param settings [ImportScripts::Mbox::Settings]
|
||||
def initialize(database, settings)
|
||||
@database = database
|
||||
@root_directory = settings.data_dir
|
||||
@split_regex = settings.split_regex
|
||||
end
|
||||
|
||||
def execute
|
||||
directories = Dir.glob(File.join(@root_directory, '*'))
|
||||
directories.select! { |f| File.directory?(f) }
|
||||
directories.sort!
|
||||
|
||||
directories.each do |directory|
|
||||
puts "indexing files in #{directory}"
|
||||
category = index_category(directory)
|
||||
index_emails(directory, category[:name])
|
||||
end
|
||||
|
||||
puts '', 'indexing replies and users'
|
||||
@database.update_in_reply_to_of_emails
|
||||
@database.sort_emails
|
||||
@database.fill_users_from_emails
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
METADATA_FILENAME = 'metadata.yml'.freeze
|
||||
|
||||
def index_category(directory)
|
||||
metadata_file = File.join(directory, METADATA_FILENAME)
|
||||
|
||||
if File.exist?(metadata_file)
|
||||
# workaround for YML files that contain classname in file header
|
||||
yaml = File.read(metadata_file).sub(/^--- !.*$/, '---')
|
||||
metadata = YAML.load(yaml)
|
||||
else
|
||||
metadata = {}
|
||||
end
|
||||
|
||||
category = {
|
||||
name: metadata['name'].presence || File.basename(directory),
|
||||
description: metadata['description']
|
||||
}
|
||||
|
||||
@database.insert_category(category)
|
||||
category
|
||||
end
|
||||
|
||||
def index_emails(directory, category_name)
|
||||
all_messages(directory, category_name) do |receiver, filename, first_line_number, last_line_number|
|
||||
msg_id = receiver.message_id
|
||||
parsed_email = receiver.mail
|
||||
from_email, from_display_name = receiver.parse_from_field(parsed_email)
|
||||
body, elided = receiver.select_body
|
||||
reply_message_ids = extract_reply_message_ids(parsed_email)
|
||||
|
||||
email = {
|
||||
msg_id: msg_id,
|
||||
from_email: from_email,
|
||||
from_name: from_display_name,
|
||||
subject: extract_subject(receiver, category_name),
|
||||
email_date: parsed_email.date&.to_s,
|
||||
raw_message: receiver.raw_email,
|
||||
body: body,
|
||||
elided: elided,
|
||||
attachment_count: receiver.attachments.count,
|
||||
charset: parsed_email.charset&.downcase,
|
||||
category: category_name,
|
||||
filename: File.basename(filename),
|
||||
first_line_number: first_line_number,
|
||||
last_line_number: last_line_number
|
||||
}
|
||||
|
||||
@database.insert_email(email)
|
||||
@database.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?
|
||||
end
|
||||
end
|
||||
|
||||
def imported_file_checksums(category_name)
|
||||
rows = @database.fetch_imported_files(category_name)
|
||||
rows.each_with_object({}) do |row, hash|
|
||||
hash[row['filename']] = row['checksum']
|
||||
end
|
||||
end
|
||||
|
||||
def all_messages(directory, category_name)
|
||||
checksums = imported_file_checksums(category_name)
|
||||
|
||||
Dir.foreach(directory) do |filename|
|
||||
filename = File.join(directory, filename)
|
||||
next if ignored_file?(filename, checksums)
|
||||
|
||||
puts "indexing #{filename}"
|
||||
|
||||
if @split_regex.present?
|
||||
each_mail(filename) do |raw_message, first_line_number, last_line_number|
|
||||
yield read_mail_from_string(raw_message), filename, first_line_number, last_line_number
|
||||
end
|
||||
else
|
||||
yield read_mail_from_file(filename), filename
|
||||
end
|
||||
|
||||
mark_as_fully_indexed(category_name, filename)
|
||||
end
|
||||
end
|
||||
|
||||
def mark_as_fully_indexed(category_name, filename)
|
||||
imported_file = {
|
||||
category: category_name,
|
||||
filename: filename,
|
||||
checksum: calc_checksum(filename)
|
||||
}
|
||||
|
||||
@database.insert_imported_file(imported_file)
|
||||
end
|
||||
|
||||
def each_mail(filename)
|
||||
raw_message = ''
|
||||
first_line_number = 1
|
||||
last_line_number = 0
|
||||
|
||||
each_line(filename) do |line|
|
||||
line = line.scrub
|
||||
|
||||
if line =~ @split_regex && last_line_number.positive?
|
||||
yield raw_message, first_line_number, last_line_number
|
||||
raw_message = ''
|
||||
first_line_number = last_line_number + 1
|
||||
else
|
||||
raw_message << line
|
||||
end
|
||||
|
||||
last_line_number += 1
|
||||
end
|
||||
|
||||
yield raw_message, first_line_number, last_line_number if raw_message.present?
|
||||
end
|
||||
|
||||
def each_line(filename)
|
||||
raw_file = File.open(filename, 'r')
|
||||
text_file = filename.end_with?('.gz') ? Zlib::GzipReader.new(raw_file) : raw_file
|
||||
|
||||
text_file.each_line do |line|
|
||||
yield line
|
||||
end
|
||||
ensure
|
||||
raw_file.close if raw_file
|
||||
end
|
||||
|
||||
def read_mail_from_file(filename)
|
||||
raw_message = File.read(filename)
|
||||
read_mail_from_string(raw_message)
|
||||
end
|
||||
|
||||
def read_mail_from_string(raw_message)
|
||||
Email::Receiver.new(raw_message)
|
||||
end
|
||||
|
||||
def extract_reply_message_ids(mail)
|
||||
message_ids = [mail.in_reply_to, Email::Receiver.extract_references(mail.references)]
|
||||
message_ids.flatten!
|
||||
message_ids.select!(&:present?)
|
||||
message_ids.uniq!
|
||||
message_ids.first(20)
|
||||
end
|
||||
|
||||
def extract_subject(receiver, list_name)
|
||||
subject = receiver.subject
|
||||
return nil if subject.blank?
|
||||
|
||||
# TODO: make the list name (or maybe multiple names) configurable
|
||||
# Strip mailing list name from subject
|
||||
subject = subject.gsub(/\[#{Regexp.escape(list_name)}\]/, '').strip
|
||||
|
||||
clean_subject(subject)
|
||||
end
|
||||
|
||||
# TODO: refactor and move prefixes to settings
|
||||
def clean_subject(subject)
|
||||
original_length = subject.length
|
||||
|
||||
# Strip Reply prefix from title (Standard and localized)
|
||||
subject = subject.gsub(/^Re: */i, '')
|
||||
subject = subject.gsub(/^R: */i, '') #Italian
|
||||
subject = subject.gsub(/^RIF: */i, '') #Italian
|
||||
|
||||
# Strip Forward prefix from title (Standard and localized)
|
||||
subject = subject.gsub(/^Fwd: */i, '')
|
||||
subject = subject.gsub(/^I: */i, '') #Italian
|
||||
|
||||
subject.strip
|
||||
|
||||
# In case of mixed localized prefixes there could be many of them
|
||||
# if the mail client didn't strip the localized ones
|
||||
if original_length > subject.length
|
||||
clean_subject(subject)
|
||||
else
|
||||
subject
|
||||
end
|
||||
end
|
||||
|
||||
def ignored_file?(filename, checksums)
|
||||
File.directory?(filename) || metadata_file?(filename) || fully_indexed?(filename, checksums)
|
||||
end
|
||||
|
||||
def metadata_file?(filename)
|
||||
File.basename(filename) == METADATA_FILENAME
|
||||
end
|
||||
|
||||
def fully_indexed?(filename, checksums)
|
||||
checksum = checksums[filename]
|
||||
checksum.present? && calc_checksum(filename) == checksum
|
||||
end
|
||||
|
||||
def calc_checksum(filename)
|
||||
Digest::SHA256.file(filename).hexdigest
|
||||
end
|
||||
end
|
||||
end
|
22
script/import_scripts/mbox/support/settings.rb
Normal file
22
script/import_scripts/mbox/support/settings.rb
Normal file
|
@ -0,0 +1,22 @@
|
|||
require 'yaml'
|
||||
|
||||
module ImportScripts::Mbox
|
||||
class Settings
|
||||
def self.load(filename)
|
||||
yaml = YAML.load_file(filename)
|
||||
Settings.new(yaml)
|
||||
end
|
||||
|
||||
attr_reader :data_dir
|
||||
attr_reader :split_regex
|
||||
attr_reader :batch_size
|
||||
attr_reader :trust_level
|
||||
|
||||
def initialize(yaml)
|
||||
@data_dir = yaml['data_dir']
|
||||
@split_regex = Regexp.new(yaml['split_regex']) unless yaml['split_regex'].empty?
|
||||
@batch_size = 1000 # no need to make this actually configurable at the moment
|
||||
@trust_level = yaml['default_trust_level']
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue
Block a user