discourse/script/import_scripts/mbox/support/database.rb
Gerhard Schlager 0e752db411 DEV: Improve mbox import script
* Customizable email subject prefixes to remove "Re" and "Fwd" as well as localized prefixes.
* Configuration option for prefixes like [FOO] or (BAR) which can be replaced with tags during import.
* Bugfix: Import script might have skipped some users due to missing ORDER BY.
2020-03-09 10:26:45 +01:00

308 lines
8.2 KiB
Ruby

# frozen_string_literal: true
require 'sqlite3'
module ImportScripts::Mbox
class Database
SCHEMA_VERSION = 2
def initialize(directory, batch_size)
@db = SQLite3::Database.new("#{directory}/index.db", results_as_hash: true)
@batch_size = batch_size
configure_database
upgrade_schema_version
create_table_for_categories
create_table_for_imported_files
create_table_for_emails
create_table_for_replies
create_table_for_users
end
def transaction
@db.transaction
yield self
@db.commit
rescue
@db.rollback
end
def insert_category(category)
@db.execute(<<-SQL, category)
INSERT OR REPLACE INTO category (name, description)
VALUES (:name, :description)
SQL
end
def insert_imported_file(imported_file)
@db.execute(<<-SQL, imported_file)
INSERT OR REPLACE INTO imported_file (category, filename, checksum)
VALUES (:category, :filename, :checksum)
SQL
end
def insert_email(email)
@db.execute(<<-SQL, email)
INSERT OR REPLACE INTO email (msg_id, from_email, from_name, subject,
email_date, raw_message, body, elided, format, attachment_count, charset,
category, filename, first_line_number, last_line_number, index_duration)
VALUES (:msg_id, :from_email, :from_name, :subject,
:email_date, :raw_message, :body, :elided, :format, :attachment_count, :charset,
:category, :filename, :first_line_number, :last_line_number, :index_duration)
SQL
end
def insert_replies(msg_id, reply_message_ids)
sql = <<-SQL
INSERT OR REPLACE INTO reply (msg_id, in_reply_to)
VALUES (:msg_id, :in_reply_to)
SQL
@db.prepare(sql) do |stmt|
reply_message_ids.each do |in_reply_to|
stmt.execute(msg_id, in_reply_to)
end
end
end
def update_in_reply_to_of_emails
@db.execute <<-SQL
UPDATE email
SET in_reply_to = (
SELECT e.msg_id
FROM reply r
JOIN email e ON (r.in_reply_to = e.msg_id)
WHERE r.msg_id = email.msg_id
ORDER BY e.email_date DESC
LIMIT 1
)
SQL
end
def update_in_reply_to_by_email_subject
@db.execute <<-SQL
UPDATE email
SET in_reply_to = NULLIF((
SELECT e.msg_id
FROM email e
JOIN email_order o ON (e.msg_id = o.msg_id)
WHERE e.subject = email.subject
ORDER BY o.ROWID
LIMIT 1
), msg_id)
SQL
end
def sort_emails_by_date_and_reply_level
@db.execute 'DELETE FROM email_order'
@db.execute <<-SQL
WITH RECURSIVE
messages(msg_id, level, email_date, in_reply_to) AS (
SELECT msg_id, 0 AS level, email_date, in_reply_to
FROM email
WHERE in_reply_to IS NULL
UNION ALL
SELECT e.msg_id, m.level + 1, e.email_date, e.in_reply_to
FROM email e
JOIN messages m ON e.in_reply_to = m.msg_id
)
INSERT INTO email_order (msg_id)
SELECT c.msg_id
FROM messages c
LEFT OUTER JOIN messages p ON (c.in_reply_to = p.msg_id)
ORDER BY MAX(c.email_date, p.email_date), c.level, c.email_date, c.msg_id
SQL
end
def sort_emails_by_subject
@db.execute 'DELETE FROM email_order'
@db.execute <<-SQL
INSERT INTO email_order (msg_id)
SELECT msg_id
FROM email
ORDER BY subject, filename, ROWID
SQL
end
def fill_users_from_emails
@db.execute 'DELETE FROM user'
@db.execute <<-SQL
INSERT INTO user (email, name, date_of_first_message)
SELECT from_email, MIN(from_name) AS from_name, MIN(email_date)
FROM email
WHERE from_email IS NOT NULL AND email_date IS NOT NULL
GROUP BY from_email
ORDER BY from_email
SQL
end
def fetch_imported_files(category)
@db.execute(<<-SQL, category)
SELECT filename, checksum
FROM imported_file
WHERE category = :category
SQL
end
def fetch_categories
@db.execute <<-SQL
SELECT name, description
FROM category
ORDER BY name
SQL
end
def count_users
@db.get_first_value <<-SQL
SELECT COUNT(*)
FROM user
SQL
end
def fetch_users(last_email)
rows = @db.execute(<<-SQL, last_email)
SELECT email, name, date_of_first_message
FROM user
WHERE email > :last_email
ORDER BY email
LIMIT #{@batch_size}
SQL
add_last_column_value(rows, 'email')
end
def count_messages
@db.get_first_value <<-SQL
SELECT COUNT(*)
FROM email
SQL
end
def fetch_messages(last_row_id)
rows = @db.execute(<<-SQL, last_row_id)
SELECT o.ROWID, e.msg_id, from_email, subject, email_date, in_reply_to,
raw_message, body, elided, format, attachment_count, category
FROM email e
JOIN email_order o USING (msg_id)
WHERE o.ROWID > :last_row_id
ORDER BY o.ROWID
LIMIT #{@batch_size}
SQL
add_last_column_value(rows, 'rowid')
end
private
def configure_database
@db.execute 'PRAGMA journal_mode = OFF'
@db.execute 'PRAGMA locking_mode = EXCLUSIVE'
end
def upgrade_schema_version
current_version = @db.get_first_value("PRAGMA user_version")
case current_version
when 1
@db.execute "ALTER TABLE email ADD COLUMN index_duration REAL"
end
@db.execute "PRAGMA user_version = #{SCHEMA_VERSION}"
end
def create_table_for_categories
@db.execute <<-SQL
CREATE TABLE IF NOT EXISTS category (
name TEXT NOT NULL PRIMARY KEY,
description TEXT
)
SQL
end
def create_table_for_imported_files
@db.execute <<-SQL
CREATE TABLE IF NOT EXISTS imported_file (
category TEXT NOT NULL,
filename TEXT NOT NULL,
checksum TEXT NOT NULL,
PRIMARY KEY (category, filename),
FOREIGN KEY(category) REFERENCES category(name)
)
SQL
end
def create_table_for_emails
@db.execute <<-SQL
CREATE TABLE IF NOT EXISTS email (
msg_id TEXT NOT NULL PRIMARY KEY,
from_email TEXT,
from_name TEXT,
subject TEXT,
in_reply_to TEXT,
email_date DATETIME,
raw_message TEXT,
body TEXT,
elided TEXT,
format INTEGER,
attachment_count INTEGER NOT NULL DEFAULT 0,
charset TEXT,
category TEXT NOT NULL,
filename TEXT NOT NULL,
first_line_number INTEGER,
last_line_number INTEGER,
index_duration REAL,
FOREIGN KEY(category) REFERENCES category(name)
)
SQL
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_from ON email (from_email)'
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_subject ON email (subject)'
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_in_reply_to ON email (in_reply_to)'
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_date ON email (email_date)'
@db.execute <<-SQL
CREATE TABLE IF NOT EXISTS email_order (
msg_id TEXT NOT NULL PRIMARY KEY
)
SQL
end
def create_table_for_replies
@db.execute <<-SQL
CREATE TABLE IF NOT EXISTS reply (
msg_id TEXT NOT NULL,
in_reply_to TEXT NOT NULL,
PRIMARY KEY (msg_id, in_reply_to),
FOREIGN KEY(msg_id) REFERENCES email(msg_id)
)
SQL
@db.execute 'CREATE INDEX IF NOT EXISTS reply_by_in_reply_to ON reply (in_reply_to)'
end
def create_table_for_users
@db.execute <<-SQL
CREATE TABLE IF NOT EXISTS user (
email TEXT NOT NULL PRIMARY KEY,
name TEXT,
date_of_first_message DATETIME NOT NULL
)
SQL
end
def add_last_column_value(rows, *last_columns)
return rows if last_columns.empty?
result = [rows]
last_row = rows.last
last_columns.each { |column| result.push(last_row ? last_row[column] : nil) }
result
end
end
end