mirror of
https://github.com/discourse/discourse.git
synced 2024-12-15 22:28:34 +08:00
ac743dab10
* emails weren't sorted in correct order * better default regex for splitting mbox files * output Message-ID if email is skipped because it doesn't have a Date
305 lines
8.2 KiB
Ruby
305 lines
8.2 KiB
Ruby
require 'sqlite3'
|
|
|
|
module ImportScripts::Mbox
|
|
class Database
|
|
SCHEMA_VERSION = 2
|
|
|
|
def initialize(directory, batch_size)
|
|
@db = SQLite3::Database.new("#{directory}/index.db", results_as_hash: true)
|
|
@batch_size = batch_size
|
|
|
|
configure_database
|
|
upgrade_schema_version
|
|
create_table_for_categories
|
|
create_table_for_imported_files
|
|
create_table_for_emails
|
|
create_table_for_replies
|
|
create_table_for_users
|
|
end
|
|
|
|
def transaction
|
|
@db.transaction
|
|
yield self
|
|
@db.commit
|
|
|
|
rescue
|
|
@db.rollback
|
|
end
|
|
|
|
def insert_category(category)
|
|
@db.execute(<<-SQL, category)
|
|
INSERT OR REPLACE INTO category (name, description)
|
|
VALUES (:name, :description)
|
|
SQL
|
|
end
|
|
|
|
def insert_imported_file(imported_file)
|
|
@db.execute(<<-SQL, imported_file)
|
|
INSERT OR REPLACE INTO imported_file (category, filename, checksum)
|
|
VALUES (:category, :filename, :checksum)
|
|
SQL
|
|
end
|
|
|
|
def insert_email(email)
|
|
@db.execute(<<-SQL, email)
|
|
INSERT OR REPLACE INTO email (msg_id, from_email, from_name, subject,
|
|
email_date, raw_message, body, elided, format, attachment_count, charset,
|
|
category, filename, first_line_number, last_line_number, index_duration)
|
|
VALUES (:msg_id, :from_email, :from_name, :subject,
|
|
:email_date, :raw_message, :body, :elided, :format, :attachment_count, :charset,
|
|
:category, :filename, :first_line_number, :last_line_number, :index_duration)
|
|
SQL
|
|
end
|
|
|
|
def insert_replies(msg_id, reply_message_ids)
|
|
sql = <<-SQL
|
|
INSERT OR REPLACE INTO reply (msg_id, in_reply_to)
|
|
VALUES (:msg_id, :in_reply_to)
|
|
SQL
|
|
|
|
@db.prepare(sql) do |stmt|
|
|
reply_message_ids.each do |in_reply_to|
|
|
stmt.execute(msg_id, in_reply_to)
|
|
end
|
|
end
|
|
end
|
|
|
|
def update_in_reply_to_of_emails
|
|
@db.execute <<-SQL
|
|
UPDATE email
|
|
SET in_reply_to = (
|
|
SELECT e.msg_id
|
|
FROM reply r
|
|
JOIN email e ON (r.in_reply_to = e.msg_id)
|
|
WHERE r.msg_id = email.msg_id
|
|
ORDER BY e.email_date DESC
|
|
LIMIT 1
|
|
)
|
|
SQL
|
|
end
|
|
|
|
def update_in_reply_to_by_email_subject
|
|
@db.execute <<-SQL
|
|
UPDATE email
|
|
SET in_reply_to = NULLIF((
|
|
SELECT e.msg_id
|
|
FROM email e
|
|
JOIN email_order o ON (e.msg_id = o.msg_id)
|
|
WHERE e.subject = email.subject
|
|
ORDER BY o.ROWID
|
|
LIMIT 1
|
|
), msg_id)
|
|
SQL
|
|
end
|
|
|
|
def sort_emails_by_date_and_reply_level
|
|
@db.execute 'DELETE FROM email_order'
|
|
|
|
@db.execute <<-SQL
|
|
WITH RECURSIVE
|
|
messages(msg_id, level, email_date, in_reply_to) AS (
|
|
SELECT msg_id, 0 AS level, email_date, in_reply_to
|
|
FROM email
|
|
WHERE in_reply_to IS NULL
|
|
UNION ALL
|
|
SELECT e.msg_id, m.level + 1, e.email_date, e.in_reply_to
|
|
FROM email e
|
|
JOIN messages m ON e.in_reply_to = m.msg_id
|
|
)
|
|
INSERT INTO email_order (msg_id)
|
|
SELECT c.msg_id
|
|
FROM messages c
|
|
LEFT OUTER JOIN messages p ON (c.in_reply_to = p.msg_id)
|
|
ORDER BY MAX(c.email_date, p.email_date), c.level, c.email_date, c.msg_id
|
|
SQL
|
|
end
|
|
|
|
def sort_emails_by_subject
|
|
@db.execute 'DELETE FROM email_order'
|
|
|
|
@db.execute <<-SQL
|
|
INSERT INTO email_order (msg_id)
|
|
SELECT msg_id
|
|
FROM email
|
|
ORDER BY subject, filename, ROWID
|
|
SQL
|
|
end
|
|
|
|
def fill_users_from_emails
|
|
@db.execute 'DELETE FROM user'
|
|
|
|
@db.execute <<-SQL
|
|
INSERT INTO user (email, name, date_of_first_message)
|
|
SELECT from_email, MIN(from_name) AS from_name, MIN(email_date)
|
|
FROM email
|
|
WHERE from_email IS NOT NULL AND email_date IS NOT NULL
|
|
GROUP BY from_email
|
|
ORDER BY from_email
|
|
SQL
|
|
end
|
|
|
|
def fetch_imported_files(category)
|
|
@db.execute(<<-SQL, category)
|
|
SELECT filename, checksum
|
|
FROM imported_file
|
|
WHERE category = :category
|
|
SQL
|
|
end
|
|
|
|
def fetch_categories
|
|
@db.execute <<-SQL
|
|
SELECT name, description
|
|
FROM category
|
|
ORDER BY name
|
|
SQL
|
|
end
|
|
|
|
def count_users
|
|
@db.get_first_value <<-SQL
|
|
SELECT COUNT(*)
|
|
FROM user
|
|
SQL
|
|
end
|
|
|
|
def fetch_users(last_email)
|
|
rows = @db.execute(<<-SQL, last_email)
|
|
SELECT email, name, date_of_first_message
|
|
FROM user
|
|
WHERE email > :last_email
|
|
LIMIT #{@batch_size}
|
|
SQL
|
|
|
|
add_last_column_value(rows, 'email')
|
|
end
|
|
|
|
def count_messages
|
|
@db.get_first_value <<-SQL
|
|
SELECT COUNT(*)
|
|
FROM email
|
|
SQL
|
|
end
|
|
|
|
def fetch_messages(last_row_id)
|
|
rows = @db.execute(<<-SQL, last_row_id)
|
|
SELECT o.ROWID, e.msg_id, from_email, subject, email_date, in_reply_to,
|
|
raw_message, body, elided, format, attachment_count, category
|
|
FROM email e
|
|
JOIN email_order o USING (msg_id)
|
|
WHERE o.ROWID > :last_row_id
|
|
ORDER BY o.ROWID
|
|
LIMIT #{@batch_size}
|
|
SQL
|
|
|
|
add_last_column_value(rows, 'rowid')
|
|
end
|
|
|
|
private
|
|
|
|
def configure_database
|
|
@db.execute 'PRAGMA journal_mode = OFF'
|
|
@db.execute 'PRAGMA locking_mode = EXCLUSIVE'
|
|
end
|
|
|
|
def upgrade_schema_version
|
|
current_version = @db.get_first_value("PRAGMA user_version")
|
|
|
|
case current_version
|
|
when 1
|
|
@db.execute "ALTER TABLE email ADD COLUMN index_duration REAL"
|
|
end
|
|
|
|
@db.execute "PRAGMA user_version = #{SCHEMA_VERSION}"
|
|
end
|
|
|
|
def create_table_for_categories
|
|
@db.execute <<-SQL
|
|
CREATE TABLE IF NOT EXISTS category (
|
|
name TEXT NOT NULL PRIMARY KEY,
|
|
description TEXT
|
|
)
|
|
SQL
|
|
end
|
|
|
|
def create_table_for_imported_files
|
|
@db.execute <<-SQL
|
|
CREATE TABLE IF NOT EXISTS imported_file (
|
|
category TEXT NOT NULL,
|
|
filename TEXT NOT NULL,
|
|
checksum TEXT NOT NULL,
|
|
PRIMARY KEY (category, filename),
|
|
FOREIGN KEY(category) REFERENCES category(name)
|
|
)
|
|
SQL
|
|
end
|
|
|
|
def create_table_for_emails
|
|
@db.execute <<-SQL
|
|
CREATE TABLE IF NOT EXISTS email (
|
|
msg_id TEXT NOT NULL PRIMARY KEY,
|
|
from_email TEXT,
|
|
from_name TEXT,
|
|
subject TEXT,
|
|
in_reply_to TEXT,
|
|
email_date DATETIME,
|
|
raw_message TEXT,
|
|
body TEXT,
|
|
elided TEXT,
|
|
format INTEGER,
|
|
attachment_count INTEGER NOT NULL DEFAULT 0,
|
|
charset TEXT,
|
|
category TEXT NOT NULL,
|
|
filename TEXT NOT NULL,
|
|
first_line_number INTEGER,
|
|
last_line_number INTEGER,
|
|
index_duration REAL,
|
|
FOREIGN KEY(category) REFERENCES category(name)
|
|
)
|
|
SQL
|
|
|
|
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_from ON email (from_email)'
|
|
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_subject ON email (subject)'
|
|
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_in_reply_to ON email (in_reply_to)'
|
|
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_date ON email (email_date)'
|
|
|
|
@db.execute <<-SQL
|
|
CREATE TABLE IF NOT EXISTS email_order (
|
|
msg_id TEXT NOT NULL PRIMARY KEY
|
|
)
|
|
SQL
|
|
end
|
|
|
|
def create_table_for_replies
|
|
@db.execute <<-SQL
|
|
CREATE TABLE IF NOT EXISTS reply (
|
|
msg_id TEXT NOT NULL,
|
|
in_reply_to TEXT NOT NULL,
|
|
PRIMARY KEY (msg_id, in_reply_to),
|
|
FOREIGN KEY(msg_id) REFERENCES email(msg_id)
|
|
)
|
|
SQL
|
|
|
|
@db.execute 'CREATE INDEX IF NOT EXISTS reply_by_in_reply_to ON reply (in_reply_to)'
|
|
end
|
|
|
|
def create_table_for_users
|
|
@db.execute <<-SQL
|
|
CREATE TABLE IF NOT EXISTS user (
|
|
email TEXT NOT NULL PRIMARY KEY,
|
|
name TEXT,
|
|
date_of_first_message DATETIME NOT NULL
|
|
)
|
|
SQL
|
|
end
|
|
|
|
def add_last_column_value(rows, *last_columns)
|
|
return rows if last_columns.empty?
|
|
|
|
result = [rows]
|
|
last_row = rows.last
|
|
|
|
last_columns.each { |column| result.push(last_row ? last_row[column] : nil) }
|
|
result
|
|
end
|
|
end
|
|
end
|