mbox: Improve error checking and reporting

This commit is contained in:
Jay Pfaffman 2016-10-07 09:38:54 -07:00
parent 3d02dc28be
commit 1886f021e2

View File

@ -1,9 +1,6 @@
require 'sqlite3' require 'sqlite3'
require File.expand_path(File.dirname(__FILE__) + "/base.rb") require File.expand_path(File.dirname(__FILE__) + "/base.rb")
# TODO: ignore ~ emacs backup files
# DONE: sort filenames before processing
# Paste these lines into your shell before running this: # Paste these lines into your shell before running this:
=begin =begin
@ -41,6 +38,13 @@ class ImportScripts::Mbox < ImportScripts::Base
# ex: "jobs-folder" => "jobs" # ex: "jobs-folder" => "jobs"
} }
unless File.directory?(MBOX_DIR)
puts "Cannot find import directory #{MBOX_DIR}. Giving up."
exit
end
validates_format_of :email, :with => /\A([^@\s]+)@((?:[-a-z0-9]+\.)+[a-z]{2,})\Z/i, :on => :create
def execute def execute
import_categories import_categories
create_email_indices create_email_indices
@ -49,6 +53,7 @@ class ImportScripts::Mbox < ImportScripts::Base
import_users import_users
create_forum_topics create_forum_topics
import_replies import_replies
# replace_email_addresses # uncomment to replace all email address with @username
end end
def import_categories def import_categories
@ -91,16 +96,21 @@ class ImportScripts::Mbox < ImportScripts::Base
files.sort! files.sort!
files.each_with_index do |f, idx| files.each_with_index do |f, idx|
print_warning "\nProcessing: #{f}"
start_time = Time.now
if SPLIT_AT.present? if SPLIT_AT.present?
msg = "" msg = ""
message_count = 0
each_line(f) do |line| each_line(f) do |line|
line = line.scrub line = line.scrub
if line =~ SPLIT_AT if line =~ SPLIT_AT
p message_count += 1
if !msg.empty? if !msg.empty?
mail = Mail.read_from_string(msg) mail = Mail.read_from_string(msg)
yield mail, f yield mail, f
print_status(idx, files.size) print_status(idx, files.size, start_time)
msg = "" msg = ""
end end
end end
@ -110,14 +120,14 @@ class ImportScripts::Mbox < ImportScripts::Base
if !msg.empty? if !msg.empty?
mail = Mail.read_from_string(msg) mail = Mail.read_from_string(msg)
yield mail, f yield mail, f
print_status(idx, files.size) print_status(idx, files.size, start_time)
msg = "" msg = ""
end end
else else
raw = File.read(f) raw = File.read(f)
mail = Mail.read_from_string(raw) mail = Mail.read_from_string(raw)
yield mail, f yield mail, f
print_status(idx, files.size) print_status(idx, files.size, start_time)
end end
end end
@ -133,7 +143,9 @@ class ImportScripts::Mbox < ImportScripts::Base
titles = {} titles = {}
rows.each do |row| rows.each do |row|
msg_ids[row[0]] = true msg_ids[row[0]] = true
titles[row[1]] = row[0] if titles[row[1]].nil?
titles[row[1]] = row[0]
end
end end
# First, any replies where the parent doesn't exist should have that field cleared # First, any replies where the parent doesn't exist should have that field cleared
@ -171,12 +183,18 @@ class ImportScripts::Mbox < ImportScripts::Base
if mail.from.present? if mail.from.present?
from_email = mail.from.dup from_email = mail.from.dup
if from_email.kind_of?(Array) if from_email.kind_of?(Array)
from_email = from_email.first.dup if from_email[0].nil?
print_warning "Cannot find email address (ignoring)!\n#{mail}"
else
from_email = from_email.first.dup
from_email.gsub!(/ at /, '@')
from_email.gsub!(/ [at] /, '@')
# strip real names in ()s. Todo: read into name
from_email.gsub!(/ \(.*$/, '')
from_email.gsub!(/ /, '')
end
end end
p end
from_email.gsub!(/ at /, '@')
from_email.gsub!(/ \(.*$/, '')
end
display_names = from.try(:display_names) display_names = from.try(:display_names)
if display_names.present? if display_names.present?
@ -191,6 +209,10 @@ class ImportScripts::Mbox < ImportScripts::Base
[from_email, from_name] [from_email, from_name]
end end
def print_warning(message)
$stderr.puts "#{message}"
end
def create_email_indices def create_email_indices
db = open_db db = open_db
db.execute "DROP TABLE IF EXISTS emails" db.execute "DROP TABLE IF EXISTS emails"
@ -228,7 +250,11 @@ class ImportScripts::Mbox < ImportScripts::Base
email_date = mail['date'].to_s email_date = mail['date'].to_s
email_date = DateTime.parse(email_date).to_s unless email_date.blank? email_date = DateTime.parse(email_date).to_s unless email_date.blank?
db.execute "INSERT OR IGNORE INTO emails (msg_id, if from_email.kind_of?(String)
unless from_email.match(/\A[\w+\-.]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+\z/i)
print_warning "Ignoring bad email address #{from_email} in #{msg_id}"
else
db.execute "INSERT OR IGNORE INTO emails (msg_id,
from_email, from_email,
from_name, from_name,
title, title,
@ -237,7 +263,9 @@ class ImportScripts::Mbox < ImportScripts::Base
message, message,
category) category)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)", VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
[msg_id, from_email, from_name, title, reply_to, email_date, mail.to_s, category] [msg_id, from_email, from_name, title, reply_to, email_date, mail.to_s, category]
end
end
end end
ensure ensure
db.close db.close
@ -285,8 +313,8 @@ class ImportScripts::Mbox < ImportScripts::Base
end end
def clean_raw(input) def clean_raw(input)
raw = input.dup raw = input.dup
raw.scrub!
raw.gsub!(/-- \nYou received this message because you are subscribed to the Google Groups "[^"]*" group.\nTo unsubscribe from this group and stop receiving emails from it, send an email to [^+@]+\+unsubscribe@googlegroups.com\.\nFor more options, visit https:\/\/groups\.google\.com\/groups\/opt_out\./, '') raw.gsub!(/-- \nYou received this message because you are subscribed to the Google Groups "[^"]*" group.\nTo unsubscribe from this group and stop receiving emails from it, send an email to [^+@]+\+unsubscribe@googlegroups.com\.\nFor more options, visit https:\/\/groups\.google\.com\/groups\/opt_out\./, '')
raw raw
@ -411,9 +439,12 @@ class ImportScripts::Mbox < ImportScripts::Base
post_count = replies.size post_count = replies.size
puts "Replies: #{post_count}"
batches(BATCH_SIZE) do |offset| batches(BATCH_SIZE) do |offset|
posts = replies[offset..offset+BATCH_SIZE-1] posts = replies[offset..offset+BATCH_SIZE-1]
break if posts.nil? break if posts.nil?
break if posts.count < 1
next if all_records_exist? :posts, posts.map {|p| p[0]} next if all_records_exist? :posts, posts.map {|p| p[0]}