mirror of
https://github.com/discourse/discourse.git
synced 2024-11-22 14:38:17 +08:00
Importer for mbox format
This commit is contained in:
parent
410ae47dd7
commit
33a5fce124
|
@ -155,6 +155,8 @@ module Email
|
|||
else
|
||||
object.body.to_s
|
||||
end
|
||||
rescue
|
||||
nil
|
||||
end
|
||||
|
||||
REPLYING_HEADER_LABELS = ['From', 'Sent', 'To', 'Subject', 'Reply To', 'Cc', 'Bcc', 'Date']
|
||||
|
|
166
script/import_scripts/mbox.rb
Executable file
166
script/import_scripts/mbox.rb
Executable file
|
@ -0,0 +1,166 @@
|
|||
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
||||
|
||||
class ImportScripts::Mbox < ImportScripts::Base
|
||||
# CHANGE THESE BEFORE RUNNING THE IMPORTER
|
||||
|
||||
BATCH_SIZE = 1000
|
||||
CATEGORY_ID = 6
|
||||
MBOX_DIR = "/tmp/mbox-input"
|
||||
USER_INDEX_PATH = "#{MBOX_DIR}/user-index.json"
|
||||
TOPIC_INDEX_PATH = "#{MBOX_DIR}/topic-index.json"
|
||||
REPLY_INDEX_PATH = "#{MBOX_DIR}/replies-index.json"
|
||||
|
||||
def execute
|
||||
create_indices
|
||||
import_users
|
||||
create_forum_topics
|
||||
import_replies
|
||||
end
|
||||
|
||||
def all_messages
|
||||
|
||||
files = Dir["#{MBOX_DIR}/*/*"]
|
||||
|
||||
files.each_with_index do |f, idx|
|
||||
raw = File.read(f)
|
||||
mail = Mail.read_from_string(raw)
|
||||
yield mail, f
|
||||
print_status(idx, files.size)
|
||||
end
|
||||
end
|
||||
|
||||
def create_indices
|
||||
return if File.exist?(USER_INDEX_PATH) && File.exist?(TOPIC_INDEX_PATH) && File.exist?(REPLY_INDEX_PATH)
|
||||
puts "", "creating indices"
|
||||
users = {}
|
||||
|
||||
topics = []
|
||||
|
||||
topic_lookup = {}
|
||||
replies = []
|
||||
|
||||
all_messages do |mail, filename|
|
||||
users[mail.from.first] = mail[:from].display_names.first
|
||||
|
||||
msg_id = mail['Message-ID'].to_s
|
||||
reply_to = mail['In-Reply-To'].to_s
|
||||
|
||||
if reply_to.present?
|
||||
topic = topic_lookup[reply_to] || reply_to
|
||||
topic_lookup[msg_id] = topic
|
||||
replies << {id: msg_id, topic: topic, file: filename}
|
||||
else
|
||||
topics << {id: msg_id, file: filename}
|
||||
end
|
||||
end
|
||||
|
||||
File.write(USER_INDEX_PATH, {users: users}.to_json)
|
||||
File.write(TOPIC_INDEX_PATH, {topics: topics}.to_json)
|
||||
File.write(REPLY_INDEX_PATH, {replies: replies}.to_json)
|
||||
end
|
||||
|
||||
def import_users
|
||||
puts "", "importing users"
|
||||
|
||||
all_users = ::JSON.parse(File.read(USER_INDEX_PATH))['users']
|
||||
user_keys = all_users.keys
|
||||
total_count = user_keys.size
|
||||
|
||||
batches(BATCH_SIZE) do |offset|
|
||||
users = user_keys[offset..offset+BATCH_SIZE-1]
|
||||
break if users.nil?
|
||||
|
||||
create_users(users, total: total_count, offset: offset) do |email|
|
||||
{
|
||||
id: email,
|
||||
email: email,
|
||||
name: all_users[email]
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def parse_email(msg)
|
||||
receiver = Email::Receiver.new(msg, skip_sanity_check: true)
|
||||
mail = Mail.read_from_string(msg)
|
||||
mail.body
|
||||
|
||||
selected = receiver.select_body(mail)
|
||||
selected.force_encoding(selected.encoding).encode("UTF-8")
|
||||
end
|
||||
|
||||
def create_forum_topics
|
||||
puts "", "creating forum topics"
|
||||
|
||||
all_topics = ::JSON.parse(File.read(TOPIC_INDEX_PATH))['topics']
|
||||
topic_count = all_topics.size
|
||||
|
||||
batches(BATCH_SIZE) do |offset|
|
||||
topics = all_topics[offset..offset+BATCH_SIZE-1]
|
||||
break if topics.nil?
|
||||
|
||||
create_posts(topics, total: topic_count, offset: offset) do |t|
|
||||
raw_email = File.read(t['file'])
|
||||
receiver = Email::Receiver.new(raw_email, skip_sanity_check: true)
|
||||
mail = Mail.read_from_string(raw_email)
|
||||
mail.body
|
||||
|
||||
selected = receiver.select_body(mail)
|
||||
next unless selected
|
||||
|
||||
raw = selected.force_encoding(selected.encoding).encode("UTF-8")
|
||||
|
||||
title = mail.subject.gsub(/\[[^\]]+\]+/, '').strip
|
||||
|
||||
{ id: t['id'],
|
||||
title: title,
|
||||
user_id: user_id_from_imported_user_id(mail.from.first) || Discourse::SYSTEM_USER_ID,
|
||||
created_at: mail.date,
|
||||
category: CATEGORY_ID,
|
||||
raw: raw,
|
||||
cook_method: Post.cook_methods[:email] }
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def import_replies
|
||||
puts "", "creating topic replies"
|
||||
|
||||
all_topics = ::JSON.parse(File.read(TOPIC_INDEX_PATH))['topics']
|
||||
topic_count = all_topics.size
|
||||
|
||||
replies = ::JSON.parse(File.read(REPLY_INDEX_PATH))['replies']
|
||||
post_count = replies.size
|
||||
|
||||
batches(BATCH_SIZE) do |offset|
|
||||
posts = replies[offset..offset+BATCH_SIZE-1]
|
||||
break if posts.nil?
|
||||
|
||||
create_posts(posts, total: post_count, offset: offset) do |p|
|
||||
parent_id = p['topic']
|
||||
id = p['id']
|
||||
|
||||
topic = topic_lookup_from_imported_post_id(parent_id)
|
||||
topic_id = topic[:topic_id] if topic
|
||||
next unless topic_id
|
||||
|
||||
raw_email = File.read(p['file'])
|
||||
receiver = Email::Receiver.new(raw_email, skip_sanity_check: true)
|
||||
mail = Mail.read_from_string(raw_email)
|
||||
mail.body
|
||||
|
||||
selected = receiver.select_body(mail)
|
||||
raw = selected.force_encoding(selected.encoding).encode("UTF-8")
|
||||
|
||||
{ id: id,
|
||||
topic_id: topic_id,
|
||||
user_id: user_id_from_imported_user_id(mail.from.first) || Discourse::SYSTEM_USER_ID,
|
||||
created_at: mail.date,
|
||||
raw: raw,
|
||||
cook_method: Post.cook_methods[:email] }
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
ImportScripts::Mbox.new.perform
|
|
@ -1,7 +1,7 @@
|
|||
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
||||
require 'pg'
|
||||
|
||||
class ImportScripts::MyAskBot < ImportScripts::Base
|
||||
class ImportScripts::Nabble < ImportScripts::Base
|
||||
# CHANGE THESE BEFORE RUNNING THE IMPORTER
|
||||
|
||||
BATCH_SIZE = 1000
|
||||
|
@ -148,4 +148,4 @@ class ImportScripts::MyAskBot < ImportScripts::Base
|
|||
end
|
||||
end
|
||||
|
||||
ImportScripts::MyAskBot.new.perform
|
||||
ImportScripts::Nabble.new.perform
|
||||
|
|
Loading…
Reference in New Issue
Block a user