2015-07-24 06:37:40 +08:00
require File . expand_path ( File . dirname ( __FILE__ ) + " /base.rb " )
class ImportScripts :: Mbox < ImportScripts :: Base
# CHANGE THESE BEFORE RUNNING THE IMPORTER
BATCH_SIZE = 1000
CATEGORY_ID = 6
MBOX_DIR = " /tmp/mbox-input "
USER_INDEX_PATH = " #{ MBOX_DIR } /user-index.json "
TOPIC_INDEX_PATH = " #{ MBOX_DIR } /topic-index.json "
REPLY_INDEX_PATH = " #{ MBOX_DIR } /replies-index.json "
def execute
create_indices
import_users
create_forum_topics
import_replies
end
def all_messages
files = Dir [ " #{ MBOX_DIR } /*/* " ]
files . each_with_index do | f , idx |
raw = File . read ( f )
mail = Mail . read_from_string ( raw )
yield mail , f
print_status ( idx , files . size )
end
end
def create_indices
return if File . exist? ( USER_INDEX_PATH ) && File . exist? ( TOPIC_INDEX_PATH ) && File . exist? ( REPLY_INDEX_PATH )
puts " " , " creating indices "
users = { }
topics = [ ]
topic_lookup = { }
2015-10-23 03:02:53 +08:00
topic_titles = { }
2015-07-24 06:37:40 +08:00
replies = [ ]
all_messages do | mail , filename |
users [ mail . from . first ] = mail [ :from ] . display_names . first
msg_id = mail [ 'Message-ID' ] . to_s
reply_to = mail [ 'In-Reply-To' ] . to_s
2015-10-23 03:02:53 +08:00
title = clean_title ( mail [ 'Subject' ] . to_s )
2015-10-28 02:21:29 +08:00
date = Time . parse ( mail [ 'date' ] . to_s ) . to_i
2015-07-24 06:37:40 +08:00
if reply_to . present?
topic = topic_lookup [ reply_to ] || reply_to
topic_lookup [ msg_id ] = topic
2015-10-28 02:21:29 +08:00
replies << { id : msg_id , topic : topic , file : filename , title : title , date : date }
2015-07-24 06:37:40 +08:00
else
2015-10-28 02:21:29 +08:00
topics << { id : msg_id , file : filename , title : title , date : date }
topic_titles [ title ] || = msg_id
2015-07-24 06:37:40 +08:00
end
end
2015-10-28 02:21:29 +08:00
replies . sort! { | a , b | a [ :date ] < = > b [ :date ] }
topics . sort! { | a , b | a [ :date ] < = > b [ :date ] }
2015-10-23 03:02:53 +08:00
# Replies without parents should be hoisted to topics
to_hoist = [ ]
replies . each do | r |
to_hoist << r if ! topic_lookup [ r [ :topic ] ]
end
to_hoist . each do | h |
replies . delete ( h )
2015-10-28 02:21:29 +08:00
topics << { id : h [ :id ] , file : h [ :file ] , title : h [ :title ] , date : h [ :date ] }
2015-10-23 03:02:53 +08:00
topic_titles [ h [ :title ] ] || = h [ :id ]
end
# Topics with duplicate replies should be replies
to_group = [ ]
topics . each do | t |
first = topic_titles [ t [ :title ] ]
to_group << t if first && first != t [ :id ]
end
to_group . each do | t |
topics . delete ( t )
2015-10-28 02:21:29 +08:00
replies << { id : t [ :id ] , topic : topic_titles [ t [ :title ] ] , file : t [ :file ] , title : t [ :title ] , date : t [ :date ] }
2015-10-23 03:02:53 +08:00
end
2015-10-28 02:21:29 +08:00
replies . sort! { | a , b | a [ :date ] < = > b [ :date ] }
topics . sort! { | a , b | a [ :date ] < = > b [ :date ] }
2015-07-24 06:37:40 +08:00
File . write ( USER_INDEX_PATH , { users : users } . to_json )
File . write ( TOPIC_INDEX_PATH , { topics : topics } . to_json )
File . write ( REPLY_INDEX_PATH , { replies : replies } . to_json )
end
2015-10-23 03:02:53 +08:00
def clean_title ( title )
title . gsub ( / ^Re: * /i , '' )
end
def clean_raw ( raw )
raw . gsub ( / -- \ nYou received this message because you are subscribed to the Google Groups "[^"]*" group. \ nTo unsubscribe from this group and stop receiving emails from it, send an email to [^+@]+ \ +unsubscribe@googlegroups.com \ . \ nFor more options, visit https: \/ \/ groups \ .google \ .com \/ groups \/ opt_out \ . / , '' )
end
2015-07-24 06:37:40 +08:00
def import_users
puts " " , " importing users "
all_users = :: JSON . parse ( File . read ( USER_INDEX_PATH ) ) [ 'users' ]
user_keys = all_users . keys
total_count = user_keys . size
batches ( BATCH_SIZE ) do | offset |
users = user_keys [ offset .. offset + BATCH_SIZE - 1 ]
break if users . nil?
2015-09-22 07:48:42 +08:00
next if all_records_exist? :users , users
2015-07-24 06:37:40 +08:00
create_users ( users , total : total_count , offset : offset ) do | email |
{
id : email ,
email : email ,
name : all_users [ email ]
}
end
end
end
def parse_email ( msg )
receiver = Email :: Receiver . new ( msg , skip_sanity_check : true )
mail = Mail . read_from_string ( msg )
mail . body
selected = receiver . select_body ( mail )
selected . force_encoding ( selected . encoding ) . encode ( " UTF-8 " )
end
def create_forum_topics
puts " " , " creating forum topics "
all_topics = :: JSON . parse ( File . read ( TOPIC_INDEX_PATH ) ) [ 'topics' ]
topic_count = all_topics . size
batches ( BATCH_SIZE ) do | offset |
topics = all_topics [ offset .. offset + BATCH_SIZE - 1 ]
break if topics . nil?
2015-10-23 03:02:53 +08:00
next if all_records_exist? :posts , topics . map { | t | t [ 'id' ] }
2015-09-22 07:48:42 +08:00
2015-07-24 06:37:40 +08:00
create_posts ( topics , total : topic_count , offset : offset ) do | t |
raw_email = File . read ( t [ 'file' ] )
receiver = Email :: Receiver . new ( raw_email , skip_sanity_check : true )
mail = Mail . read_from_string ( raw_email )
mail . body
selected = receiver . select_body ( mail )
next unless selected
raw = selected . force_encoding ( selected . encoding ) . encode ( " UTF-8 " )
title = mail . subject . gsub ( / \ [[^ \ ]]+ \ ]+ / , '' ) . strip
{ id : t [ 'id' ] ,
2015-10-23 03:02:53 +08:00
title : clean_title ( title ) ,
2015-07-24 06:37:40 +08:00
user_id : user_id_from_imported_user_id ( mail . from . first ) || Discourse :: SYSTEM_USER_ID ,
created_at : mail . date ,
category : CATEGORY_ID ,
2015-10-23 03:02:53 +08:00
raw : clean_raw ( raw ) ,
2015-07-24 06:37:40 +08:00
cook_method : Post . cook_methods [ :email ] }
end
end
end
def import_replies
puts " " , " creating topic replies "
replies = :: JSON . parse ( File . read ( REPLY_INDEX_PATH ) ) [ 'replies' ]
post_count = replies . size
batches ( BATCH_SIZE ) do | offset |
posts = replies [ offset .. offset + BATCH_SIZE - 1 ]
break if posts . nil?
2015-10-23 03:02:53 +08:00
next if all_records_exist? :posts , posts . map { | p | p [ 'id' ] }
2015-09-22 07:48:42 +08:00
2015-07-24 06:37:40 +08:00
create_posts ( posts , total : post_count , offset : offset ) do | p |
parent_id = p [ 'topic' ]
id = p [ 'id' ]
topic = topic_lookup_from_imported_post_id ( parent_id )
topic_id = topic [ :topic_id ] if topic
next unless topic_id
raw_email = File . read ( p [ 'file' ] )
receiver = Email :: Receiver . new ( raw_email , skip_sanity_check : true )
mail = Mail . read_from_string ( raw_email )
mail . body
selected = receiver . select_body ( mail )
raw = selected . force_encoding ( selected . encoding ) . encode ( " UTF-8 " )
{ id : id ,
topic_id : topic_id ,
user_id : user_id_from_imported_user_id ( mail . from . first ) || Discourse :: SYSTEM_USER_ID ,
created_at : mail . date ,
2015-10-23 03:02:53 +08:00
raw : clean_raw ( raw ) ,
2015-07-24 06:37:40 +08:00
cook_method : Post . cook_methods [ :email ] }
end
end
end
end
ImportScripts :: Mbox . new . perform