discourse/script/import_scripts/drupal_qa.rb
Kane York 821124f3fd FEATURE: Skip batches if all records exist
Update all import scripts to take advantage of all_records_exist?
2015-09-21 17:13:02 -07:00

241 lines
7.1 KiB
Ruby

require "mysql2"
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require File.expand_path(File.dirname(__FILE__) + "/drupal.rb")
class ImportScripts::DrupalQA < ImportScripts::Drupal
def categories_query
result = @client.query("SELECT n.nid, GROUP_CONCAT(ti.tid) AS tids
FROM node AS n
INNER JOIN taxonomy_index AS ti ON ti.nid = n.nid
WHERE n.type = 'question'
AND n.status = 1
GROUP BY n.nid")
categories = {}
result.each do |r|
tids = r['tids']
if tids.present?
tids = tids.split(',')
categories[tids[0].to_i] = true
end
end
@client.query("SELECT tid, name, description FROM taxonomy_term_data WHERE tid IN (#{categories.keys.join(',')})")
end
def create_forum_topics
puts '', "creating forum topics"
total_count = @client.query("
SELECT COUNT(*) count
FROM node n
WHERE n.type = 'question'
AND n.status = 1;").first['count']
batch_size = 1000
batches(batch_size) do |offset|
results = @client.query("
SELECT n.nid,
n.title,
GROUP_CONCAT(t.tid) AS tid,
n.uid,
n.created,
f.body_value AS body
FROM node AS n
LEFT OUTER JOIN taxonomy_index AS t on t.nid = n.nid
INNER JOIN field_data_body AS f ON f.entity_id = n.nid
WHERE n.type = 'question'
AND n.status = 1
GROUP BY n.nid, n.title, n.uid, n.created, f.body_value
LIMIT #{batch_size}
OFFSET #{offset}
", cache_rows: false);
break if results.size < 1
next if all_records_exist? :posts, results.map {|p| "nid:#{p['nid']}"}
create_posts(results, total: total_count, offset: offset) do |row|
{
id: "nid:#{row['nid']}",
user_id: user_id_from_imported_user_id(row['uid']) || -1,
category: category_id_from_imported_category_id((row['tid'] || '').split(',')[0]),
raw: row['body'],
created_at: Time.zone.at(row['created']),
pinned_at: nil,
title: row['title'].try(:strip)
}
end
end
end
def create_direct_replies
puts '', "creating replies in topics"
total_count = @client.query("
SELECT COUNT(*) count
FROM node n
WHERE n.type = 'answer'
AND n.status = 1;").first['count']
batch_size = 1000
batches(batch_size) do |offset|
results = @client.query("
SELECT n.nid AS cid,
q.field_answer_question_nid AS nid,
n.uid,
n.created,
f.body_value AS body
FROM node AS n
INNER JOIN field_data_field_answer_question AS q ON q.entity_id = n.nid
INNER JOIN field_data_body AS f ON f.entity_id = n.nid
WHERE n.status = 1
AND n.type = 'answer'
LIMIT #{batch_size}
OFFSET #{offset}
", cache_rows: false)
break if results.size < 1
next if all_records_exist? :posts, results.map {|p| "cid:#{p['cid']}"}
create_posts(results, total: total_count, offset: offset) do |row|
topic_mapping = topic_lookup_from_imported_post_id("nid:#{row['nid']}")
if topic_mapping && topic_id = topic_mapping[:topic_id]
h = {
id: "cid:#{row['cid']}",
topic_id: topic_id,
user_id: user_id_from_imported_user_id(row['uid']) || -1,
raw: row['body'],
created_at: Time.zone.at(row['created']),
}
h
else
puts "No topic found for answer #{row['cid']}"
nil
end
end
end
end
def create_nested_replies
puts '', "creating nested replies to posts in topics"
total_count = @client.query("
SELECT COUNT(c.cid) count
FROM node n
INNER JOIN comment AS c ON n.nid = c.nid
WHERE n.type = 'question'
AND n.status = 1;").first['count']
batch_size = 1000
batches(batch_size) do |offset|
# WARNING: If there are more than 1000000 this might have to be revisited
results = @client.query("
SELECT (c.cid + 1000000) as cid,
c.nid,
c.uid,
c.created,
cb.comment_body_value AS body
FROM node AS n
INNER JOIN comment AS c ON c.nid = n.nid
INNER JOIN field_data_comment_body AS cb ON cb.entity_id = c.cid
WHERE n.status = 1
AND n.type = 'question'
LIMIT #{batch_size}
OFFSET #{offset}
", cache_rows: false)
break if results.size < 1
next if all_records_exist? :posts, results.map {|p| "cid:#{p['cid']}"}
create_posts(results, total: total_count, offset: offset) do |row|
topic_mapping = topic_lookup_from_imported_post_id("nid:#{row['nid']}")
if topic_mapping && topic_id = topic_mapping[:topic_id]
h = {
id: "cid:#{row['cid']}",
topic_id: topic_id,
user_id: user_id_from_imported_user_id(row['uid']) || -1,
raw: row['body'],
created_at: Time.zone.at(row['created']),
}
h
else
puts "No topic found for comment #{row['cid']}"
nil
end
end
end
puts '', "creating nested replies to answers in topics"
total_count = @client.query("
SELECT COUNT(c.cid) count
FROM node n
INNER JOIN comment AS c ON n.nid = c.nid
WHERE n.type = 'answer'
AND n.status = 1;").first['count']
batch_size = 1000
batches(batch_size) do |offset|
# WARNING: If there are more than 1000000 this might have to be revisited
results = @client.query("
SELECT (c.cid + 1000000) as cid,
q.field_answer_question_nid AS nid,
c.uid,
c.created,
cb.comment_body_value AS body
FROM node AS n
INNER JOIN field_data_field_answer_question AS q ON q.entity_id = n.nid
INNER JOIN comment AS c ON c.nid = n.nid
INNER JOIN field_data_comment_body AS cb ON cb.entity_id = c.cid
WHERE n.status = 1
AND n.type = 'answer'
LIMIT #{batch_size}
OFFSET #{offset}
", cache_rows: false)
break if results.size < 1
next if all_records_exist? :posts, results.map {|p| "cid:#{p['cid']}"}
create_posts(results, total: total_count, offset: offset) do |row|
topic_mapping = topic_lookup_from_imported_post_id("nid:#{row['nid']}")
if topic_mapping && topic_id = topic_mapping[:topic_id]
h = {
id: "cid:#{row['cid']}",
topic_id: topic_id,
user_id: user_id_from_imported_user_id(row['uid']) || -1,
raw: row['body'],
created_at: Time.zone.at(row['created']),
}
h
else
puts "No topic found for comment #{row['cid']}"
nil
end
end
end
end
def create_replies
create_direct_replies
create_nested_replies
end
end
if __FILE__==$0
ImportScripts::DrupalQA.new.perform
end