discourse/lib/tasks/import.rake
Sam 5f64fd0a21 DEV: remove exec_sql and replace with mini_sql
Introduce new patterns for direct sql that are safe and fast.

MiniSql is not prone to memory bloat that can happen with direct PG usage.
It also has an extremely fast materializer and very a convenient API

- DB.exec(sql, *params) => runs sql returns row count
- DB.query(sql, *params) => runs sql returns usable objects (not a hash)
- DB.query_hash(sql, *params) => runs sql returns an array of hashes
- DB.query_single(sql, *params) => runs sql and returns a flat one dimensional array
- DB.build(sql) => returns a sql builder

See more at: https://github.com/discourse/mini_sql
2018-06-19 16:13:36 +10:00

508 lines
15 KiB
Ruby

# Use http://tatiyants.com/pev/#/plans/new if you want to optimize a query
task "import:ensure_consistency" => :environment do
log "Starting..."
insert_post_timings
insert_post_replies
insert_topic_users
insert_topic_views
insert_user_actions
insert_user_options
insert_user_stats
insert_user_visits
insert_draft_sequences
update_user_stats
update_posts
update_topics
update_categories
update_users
update_groups
update_tag_stats
log "Done!"
end
MS_SPEND_CREATING_POST ||= 5000
def insert_post_timings
log "Inserting post timings..."
DB.exec <<-SQL
INSERT INTO post_timings (topic_id, post_number, user_id, msecs)
SELECT topic_id, post_number, user_id, #{MS_SPEND_CREATING_POST}
FROM posts
WHERE user_id > 0
ON CONFLICT DO NOTHING
SQL
end
def insert_post_replies
log "Inserting post replies..."
DB.exec <<-SQL
INSERT INTO post_replies (post_id, reply_id, created_at, updated_at)
SELECT p2.id, p.id, p.created_at, p.created_at
FROM posts p
INNER JOIN posts p2 ON p2.post_number = p.reply_to_post_number AND p2.topic_id = p.topic_id
ON CONFLICT DO NOTHING
SQL
end
def insert_topic_users
log "Inserting topic users..."
DB.exec <<-SQL
INSERT INTO topic_users (user_id, topic_id, posted, last_read_post_number, highest_seen_post_number, first_visited_at, last_visited_at, total_msecs_viewed)
SELECT user_id, topic_id, 't' , MAX(post_number), MAX(post_number), MIN(created_at), MAX(created_at), COUNT(id) * #{MS_SPEND_CREATING_POST}
FROM posts
WHERE user_id > 0
GROUP BY user_id, topic_id
ON CONFLICT DO NOTHING
SQL
end
def insert_topic_views
log "Inserting topic views..."
DB.exec <<-SQL
WITH X AS (
SELECT topic_id, user_id, DATE(p.created_at) posted_at
FROM posts p
JOIN users u ON u.id = p.user_id
WHERE user_id > 0
GROUP BY topic_id, user_id, DATE(p.created_at)
)
INSERT INTO topic_views (topic_id, user_id, viewed_at, ip_address)
SELECT X.topic_id, X.user_id, X.posted_at, ip_address
FROM X
JOIN users u ON u.id = X.user_id
WHERE ip_address IS NOT NULL
ON CONFLICT DO NOTHING
SQL
end
def insert_user_actions
log "Inserting user actions for NEW_TOPIC = 4..."
DB.exec <<-SQL
INSERT INTO user_actions (action_type, user_id, target_topic_id, target_post_id, acting_user_id, created_at, updated_at)
SELECT 4, p.user_id, topic_id, p.id, p.user_id, p.created_at, p.created_at
FROM posts p
JOIN topics t ON t.id = p.topic_id
WHERE post_number = 1
AND archetype <> 'private_message'
AND p.deleted_at IS NULL
AND t.deleted_at IS NULL
ON CONFLICT DO NOTHING
SQL
log "Inserting user actions for REPLY = 5..."
DB.exec <<-SQL
INSERT INTO user_actions (action_type, user_id, target_topic_id, target_post_id, acting_user_id, created_at, updated_at)
SELECT 5, p.user_id, topic_id, p.id, p.user_id, p.created_at, p.created_at
FROM posts p
JOIN topics t ON t.id = p.topic_id
WHERE post_number > 1
AND archetype <> 'private_message'
AND p.deleted_at IS NULL
AND t.deleted_at IS NULL
ON CONFLICT DO NOTHING
SQL
log "Inserting user actions for RESPONSE = 6..."
DB.exec <<-SQL
INSERT INTO user_actions (action_type, user_id, target_topic_id, target_post_id, acting_user_id, created_at, updated_at)
SELECT 6, p.user_id, p.topic_id, p.id, p2.user_id, p.created_at, p.created_at
FROM posts p
JOIN topics t ON t.id = p.topic_id
INNER JOIN posts p2 ON p2.post_number = p.reply_to_post_number
WHERE p.post_number > 1
AND archetype <> 'private_message'
AND p.deleted_at IS NULL
AND t.deleted_at IS NULL
AND p2.topic_id = p.topic_id
AND p2.user_id <> p.user_id
ON CONFLICT DO NOTHING
SQL
# TODO:
# - NEW_PRIVATE_MESSAGE
# - GOT_PRIVATE_MESSAGE
end
def insert_user_options
log "Inserting user options..."
DB.exec <<-SQL
INSERT INTO user_options (
user_id,
email_always,
mailing_list_mode,
mailing_list_mode_frequency,
email_direct,
email_private_messages,
email_previous_replies,
email_in_reply_to,
email_digests,
digest_after_minutes,
include_tl0_in_digests,
automatically_unpin_topics,
enable_quoting,
external_links_in_new_tab,
dynamic_favicon,
disable_jump_reply,
new_topic_duration_minutes,
auto_track_topics_after_msecs,
notification_level_when_replying,
like_notification_frequency
)
SELECT u.id
, #{SiteSetting.default_email_always}
, #{SiteSetting.default_email_mailing_list_mode}
, #{SiteSetting.default_email_mailing_list_mode_frequency}
, #{SiteSetting.default_email_direct}
, #{SiteSetting.default_email_personal_messages}
, #{SiteSetting.default_email_previous_replies}
, #{SiteSetting.default_email_in_reply_to}
, #{SiteSetting.default_email_digest_frequency.to_i > 0}
, #{SiteSetting.default_email_digest_frequency}
, #{SiteSetting.default_include_tl0_in_digests}
, #{SiteSetting.default_topics_automatic_unpin}
, #{SiteSetting.default_other_enable_quoting}
, #{SiteSetting.default_other_external_links_in_new_tab}
, #{SiteSetting.default_other_dynamic_favicon}
, #{SiteSetting.default_other_disable_jump_reply}
, #{SiteSetting.default_other_new_topic_duration_minutes}
, #{SiteSetting.default_other_auto_track_topics_after_msecs}
, #{SiteSetting.default_other_notification_level_when_replying}
, #{SiteSetting.default_other_like_notification_frequency}
FROM users u
LEFT JOIN user_options uo ON uo.user_id = u.id
WHERE uo.user_id IS NULL
SQL
end
def insert_user_stats
log "Inserting user stats..."
DB.exec <<-SQL
INSERT INTO user_stats (user_id, new_since)
SELECT id, created_at
FROM users
ON CONFLICT DO NOTHING
SQL
end
def insert_user_visits
log "Inserting user visits..."
DB.exec <<-SQL
INSERT INTO user_visits (user_id, visited_at, posts_read)
SELECT user_id, DATE(created_at), COUNT(*)
FROM posts
WHERE user_id > 0
GROUP BY user_id, DATE(created_at)
ON CONFLICT DO NOTHING
SQL
end
def insert_draft_sequences
log "Inserting draft sequences..."
DB.exec <<-SQL
INSERT INTO draft_sequences (user_id, draft_key, sequence)
SELECT user_id, CONCAT('#{Draft::EXISTING_TOPIC}', id), 1
FROM topics
WHERE user_id > 0
AND archetype = 'regular'
ON CONFLICT DO NOTHING
SQL
end
def update_user_stats
log "Updating user stats..."
DB.exec <<-SQL
WITH X AS (
SELECT p.user_id
, COUNT(p.id) posts
, COUNT(DISTINCT p.topic_id) topics
, MIN(p.created_at) min_created_at
, COALESCE(COUNT(DISTINCT DATE(p.created_at)), 0) days
, COALESCE((
SELECT COUNT(*)
FROM topics
WHERE id IN (
SELECT topic_id
FROM posts p2
JOIN topics t2 ON t2.id = p2.topic_id
WHERE p2.deleted_at IS NULL
AND p2.post_type = 1
AND NOT COALESCE(p2.hidden, 't')
AND p2.user_id <> t2.user_id
AND p2.user_id = p.user_id
)
), 0) topic_replies
FROM posts p
JOIN topics t ON t.id = p.topic_id
WHERE p.deleted_at IS NULL
AND NOT COALESCE(p.hidden, 't')
AND p.post_type = 1
AND t.deleted_at IS NULL
AND COALESCE(t.visible, 't')
AND t.archetype <> 'private_message'
AND p.user_id > 0
GROUP BY p.user_id
)
UPDATE user_stats
SET post_count = X.posts
, posts_read_count = X.posts
, time_read = X.posts * 5
, topic_count = X.topics
, topics_entered = X.topics
, first_post_created_at = X.min_created_at
, days_visited = X.days
, topic_reply_count = X.topic_replies
FROM X
WHERE user_stats.user_id = X.user_id
AND (post_count <> X.posts
OR posts_read_count <> X.posts
OR time_read <> X.posts * 5
OR topic_count <> X.topics
OR topics_entered <> X.topics
OR COALESCE(first_post_created_at, '1970-01-01') <> X.min_created_at
OR days_visited <> X.days
OR topic_reply_count <> X.topic_replies)
SQL
end
def update_posts
log "Updating posts..."
DB.exec <<-SQL
WITH Y AS (
SELECT post_id, COUNT(*) replies FROM post_replies GROUP BY post_id
)
UPDATE posts
SET reply_count = Y.replies
FROM Y
WHERE posts.id = Y.post_id
AND reply_count <> Y.replies
SQL
# -- TODO: ensure this is how this works!
# WITH X AS (
# SELECT pr.post_id, p.user_id
# FROM post_replies pr
# JOIN posts p ON p.id = pr.reply_id
# )
# UPDATE posts
# SET reply_to_user_id = X.user_id
# FROM X
# WHERE id = X.post_id
# AND COALESCE(reply_to_user_id, -9999) <> X.user_id
end
def update_topics
log "Updating topics..."
DB.exec <<-SQL
WITH X AS (
SELECT topic_id
, COUNT(*) posts
, MAX(created_at) last_post_date
, COALESCE(SUM(word_count), 0) words
, COALESCE(SUM(reply_count), 0) replies
, ( SELECT user_id
FROM posts
WHERE NOT hidden
AND deleted_at IS NULL
AND topic_id = p.topic_id
ORDER BY post_number DESC
LIMIT 1) last_poster
FROM posts p
WHERE NOT hidden
AND deleted_at IS NULL
GROUP BY topic_id
)
UPDATE topics
SET posts_count = X.posts
, last_posted_at = X.last_post_date
, bumped_at = X.last_post_date
, word_count = X.words
, reply_count = X.replies
, last_post_user_id = X.last_poster
FROM X
WHERE id = X.topic_id
AND (posts_count <> X.posts
OR COALESCE(last_posted_at, '1970-01-01') <> X.last_post_date
OR bumped_at <> X.last_post_date
OR COALESCE(word_count, -1) <> X.words
OR COALESCE(reply_count, -1) <> X.replies
OR COALESCE(last_post_user_id, -9999) <> X.last_poster)
SQL
end
def update_categories
log "Updating categories..."
DB.exec <<-SQL
WITH X AS (
SELECT category_id
, MAX(p.id) post_id
, MAX(t.id) topic_id
, COUNT(p.id) posts
, COUNT(DISTINCT p.topic_id) topics
FROM posts p
JOIN topics t ON t.id = p.topic_id
WHERE p.deleted_at IS NULL
AND t.deleted_at IS NULL
AND NOT p.hidden
AND t.visible
GROUP BY category_id
)
UPDATE categories
SET latest_post_id = X.post_id
, latest_topic_id = X.topic_id
, post_count = X.posts
, topic_count = X.topics
FROM X
WHERE id = X.category_id
AND (COALESCE(latest_post_id, -1) <> X.post_id
OR COALESCE(latest_topic_id, -1) <> X.topic_id
OR post_count <> X.posts
OR topic_count <> X.topics)
SQL
end
def update_users
log "Updating users..."
DB.exec <<-SQL
WITH X AS (
SELECT user_id
, MIN(created_at) min_created_at
, MAX(created_at) max_created_at
FROM posts
WHERE deleted_at IS NULL
GROUP BY user_id
)
UPDATE users
SET first_seen_at = X.min_created_at
, last_seen_at = X.max_created_at
, last_posted_at = X.max_created_at
FROM X
WHERE id = X.user_id
AND (COALESCE(first_seen_at, '1970-01-01') <> X.min_created_at
OR COALESCE(last_seen_at, '1970-01-01') <> X.max_created_at
OR COALESCE(last_posted_at, '1970-01-01') <> X.max_created_at)
SQL
end
def update_groups
log "Updating groups..."
DB.exec <<-SQL
WITH X AS (
SELECT group_id, COUNT(*) count
FROM group_users
GROUP BY group_id
)
UPDATE groups
SET user_count = X.count
FROM X
WHERE id = X.group_id
AND user_count <> X.count
SQL
end
def update_tag_stats
Tag.ensure_consistency!
end
def log(message)
puts "[#{DateTime.now.strftime("%Y-%m-%d %H:%M:%S")}] #{message}"
end
task "import:create_phpbb_permalinks" => :environment do
log 'Creating Permalinks...'
# /[^\/]+\/.*-t(\d+).html/
SiteSetting.permalink_normalizations = '/[^\/]+\/.*-t(\d+).html/thread/\1'
Topic.listable_topics.find_each do |topic|
tcf = topic.custom_fields
if tcf && tcf["import_id"]
Permalink.create(url: "thread/#{tcf["import_id"]}", topic_id: topic.id) rescue nil
end
end
log "Done!"
end
task "import:remap_old_phpbb_permalinks" => :environment do
log 'Remapping Permalinks...'
i = 0
Post.where("raw LIKE ?", "%discussions.example.com%").each do |p|
begin
new_raw = p.raw.dup
# \((https?:\/\/discussions\.example\.com\/\S*-t\d+.html)\)
new_raw.gsub!(/\((https?:\/\/discussions\.example\.com\/\S*-t\d+.html)\)/) do
normalized_url = Permalink.normalize_url($1)
permalink = Permalink.find_by_url(normalized_url) rescue nil
if permalink && permalink.target_url
"(#{permalink.target_url})"
else
"(#{$1})"
end
end
if new_raw != p.raw
p.revise(Discourse.system_user, { raw: new_raw }, bypass_bump: true, skip_revision: true)
putc "."
i += 1
end
rescue
# skip
end
end
log "Done! #{i} posts remapped."
end
task "import:create_vbulletin_permalinks" => :environment do
log 'Creating Permalinks...'
# /showthread.php\?t=(\d+).*/
SiteSetting.permalink_normalizations = '/showthread.php\?t=(\d+).*/showthread.php?t=\1'
Topic.listable_topics.find_each do |topic|
tcf = topic.custom_fields
if tcf && tcf["import_id"]
Permalink.create(url: "showthread.php?t=#{tcf["import_id"]}", topic_id: topic.id) rescue nil
end
end
Category.find_each do |cat|
ccf = cat.custom_fields
if ccf && ccf["import_id"]
Permalink.create(url: "forumdisplay.php?f=#{ccf["import_id"]}", category_id: cat.id) rescue nil
end
end
log "Done!"
end
desc 'Import existing exported file'
task 'import:file', [:file_name] => [:environment] do |_, args|
require "import_export/import_export"
ImportExport.import(args[:file_name])
puts "", "Done", ""
end