DEV: Import script should insert more data into user_stats table (#26551)

This SQL tries to insert as much data as possible into the `user_stats` table by either calculating or by approximating stats based on existing. It also fixes an error in the calculation of `reply_count`which mistakenly contained all posts, not just replies.

This change also disables some steps in the `import:ensure_consistency` rake task by setting the `SKIP_USER_STATS` env variable. Otherwise, the rake task will overwrite the calculated data in the `user_stats` table with inaccurate data. I'm not changing or removing the logic from the rake task yet because other bulk import scripts seem to depend on it.
This commit is contained in:
Gerhard Schlager 2024-04-11 14:05:21 +02:00 committed by GitHub
parent 4f2b44ba9f
commit 4d045bfc61
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 81 additions and 24 deletions

View File

@ -12,12 +12,12 @@ task "import:ensure_consistency" => :environment do
insert_user_actions insert_user_actions
insert_user_options insert_user_options
insert_user_profiles insert_user_profiles
insert_user_stats insert_user_stats unless ENV["SKIP_USER_STATS"]
insert_user_visits insert_user_visits
insert_draft_sequences insert_draft_sequences
insert_automatic_group_users insert_automatic_group_users
update_user_stats update_user_stats unless ENV["SKIP_USER_STATS"]
update_posts update_posts
update_topics update_topics
update_categories update_categories

View File

@ -31,6 +31,7 @@ class BulkImport::Generic < BulkImport::Base
# Now that the migration is complete, do some more work: # Now that the migration is complete, do some more work:
ENV["SKIP_USER_STATS"] = "1"
Discourse::Application.load_tasks Discourse::Application.load_tasks
puts "running 'import:ensure_consistency' rake task." puts "running 'import:ensure_consistency' rake task."
@ -1258,35 +1259,91 @@ class BulkImport::Generic < BulkImport::Base
DB.exec(<<~SQL) DB.exec(<<~SQL)
WITH WITH
post_stats AS ( visible_posts AS (
SELECT p.user_id, COUNT(p.id) AS post_count, MIN(p.created_at) AS first_post_created_at, SELECT p.id, p.post_number, p.user_id, p.created_at, p.like_count, p.topic_id
SUM(like_count) AS likes_received FROM posts p
FROM posts p JOIN topics t ON p.topic_id = t.id
GROUP BY p.user_id WHERE t.archetype = 'regular'
), AND t.deleted_at IS NULL
AND t.visible
AND p.deleted_at IS NULL
AND p.post_type = 1 /* regular_post_type */
AND NOT p.hidden
),
topic_stats AS ( topic_stats AS (
SELECT t.user_id, COUNT(t.id) AS topic_count FROM topics t GROUP BY t.user_id SELECT t.user_id, COUNT(t.id) AS topic_count
), FROM topics t
WHERE t.archetype = 'regular'
AND t.deleted_at IS NULL
AND t.visible
GROUP BY t.user_id
),
post_stats AS (
SELECT p.user_id, MIN(p.created_at) AS first_post_created_at, SUM(p.like_count) AS likes_received
FROM visible_posts p
GROUP BY p.user_id
),
reply_stats AS (
SELECT p.user_id, COUNT(p.id) AS reply_count
FROM visible_posts p
WHERE p.post_number > 1
GROUP BY p.user_id
),
like_stats AS ( like_stats AS (
SELECT pa.user_id, COUNT(*) AS likes_given SELECT pa.user_id, COUNT(*) AS likes_given
FROM post_actions pa FROM post_actions pa
WHERE pa.post_action_type_id = 2 JOIN visible_posts p ON pa.post_id = p.id
GROUP BY pa.user_id WHERE pa.post_action_type_id = 2 /* like */
) AND pa.deleted_at IS NULL
GROUP BY pa.user_id
),
badge_stats AS (
SELECT ub.user_id, COUNT(DISTINCT ub.badge_id) AS distinct_badge_count
FROM user_badges ub
JOIN badges b ON ub.badge_id = b.id AND b.enabled
GROUP BY ub.user_id
),
post_action_stats AS ( -- posts created by user and likes given by user
SELECT p.user_id, p.id AS post_id, p.created_at::DATE, p.topic_id, p.post_number
FROM visible_posts p
UNION
SELECT pa.user_id, pa.post_id, pa.created_at::DATE, p.topic_id, p.post_number
FROM post_actions pa
JOIN visible_posts p ON pa.post_id = p.id
WHERE pa.post_action_type_id = 2
),
topic_reading_stats AS (
SELECT user_id, COUNT(DISTINCT topic_id) AS topics_entered,
COUNT(DISTINCT created_at) AS days_visited
FROM post_action_stats
GROUP BY user_id
),
posts_reading_stats AS (
SELECT user_id, SUM(max_post_number) AS posts_read_count
FROM (
SELECT user_id, MAX(post_number) AS max_post_number
FROM post_action_stats
GROUP BY user_id, topic_id
) x
GROUP BY user_id
)
INSERT INSERT
INTO user_stats (user_id, new_since, post_count, topic_count, first_post_created_at, likes_received, likes_given) INTO user_stats (user_id, new_since, post_count, topic_count, first_post_created_at, likes_received,
SELECT u.id, u.created_at AS new_since, COALESCE(ps.post_count, 0) AS post_count, likes_given, distinct_badge_count, days_visited, topics_entered, posts_read_count, time_read)
SELECT u.id AS user_id, u.created_at AS new_since, COALESCE(rs.reply_count, 0) AS reply_count,
COALESCE(ts.topic_count, 0) AS topic_count, ps.first_post_created_at, COALESCE(ts.topic_count, 0) AS topic_count, ps.first_post_created_at,
COALESCE(ps.likes_received, 0) AS likes_received, COALESCE(ls.likes_given, 0) AS likes_given COALESCE(ps.likes_received, 0) AS likes_received, COALESCE(ls.likes_given, 0) AS likes_given,
COALESCE(bs.distinct_badge_count, 0) AS distinct_badge_count, COALESCE(trs.days_visited, 1) AS days_visited,
COALESCE(trs.topics_entered, 0) AS topics_entered, COALESCE(prs.posts_read_count, 0) AS posts_read_count,
COALESCE(prs.posts_read_count, 0) * 30 AS time_read -- assume 30 seconds / post
FROM users u FROM users u
LEFT JOIN post_stats ps ON u.id = ps.user_id
LEFT JOIN topic_stats ts ON u.id = ts.user_id LEFT JOIN topic_stats ts ON u.id = ts.user_id
LEFT JOIN post_stats ps ON u.id = ps.user_id
LEFT JOIN reply_stats rs ON u.id = rs.user_id
LEFT JOIN like_stats ls ON u.id = ls.user_id LEFT JOIN like_stats ls ON u.id = ls.user_id
WHERE NOT EXISTS ( LEFT JOIN badge_stats bs ON u.id = bs.user_id
SELECT 1 LEFT JOIN topic_reading_stats trs ON u.id = trs.user_id
FROM user_stats us LEFT JOIN posts_reading_stats prs ON u.id = prs.user_id
WHERE us.user_id = u.id
)
ON CONFLICT DO NOTHING ON CONFLICT DO NOTHING
SQL SQL