DEV: Import script should insert more data into user_stats table (#26551)

This SQL tries to insert as much data as possible into the `user_stats` table by either calculating or by approximating stats based on existing. It also fixes an error in the calculation of `reply_count`which mistakenly contained all posts, not just replies.

This change also disables some steps in the `import:ensure_consistency` rake task by setting the `SKIP_USER_STATS` env variable. Otherwise, the rake task will overwrite the calculated data in the `user_stats` table with inaccurate data. I'm not changing or removing the logic from the rake task yet because other bulk import scripts seem to depend on it.
This commit is contained in:
Gerhard Schlager 2024-04-11 14:05:21 +02:00 committed by GitHub
parent 4f2b44ba9f
commit 4d045bfc61
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 81 additions and 24 deletions

View File

@ -12,12 +12,12 @@ task "import:ensure_consistency" => :environment do
insert_user_actions
insert_user_options
insert_user_profiles
insert_user_stats
insert_user_stats unless ENV["SKIP_USER_STATS"]
insert_user_visits
insert_draft_sequences
insert_automatic_group_users
update_user_stats
update_user_stats unless ENV["SKIP_USER_STATS"]
update_posts
update_topics
update_categories

View File

@ -31,6 +31,7 @@ class BulkImport::Generic < BulkImport::Base
# Now that the migration is complete, do some more work:
ENV["SKIP_USER_STATS"] = "1"
Discourse::Application.load_tasks
puts "running 'import:ensure_consistency' rake task."
@ -1258,35 +1259,91 @@ class BulkImport::Generic < BulkImport::Base
DB.exec(<<~SQL)
WITH
post_stats AS (
SELECT p.user_id, COUNT(p.id) AS post_count, MIN(p.created_at) AS first_post_created_at,
SUM(like_count) AS likes_received
FROM posts p
GROUP BY p.user_id
),
visible_posts AS (
SELECT p.id, p.post_number, p.user_id, p.created_at, p.like_count, p.topic_id
FROM posts p
JOIN topics t ON p.topic_id = t.id
WHERE t.archetype = 'regular'
AND t.deleted_at IS NULL
AND t.visible
AND p.deleted_at IS NULL
AND p.post_type = 1 /* regular_post_type */
AND NOT p.hidden
),
topic_stats AS (
SELECT t.user_id, COUNT(t.id) AS topic_count FROM topics t GROUP BY t.user_id
),
SELECT t.user_id, COUNT(t.id) AS topic_count
FROM topics t
WHERE t.archetype = 'regular'
AND t.deleted_at IS NULL
AND t.visible
GROUP BY t.user_id
),
post_stats AS (
SELECT p.user_id, MIN(p.created_at) AS first_post_created_at, SUM(p.like_count) AS likes_received
FROM visible_posts p
GROUP BY p.user_id
),
reply_stats AS (
SELECT p.user_id, COUNT(p.id) AS reply_count
FROM visible_posts p
WHERE p.post_number > 1
GROUP BY p.user_id
),
like_stats AS (
SELECT pa.user_id, COUNT(*) AS likes_given
FROM post_actions pa
WHERE pa.post_action_type_id = 2
GROUP BY pa.user_id
)
SELECT pa.user_id, COUNT(*) AS likes_given
FROM post_actions pa
JOIN visible_posts p ON pa.post_id = p.id
WHERE pa.post_action_type_id = 2 /* like */
AND pa.deleted_at IS NULL
GROUP BY pa.user_id
),
badge_stats AS (
SELECT ub.user_id, COUNT(DISTINCT ub.badge_id) AS distinct_badge_count
FROM user_badges ub
JOIN badges b ON ub.badge_id = b.id AND b.enabled
GROUP BY ub.user_id
),
post_action_stats AS ( -- posts created by user and likes given by user
SELECT p.user_id, p.id AS post_id, p.created_at::DATE, p.topic_id, p.post_number
FROM visible_posts p
UNION
SELECT pa.user_id, pa.post_id, pa.created_at::DATE, p.topic_id, p.post_number
FROM post_actions pa
JOIN visible_posts p ON pa.post_id = p.id
WHERE pa.post_action_type_id = 2
),
topic_reading_stats AS (
SELECT user_id, COUNT(DISTINCT topic_id) AS topics_entered,
COUNT(DISTINCT created_at) AS days_visited
FROM post_action_stats
GROUP BY user_id
),
posts_reading_stats AS (
SELECT user_id, SUM(max_post_number) AS posts_read_count
FROM (
SELECT user_id, MAX(post_number) AS max_post_number
FROM post_action_stats
GROUP BY user_id, topic_id
) x
GROUP BY user_id
)
INSERT
INTO user_stats (user_id, new_since, post_count, topic_count, first_post_created_at, likes_received, likes_given)
SELECT u.id, u.created_at AS new_since, COALESCE(ps.post_count, 0) AS post_count,
INTO user_stats (user_id, new_since, post_count, topic_count, first_post_created_at, likes_received,
likes_given, distinct_badge_count, days_visited, topics_entered, posts_read_count, time_read)
SELECT u.id AS user_id, u.created_at AS new_since, COALESCE(rs.reply_count, 0) AS reply_count,
COALESCE(ts.topic_count, 0) AS topic_count, ps.first_post_created_at,
COALESCE(ps.likes_received, 0) AS likes_received, COALESCE(ls.likes_given, 0) AS likes_given
COALESCE(ps.likes_received, 0) AS likes_received, COALESCE(ls.likes_given, 0) AS likes_given,
COALESCE(bs.distinct_badge_count, 0) AS distinct_badge_count, COALESCE(trs.days_visited, 1) AS days_visited,
COALESCE(trs.topics_entered, 0) AS topics_entered, COALESCE(prs.posts_read_count, 0) AS posts_read_count,
COALESCE(prs.posts_read_count, 0) * 30 AS time_read -- assume 30 seconds / post
FROM users u
LEFT JOIN post_stats ps ON u.id = ps.user_id
LEFT JOIN topic_stats ts ON u.id = ts.user_id
LEFT JOIN post_stats ps ON u.id = ps.user_id
LEFT JOIN reply_stats rs ON u.id = rs.user_id
LEFT JOIN like_stats ls ON u.id = ls.user_id
WHERE NOT EXISTS (
SELECT 1
FROM user_stats us
WHERE us.user_id = u.id
)
LEFT JOIN badge_stats bs ON u.id = bs.user_id
LEFT JOIN topic_reading_stats trs ON u.id = trs.user_id
LEFT JOIN posts_reading_stats prs ON u.id = prs.user_id
ON CONFLICT DO NOTHING
SQL