mirror of
https://github.com/discourse/discourse.git
synced 2025-01-18 18:42:45 +08:00
PERF: optimise backfilling of topic_id (#13545)
Relying on large offsets can have uneven performance on huge table, new implementation recovers more cleanly and avoids double updates
This commit is contained in:
parent
a6b9289477
commit
14a0247301
|
@ -5,27 +5,22 @@ class BackfillEmailLogTopicId < ActiveRecord::Migration[6.1]
|
|||
BATCH_SIZE = 30_000
|
||||
|
||||
def up
|
||||
offset = 0
|
||||
email_log_count = DB.query_single("SELECT COUNT(*) FROM email_logs").first
|
||||
|
||||
loop do
|
||||
DB.exec(<<~SQL, offset: offset, batch_size: BATCH_SIZE)
|
||||
count = DB.exec(<<~SQL, batch_size: BATCH_SIZE)
|
||||
WITH cte AS (
|
||||
SELECT post_id
|
||||
FROM email_logs
|
||||
ORDER BY id
|
||||
SELECT l1.id, p1.topic_id
|
||||
FROM email_logs l1
|
||||
INNER JOIN posts p1 ON p1.id = l1.post_id
|
||||
WHERE l1.topic_id IS NULL AND p1.topic_id IS NOT NULL
|
||||
LIMIT :batch_size
|
||||
OFFSET :offset
|
||||
)
|
||||
UPDATE email_logs
|
||||
SET topic_id = posts.topic_id
|
||||
SET topic_id = cte.topic_id
|
||||
FROM cte
|
||||
INNER JOIN posts ON posts.id = cte.post_id
|
||||
WHERE email_logs.post_id = cte.post_id
|
||||
WHERE email_logs.id = cte.id
|
||||
SQL
|
||||
|
||||
offset += BATCH_SIZE
|
||||
break if offset > (email_log_count + BATCH_SIZE * 2)
|
||||
break if count == 0
|
||||
end
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user