DEV: make sure we don't load all data into memory when exporting chat messages (#22276)

This commit makes sure we don't load all data into memory when doing CSV exports. 
The most important change here made to the recently introduced export of chat 
messages (3ea31f4). We were loading all data into memory in the first version, with 
this commit it's not the case anymore.

Speaking of old exports. Some of them already use find_each, and it worked as 
expected, without loading all data into memory. And it will proceed working as 
expected after this commit.

In general, I made sure this change didn't break other CSV exports, first manually, and 
then by writing system specs for them. Sadly, I haven't managed yet to make those 
specs stable, they work fine locally, but flaky in GitHub actions, so I've disabled them 
for now.

I'll be making more changes to the CSV exports code soon, those system specs will be 
very helpful. I'll be running them locally, and I hope I'll manage to make them stable 
while doing that work.
This commit is contained in:
Andrei Prigorshnev 2023-07-12 18:52:18 +04:00 committed by GitHub
parent b7404373cf
commit fbe0e4c78c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 501 additions and 37 deletions

View File

@ -86,25 +86,9 @@ module Jobs
filename = entities[0][:filename] # use first entity as a name for this export filename = entities[0][:filename] # use first entity as a name for this export
user_export = UserExport.create(file_name: filename, user_id: @current_user.id) user_export = UserExport.create(file_name: filename, user_id: @current_user.id)
filename = "#{filename}-#{user_export.id}" filename = "#{filename}-#{user_export.id}"
dirname = "#{UserExport.base_directory}/#{filename}"
# ensure directory exists zip_filename = write_to_csv_and_zip(filename, entities)
FileUtils.mkdir_p(dirname) unless Dir.exist?(dirname)
# Generate a compressed CSV file
begin
entities.each do |entity|
CSV.open("#{dirname}/#{entity[:filename]}.csv", "w") do |csv|
csv << get_header(entity[:name]) if entity[:name] != "report"
public_send(entity[:method]).each { |d| csv << d }
end
end
zip_filename = Compression::Zip.new.compress(UserExport.base_directory, filename)
ensure
FileUtils.rm_rf(dirname)
end
# create upload # create upload
upload = nil upload = nil
@ -476,5 +460,22 @@ module Jobs
post post
end end
def write_to_csv_and_zip(filename, entities)
dirname = "#{UserExport.base_directory}/#{filename}"
FileUtils.mkdir_p(dirname) unless Dir.exist?(dirname)
begin
entities.each do |entity|
CSV.open("#{dirname}/#{entity[:filename]}.csv", "w") do |csv|
csv << get_header(entity[:name]) if entity[:name] != "report"
public_send(entity[:method]) { |d| csv << d }
end
end
Compression::Zip.new.compress(UserExport.base_directory, filename)
ensure
FileUtils.rm_rf(dirname)
end
end
end end
end end

View File

@ -8,26 +8,29 @@ module Chat
Chat::Message Chat::Message
.unscoped .unscoped
.where(created_at: 6.months.ago..Time.current) .where(created_at: 6.months.ago..Time.current)
.joins(:chat_channel) .includes(:chat_channel)
.joins(:user) .includes(:user)
.joins("INNER JOIN users last_editors ON chat_messages.last_editor_id = last_editors.id") .includes(:last_editor)
.order(:created_at)
.limit(LIMIT) .limit(LIMIT)
.pluck( .find_each do |chat_message|
"chat_messages.id", yield(
"chat_channels.id", [
"chat_channels.name", chat_message.id,
"users.id", chat_message.chat_channel.id,
"users.username", chat_message.chat_channel.name,
"chat_messages.message", chat_message.user.id,
"chat_messages.cooked", chat_message.user.username,
"chat_messages.created_at", chat_message.message,
"chat_messages.updated_at", chat_message.cooked,
"chat_messages.deleted_at", chat_message.created_at,
"chat_messages.in_reply_to_id", chat_message.updated_at,
"last_editors.id", chat_message.deleted_at,
"last_editors.username", chat_message.in_reply_to&.id,
) chat_message.last_editor&.id,
chat_message.last_editor&.username,
]
)
end
end end
def get_header(entity) def get_header(entity)

View File

@ -22,7 +22,8 @@ describe Chat::MessagesExporter do
it "exports messages" do it "exports messages" do
exporter = Class.new.extend(Chat::MessagesExporter) exporter = Class.new.extend(Chat::MessagesExporter)
result = exporter.chat_message_export.to_a result = []
exporter.chat_message_export { |data_row| result << data_row }
expect(result.length).to be(7) expect(result.length).to be(7)
assert_exported_message(result[0], public_channel_message_1) assert_exported_message(result[0], public_channel_message_1)

View File

@ -0,0 +1,63 @@
# frozen_string_literal: true
RSpec.describe "Chat CSV exports", type: :system do
fab!(:admin) { Fabricate(:admin) }
let(:csv_export_pm_page) { PageObjects::Pages::CSVExportPM.new }
before do
Jobs.run_immediately!
sign_in(admin)
chat_system_bootstrap
end
xit "exports chat messages" do
message = Fabricate(:chat_message)
visit "/admin/plugins/chat"
click_button "Create export"
visit "/u/#{admin.username}/messages"
click_link "[Chat Message] Data export complete"
expect(csv_export_pm_page).to have_download_link
exported_data = csv_export_pm_page.download_and_extract
expect(exported_data.first).to eq(
%w[
id
chat_channel_id
chat_channel_name
user_id
username
message
cooked
created_at
updated_at
deleted_at
in_reply_to_id
last_editor_id
last_editor_username
],
)
time_format = "%Y-%m-%d %H:%M:%S UTC"
expect(exported_data.second).to eq(
[
message.id.to_s,
message.chat_channel.id.to_s,
message.chat_channel.name,
message.user.id.to_s,
message.user.username,
message.message,
message.cooked,
message.created_at.strftime(time_format),
message.updated_at.strftime(time_format),
nil,
nil,
message.last_editor.id.to_s,
message.last_editor.username,
],
)
ensure
csv_export_pm_page.clear_downloads
end
end

View File

@ -77,6 +77,7 @@ end
# in spec/support/ and its subdirectories. # in spec/support/ and its subdirectories.
Dir[Rails.root.join("spec/support/**/*.rb")].each { |f| require f } Dir[Rails.root.join("spec/support/**/*.rb")].each { |f| require f }
Dir[Rails.root.join("spec/system/helpers/**/*.rb")].each { |f| require f }
Dir[Rails.root.join("spec/system/page_objects/**/base.rb")].each { |f| require f } Dir[Rails.root.join("spec/system/page_objects/**/base.rb")].each { |f| require f }
Dir[Rails.root.join("spec/system/page_objects/**/*.rb")].each { |f| require f } Dir[Rails.root.join("spec/system/page_objects/**/*.rb")].each { |f| require f }
@ -286,6 +287,7 @@ RSpec.configure do |config|
.tap do |options| .tap do |options|
apply_base_chrome_options(options) apply_base_chrome_options(options)
options.add_argument("--window-size=1400,1400") options.add_argument("--window-size=1400,1400")
options.add_preference("download.default_directory", Downloads::FOLDER)
end end
Capybara.register_driver :selenium_chrome do |app| Capybara.register_driver :selenium_chrome do |app|
@ -348,6 +350,7 @@ RSpec.configure do |config|
config.after(:suite) do config.after(:suite) do
FileUtils.remove_dir(concurrency_safe_tmp_dir, true) if SpecSecureRandom.value FileUtils.remove_dir(concurrency_safe_tmp_dir, true) if SpecSecureRandom.value
Downloads.clear
end end
config.around :each do |example| config.around :each do |example|

View File

@ -0,0 +1,314 @@
# frozen_string_literal: true
RSpec.describe "CSV Exports", type: :system do
fab!(:admin) { Fabricate(:admin) }
let(:csv_export_pm_page) { PageObjects::Pages::CSVExportPM.new }
time_format = "%Y-%m-%d %H:%M:%S UTC"
before do
Jobs.run_immediately!
sign_in(admin)
end
context "with user list" do
fab!(:group1) { Fabricate(:group) }
fab!(:group2) { Fabricate(:group) }
fab!(:user) do
Fabricate(
:user,
title: "dr",
last_seen_at: Time.now,
last_posted_at: Time.now,
last_emailed_at: Time.now,
approved: true,
suspended_at: Time.now,
suspended_till: Time.now,
silenced_till: Time.now,
admin: true,
moderator: true,
staged: true,
group_ids: [group1.id, group2.id],
)
end
let(:second_email) { "second_email@discourse.org" }
let(:third_email) { "third_email@discourse.org" }
before do
user.user_emails.create!(email: second_email)
user.user_emails.create!(email: third_email)
user.user_stat.topics_entered = 111
user.user_stat.posts_read_count = 112
user.user_stat.time_read = 113
user.user_stat.topic_count = 114
user.user_stat.post_count = 115
user.user_stat.likes_given = 116
user.user_stat.likes_received = 117
user.user_stat.save!
user.user_profile.location = "Tbilisi"
user.user_profile.website = "https://www.discourse.org"
user.user_profile.views = 5
user.user_profile.save!
end
xit "exports data" do
visit "admin/users/list/active"
click_button "Export"
visit "/u/#{admin.username}/messages"
click_link "[User List] Data export complete"
expect(csv_export_pm_page).to have_download_link
exported_data = csv_export_pm_page.download_and_extract
expect(exported_data.length).to be(5)
expect(exported_data.first).to eq(
%w[
id
name
username
email
title
created_at
last_seen_at
last_posted_at
last_emailed_at
trust_level
approved
suspended_at
suspended_till
silenced_till
active
admin
moderator
ip_address
staged
secondary_emails
topics_entered
posts_read_count
time_read
topic_count
post_count
likes_given
likes_received
location
website
views
group_names
],
)
expect(exported_data.last).to eq(
[
user.id.to_s,
user.name,
user.username,
user.email,
user.title,
user.created_at.strftime(time_format),
user.last_seen_at.strftime(time_format),
user.last_posted_at.strftime(time_format),
user.last_emailed_at.strftime(time_format),
user.trust_level.to_s,
user.approved.to_s,
user.suspended_at.strftime(time_format),
user.suspended_till.strftime(time_format),
user.silenced_till.strftime(time_format),
user.active.to_s,
user.admin.to_s,
user.moderator.to_s,
user.ip_address.to_s,
user.staged.to_s,
"#{second_email};#{third_email}",
user.user_stat.topics_entered.to_s,
user.user_stat.posts_read_count.to_s,
user.user_stat.time_read.to_s,
user.user_stat.topic_count.to_s,
user.user_stat.post_count.to_s,
user.user_stat.likes_given.to_s,
user.user_stat.likes_received.to_s,
user.user_profile.location,
user.user_profile.website,
user.user_profile.views.to_s,
"#{group1.name};#{group2.name}",
],
)
ensure
csv_export_pm_page.clear_downloads
end
end
context "with stuff actions log" do
fab!(:user_history) do
Fabricate(
:user_history,
acting_user: admin,
action: UserHistory.actions[:change_site_setting],
subject: "default_trust_level",
details: "details",
context: "context",
)
end
xit "exports data" do
visit "admin/logs/staff_action_logs"
click_button "Export"
visit "/u/#{admin.username}/messages"
click_link "[Staff Action] Data export complete"
expect(csv_export_pm_page).to have_download_link
exported_data = csv_export_pm_page.download_and_extract
expect(exported_data.first).to eq(%w[staff_user action subject created_at details context])
expect(exported_data.last).to eq(
[
user_history.acting_user.username,
"change_site_setting",
user_history.subject,
user_history.created_at.strftime(time_format),
user_history.details,
user_history.context,
],
)
ensure
csv_export_pm_page.clear_downloads
end
end
context "with reports" do
before do
freeze_time # otherwise the test can fail when ran in midnight
Fabricate(:bookmark)
end
xit "exports the Bookmarks report" do
visit "admin/reports/bookmarks"
click_button "Export"
visit "/u/#{admin.username}/messages"
click_link "[Bookmarks] Data export complete"
expect(csv_export_pm_page).to have_download_link
exported_data = csv_export_pm_page.download_and_extract
expect(exported_data.length).to be(2)
expect(exported_data.first).to eq(%w[Day Count])
expect(exported_data.second).to eq([Time.now.strftime("%Y-%m-%d"), "1"])
ensure
csv_export_pm_page.clear_downloads
end
end
context "with screened emails" do
fab!(:screened_email) do
Fabricate(
:screened_email,
action_type: ScreenedEmail.actions[:do_nothing],
match_count: 5,
last_match_at: Time.now,
created_at: Time.now,
ip_address: "94.99.101.228",
)
end
xit "exports data" do
visit "admin/logs/screened_emails"
click_button "Export"
visit "/u/#{admin.username}/messages"
click_link "[Screened Email] Data export complete"
expect(csv_export_pm_page).to have_download_link
exported_data = csv_export_pm_page.download_and_extract
expect(exported_data.length).to be(2)
expect(exported_data.first).to eq(
%w[email action match_count last_match_at created_at ip_address],
)
expect(exported_data.second).to eq(
[
screened_email.email,
"do_nothing",
screened_email.match_count.to_s,
screened_email.last_match_at.strftime(time_format),
screened_email.created_at.strftime(time_format),
screened_email.ip_address.to_s,
],
)
ensure
csv_export_pm_page.clear_downloads
end
end
context "with screened ips" do
fab!(:screened_ip) do
Fabricate(
:screened_ip_address,
action_type: ScreenedIpAddress.actions[:do_nothing],
match_count: 5,
ip_address: "99.232.23.124",
last_match_at: Time.now,
created_at: Time.now,
)
end
xit "exports data" do
visit "admin/logs/screened_ip_addresses"
click_button "Export"
visit "/u/#{admin.username}/messages"
click_link "[Screened Ip] Data export complete"
expect(csv_export_pm_page).to have_download_link
exported_data = csv_export_pm_page.download_and_extract
expect(exported_data.first).to eq(%w[ip_address action match_count last_match_at created_at])
expect(exported_data.second).to eq(
[
screened_ip.ip_address.to_s,
"do_nothing",
screened_ip.match_count.to_s,
screened_ip.last_match_at.strftime(time_format),
screened_ip.created_at.strftime(time_format),
],
)
ensure
csv_export_pm_page.clear_downloads
end
end
context "with screened urls" do
fab!(:screened_url) do
Fabricate(
:screened_url,
action_type: ScreenedUrl.actions[:do_nothing],
match_count: 5,
domain: "https://discourse.org",
last_match_at: Time.now,
created_at: Time.now,
)
end
xit "exports data" do
visit "admin/logs/screened_urls"
click_button "Export"
visit "/u/#{admin.username}/messages"
click_link "[Screened Url] Data export complete"
expect(csv_export_pm_page).to have_download_link
exported_data = csv_export_pm_page.download_and_extract
expect(exported_data.length).to be(2)
expect(exported_data.first).to eq(%w[domain action match_count last_match_at created_at])
expect(exported_data.second).to eq(
[
screened_url.domain,
"do nothing",
screened_url.match_count.to_s,
screened_url.last_match_at.strftime(time_format),
screened_url.created_at.strftime(time_format),
],
)
ensure
csv_export_pm_page.clear_downloads
end
end
end

View File

@ -0,0 +1,30 @@
# frozen_string_literal: true
class Downloads
FOLDER = Rails.root.join("tmp/downloads")
TIMEOUT = 10
def self.wait_for_download
Timeout.timeout(TIMEOUT) { sleep 0.1 until downloaded? }
end
def self.clear
FileUtils.rm_rf(FOLDER)
end
private
def self.downloaded?
!downloading? && downloads.any?
end
def self.downloading?
downloads.grep(/\.crdownload$/).any?
end
def self.downloads
Dir[FOLDER.join("*")]
end
private_class_method :downloaded?, :downloading?, :downloads
end

View File

@ -0,0 +1,49 @@
# frozen_string_literal: true
module PageObjects
module Pages
class CSVExportPM < PageObjects::Pages::Base
def initialize
super
@downloaded_files = []
end
def download_and_extract
click_link ".zip"
Downloads.wait_for_download
zip_name = find("a.attachment").text
zip_path = File.join(Downloads::FOLDER, zip_name)
@downloaded_files << zip_path
csv_path = unzip(zip_path).first
@downloaded_files << csv_path
CSV.read(csv_path)
end
def clear_downloads
@downloaded_files.each { |file| FileUtils.rm(file) }
@downloaded_files = []
end
def has_download_link?
find(:link, ".zip").present?
end
private
def unzip(file)
paths = []
Zip::File.open(file) do |zip_file|
zip_file.each do |f|
path = File.join(Downloads::FOLDER, f.name)
zip_file.extract(f, path)
paths << path
end
end
paths
end
end
end
end