discourse/config/unicorn.conf.rb
Alan Guo Xiang Tan c1f25cdf5b
FIX: Unicorn master and Sidekiq reopening logs at the same time (#29137)
In our production environment, we have been seeing Sidekiq processes
getting stuck randomly when a USR1 signal is sent to the Unicorn master
process. We have not been able to identify the root cause of why the
Sidekiq process gets stuck. We however noticed that when the Unicorn
master process receives a USR1 signal, it will reopen the logs for the
Unicorn master process first before sending a USR1 signal for the
Unicorn worker processes to reopen the logs. We figured that we should
do the same for the Sidekiq process as well when a USR1 signal.

In this commit, we introduce an arbitrary delay of 1 second before we
the Sidekiq process reopens its log files so as to allow enough time for the Unicorn
master to finish reopening it logs first.

We also do not send reopen logs for the Sidekiq process if the `DISCOURSE_LOG_SIDEKIQ`
env is not present because there is no need to reopen any logs.
2024-10-10 08:01:40 +08:00

316 lines
11 KiB
Ruby

# frozen_string_literal: true
# See http://unicorn.bogomips.org/Unicorn/Configurator.html
discourse_path = File.expand_path(File.expand_path(File.dirname(__FILE__)) + "/../")
enable_logstash_logger = ENV["ENABLE_LOGSTASH_LOGGER"] == "1"
unicorn_stderr_path = "#{discourse_path}/log/unicorn.stderr.log"
unicorn_stdout_path = "#{discourse_path}/log/unicorn.stdout.log"
if enable_logstash_logger
require_relative "../lib/discourse_logstash_logger"
require_relative "../lib/unicorn_logstash_patch"
FileUtils.touch(unicorn_stderr_path) if !File.exist?(unicorn_stderr_path)
logger DiscourseLogstashLogger.logger(
logdev: unicorn_stderr_path,
type: :unicorn,
customize_event: lambda { |event| event["@timestamp"] = ::Time.now.utc },
)
else
logger Logger.new(STDOUT)
end
# tune down if not enough ram
worker_processes (ENV["UNICORN_WORKERS"] || 3).to_i
working_directory discourse_path
# listen "#{discourse_path}/tmp/sockets/unicorn.sock"
# stree-ignore
listen ENV["UNICORN_LISTENER"] || "#{(ENV["UNICORN_BIND_ALL"] ? "" : "127.0.0.1:")}#{(ENV["UNICORN_PORT"] || 3000).to_i}"
FileUtils.mkdir_p("#{discourse_path}/tmp/pids") if !File.exist?("#{discourse_path}/tmp/pids")
# feel free to point this anywhere accessible on the filesystem
pid(ENV["UNICORN_PID_PATH"] || "#{discourse_path}/tmp/pids/unicorn.pid")
if ENV["RAILS_ENV"] == "production"
# By default, the Unicorn logger will write to stderr.
# Additionally, some applications/frameworks log to stderr or stdout,
# so prevent them from going to /dev/null when daemonized here:
stderr_path unicorn_stderr_path
stdout_path unicorn_stdout_path
# nuke workers after 30 seconds instead of 60 seconds (the default)
timeout 30
else
# we want a longer timeout in dev cause first request can be really slow
timeout(ENV["UNICORN_TIMEOUT"] && ENV["UNICORN_TIMEOUT"].to_i || 60)
end
# important for Ruby 2.0
preload_app true
# Enable this flag to have unicorn test client connections by writing the
# beginning of the HTTP headers before calling the application. This
# prevents calling the application for connections that have disconnected
# while queued. This is only guaranteed to detect clients on the same
# host unicorn runs on, and unlikely to detect disconnects even on a
# fast LAN.
check_client_connection false
initialized = false
before_fork do |server, worker|
unless initialized
Discourse.preload_rails!
Discourse.before_fork
initialized = true
supervisor = ENV["UNICORN_SUPERVISOR_PID"].to_i
if supervisor > 0
Thread.new do
while true
unless File.exist?("/proc/#{supervisor}")
server.logger.error "Kill self supervisor is gone"
Process.kill "TERM", Process.pid
end
sleep 2
end
end
end
sidekiqs = ENV["UNICORN_SIDEKIQS"].to_i
if sidekiqs > 0
server.logger.info "starting #{sidekiqs} supervised sidekiqs"
require "demon/sidekiq"
Demon::Sidekiq.after_fork { DiscourseEvent.trigger(:sidekiq_fork_started) }
Demon::Sidekiq.start(sidekiqs, logger: server.logger)
if Discourse.enable_sidekiq_logging?
# Trap USR1, so we can re-issue to sidekiq workers
# but chain the default unicorn implementation as well
old_handler =
Signal.trap("USR1") do
old_handler.call
# We have seen Sidekiq processes getting stuck in production sporadically when log rotation happens.
# The cause is currently unknown but we suspect that it is related to the Unicorn master process and
# Sidekiq demon processes reopening logs at the same time as we noticed that Unicorn worker processes only
# reopen logs after the Unicorn master process is done. To workaround the problem, we are adding an arbitrary
# delay of 1 second to Sidekiq's log reopeing procedure. The 1 second delay should be
# more than enough for the Unicorn master process to finish reopening logs.
Demon::Sidekiq.kill("USR2")
end
end
end
if ENV["DISCOURSE_ENABLE_EMAIL_SYNC_DEMON"] == "true"
server.logger.info "starting up EmailSync demon"
Demon::EmailSync.start(1, logger: server.logger)
end
DiscoursePluginRegistry.demon_processes.each do |demon_class|
server.logger.info "starting #{demon_class.prefix} demon"
demon_class.start(1, logger: server.logger)
end
class ::Unicorn::HttpServer
alias master_sleep_orig master_sleep
# Original source: https://github.com/defunkt/unicorn/blob/6c9c442fb6aa12fd871237bc2bb5aec56c5b3538/lib/unicorn/http_server.rb#L477-L496
def murder_lazy_workers
next_sleep = @timeout - 1
now = time_now.to_i
@workers.dup.each_pair do |wpid, worker|
tick = worker.tick
0 == tick and next # skip workers that haven't processed any clients
diff = now - tick
tmp = @timeout - diff
# START MONKEY PATCH
if tmp < 2
logger.error "worker=#{worker.nr} PID:#{wpid} running too long " \
"(#{diff}s), sending USR2 to dump thread backtraces"
kill_worker(:USR2, wpid)
end
# END MONKEY PATCH
if tmp >= 0
next_sleep > tmp and next_sleep = tmp
next
end
next_sleep = 0
logger.error "worker=#{worker.nr} PID:#{wpid} timeout " \
"(#{diff}s > #{@timeout}s), killing"
kill_worker(:KILL, wpid) # take no prisoners for timeout violations
end
next_sleep <= 0 ? 1 : next_sleep
end
def max_sidekiq_rss
rss =
`ps -eo rss,args | grep sidekiq | grep -v grep | awk '{print $1}'`.split("\n")
.map(&:to_i)
.max
rss ||= 0
rss * 1024
end
def max_allowed_sidekiq_rss
[ENV["UNICORN_SIDEKIQ_MAX_RSS"].to_i, 500].max.megabytes
end
def force_kill_rogue_sidekiq
info = `ps -eo pid,rss,args | grep sidekiq | grep -v grep | awk '{print $1,$2}'`
info
.split("\n")
.each do |row|
pid, mem = row.split(" ").map(&:to_i)
if pid > 0 && (mem * 1024) > max_allowed_sidekiq_rss
Rails.logger.warn "Detected rogue Sidekiq pid #{pid} mem #{mem * 1024}, killing"
begin
Process.kill("KILL", pid)
rescue StandardError
nil
end
end
end
end
def check_sidekiq_heartbeat
@sidekiq_heartbeat_interval ||= 30.minutes
@sidekiq_next_heartbeat_check ||= Time.now.to_i + @sidekiq_heartbeat_interval
if @sidekiq_next_heartbeat_check < Time.now.to_i
last_heartbeat = Jobs::RunHeartbeat.last_heartbeat
restart = false
sidekiq_rss = max_sidekiq_rss
if sidekiq_rss > max_allowed_sidekiq_rss
Rails.logger.warn(
"Sidekiq is consuming too much memory (using: %0.2fM) for '%s', restarting" %
[(sidekiq_rss.to_f / 1.megabyte), ENV["DISCOURSE_HOSTNAME"]],
)
restart = true
end
if last_heartbeat < Time.now.to_i - @sidekiq_heartbeat_interval
Rails.logger.warn "Sidekiq heartbeat test failed, restarting"
restart = true
end
@sidekiq_next_heartbeat_check = Time.now.to_i + @sidekiq_heartbeat_interval
if restart
Demon::Sidekiq.restart
sleep 10
force_kill_rogue_sidekiq
end
Discourse.redis.close
end
end
def max_email_sync_rss
return 0 if Demon::EmailSync.demons.empty?
email_sync_pids = Demon::EmailSync.demons.map { |uid, demon| demon.pid }
return 0 if email_sync_pids.empty?
rss =
`ps -eo pid,rss,args | grep '#{email_sync_pids.join("|")}' | grep -v grep | awk '{print $2}'`.split(
"\n",
)
.map(&:to_i)
.max
(rss || 0) * 1024
end
def max_allowed_email_sync_rss
[ENV["UNICORN_EMAIL_SYNC_MAX_RSS"].to_i, 500].max.megabytes
end
def check_email_sync_heartbeat
# Skip first check to let process warm up
@email_sync_next_heartbeat_check ||= (Time.now + Demon::EmailSync::HEARTBEAT_INTERVAL).to_i
return if @email_sync_next_heartbeat_check > Time.now.to_i
@email_sync_next_heartbeat_check = (Time.now + Demon::EmailSync::HEARTBEAT_INTERVAL).to_i
restart = false
# Restart process if it does not respond anymore
last_heartbeat_ago =
Time.now.to_i - Discourse.redis.get(Demon::EmailSync::HEARTBEAT_KEY).to_i
if last_heartbeat_ago > Demon::EmailSync::HEARTBEAT_INTERVAL.to_i
Rails.logger.warn(
"EmailSync heartbeat test failed (last heartbeat was #{last_heartbeat_ago}s ago), restarting",
)
restart = true
end
# Restart process if memory usage is too high
email_sync_rss = max_email_sync_rss
if email_sync_rss > max_allowed_email_sync_rss
Rails.logger.warn(
"EmailSync is consuming too much memory (using: %0.2fM) for '%s', restarting" %
[(email_sync_rss.to_f / 1.megabyte), ENV["DISCOURSE_HOSTNAME"]],
)
restart = true
end
Demon::EmailSync.restart if restart
end
def master_sleep(sec)
sidekiqs = ENV["UNICORN_SIDEKIQS"].to_i
if sidekiqs > 0
Demon::Sidekiq.ensure_running
check_sidekiq_heartbeat
end
if ENV["DISCOURSE_ENABLE_EMAIL_SYNC_DEMON"] == "true"
Demon::EmailSync.ensure_running
check_email_sync_heartbeat
end
DiscoursePluginRegistry.demon_processes.each { |demon_class| demon_class.ensure_running }
master_sleep_orig(sec)
end
end
end
Discourse.redis.close
# Throttle the master from forking too quickly by sleeping. Due
# to the implementation of standard Unix signal handlers, this
# helps (but does not completely) prevent identical, repeated signals
# from being lost when the receiving process is busy.
sleep 1 if !Rails.env.development?
end
after_fork do |server, worker|
DiscourseEvent.trigger(:web_fork_started)
Discourse.after_fork
SignalTrapLogger.instance.after_fork
Signal.trap("USR2") do
message = <<~MSG
Unicorn worker received USR2 signal indicating it is about to timeout, dumping backtrace for main thread
#{Thread.current.backtrace&.join("\n")}
MSG
SignalTrapLogger.instance.log(Rails.logger, message, level: :warn)
end
end