mirror of
https://github.com/discourse/discourse.git
synced 2024-12-19 05:14:08 +08:00
2f65f36cae
In 6cafe59c76
, we added a monkey patch to
`Unicorn::HtppServer#murder_lazy_workers` to log a message and send a
`USR2` signal to the Unicorn worker process when they Unicorn worker
process is 2 seconds away from being timed out by the Unicorn master
process. However, we ended up loggging multiple messages and sending
multiple USR2 signal during the 2 seconds before the Unicorn worker
process hit the time out.
To overcome this problem, we will now set an instance variable on the
`Unicorn::Worker` instance and use it to ensure that the log message is
only logged once and USR2 signal to the Unicorn worker process is only
sent one as well.
319 lines
11 KiB
Ruby
319 lines
11 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
# See http://unicorn.bogomips.org/Unicorn/Configurator.html
|
|
discourse_path = File.expand_path(File.expand_path(File.dirname(__FILE__)) + "/../")
|
|
|
|
enable_logstash_logger = ENV["ENABLE_LOGSTASH_LOGGER"] == "1"
|
|
unicorn_stderr_path = "#{discourse_path}/log/unicorn.stderr.log"
|
|
unicorn_stdout_path = "#{discourse_path}/log/unicorn.stdout.log"
|
|
|
|
if enable_logstash_logger
|
|
require_relative "../lib/discourse_logstash_logger"
|
|
require_relative "../lib/unicorn_logstash_patch"
|
|
FileUtils.touch(unicorn_stderr_path) if !File.exist?(unicorn_stderr_path)
|
|
logger DiscourseLogstashLogger.logger(
|
|
logdev: unicorn_stderr_path,
|
|
type: :unicorn,
|
|
customize_event: lambda { |event| event["@timestamp"] = ::Time.now.utc },
|
|
)
|
|
else
|
|
logger Logger.new(STDOUT)
|
|
end
|
|
|
|
# tune down if not enough ram
|
|
worker_processes (ENV["UNICORN_WORKERS"] || 3).to_i
|
|
|
|
working_directory discourse_path
|
|
|
|
# listen "#{discourse_path}/tmp/sockets/unicorn.sock"
|
|
|
|
# stree-ignore
|
|
listen ENV["UNICORN_LISTENER"] || "#{(ENV["UNICORN_BIND_ALL"] ? "" : "127.0.0.1:")}#{(ENV["UNICORN_PORT"] || 3000).to_i}"
|
|
|
|
FileUtils.mkdir_p("#{discourse_path}/tmp/pids") if !File.exist?("#{discourse_path}/tmp/pids")
|
|
|
|
# feel free to point this anywhere accessible on the filesystem
|
|
pid(ENV["UNICORN_PID_PATH"] || "#{discourse_path}/tmp/pids/unicorn.pid")
|
|
|
|
if ENV["RAILS_ENV"] == "production"
|
|
# By default, the Unicorn logger will write to stderr.
|
|
# Additionally, some applications/frameworks log to stderr or stdout,
|
|
# so prevent them from going to /dev/null when daemonized here:
|
|
stderr_path unicorn_stderr_path
|
|
stdout_path unicorn_stdout_path
|
|
|
|
# nuke workers after 30 seconds instead of 60 seconds (the default)
|
|
timeout 30
|
|
else
|
|
# we want a longer timeout in dev cause first request can be really slow
|
|
timeout(ENV["UNICORN_TIMEOUT"] && ENV["UNICORN_TIMEOUT"].to_i || 60)
|
|
end
|
|
|
|
# important for Ruby 2.0
|
|
preload_app true
|
|
|
|
# Enable this flag to have unicorn test client connections by writing the
|
|
# beginning of the HTTP headers before calling the application. This
|
|
# prevents calling the application for connections that have disconnected
|
|
# while queued. This is only guaranteed to detect clients on the same
|
|
# host unicorn runs on, and unlikely to detect disconnects even on a
|
|
# fast LAN.
|
|
check_client_connection false
|
|
|
|
initialized = false
|
|
before_fork do |server, worker|
|
|
unless initialized
|
|
Discourse.preload_rails!
|
|
Discourse.before_fork
|
|
|
|
initialized = true
|
|
|
|
supervisor = ENV["UNICORN_SUPERVISOR_PID"].to_i
|
|
if supervisor > 0
|
|
Thread.new do
|
|
while true
|
|
unless File.exist?("/proc/#{supervisor}")
|
|
server.logger.error "Kill self supervisor is gone"
|
|
Process.kill "TERM", Process.pid
|
|
end
|
|
sleep 2
|
|
end
|
|
end
|
|
end
|
|
|
|
sidekiqs = ENV["UNICORN_SIDEKIQS"].to_i
|
|
if sidekiqs > 0
|
|
server.logger.info "starting #{sidekiqs} supervised sidekiqs"
|
|
|
|
require "demon/sidekiq"
|
|
Demon::Sidekiq.after_fork { DiscourseEvent.trigger(:sidekiq_fork_started) }
|
|
Demon::Sidekiq.start(sidekiqs, logger: server.logger)
|
|
|
|
if Discourse.enable_sidekiq_logging?
|
|
# Trap USR1, so we can re-issue to sidekiq workers
|
|
# but chain the default unicorn implementation as well
|
|
old_handler =
|
|
Signal.trap("USR1") do
|
|
old_handler.call
|
|
|
|
# We have seen Sidekiq processes getting stuck in production sporadically when log rotation happens.
|
|
# The cause is currently unknown but we suspect that it is related to the Unicorn master process and
|
|
# Sidekiq demon processes reopening logs at the same time as we noticed that Unicorn worker processes only
|
|
# reopen logs after the Unicorn master process is done. To workaround the problem, we are adding an arbitrary
|
|
# delay of 1 second to Sidekiq's log reopeing procedure. The 1 second delay should be
|
|
# more than enough for the Unicorn master process to finish reopening logs.
|
|
Demon::Sidekiq.kill("USR2")
|
|
end
|
|
end
|
|
end
|
|
|
|
if ENV["DISCOURSE_ENABLE_EMAIL_SYNC_DEMON"] == "true"
|
|
server.logger.info "starting up EmailSync demon"
|
|
Demon::EmailSync.start(1, logger: server.logger)
|
|
end
|
|
|
|
DiscoursePluginRegistry.demon_processes.each do |demon_class|
|
|
server.logger.info "starting #{demon_class.prefix} demon"
|
|
demon_class.start(1, logger: server.logger)
|
|
end
|
|
|
|
class ::Unicorn::HttpServer
|
|
alias master_sleep_orig master_sleep
|
|
|
|
# Original source: https://github.com/defunkt/unicorn/blob/6c9c442fb6aa12fd871237bc2bb5aec56c5b3538/lib/unicorn/http_server.rb#L477-L496
|
|
def murder_lazy_workers
|
|
next_sleep = @timeout - 1
|
|
now = time_now.to_i
|
|
@workers.dup.each_pair do |wpid, worker|
|
|
tick = worker.tick
|
|
0 == tick and next # skip workers that haven't processed any clients
|
|
diff = now - tick
|
|
tmp = @timeout - diff
|
|
|
|
# START MONKEY PATCH
|
|
if tmp < 2 && !worker.instance_variable_get(:@timing_out_logged)
|
|
logger.error do
|
|
"worker=#{worker.nr} PID:#{wpid} running too long (#{diff}s), sending USR2 to dump thread backtraces"
|
|
end
|
|
|
|
worker.instance_variable_set(:@timing_out_logged, true)
|
|
kill_worker(:USR2, wpid)
|
|
end
|
|
# END MONKEY PATCH
|
|
|
|
if tmp >= 0
|
|
next_sleep > tmp and next_sleep = tmp
|
|
next
|
|
end
|
|
next_sleep = 0
|
|
logger.error "worker=#{worker.nr} PID:#{wpid} timeout " \
|
|
"(#{diff}s > #{@timeout}s), killing"
|
|
|
|
kill_worker(:KILL, wpid) # take no prisoners for timeout violations
|
|
end
|
|
next_sleep <= 0 ? 1 : next_sleep
|
|
end
|
|
|
|
def max_sidekiq_rss
|
|
rss =
|
|
`ps -eo rss,args | grep sidekiq | grep -v grep | awk '{print $1}'`.split("\n")
|
|
.map(&:to_i)
|
|
.max
|
|
|
|
rss ||= 0
|
|
|
|
rss * 1024
|
|
end
|
|
|
|
def max_allowed_sidekiq_rss
|
|
[ENV["UNICORN_SIDEKIQ_MAX_RSS"].to_i, 500].max.megabytes
|
|
end
|
|
|
|
def force_kill_rogue_sidekiq
|
|
info = `ps -eo pid,rss,args | grep sidekiq | grep -v grep | awk '{print $1,$2}'`
|
|
info
|
|
.split("\n")
|
|
.each do |row|
|
|
pid, mem = row.split(" ").map(&:to_i)
|
|
if pid > 0 && (mem * 1024) > max_allowed_sidekiq_rss
|
|
Rails.logger.warn "Detected rogue Sidekiq pid #{pid} mem #{mem * 1024}, killing"
|
|
begin
|
|
Process.kill("KILL", pid)
|
|
rescue StandardError
|
|
nil
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def check_sidekiq_heartbeat
|
|
@sidekiq_heartbeat_interval ||= 30.minutes
|
|
@sidekiq_next_heartbeat_check ||= Time.now.to_i + @sidekiq_heartbeat_interval
|
|
|
|
if @sidekiq_next_heartbeat_check < Time.now.to_i
|
|
last_heartbeat = Jobs::RunHeartbeat.last_heartbeat
|
|
restart = false
|
|
|
|
sidekiq_rss = max_sidekiq_rss
|
|
if sidekiq_rss > max_allowed_sidekiq_rss
|
|
Rails.logger.warn(
|
|
"Sidekiq is consuming too much memory (using: %0.2fM) for '%s', restarting" %
|
|
[(sidekiq_rss.to_f / 1.megabyte), ENV["DISCOURSE_HOSTNAME"]],
|
|
)
|
|
|
|
restart = true
|
|
end
|
|
|
|
if last_heartbeat < Time.now.to_i - @sidekiq_heartbeat_interval
|
|
Rails.logger.warn "Sidekiq heartbeat test failed, restarting"
|
|
|
|
restart = true
|
|
end
|
|
@sidekiq_next_heartbeat_check = Time.now.to_i + @sidekiq_heartbeat_interval
|
|
|
|
if restart
|
|
Demon::Sidekiq.restart
|
|
sleep 10
|
|
force_kill_rogue_sidekiq
|
|
end
|
|
Discourse.redis.close
|
|
end
|
|
end
|
|
|
|
def max_email_sync_rss
|
|
return 0 if Demon::EmailSync.demons.empty?
|
|
|
|
email_sync_pids = Demon::EmailSync.demons.map { |uid, demon| demon.pid }
|
|
return 0 if email_sync_pids.empty?
|
|
|
|
rss =
|
|
`ps -eo pid,rss,args | grep '#{email_sync_pids.join("|")}' | grep -v grep | awk '{print $2}'`.split(
|
|
"\n",
|
|
)
|
|
.map(&:to_i)
|
|
.max
|
|
|
|
(rss || 0) * 1024
|
|
end
|
|
|
|
def max_allowed_email_sync_rss
|
|
[ENV["UNICORN_EMAIL_SYNC_MAX_RSS"].to_i, 500].max.megabytes
|
|
end
|
|
|
|
def check_email_sync_heartbeat
|
|
# Skip first check to let process warm up
|
|
@email_sync_next_heartbeat_check ||= (Time.now + Demon::EmailSync::HEARTBEAT_INTERVAL).to_i
|
|
|
|
return if @email_sync_next_heartbeat_check > Time.now.to_i
|
|
@email_sync_next_heartbeat_check = (Time.now + Demon::EmailSync::HEARTBEAT_INTERVAL).to_i
|
|
|
|
restart = false
|
|
|
|
# Restart process if it does not respond anymore
|
|
last_heartbeat_ago =
|
|
Time.now.to_i - Discourse.redis.get(Demon::EmailSync::HEARTBEAT_KEY).to_i
|
|
if last_heartbeat_ago > Demon::EmailSync::HEARTBEAT_INTERVAL.to_i
|
|
Rails.logger.warn(
|
|
"EmailSync heartbeat test failed (last heartbeat was #{last_heartbeat_ago}s ago), restarting",
|
|
)
|
|
|
|
restart = true
|
|
end
|
|
|
|
# Restart process if memory usage is too high
|
|
email_sync_rss = max_email_sync_rss
|
|
if email_sync_rss > max_allowed_email_sync_rss
|
|
Rails.logger.warn(
|
|
"EmailSync is consuming too much memory (using: %0.2fM) for '%s', restarting" %
|
|
[(email_sync_rss.to_f / 1.megabyte), ENV["DISCOURSE_HOSTNAME"]],
|
|
)
|
|
|
|
restart = true
|
|
end
|
|
|
|
Demon::EmailSync.restart if restart
|
|
end
|
|
|
|
def master_sleep(sec)
|
|
sidekiqs = ENV["UNICORN_SIDEKIQS"].to_i
|
|
if sidekiqs > 0
|
|
Demon::Sidekiq.ensure_running
|
|
check_sidekiq_heartbeat
|
|
end
|
|
|
|
if ENV["DISCOURSE_ENABLE_EMAIL_SYNC_DEMON"] == "true"
|
|
Demon::EmailSync.ensure_running
|
|
check_email_sync_heartbeat
|
|
end
|
|
|
|
DiscoursePluginRegistry.demon_processes.each { |demon_class| demon_class.ensure_running }
|
|
|
|
master_sleep_orig(sec)
|
|
end
|
|
end
|
|
end
|
|
|
|
Discourse.redis.close
|
|
|
|
# Throttle the master from forking too quickly by sleeping. Due
|
|
# to the implementation of standard Unix signal handlers, this
|
|
# helps (but does not completely) prevent identical, repeated signals
|
|
# from being lost when the receiving process is busy.
|
|
sleep 1 if !Rails.env.development?
|
|
end
|
|
|
|
after_fork do |server, worker|
|
|
DiscourseEvent.trigger(:web_fork_started)
|
|
Discourse.after_fork
|
|
SignalTrapLogger.instance.after_fork
|
|
|
|
Signal.trap("USR2") do
|
|
message = <<~MSG
|
|
Unicorn worker received USR2 signal indicating it is about to timeout, dumping backtrace for main thread
|
|
#{Thread.current.backtrace&.join("\n")}
|
|
MSG
|
|
|
|
SignalTrapLogger.instance.log(Rails.logger, message, level: :warn)
|
|
end
|
|
end
|