DEV: Log Unicorn worker timeout backtraces to Rails.logger (#27257)

This commit introduces the following changes:

1. Introduce the `SignalTrapLogger` singleton which starts a single
   thread that polls a queue to log messages with the specified logger.
   This thread is necessary becasue most loggers cannot be used inside
   the `Signal.trap` context as they rely on mutexes which are not
   allowed within the context.

2. Moves the monkey patch in `freedom_patches/unicorn_http_server_patch.rb` to
   `config/unicorn.config.rb` which is already monkey patching
   `Unicorn::HttpServer`.

3. `Unicorn::HttpServer` will now automatically send a `USR2` signal to
   a unicorn worker 2 seconds before the worker is timed out by the
   Unicorn master.

4. When a Unicorn worker receives a `USR2` signal, it will now log only
   the main thread's backtraces to `Rails.logger`. Previously, it was
   `put`ing the backtraces to `STDOUT` which most people wouldn't read.
   Logging it via `Rails.logger` will make the backtraces easily
   accessible via `/logs`.
This commit is contained in:
Alan Guo Xiang Tan 2024-06-03 12:51:12 +08:00 committed by GitHub
parent 4b2bd4d682
commit 23c38cbf11
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 118 additions and 105 deletions

View File

@ -84,14 +84,8 @@ before_fork do |server, worker|
server.logger.info "starting #{sidekiqs} supervised sidekiqs" server.logger.info "starting #{sidekiqs} supervised sidekiqs"
require "demon/sidekiq" require "demon/sidekiq"
Demon::Sidekiq.logger = server.logger
Demon::Sidekiq.after_fork { DiscourseEvent.trigger(:sidekiq_fork_started) } Demon::Sidekiq.after_fork { DiscourseEvent.trigger(:sidekiq_fork_started) }
Demon::Sidekiq.start(sidekiqs) Demon::Sidekiq.start(sidekiqs, logger: server.logger)
Signal.trap("SIGTSTP") do
Demon::Sidekiq.log("Issuing stop to Sidekiq")
Demon::Sidekiq.stop
end
# Trap USR1, so we can re-issue to sidekiq workers # Trap USR1, so we can re-issue to sidekiq workers
# but chain the default unicorn implementation as well # but chain the default unicorn implementation as well
@ -104,21 +98,48 @@ before_fork do |server, worker|
if ENV["DISCOURSE_ENABLE_EMAIL_SYNC_DEMON"] == "true" if ENV["DISCOURSE_ENABLE_EMAIL_SYNC_DEMON"] == "true"
server.logger.info "starting up EmailSync demon" server.logger.info "starting up EmailSync demon"
Demon::EmailSync.start Demon::EmailSync.start(1, logger: server.logger)
Signal.trap("SIGTSTP") do
STDERR.puts "#{Time.now}: Issuing stop to EmailSync"
Demon::EmailSync.stop
end
end end
DiscoursePluginRegistry.demon_processes.each do |demon_class| DiscoursePluginRegistry.demon_processes.each do |demon_class|
server.logger.info "starting #{demon_class.prefix} demon" server.logger.info "starting #{demon_class.prefix} demon"
demon_class.start demon_class.start(1, logger: server.logger)
end end
class ::Unicorn::HttpServer class ::Unicorn::HttpServer
alias master_sleep_orig master_sleep alias master_sleep_orig master_sleep
# Original source: https://github.com/defunkt/unicorn/blob/6c9c442fb6aa12fd871237bc2bb5aec56c5b3538/lib/unicorn/http_server.rb#L477-L496
def murder_lazy_workers
next_sleep = @timeout - 1
now = time_now.to_i
@workers.dup.each_pair do |wpid, worker|
tick = worker.tick
0 == tick and next # skip workers that haven't processed any clients
diff = now - tick
tmp = @timeout - diff
# START MONKEY PATCH
if tmp < 2
logger.error "worker=#{worker.nr} PID:#{wpid} running too long " \
"(#{diff}s), sending USR2 to dump thread backtraces"
kill_worker(:USR2, wpid)
end
# END MONKEY PATCH
if tmp >= 0
next_sleep > tmp and next_sleep = tmp
next
end
next_sleep = 0
logger.error "worker=#{worker.nr} PID:#{wpid} timeout " \
"(#{diff}s > #{@timeout}s), killing"
kill_worker(:KILL, wpid) # take no prisoners for timeout violations
end
next_sleep <= 0 ? 1 : next_sleep
end
def max_sidekiq_rss def max_sidekiq_rss
rss = rss =
`ps -eo rss,args | grep sidekiq | grep -v grep | awk '{print $1}'`.split("\n") `ps -eo rss,args | grep sidekiq | grep -v grep | awk '{print $1}'`.split("\n")
@ -270,9 +291,14 @@ end
after_fork do |server, worker| after_fork do |server, worker|
DiscourseEvent.trigger(:web_fork_started) DiscourseEvent.trigger(:web_fork_started)
Discourse.after_fork Discourse.after_fork
SignalTrapLogger.instance.after_fork
Signal.trap("USR2") { puts <<~MSG } Signal.trap("USR2") do
[#{Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S.%6N")} ##{Process.pid}] Received USR2 signal, dumping backtrace for all threads message = <<~MSG
#{Thread.list.map { |t| "#{t.backtrace&.join("\n")}" }.join("\n\n")} Unicorn worker received USR2 signal indicating it is about to timeout, dumping backtrace for main thread
#{Thread.current.backtrace&.join("\n")}
MSG MSG
SignalTrapLogger.instance.log(Rails.logger, message, level: :warn)
end
end end

View File

@ -9,9 +9,9 @@ class Demon::Base
@demons @demons
end end
def self.start(count = 1, verbose: false) def self.start(count = 1, verbose: false, logger: nil)
@demons ||= {} @demons ||= {}
count.times { |i| (@demons["#{prefix}_#{i}"] ||= new(i, verbose: verbose)).start } count.times { |i| (@demons["#{prefix}_#{i}"] ||= new(i, verbose:, logger:)).start }
end end
def self.stop def self.stop
@ -39,7 +39,7 @@ class Demon::Base
attr_reader :pid, :parent_pid, :started, :index attr_reader :pid, :parent_pid, :started, :index
attr_accessor :stop_timeout attr_accessor :stop_timeout
def initialize(index, rails_root: nil, parent_pid: nil, verbose: false) def initialize(index, rails_root: nil, parent_pid: nil, verbose: false, logger: nil)
@index = index @index = index
@pid = nil @pid = nil
@parent_pid = parent_pid || Process.pid @parent_pid = parent_pid || Process.pid
@ -47,6 +47,11 @@ class Demon::Base
@stop_timeout = 10 @stop_timeout = 10
@rails_root = rails_root || Rails.root @rails_root = rails_root || Rails.root
@verbose = verbose @verbose = verbose
@logger = logger || Logger.new(STDERR)
end
def log(message, level: :info)
@logger.public_send(level, message)
end end
def pid_file def pid_file
@ -72,6 +77,7 @@ class Demon::Base
def stop def stop
@started = false @started = false
if @pid if @pid
Process.kill(stop_signal, @pid) Process.kill(stop_signal, @pid)
@ -99,7 +105,7 @@ class Demon::Base
wait_for_stop.call wait_for_stop.call
if alive? if alive?
STDERR.puts "Process would not terminate cleanly, force quitting. pid: #{@pid} #{self.class}" log("Process would not terminate cleanly, force quitting. pid: #{@pid} #{self.class}")
Process.kill("KILL", @pid) Process.kill("KILL", @pid)
end end
@ -125,8 +131,9 @@ class Demon::Base
rescue StandardError rescue StandardError
-1 -1
end end
if dead if dead
STDERR.puts "Detected dead worker #{@pid}, restarting..." log("Detected dead worker #{@pid}, restarting...")
@pid = nil @pid = nil
@started = false @started = false
start start
@ -138,7 +145,7 @@ class Demon::Base
if existing = already_running? if existing = already_running?
# should not happen ... so kill violently # should not happen ... so kill violently
STDERR.puts "Attempting to kill pid #{existing}" log("Attempting to kill pid #{existing}")
Process.kill("TERM", existing) Process.kill("TERM", existing)
end end
@ -199,7 +206,7 @@ class Demon::Base
Process.kill "KILL", Process.pid Process.kill "KILL", Process.pid
end end
rescue => e rescue => e
STDERR.puts "URGENT monitoring thread had an exception #{e}" log("URGENT monitoring thread had an exception #{e}")
end end
sleep 1 sleep 1
end end

View File

@ -77,14 +77,14 @@ class Demon::EmailSync < ::Demon::Base
end end
def after_fork def after_fork
puts "[EmailSync] Loading EmailSync in process id #{Process.pid}" log("[EmailSync] Loading EmailSync in process id #{Process.pid}")
loop do loop do
break if Discourse.redis.set(HEARTBEAT_KEY, Time.now.to_i, ex: HEARTBEAT_INTERVAL, nx: true) break if Discourse.redis.set(HEARTBEAT_KEY, Time.now.to_i, ex: HEARTBEAT_INTERVAL, nx: true)
sleep HEARTBEAT_INTERVAL sleep HEARTBEAT_INTERVAL
end end
puts "[EmailSync] Starting EmailSync main thread" log("[EmailSync] Starting EmailSync main thread")
@running = true @running = true
@sync_data = {} @sync_data = {}
@ -158,18 +158,20 @@ class Demon::EmailSync < ::Demon::Base
Discourse.redis.del(HEARTBEAT_KEY) Discourse.redis.del(HEARTBEAT_KEY)
exit 0 exit 0
rescue => e rescue => e
STDERR.puts e.message log("#{e.message}: #{e.backtrace.join("\n")}")
STDERR.puts e.backtrace.join("\n")
exit 1 exit 1
end end
def kill_and_disconnect!(data) def kill_and_disconnect!(data)
data[:thread].kill data[:thread].kill
data[:thread].join data[:thread].join
begin begin
data[:syncer]&.disconnect! data[:syncer]&.disconnect!
rescue Net::IMAP::ResponseError => err rescue Net::IMAP::ResponseError => err
puts "[EmailSync] Encountered a response error when disconnecting: #{err}" log(
"[EmailSync] Encountered a response error when disconnecting: #{err}\n#{err.backtrace.join("\n")}",
)
end end
end end
end end

View File

@ -3,8 +3,6 @@
require "demon/base" require "demon/base"
class Demon::Sidekiq < ::Demon::Base class Demon::Sidekiq < ::Demon::Base
cattr_accessor :logger
def self.prefix def self.prefix
"sidekiq" "sidekiq"
end end
@ -13,37 +11,6 @@ class Demon::Sidekiq < ::Demon::Base
blk ? (@blk = blk) : @blk blk ? (@blk = blk) : @blk
end end
def self.format(message)
"[#{Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S.%6N")} ##{Process.pid}] #{message}"
end
def self.log(message, level: :info)
# We use an IO pipe and log messages using the logger in a seperate thread to avoid the `log writing failed. can't be called from trap context`
# error message that is raised when trying to log from within a `Signal.trap` block.
if logger
if !defined?(@logger_read_pipe)
@logger_read_pipe, @logger_write_pipe = IO.pipe
@logger_thread =
Thread.new do
begin
while readable_io = IO.select([@logger_read_pipe])
logger.public_send(level, readable_io.first[0].gets.strip)
end
rescue => e
STDOUT.puts self.format(
"Error in Sidekiq demon logger thread: #{e.message}\n#{e.backtrace.join("\n")}",
)
end
end
end
@logger_write_pipe.puts(message)
else
STDOUT.puts self.format(message)
end
end
private private
def suppress_stdout def suppress_stdout
@ -54,12 +21,13 @@ class Demon::Sidekiq < ::Demon::Base
false false
end end
def log(message, level: :info) def log_in_trap(message, level: :info)
self.class.log(message, level:) SignalTrapLogger.instance.log(@logger, message, level: level)
end end
def after_fork def after_fork
Demon::Sidekiq.after_fork&.call Demon::Sidekiq.after_fork&.call
SignalTrapLogger.instance.after_fork
log("Loading Sidekiq in process id #{Process.pid}") log("Loading Sidekiq in process id #{Process.pid}")
require "sidekiq/cli" require "sidekiq/cli"
@ -67,9 +35,9 @@ class Demon::Sidekiq < ::Demon::Base
# Unicorn uses USR1 to indicate that log files have been rotated # Unicorn uses USR1 to indicate that log files have been rotated
Signal.trap("USR1") do Signal.trap("USR1") do
log("Sidekiq reopening logs...") log_in_trap("Sidekiq reopening logs...")
Unicorn::Util.reopen_logs Unicorn::Util.reopen_logs
log("Sidekiq done reopening logs...") log_in_trap("Sidekiq done reopening logs...")
end end
options = ["-c", GlobalSetting.sidekiq_workers.to_s] options = ["-c", GlobalSetting.sidekiq_workers.to_s]
@ -91,8 +59,7 @@ class Demon::Sidekiq < ::Demon::Base
load Rails.root + "config/initializers/100-sidekiq.rb" load Rails.root + "config/initializers/100-sidekiq.rb"
cli.run cli.run
rescue => e rescue => e
STDERR.puts e.message log("Error encountered while starting Sidekiq: #{e.message}\n#{e.backtrace.join("\n")}")
STDERR.puts e.backtrace.join("\n")
exit 1 exit 1
end end
end end

View File

@ -1,38 +0,0 @@
# frozen_string_literal: true
if ENV["DISCOURSE_DUMP_BACKTRACES_ON_UNICORN_WORKER_TIMEOUT"] && defined?(Unicorn::HttpServer)
module UnicornHTTPServerPatch
# Original source: https://github.com/defunkt/unicorn/blob/6c9c442fb6aa12fd871237bc2bb5aec56c5b3538/lib/unicorn/http_server.rb#L477-L496
def murder_lazy_workers
next_sleep = @timeout - 1
now = time_now.to_i
@workers.dup.each_pair do |wpid, worker|
tick = worker.tick
0 == tick and next # skip workers that haven't processed any clients
diff = now - tick
tmp = @timeout - diff
# START MONKEY PATCH
if tmp < 2
logger.error "worker=#{worker.nr} PID:#{wpid} running too long " \
"(#{diff}s), sending USR2 to dump thread backtraces"
kill_worker(:USR2, wpid)
end
# END MONKEY PATCH
if tmp >= 0
next_sleep > tmp and next_sleep = tmp
next
end
next_sleep = 0
logger.error "worker=#{worker.nr} PID:#{wpid} timeout " \
"(#{diff}s > #{@timeout}s), killing"
kill_worker(:KILL, wpid) # take no prisoners for timeout violations
end
next_sleep <= 0 ? 1 : next_sleep
end
end
Unicorn::HttpServer.prepend(UnicornHTTPServerPatch)
end

49
lib/signal_trap_logger.rb Normal file
View File

@ -0,0 +1,49 @@
# frozen_string_literal: true
# This class is used to log messages to a specified logger from within a `Signal.trap` context. Most loggers rely on
# methods that are prohibited within a `Signal.trap` context, so this class is used to queue up log messages and then
# log them from a separate thread outside of the `Signal.trap` context.
#
# Example:
# Signal.trap("USR1") do
# SignalTrapLogger.instance.log(Rails.logger, "Received USR1 signal")
# end
#
# Do note that you need to call `SignalTrapLogger.instance.after_fork` after forking a new process to ensure that the
# logging thread is running in the new process.
class SignalTrapLogger
include Singleton
def initialize
@queue = Queue.new
ensure_logging_thread_running
end
def log(logger, message, level: :info)
@queue << { logger:, message:, level: }
end
def after_fork
ensure_logging_thread_running
end
private
def ensure_logging_thread_running
return if @thread&.alive?
@thread =
Thread.new do
loop do
begin
log_entry = @queue.pop
log_entry[:logger].public_send(log_entry[:level], log_entry[:message])
rescue => error
Rails.logger.error(
"Error in SignalTrapLogger thread: #{error.message}\n#{error.backtrace.join("\n")}",
)
end
end
end
end
end