FEATURE: track stats around failing scheduled jobs (#17769)

* FEATURE: track stats around failing scheduled jobs

Discourse.job_exception_stats can now be used to gather stats around how
many regular scheduled jobs failed in the current process.

This will be consumed by the Prometheus plugin and potentially other
monitoring plugins.
This commit is contained in:
Sam 2022-08-03 12:53:26 +10:00 committed by GitHub
parent c99f658a9e
commit bfe502012d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 53 additions and 0 deletions

View File

@ -153,6 +153,16 @@ module Discourse
end
end
def self.job_exception_stats
@job_exception_stats
end
def self.reset_job_exception_stats!
@job_exception_stats = Hash.new(0)
end
reset_job_exception_stats!
# Log an exception.
#
# If your code is in a scheduled job, it is recommended to use the
@ -165,6 +175,11 @@ module Discourse
context ||= {}
parent_logger ||= Sidekiq
job = context.dig(:job, "class")
if job
job_exception_stats[job] += 1
end
cm = RailsMultisite::ConnectionManagement
parent_logger.handle_exception(ex, {
current_db: cm.current_db,

View File

@ -340,6 +340,44 @@ RSpec.describe Discourse do
Sidekiq.error_handlers.delete(logger)
end
describe "#job_exception_stats" do
before do
Discourse.reset_job_exception_stats!
end
after do
Discourse.reset_job_exception_stats!
end
it "should collect job exception stats" do
# see MiniScheduler Manager which reports it like this
# https://github.com/discourse/mini_scheduler/blob/2b2c1c56b6e76f51108c2a305775469e24cf2b65/lib/mini_scheduler/manager.rb#L95
exception_context = {
message: "Running a scheduled job",
job: { "class" => Jobs::ReindexSearch }
}
# re-raised unconditionally in test env
2.times do
expect { Discourse.handle_job_exception(StandardError.new, exception_context) }.to raise_error(StandardError)
end
exception_context = {
message: "Running a scheduled job",
job: { "class" => Jobs::PollMailbox }
}
expect { Discourse.handle_job_exception(StandardError.new, exception_context) }.to raise_error(StandardError)
expect(Discourse.job_exception_stats).to eq({
Jobs::PollMailbox => 1,
Jobs::ReindexSearch => 2,
})
end
end
it "should not fail when called" do
exception = StandardError.new