Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Monitor worker job runtime #13

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ end
- Number of jobs in dead set (“morgue”): `sidekiq_jobs_dead_count`
- Active workers count: `sidekiq_active_processes`
- Active processes count: `sidekiq_active_workers_count`
- Runtime of the actual running worker jobs `sidekiq_worker_runtime`

## Custom tags

Expand Down
25 changes: 25 additions & 0 deletions lib/yabeda/sidekiq.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ module Sidekiq
gauge :jobs_dead_count, tags: [], comment: "The number of jobs exceeded their retry count."
gauge :active_processes, tags: [], comment: "The number of active Sidekiq worker processes."
gauge :queue_latency, tags: %i[queue], comment: "The queue latency, the difference in seconds since the oldest job in the queue was enqueued"
gauge :job_max_runtime, tags: %i[queue worker], comment: "The actual job runtime"

histogram :job_latency, comment: "The job latency, the difference in seconds between enqueued and running time",
unit: :seconds, per: :job,
Expand All @@ -59,6 +60,7 @@ module Sidekiq
sidekiq_queue_latency.set({ queue: queue.name }, queue.latency)
end

Yabeda::Sidekiq.track_max_job_runtime
# That is quite slow if your retry set is large
# I don't want to enable it by default
# retries_by_queues =
Expand Down Expand Up @@ -87,6 +89,8 @@ module Sidekiq
end

class << self
attr_accessor :observed_max_job_runtimes

def labelize(worker, job, queue)
{ queue: queue, worker: worker_class(worker, job) }
end
Expand All @@ -106,5 +110,26 @@ def custom_tags(worker, job)
worker.method(:yabeda_tags).arity.zero? ? worker.yabeda_tags : worker.yabeda_tags(*job["args"])
end
end

self.observed_max_job_runtimes = Concurrent::Set.new

# rubocop: disable Metrics/AbcSize
def self.track_max_job_runtime
now = Time.now.utc
current_max_job_runtimes = ::Sidekiq::Workers.new.each_with_object({}) do |(_process, _thread, msg), result|
payload = msg["payload"]
tags = { queue: payload["queue"], worker: payload["wrapped"] || payload["class"] }
observed_max_job_runtimes << tags

duration = now - Time.at(msg["run_at"])
result[tags] = duration if !result[tags] || result[tags] < duration
end

observed_max_job_runtimes.each do |tags|
duration = current_max_job_runtimes[tags].to_i # will set not currently observed jobs duration to zero
Yabeda.sidekiq.job_max_runtime.set(tags, duration)
end
end
# rubocop: enable Metrics/AbcSize
end
end
90 changes: 90 additions & 0 deletions spec/yabeda/sidekiq_spec.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,26 @@
# frozen_string_literal: true

RSpec.describe Yabeda::Sidekiq do
before do
allow(Sidekiq::Stats).to receive(:new).and_return(
OpenStruct.new(
workers_size: 0,
scheduled_size: 0,
dead_size: 0,
processes_size: 0,
retry_size: 0,
processed: 0,
failed: 0,
queues: { "default" => 0 },
),
)
allow(Sidekiq::Queue).to receive(:all).and_return(
[
OpenStruct.new({ name: "default", latency: 0 }),
],
)
end

it "has a version number" do
expect(Yabeda::Sidekiq::VERSION).not_to be nil
end
Expand Down Expand Up @@ -54,6 +74,52 @@
{ queue: "default", worker: "FailingPlainJob" } => kind_of(Numeric),
)
end

it "measures maximum runtime" do
invoked = 0
allow(Sidekiq::Workers).to receive(:new) do
[
[
[
"server:pid:wtf", "tid1", {
"queue" => "default",
"payload" => { "queue" => "default", "class" => "FailingPlainJob" },
"run_at" => Time.now.to_i - 10,
},
],
],
[
[
"server:pid:wtf", "tid1", {
"queue" => "default",
"payload" => { "queue" => "default", "class" => "SamplePlainJob" },
"run_at" => Time.now.to_i,
},
],
[
"server:pid:wtf", "tid2", {
"queue" => "default",
"payload" => { "queue" => "default", "class" => "SamplePlainJob" },
"run_at" => Time.now.to_i - 5,
},
],
],
][(invoked += 1) - 1]
end

Yabeda.collectors.each(&:call)

expect(Yabeda.sidekiq.job_max_runtime.values).to include(
{ queue: "default", worker: "FailingPlainJob" } => (be >= 10),
)

Yabeda.collectors.each(&:call)

expect(Yabeda.sidekiq.job_max_runtime.values).to include(
{ queue: "default", worker: "SamplePlainJob" } => (be >= 5),
{ queue: "default", worker: "FailingPlainJob" } => 0,
)
end
end

describe "ActiveJob jobs" do
Expand Down Expand Up @@ -96,6 +162,30 @@
{ queue: "default", worker: "FailingActiveJob" } => kind_of(Numeric),
)
end

it "measures maximum runtime" do
allow(Sidekiq::Workers).to receive(:new) do
[
[
"server:pid:wtf", "tid1", {
"queue" => "default",
"payload" => {
"queue" => "default",
"class" => "ActiveJob::QueueAdapters::SidekiqAdapter::JobWrapper",
"wrapped" => "SampleActiveJob",
},
"run_at" => Time.now.to_i - 42,
},
],
]
end

Yabeda.collectors.each(&:call)

expect(Yabeda.sidekiq.job_max_runtime.values).to include(
{ queue: "default", worker: "SampleActiveJob" } => (be >= 42),
)
end
end

describe "#yabeda_tags worker method" do
Expand Down