Skip to content

Commit

Permalink
Remove prometheus metrics emitted on scheduler VM
Browse files Browse the repository at this point in the history
The metrics `report_diego_cell_sync_duration`, `report_deployment_duration`, `update_synced_invalid_lrps` are being emitted on the scheduler VM.
There is no web server and therefore also no endpoint, which could serve those metrics.
For now we decided to remove those prometheus metrics and just keep the statsd metrics.
If those metrics should be also available through prometheus in the future, we probably have to deploy additional jobs on the scheduler VM, which
take care of publishing the metrics, so they can be collected by the prom_scraper job.
  • Loading branch information
svkrieger committed Oct 12, 2023
1 parent 1b52571 commit 7a4f401
Show file tree
Hide file tree
Showing 6 changed files with 4 additions and 72 deletions.
4 changes: 1 addition & 3 deletions app/jobs/diego/sync.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@ module VCAP::CloudController
module Jobs
module Diego
class Sync < VCAP::CloudController::Jobs::CCJob
def initialize(statsd=Statsd.new, prometheus_updater=VCAP::CloudController::Metrics::PrometheusUpdater.new)
def initialize(statsd=Statsd.new)
@statsd = statsd
@prometheus_updater = prometheus_updater
end

def perform
Expand All @@ -28,7 +27,6 @@ def perform
elapsed_ms = ((finish - start) * 1000).round

@statsd.timing('cc.diego_sync.duration', elapsed_ms)
@prometheus_updater.report_diego_cell_sync_duration((finish - start))
end
end

Expand Down
7 changes: 2 additions & 5 deletions lib/cloud_controller/deployment_updater/scheduler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,11 @@ def start
with_error_logging('cc.deployment_updater') do
config = CloudController::DependencyLocator.instance.config
statsd_client = CloudController::DependencyLocator.instance.statsd_client
prometheus_updater = CloudController::DependencyLocator.instance.prometheus_updater

update_step = proc {
update(
update_frequency: config.get(:deployment_updater, :update_frequency_in_seconds),
statsd_client: statsd_client,
prometheus_updater: prometheus_updater
statsd_client: statsd_client
)
}

Expand All @@ -42,7 +40,7 @@ def start

private

def update(update_frequency:, statsd_client:, prometheus_updater:)
def update(update_frequency:, statsd_client:)
logger = Steno.logger('cc.deployment_updater.scheduler')

update_start_time = Time.now
Expand All @@ -54,7 +52,6 @@ def update(update_frequency:, statsd_client:, prometheus_updater:)
## so feed in the entire value!
update_duration_ms = update_duration * 1000
statsd_client.timing('cc.deployments.update.duration', update_duration_ms)
prometheus_updater.report_deployment_duration(update_duration)

logger.info("Update loop took #{update_duration}s")

Expand Down
25 changes: 0 additions & 25 deletions lib/cloud_controller/metrics/prometheus_updater.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,12 @@ def initialize(registry=Prometheus::Client.registry)
@registry.gauge(:cc_job_queues_length_total, docstring: 'Job queues length of worker processes', labels: [:queue]) unless @registry.exist?(:cc_job_queues_length_total)
@registry.gauge(:cc_failed_jobs_total, docstring: 'Number of failed jobs of worker processes', labels: [:queue]) unless @registry.exist?(:cc_failed_jobs_total)
@registry.counter(:cc_staging_requested, docstring: 'Number of staging requests') unless @registry.exist?(:cc_staging_requested)
@registry.gauge(:cc_diego_sync_invalid_desired_lrps, docstring: 'Invalid Desired LRPs') unless @registry.exist?(:cc_diego_sync_invalid_desired_lrps)
unless @registry.exist?(:cc_staging_succeeded_duration_seconds)
@registry.histogram(:cc_staging_succeeded_duration_seconds,
docstring: 'Durations of successful staging events')
end
@registry.histogram(:cc_staging_failed_duration_seconds, docstring: 'Durations of failed staging events') unless @registry.exist?(:cc_staging_failed_duration_seconds)

@registry.summary(:cc_diego_sync_duration_summary, docstring: 'Diego cell sync duration summary') unless @registry.exist?(:cc_diego_sync_duration_summary)
@registry.histogram(:cc_diego_sync_duration_hist, docstring: 'Diego cell sync duration histogram') unless @registry.exist?(:cc_diego_sync_duration_hist)
@registry.gauge(:cc_diego_sync_duration_gauge, docstring: 'Diego cell sync duration gauge') unless @registry.exist?(:cc_diego_sync_duration_gauge)

@registry.summary(:cc_deployments_update_duration_summary, docstring: 'Deployment duration summary') unless @registry.exist?(:cc_deployments_update_duration_summary)
@registry.histogram(:cc_deployments_update_duration_hist, docstring: 'Deployment duration histogram') unless @registry.exist?(:cc_deployments_update_duration_hist)
@registry.gauge(:cc_deployments_update_duration_gauge, docstring: 'Deployment duration gauge') unless @registry.exist?(:cc_deployments_update_duration_gauge)

@registry.gauge(:cc_requests_outstanding_gauge, docstring: 'Requests Outstanding Gauge') unless @registry.exist?(:cc_requests_outstanding_gauge)
@registry.counter(:cc_requests_completed, docstring: 'Requests Completed') unless @registry.exist?(:cc_requests_completed)
end
Expand Down Expand Up @@ -101,10 +92,6 @@ def update_task_stats(total_running_tasks, total_memory_in_bytes)
update_gauge_metric(:cc_tasks_running_memory_in_mb, total_memory_in_bytes, 'Total memory consumed by running tasks')
end

def update_synced_invalid_lrps(lrp_count)
update_gauge_metric(:cc_diego_sync_invalid_desired_lrps, lrp_count, 'Invalid Desired LRPs')
end

def start_staging_request_received
increment_counter_metric(:cc_staging_requested, 'Number of staging requests')
end
Expand All @@ -117,18 +104,6 @@ def report_staging_failure_metrics(duration_ns)
update_histogram_metric(:cc_staging_failed_duration_seconds, nanoseconds_to_seconds(duration_ns), 'Durations of failed staging events')
end

def report_diego_cell_sync_duration(duration_seconds)
update_summary_metric(:cc_diego_sync_duration_summary, duration_seconds, 'Diego cell sync duration summary')
update_histogram_metric(:cc_diego_sync_duration_hist, duration_seconds, 'Diego cell sync duration histogram')
update_gauge_metric(:cc_diego_sync_duration_gauge, duration_seconds, 'Diego cell sync duration (gauge metric)')
end

def report_deployment_duration(duration_seconds)
update_summary_metric(:cc_deployments_update_duration_summary, duration_seconds, 'Deployment duration summary')
update_histogram_metric(:cc_deployments_update_duration_hist, duration_seconds, 'Deployment duration histogram')
update_gauge_metric(:cc_deployments_update_duration_gauge, duration_seconds, 'Deployment duration (gauge metric)')
end

private

def nanoseconds_to_seconds(time_ns)
Expand Down
1 change: 0 additions & 1 deletion spec/unit/jobs/diego/sync_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ module Jobs::Diego
expect(tasks_sync).to receive(:sync)
expect(Time).to receive(:now).twice # Ensure that we get two time measurements. _Hopefully_ they get turned into an elapsed time and passed in where they need to be!
expect_any_instance_of(Statsd).to receive(:timing).with('cc.diego_sync.duration', kind_of(Numeric))
expect_any_instance_of(VCAP::CloudController::Metrics::PrometheusUpdater).to receive(:report_diego_cell_sync_duration).with(kind_of(Numeric))

job.perform
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ module VCAP::CloudController
let(:lock_worker) { instance_double(Locket::LockWorker) }
let(:logger) { instance_double(Steno::Logger, info: nil, debug: nil, error: nil) }
let(:statsd_client) { instance_double(Statsd) }
let(:prometheus_updater) { instance_double(VCAP::CloudController::Metrics::PrometheusUpdater) }

before do
allow(Locket::LockRunner).to receive(:new).and_return(lock_runner)
Expand All @@ -38,9 +37,8 @@ module VCAP::CloudController
allow(lock_worker).to receive(:acquire_lock_and_repeatedly_call).and_yield
allow(DeploymentUpdater::Scheduler).to receive(:sleep)
allow(DeploymentUpdater::Dispatcher).to receive(:dispatch)
allow(CloudController::DependencyLocator.instance).to receive_messages(statsd_client:, prometheus_updater:)
allow(CloudController::DependencyLocator.instance).to receive_messages(statsd_client:)
allow(statsd_client).to receive(:timing)
allow(prometheus_updater).to receive(:report_deployment_duration)
end

it 'correctly configures a LockRunner and uses it to initialize a LockWorker' do
Expand Down Expand Up @@ -132,7 +130,6 @@ module VCAP::CloudController
it 'records the deployment update duration' do
expect(DeploymentUpdater::Dispatcher).to receive(:dispatch)
expect(statsd_client).to receive(:timing).with('cc.deployments.update.duration', kind_of(Numeric))
expect(prometheus_updater).to receive(:report_deployment_duration).with(kind_of(Numeric))

DeploymentUpdater::Scheduler.start
end
Expand Down
34 changes: 0 additions & 34 deletions spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -193,14 +193,6 @@ module VCAP::CloudController::Metrics
end
end

describe '#update_synced_invalid_lrps' do
it 'records number of running tasks and task memory to statsd' do
updater.update_synced_invalid_lrps(5)
metric = prom_client.metrics.find { |m| m.name == :cc_diego_sync_invalid_desired_lrps }
expect(metric.get).to eq 5
end
end

describe '#start_staging_request_received' do
it 'increments "cc_staging_requested"' do
updater.start_staging_request_received
Expand Down Expand Up @@ -242,31 +234,5 @@ module VCAP::CloudController::Metrics
expect(metric.get).to eq({ '10000.0' => 0, '15000.0' => 0, '20000.0' => 1, '25000.0' => 1, '30000.0' => 1, 'sum' => 20_000, '+Inf' => 1 })
end
end

describe '#report_diego_cell_sync_duration' do
it 'reports diego cell sync duration' do
duration_ns = 20 * 1e9

updater.report_diego_cell_sync_duration(duration_ns)
metric = prom_client.metrics.find { |m| m.name == :cc_diego_sync_duration }
expect(metric.get).to eq({ 'count' => 1.0, 'sum' => 20_000_000_000.0 })

metric = prom_client.metrics.find { |m| m.name == :cc_diego_sync_duration_gauge }
expect(metric.get).to eq duration_ns
end
end

describe '#report_deployment_duration' do
it 'reports deployments update duration' do
duration_ns = 20 * 1e9

updater.report_deployment_duration(duration_ns)
metric = prom_client.metrics.find { |m| m.name == :cc_deployments_update_duration }
expect(metric.get).to eq({ 'count' => 1.0, 'sum' => 20_000_000_000.0 })

metric = prom_client.metrics.find { |m| m.name == :cc_deployments_update_duration_gauge }
expect(metric.get).to eq duration_ns
end
end
end
end

0 comments on commit 7a4f401

Please sign in to comment.