From b278a0465f4278c8010764cd439ef58da41a803c Mon Sep 17 00:00:00 2001 From: Kenneth Lakin Date: Mon, 11 Apr 2022 23:51:53 +0000 Subject: [PATCH] Creates metrics endpoint for use by prom-scraper Notable changes in this commit: * Adds an `/internal/v4/metrics` endpoint which serves Prometheus-style metrics. * Adds function to Prometheus::Client::Registry to re-create the library's metrics registry. This is used by the PrometheusClient class's tests to wipe out the metrics registry, which is global state. Global state is the #1 cause of test pollution and this change seemed to be the most straightforward way to eliminate some order-dependent test failures. NOTE: This function definition is only in the `prometheus_updater_spec.rb` file, which we believe is not shipped in capi-release, so the odds of actually writing production code that depends on this test-only functionality are pretty slim. Co-authored-by: David Alvarado Co-authored-by: Kenneth Lakin Co-authored-by: Merric de Launey Co-authored-by: Michael Oleske --- Gemfile | 1 + Gemfile.lock | 4 +- .../internal/metrics_controller.rb | 25 ++ .../internal/staging_completion_controller.rb | 6 + app/jobs/diego/sync.rb | 17 +- lib/cloud_controller/dependency_locator.rb | 8 + .../deployment_updater/scheduler.rb | 18 +- .../metrics/periodic_updater.rb | 10 +- .../metrics/prometheus_updater.rb | 143 +++++++ .../metrics/request_metrics.rb | 15 +- .../metrics/statsd_updater.rb | 2 +- spec/request/internal/metrics_spec.rb | 142 +++++++ .../internal/metric_controller_spec.rb | 34 ++ .../staging_completion_controller_spec.rb | 12 + spec/unit/jobs/diego/sync_spec.rb | 19 +- .../deployment_updater/scheduler_spec.rb | 20 +- .../metrics/periodic_updater_spec.rb | 26 +- .../metrics/prometheus_updater_spec.rb | 350 ++++++++++++++++++ .../metrics/request_metrics_spec.rb | 19 +- .../metrics/statsd_updater_spec.rb | 4 +- 20 files changed, 820 insertions(+), 55 deletions(-) create mode 100644 app/controllers/internal/metrics_controller.rb create mode 100644 lib/cloud_controller/metrics/prometheus_updater.rb create mode 100644 spec/request/internal/metrics_spec.rb create mode 100644 spec/unit/controllers/internal/metric_controller_spec.rb create mode 100644 spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb diff --git a/Gemfile b/Gemfile index c4b151cae7d..35cef3fa722 100644 --- a/Gemfile +++ b/Gemfile @@ -40,6 +40,7 @@ gem 'sequel_pg', require: 'sequel' gem 'sinatra', '~> 2.2' gem 'sinatra-contrib' gem 'statsd-ruby', '~> 1.4.0' +gem 'prometheus-client' gem 'steno' gem 'talentbox-delayed_job_sequel', '~> 4.3.0' gem 'thin' diff --git a/Gemfile.lock b/Gemfile.lock index f4f5cacff6a..b1547eed23c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -352,6 +352,7 @@ GEM ast (~> 2.4.1) pg (1.3.5) posix-spawn (0.3.15) + prometheus-client (3.0.0) protobuf (3.6.12) activesupport (>= 3.2) middleware @@ -600,6 +601,7 @@ DEPENDENCIES parallel_tests pg posix-spawn (~> 0.3.15) + prometheus-client protobuf (= 3.6.12) pry-byebug psych (>= 4.0.4) @@ -640,4 +642,4 @@ DEPENDENCIES yajl-ruby BUNDLED WITH - 2.1.4 + 2.2.26 diff --git a/app/controllers/internal/metrics_controller.rb b/app/controllers/internal/metrics_controller.rb new file mode 100644 index 00000000000..e65e3d31d1b --- /dev/null +++ b/app/controllers/internal/metrics_controller.rb @@ -0,0 +1,25 @@ +require 'prometheus/client' +require 'prometheus/client/formats/text' +require 'cloud_controller/metrics/prometheus_updater' + +module VCAP::CloudController + module Internal + class MetricsController < RestController::BaseController + allow_unauthenticated_access + get '/internal/v4/metrics', :index + + def index + periodic_updater = VCAP::CloudController::Metrics::PeriodicUpdater.new( + Time.now.utc, + Steno::Sink::Counter.new, + Steno.logger('cc.api'), + [ + VCAP::CloudController::Metrics::StatsdUpdater.new, + VCAP::CloudController::Metrics::PrometheusUpdater.new + ]) + periodic_updater.update! + [200, Prometheus::Client::Formats::Text.marshal(Prometheus::Client.registry)] + end + end + end +end diff --git a/app/controllers/internal/staging_completion_controller.rb b/app/controllers/internal/staging_completion_controller.rb index fda00f0743e..54ecef89156 100644 --- a/app/controllers/internal/staging_completion_controller.rb +++ b/app/controllers/internal/staging_completion_controller.rb @@ -117,8 +117,10 @@ def report_metrics(bbs_staging_response) duration = Time.now.utc.to_i * 1e9 - bbs_staging_response[:created_at] if bbs_staging_response[:failed] statsd_updater.report_staging_failure_metrics(duration) + prometheus_updater.report_staging_failure_metrics(duration) else statsd_updater.report_staging_success_metrics(duration) + prometheus_updater.report_staging_success_metrics(duration) end end @@ -126,6 +128,10 @@ def statsd_updater @statsd_updater ||= VCAP::CloudController::Metrics::StatsdUpdater.new end + def prometheus_updater + @prometheus_updater ||= VCAP::CloudController::Metrics::PrometheusUpdater.new # this should be using singleton + end + attr_reader :stagers def read_body diff --git a/app/jobs/diego/sync.rb b/app/jobs/diego/sync.rb index 0ba5d886549..3889b309bb0 100644 --- a/app/jobs/diego/sync.rb +++ b/app/jobs/diego/sync.rb @@ -7,15 +7,28 @@ module VCAP::CloudController module Jobs module Diego class Sync < VCAP::CloudController::Jobs::CCJob - def initialize(statsd=Statsd.new) + def initialize(statsd=Statsd.new, prometheus_updater=VCAP::CloudController::Metrics::PrometheusUpdater.new) @statsd = statsd + @prometheus_updater = prometheus_updater end def perform config = CloudController::DependencyLocator.instance.config - @statsd.time('cc.diego_sync.duration') do + begin + ## TODO: At some point in the future, start using a monotonic time source, rather than wall-clock time! + start = Time.now VCAP::CloudController::Diego::ProcessesSync.new(config: config).sync VCAP::CloudController::Diego::TasksSync.new(config: config).sync + ensure + finish = Time.now + ## NOTE: We're taking time in seconds and multiplying by 1000 because we don't have + ## access to time in milliseconds. If you ever get access to reliable time in + ## milliseconds, then do know that the lack of precision here is not desired + ## so feed in the entire value! + elapsed_ms = ((finish - start) * 1000).round + + @statsd.timing('cc.diego_sync.duration', elapsed_ms) + @prometheus_updater.report_diego_cell_sync_duration(elapsed_ms) end end diff --git a/lib/cloud_controller/dependency_locator.rb b/lib/cloud_controller/dependency_locator.rb index 5c2ec46258b..db2e77cd4e6 100644 --- a/lib/cloud_controller/dependency_locator.rb +++ b/lib/cloud_controller/dependency_locator.rb @@ -34,6 +34,7 @@ require 'cloud_controller/opi/instances_client' require 'cloud_controller/opi/stager_client' require 'cloud_controller/opi/task_client' +require 'cloud_controller/metrics/prometheus_updater' require 'bits_service_client' @@ -70,6 +71,13 @@ def runners @dependencies[:runners] || register(:runners, VCAP::CloudController::Runners.new(config)) end + def prometheus_updater + unless @dependencies[:prometheus_updater] + register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new) + end + @dependencies[:prometheus_updater] + end + def stagers @dependencies[:stagers] || register(:stagers, VCAP::CloudController::Stagers.new(config)) end diff --git a/lib/cloud_controller/deployment_updater/scheduler.rb b/lib/cloud_controller/deployment_updater/scheduler.rb index c0dc27790f0..82cd1a03bba 100644 --- a/lib/cloud_controller/deployment_updater/scheduler.rb +++ b/lib/cloud_controller/deployment_updater/scheduler.rb @@ -10,10 +10,12 @@ def start with_error_logging('cc.deployment_updater') do config = CloudController::DependencyLocator.instance.config statsd_client = CloudController::DependencyLocator.instance.statsd_client + prometheus_updater = CloudController::DependencyLocator.instance.prometheus_updater update_step = proc { update( update_frequency: config.get(:deployment_updater, :update_frequency_in_seconds), - statsd_client: statsd_client + statsd_client: statsd_client, + prometheus_updater: prometheus_updater ) } @@ -39,14 +41,20 @@ def start private - def update(update_frequency:, statsd_client:) + def update(update_frequency:, statsd_client:, prometheus_updater:) logger = Steno.logger('cc.deployment_updater.scheduler') update_start_time = Time.now - statsd_client.time('cc.deployments.update.duration') do - Dispatcher.dispatch - end + Dispatcher.dispatch update_duration = Time.now - update_start_time + ## NOTE: We're taking time in seconds and multiplying by 1000 because we don't have + ## access to time in milliseconds. If you ever get access to reliable time in + ## milliseconds, then do know that the lack of precision here is not desired + ## so feed in the entire value! + update_duration_ms = update_duration * 1000 + statsd_client.timing('cc.deployments.update.duration', update_duration_ms) + prometheus_updater.report_deployment_duration(update_duration_ms) + logger.info("Update loop took #{update_duration}s") sleep_duration = update_frequency - update_duration diff --git a/lib/cloud_controller/metrics/periodic_updater.rb b/lib/cloud_controller/metrics/periodic_updater.rb index 20a04238464..2e5e77d9532 100644 --- a/lib/cloud_controller/metrics/periodic_updater.rb +++ b/lib/cloud_controller/metrics/periodic_updater.rb @@ -3,7 +3,7 @@ module VCAP::CloudController::Metrics class PeriodicUpdater - def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpdater.new]) + def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpdater.new, PrometheusUpdater.new]) @start_time = start_time @updaters = updaters @log_counter = log_counter @@ -15,7 +15,7 @@ def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpd def setup_updates update! - EM.add_periodic_timer(600) { catch_error { record_user_count } } + EM.add_periodic_timer(600) { catch_error { update_user_count } } EM.add_periodic_timer(30) { catch_error { update_job_queue_length } } EM.add_periodic_timer(30) { catch_error { update_thread_info } } EM.add_periodic_timer(30) { catch_error { update_failed_job_count } } @@ -26,7 +26,7 @@ def setup_updates end def update! - record_user_count + update_user_count update_job_queue_length update_thread_info update_failed_job_count @@ -67,10 +67,10 @@ def update_deploying_count @updaters.each { |u| u.update_deploying_count(deploying_count) } end - def record_user_count + def update_user_count user_count = VCAP::CloudController::User.count - @updaters.each { |u| u.record_user_count(user_count) } + @updaters.each { |u| u.update_user_count(user_count) } end def update_job_queue_length diff --git a/lib/cloud_controller/metrics/prometheus_updater.rb b/lib/cloud_controller/metrics/prometheus_updater.rb new file mode 100644 index 00000000000..e3c07a4ed53 --- /dev/null +++ b/lib/cloud_controller/metrics/prometheus_updater.rb @@ -0,0 +1,143 @@ +require 'prometheus/client' + +module VCAP::CloudController::Metrics + class PrometheusUpdater + def initialize(registry=Prometheus::Client.registry) + @registry = registry + end + + def update_gauge_metric(metric, value, message) + unless @registry.exist?(metric) + @registry.gauge(metric, docstring: message) + end + @registry.get(metric).set(value) + end + + def increment_gauge_metric(metric, message) + unless @registry.exist?(metric) + @registry.gauge(metric, docstring: message) + end + @registry.get(metric).increment + end + + def decrement_gauge_metric(metric, message) + unless @registry.exist?(metric) + @registry.gauge(metric, docstring: message) + end + @registry.get(metric).decrement + end + + def increment_counter_metric(metric, message) + unless @registry.exist?(metric) + @registry.counter(metric, docstring: message) + end + @registry.get(metric).increment + end + + def update_histogram_metric(metric, value, message, buckets) + unless @registry.exist?(metric) + @registry.histogram(metric, buckets: buckets, docstring: message) + end + @registry.get(metric).observe(value) + end + + def update_summary_metric(metric, value, message) + unless @registry.exist?(metric) + @registry.summary(metric, docstring: message) + end + @registry.get(metric).observe(value) + end + + def update_deploying_count(deploying_count) + update_gauge_metric(:cc_deployments_deploying, deploying_count, 'Number of in progress deployments') + end + + def update_user_count(user_count) + update_gauge_metric(:cc_total_users, user_count, 'Number of users') + end + + def update_job_queue_length(pending_job_count_by_queue, total) + pending_job_count_by_queue.each do |key, value| + metric_key = :"cc_job_queue_length_#{key.to_s.underscore}" + update_gauge_metric(metric_key, value, docstring: "Job queue length for worker #{key}") + end + + update_gauge_metric(:cc_job_queue_length_total, total, 'Total job queue length') + end + + def update_thread_info(thread_info) + update_gauge_metric(:cc_thread_info_thread_count, thread_info[:thread_count], 'Thread count') + update_gauge_metric(:cc_thread_info_event_machine_connection_count, thread_info[:event_machine][:connection_count], 'Event Machine connection count') + update_gauge_metric(:cc_thread_info_event_machine_threadqueue_size, thread_info[:event_machine][:threadqueue][:size], 'EventMachine thread queue size') + update_gauge_metric(:cc_thread_info_event_machine_threadqueue_num_waiting, thread_info[:event_machine][:threadqueue][:num_waiting], 'EventMachine num waiting in thread') + update_gauge_metric(:cc_thread_info_event_machine_resultqueue_size, thread_info[:event_machine][:resultqueue][:size], 'EventMachine queue size') + update_gauge_metric(:cc_thread_info_event_machine_resultqueue_num_waiting, thread_info[:event_machine][:resultqueue][:num_waiting], 'EventMachine requests waiting in queue') + end + + def update_failed_job_count(failed_jobs_by_queue, total) + failed_jobs_by_queue.each do |key, value| + metric_key = :"cc_failed_job_count_#{key.to_s.underscore}" + update_gauge_metric(metric_key, value, "Failed jobs for worker #{key}") + end + + update_gauge_metric(:cc_failed_job_count_total, total, 'Total failed jobs') + end + + def update_vitals(vitals) + vitals.each do |key, value| + metric_key = :"cc_vitals_#{key.to_s.underscore}" + update_gauge_metric(metric_key, value, "CloudController Vitals: #{key}") + end + end + + def update_log_counts(counts) + counts.each do |key, value| + metric_key = :"cc_log_count_#{key.to_s.underscore}" + update_gauge_metric(metric_key, value, "Log count for log level '#{key}'") + end + end + + def update_task_stats(total_running_tasks, total_memory_in_mb) + update_gauge_metric(:cc_tasks_running_count, total_running_tasks, 'Total running tasks') + update_gauge_metric(:cc_tasks_running_memory_in_mb, total_memory_in_mb, 'Total memory consumed by running tasks') + end + + def update_synced_invalid_lrps(lrp_count) + update_gauge_metric(:cc_diego_sync_invalid_desired_lrps, lrp_count, 'Invalid Desired LRPs') + end + + def start_staging_request_received + increment_counter_metric(:cc_staging_requested, 'Number of staging requests') + end + + def report_staging_success_metrics(duration_ns) + increment_counter_metric(:cc_staging_succeeded, 'Number of successful staging events') + update_histogram_metric(:cc_staging_succeeded_duration, nanoseconds_to_milliseconds(duration_ns), 'Durations of successful staging events', duration_buckets) + end + + def report_staging_failure_metrics(duration_ns) + increment_counter_metric(:cc_staging_failed, 'Number of failed staging events') + update_histogram_metric(:cc_staging_failed_duration, nanoseconds_to_milliseconds(duration_ns), 'Durations of failed staging events', duration_buckets) + end + + def report_diego_cell_sync_duration(duration_ms) + update_summary_metric(:cc_diego_sync_duration, duration_ms, 'Diego cell sync duration') + update_gauge_metric(:cc_diego_sync_duration_gauge, duration_ms, 'Diego cell sync duration (gauge metric)') + end + + def report_deployment_duration(duration_ms) + update_summary_metric(:cc_deployments_update_duration, duration_ms, 'Deployment duration') + update_gauge_metric(:cc_deployments_update_duration_gauge, duration_ms, 'Deployment duration (gauge metric)') + end + + private + + def duration_buckets + Prometheus::Client::Histogram.linear_buckets(start: 10000, width: 5000, count: 5) + end + + def nanoseconds_to_milliseconds(time_ns) + (time_ns / 1e6).to_i + end + end +end diff --git a/lib/cloud_controller/metrics/request_metrics.rb b/lib/cloud_controller/metrics/request_metrics.rb index 6847b71733d..83a0122112e 100644 --- a/lib/cloud_controller/metrics/request_metrics.rb +++ b/lib/cloud_controller/metrics/request_metrics.rb @@ -3,25 +3,36 @@ module VCAP::CloudController module Metrics class RequestMetrics - def initialize(statsd=Statsd.new) + def initialize(statsd=Statsd.new, prometheus_updater=PrometheusUpdater.new) @counter = 0 @statsd = statsd + @prometheus_updater = prometheus_updater end def start_request @counter += 1 @statsd.gauge('cc.requests.outstanding.gauge', @counter) @statsd.increment 'cc.requests.outstanding' + + @prometheus_updater.update_gauge_metric(:cc_requests_outstanding_gauge, @counter, 'Requests Outstanding Gauge') + @prometheus_updater.increment_gauge_metric(:cc_requests_outstanding, 'Requests Outstanding') end def complete_request(status) + http_status_code = "#{status.to_s[0]}XX" + http_status_metric = "cc.http_status.#{http_status_code}" @counter -= 1 @statsd.gauge('cc.requests.outstanding.gauge', @counter) @statsd.batch do |batch| batch.decrement 'cc.requests.outstanding' batch.increment 'cc.requests.completed' - batch.increment "cc.http_status.#{status.to_s[0]}XX" + batch.increment http_status_metric end + + @prometheus_updater.update_gauge_metric(:cc_requests_outstanding_gauge, @counter, 'Requests Outstanding Gauge') + @prometheus_updater.decrement_gauge_metric(:cc_requests_outstanding, 'Requests Outstanding') + @prometheus_updater.increment_gauge_metric(:cc_requests_completed, 'Requests Completed') + @prometheus_updater.increment_gauge_metric(http_status_metric.gsub('.', '_').to_sym, "Times HTTP status #{http_status_code} have been received") end end end diff --git a/lib/cloud_controller/metrics/statsd_updater.rb b/lib/cloud_controller/metrics/statsd_updater.rb index f45b716b0c4..49ce3d7aacc 100644 --- a/lib/cloud_controller/metrics/statsd_updater.rb +++ b/lib/cloud_controller/metrics/statsd_updater.rb @@ -10,7 +10,7 @@ def update_deploying_count(deploying_count) @statsd.gauge('cc.deployments.deploying', deploying_count) end - def record_user_count(user_count) + def update_user_count(user_count) @statsd.gauge('cc.total_users', user_count) end diff --git a/spec/request/internal/metrics_spec.rb b/spec/request/internal/metrics_spec.rb new file mode 100644 index 00000000000..6e78d91ce05 --- /dev/null +++ b/spec/request/internal/metrics_spec.rb @@ -0,0 +1,142 @@ +require 'spec_helper' + +RSpec.describe 'Metrics' do + let(:user) { VCAP::CloudController::User.make } + let(:user_header) { headers_for(user) } + let(:threadqueue) { double(EventMachine::Queue, size: 20, num_waiting: 0) } + let(:resultqueue) { double(EventMachine::Queue, size: 0, num_waiting: 1) } + + before do + # locator = CloudController::DependencyLocator.instance + # allow(locator).to receive(:prometheus_updater).and_return(VCAP::CloudController::Metrics::PrometheusUpdater.new(Prometheus::Client::Registry.new)) + allow(EventMachine).to receive(:connection_count).and_return(123) + + allow(EventMachine).to receive(:instance_variable_get) do |instance_var| + case instance_var + when :@threadqueue + threadqueue + when :@resultqueue + resultqueue + else + raise "Unexpected call: #{instance_var}" + end + end + end + + it 'succeeds' do + get '/internal/v4/metrics', nil + + expect(last_response.status).to eq 200 + get '/internal/v4/metrics', nil + + expect(last_response.status).to eq 200 + get '/internal/v4/metrics', nil + + expect(last_response.status).to eq 200 + end + + context 'cc_total_users' do + before do + cc_total_users = Prometheus::Client.registry.get(:cc_total_users) + if !cc_total_users.nil? + cc_total_users.set(0) + end + + # Prometheus::Client::Config::data_store = nil + 10.times do + VCAP::CloudController::User.make + end + end + + it 'reports the total number of users' do + get '/internal/v4/metrics', nil + + expect(last_response.status).to eq 200 + + expect(last_response.body).to include('cc_total_users 10.0') + end + end + + context 'cc_vitals' do + it 'reports vitals' do + get '/internal/v4/metrics', nil + + expect(last_response.body).to match(/cc_vitals_num_cores [1-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_vitals_uptime [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_vitals_cpu [1-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_vitals_mem_bytes [1-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_vitals_cpu_load_avg [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_vitals_mem_used_bytes [1-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_vitals_mem_free_bytes [1-9][0-9]*\.\d+/) + end + end + + context 'cc_job_queue_length' do + # NOTE: Because there is no easy way to enqueue a job that will + # stick around for long enough to appear in the metrics, + # we're only testing that the metric is emitted in the output. + # If you can figure out how to actually get a job enqueued that + # will show up in the "job queue length" metric, please do update + # the test! + it 'includes job queue length metric in output' do + get '/internal/v4/metrics', nil + + expect(last_response.body).to match(/cc_job_queue_length_cc_api_0 [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_job_queue_length_total [0-9][0-9]*\.\d+/) + end + end + + context 'cc_thread_info' do + it 'reports thread info' do + get '/internal/v4/metrics', nil + + expect(last_response.body).to match(/cc_thread_info_thread_count [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_thread_info_event_machine_connection_count [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_thread_info_event_machine_threadqueue_size [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_thread_info_event_machine_threadqueue_num_waiting [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_thread_info_event_machine_resultqueue_size [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_thread_info_event_machine_resultqueue_num_waiting [0-9][0-9]*\.\d+/) + end + end + + context 'cc_failed_job_count' do + it 'reports failed job count' do + get '/internal/v4/metrics', nil + + expect(last_response.body).to match(/cc_failed_job_count_cc_api_0 [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_failed_job_count_total [0-9][0-9]*\.\d+/) + end + end + + context 'cc_log_count' do + it 'reports log counts' do + get '/internal/v4/metrics', nil + + expect(last_response.body).to match(/cc_log_count_off [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_log_count_fatal [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_log_count_error [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_log_count_warn [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_log_count_info [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_log_count_debug1 [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_log_count_debug2 [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_log_count_all [0-9][0-9]*\.\d+/) + end + end + + context 'cc_task_stats' do + it 'reports task stats' do + get '/internal/v4/metrics', nil + + expect(last_response.body).to match(/cc_tasks_running_count [0-9][0-9]*\.\d+/) + expect(last_response.body).to match(/cc_tasks_running_memory_in_mb [0-9][0-9]*\.\d+/) + end + end + + context 'cc_deploying_count' do + it 'reports deploying_count' do + get '/internal/v4/metrics', nil + + expect(last_response.body).to match(/cc_deployments_deploying [0-9][0-9]*\.\d+/) + end + end +end diff --git a/spec/unit/controllers/internal/metric_controller_spec.rb b/spec/unit/controllers/internal/metric_controller_spec.rb new file mode 100644 index 00000000000..b31e4db1ddc --- /dev/null +++ b/spec/unit/controllers/internal/metric_controller_spec.rb @@ -0,0 +1,34 @@ +require 'spec_helper' + +module VCAP::CloudController + module Internal + RSpec.describe MetricsController do + let(:threadqueue) { double(EventMachine::Queue, size: 20, num_waiting: 0) } + let(:resultqueue) { double(EventMachine::Queue, size: 0, num_waiting: 1) } + + before do + allow(EventMachine).to receive(:connection_count).and_return(123) + + allow(EventMachine).to receive(:instance_variable_get) do |instance_var| + case instance_var + when :@threadqueue + threadqueue + when :@resultqueue + resultqueue + else + raise "Unexpected call: #{instance_var}" + end + end + end + + describe '#index' do + it 'returns a 200' do + get '/internal/v4/metrics' + + expect(last_response.status).to eq 200 + expect(last_response.body).to match(/cc_vitals_num_cores [1-9][0-9]*.\d+/) + end + end + end + end +end diff --git a/spec/unit/controllers/internal/staging_completion_controller_spec.rb b/spec/unit/controllers/internal/staging_completion_controller_spec.rb index c0713b921ac..608df70341d 100644 --- a/spec/unit/controllers/internal/staging_completion_controller_spec.rb +++ b/spec/unit/controllers/internal/staging_completion_controller_spec.rb @@ -1,6 +1,7 @@ require 'spec_helper' require 'membrane' require 'cloud_controller/diego/failure_reason_sanitizer' +require 'cloud_controller/metrics/prometheus_updater' ## NOTICE: Prefer request specs over controller specs as per ADR #0003 ## @@ -37,6 +38,12 @@ module VCAP::CloudController report_staging_failure_metrics: nil, ) end + let(:prometheus_updater) do + instance_double(VCAP::CloudController::Metrics::PrometheusUpdater, + report_staging_success_metrics: nil, + report_staging_failure_metrics: nil, + ) + end let(:one_hour) { 1.hour.to_i } let(:one_hour_in_nanoseconds) { (1.hour.to_i * 1e9).to_i } @@ -44,6 +51,7 @@ module VCAP::CloudController TestConfig.override(kubernetes: nil) allow(VCAP::CloudController::Metrics::StatsdUpdater).to receive(:new).and_return(statsd_updater) + allow(VCAP::CloudController::Metrics::PrometheusUpdater).to receive(:new).and_return(prometheus_updater) end context 'staging a package through /droplet_completed (legacy for rolling deploy)' do @@ -143,6 +151,7 @@ module VCAP::CloudController it 'emits metrics for staging success' do one_hour_in_nanoseconds = (1.hour.to_i * 1e9).to_i expect(statsd_updater).to receive(:report_staging_success_metrics).with(one_hour_in_nanoseconds) + expect(prometheus_updater).to receive(:report_staging_success_metrics).with(one_hour_in_nanoseconds) Timecop.freeze(Time.now) do post url, MultiJson.dump(staging_response) end @@ -161,6 +170,7 @@ module VCAP::CloudController it 'emits metrics for staging failure' do one_hour_in_nanoseconds = (1.hour.to_i * 1e9).to_i expect(statsd_updater).to receive(:report_staging_failure_metrics).with(one_hour_in_nanoseconds) + expect(prometheus_updater).to receive(:report_staging_failure_metrics).with(one_hour_in_nanoseconds) Timecop.freeze(Time.now) do post url, MultiJson.dump(staging_response) end @@ -357,6 +367,7 @@ module VCAP::CloudController it 'emits metrics for staging success' do one_hour_in_nanoseconds = (1.hour.to_i * 1e9).to_i expect(statsd_updater).to receive(:report_staging_success_metrics).with(one_hour_in_nanoseconds) + expect(prometheus_updater).to receive(:report_staging_success_metrics).with(one_hour_in_nanoseconds) Timecop.freeze(Time.now) do post url, MultiJson.dump(staging_response) end @@ -392,6 +403,7 @@ module VCAP::CloudController it 'emits metrics for staging failure' do one_hour_in_nanoseconds = (1.hour.to_i * 1e9).to_i expect(statsd_updater).to receive(:report_staging_failure_metrics).with(one_hour_in_nanoseconds) + expect(prometheus_updater).to receive(:report_staging_failure_metrics).with(one_hour_in_nanoseconds) Timecop.freeze(Time.now) do post url, MultiJson.dump(staging_response) end diff --git a/spec/unit/jobs/diego/sync_spec.rb b/spec/unit/jobs/diego/sync_spec.rb index 63e732f2a90..944c99a739e 100644 --- a/spec/unit/jobs/diego/sync_spec.rb +++ b/spec/unit/jobs/diego/sync_spec.rb @@ -27,20 +27,17 @@ module Jobs::Diego end it 'records sync duration' do - yielded_block = nil + allow(Time).to receive(:now).and_call_original + allow_any_instance_of(VCAP::CloudController::Diego::ProcessesSync).to receive(:sync) + allow_any_instance_of(VCAP::CloudController::Diego::TasksSync).to receive(:sync) - allow_any_instance_of(Statsd).to receive(:time) do |_, metric_name, &block| - expect(metric_name).to eq 'cc.diego_sync.duration' - yielded_block = block - end + expect(processes_sync).to receive(:sync) + expect(tasks_sync).to receive(:sync) + expect(Time).to receive(:now).twice # Ensure that we get two time measurements. _Hopefully_ they get turned into an elapsed time and passed in where they need to be! + expect_any_instance_of(Statsd).to receive(:timing).with('cc.diego_sync.duration', kind_of(Numeric)) + expect_any_instance_of(VCAP::CloudController::Metrics::PrometheusUpdater).to receive(:report_diego_cell_sync_duration).with(kind_of(Numeric)) job.perform - expect(processes_sync).to_not have_received(:sync) - expect(tasks_sync).to_not have_received(:sync) - - yielded_block.call - expect(processes_sync).to have_received(:sync) - expect(tasks_sync).to have_received(:sync) end end end diff --git a/spec/unit/lib/cloud_controller/deployment_updater/scheduler_spec.rb b/spec/unit/lib/cloud_controller/deployment_updater/scheduler_spec.rb index f946916ebcd..10e4e336f97 100644 --- a/spec/unit/lib/cloud_controller/deployment_updater/scheduler_spec.rb +++ b/spec/unit/lib/cloud_controller/deployment_updater/scheduler_spec.rb @@ -27,6 +27,7 @@ module VCAP::CloudController let(:lock_worker) { instance_double(Locket::LockWorker) } let(:logger) { instance_double(Steno::Logger, info: nil, debug: nil, error: nil) } let(:statsd_client) { instance_double(Statsd) } + let(:prometheus_updater) { instance_double(VCAP::CloudController::Metrics::PrometheusUpdater) } before do allow(Locket::LockRunner).to receive(:new).and_return(lock_runner) @@ -37,7 +38,9 @@ module VCAP::CloudController allow(DeploymentUpdater::Scheduler).to receive(:sleep) allow(DeploymentUpdater::Dispatcher).to receive(:dispatch) allow(CloudController::DependencyLocator.instance).to receive(:statsd_client).and_return(statsd_client) - allow(statsd_client).to receive(:time).and_yield + allow(CloudController::DependencyLocator.instance).to receive(:prometheus_updater).and_return(prometheus_updater) + allow(statsd_client).to receive(:timing) + allow(prometheus_updater).to receive(:report_deployment_duration) end it 'correctly configures a LockRunner and uses it to initialize a LockWorker' do @@ -125,20 +128,13 @@ module VCAP::CloudController end end - describe 'statsd metrics' do + describe 'metrics' do it 'records the deployment update duration' do - timed_block = nil - - allow(statsd_client).to receive(:time) do |_, &block| - timed_block = block - end + expect(DeploymentUpdater::Dispatcher).to receive(:dispatch) + expect(statsd_client).to receive(:timing).with('cc.deployments.update.duration', kind_of(Numeric)) + expect(prometheus_updater).to receive(:report_deployment_duration).with(kind_of(Numeric)) DeploymentUpdater::Scheduler.start - expect(statsd_client).to have_received(:time).with('cc.deployments.update.duration') - - expect(DeploymentUpdater::Dispatcher).to_not have_received(:dispatch) - timed_block.call - expect(DeploymentUpdater::Dispatcher).to have_received(:dispatch) end end diff --git a/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb b/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb index e94be504ba9..98051e69180 100644 --- a/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb +++ b/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb @@ -70,7 +70,7 @@ module VCAP::CloudController::Metrics describe '#setup_updates' do before do - allow(updater1).to receive(:record_user_count) + allow(updater1).to receive(:update_user_count) allow(updater1).to receive(:update_job_queue_length) allow(updater1).to receive(:update_thread_info) allow(updater1).to receive(:update_failed_job_count) @@ -79,7 +79,7 @@ module VCAP::CloudController::Metrics allow(updater1).to receive(:update_task_stats) allow(updater1).to receive(:update_deploying_count) - allow(updater2).to receive(:record_user_count) + allow(updater2).to receive(:update_user_count) allow(updater2).to receive(:update_job_queue_length) allow(updater2).to receive(:update_thread_info) allow(updater2).to receive(:update_failed_job_count) @@ -92,7 +92,7 @@ module VCAP::CloudController::Metrics end it 'bumps the number of users and sets periodic timer' do - expect(periodic_updater).to receive(:record_user_count).once + expect(periodic_updater).to receive(:update_user_count).once periodic_updater.setup_updates end @@ -147,7 +147,7 @@ module VCAP::CloudController::Metrics it 'bumps the number of users and sets periodic timer' do expect(periodic_updater).to receive(:catch_error).once.and_call_original - expect(periodic_updater).to receive(:record_user_count).once + expect(periodic_updater).to receive(:update_user_count).once expect(@periodic_timers[0][:interval]).to eq(600) @periodic_timers[0][:block].call @@ -219,19 +219,19 @@ module VCAP::CloudController::Metrics end end - describe '#record_user_count' do + describe '#update_user_count' do before do - allow(updater1).to receive(:record_user_count) - allow(updater2).to receive(:record_user_count) + allow(updater1).to receive(:update_user_count) + allow(updater2).to receive(:update_user_count) end it 'should include the number of users' do 4.times { VCAP::CloudController::User.create(guid: SecureRandom.uuid) } - periodic_updater.record_user_count + periodic_updater.update_user_count - expect(updater1).to have_received(:record_user_count).with(VCAP::CloudController::User.count) - expect(updater2).to have_received(:record_user_count).with(VCAP::CloudController::User.count) + expect(updater1).to have_received(:update_user_count).with(VCAP::CloudController::User.count) + expect(updater2).to have_received(:update_user_count).with(VCAP::CloudController::User.count) end end @@ -593,7 +593,7 @@ module VCAP::CloudController::Metrics describe '#update!' do before do - allow(updater1).to receive(:record_user_count) + allow(updater1).to receive(:update_user_count) allow(updater1).to receive(:update_job_queue_length) allow(updater1).to receive(:update_thread_info) allow(updater1).to receive(:update_failed_job_count) @@ -602,7 +602,7 @@ module VCAP::CloudController::Metrics allow(updater1).to receive(:update_task_stats) allow(updater1).to receive(:update_deploying_count) - allow(updater2).to receive(:record_user_count) + allow(updater2).to receive(:update_user_count) allow(updater2).to receive(:update_job_queue_length) allow(updater2).to receive(:update_thread_info) allow(updater2).to receive(:update_failed_job_count) @@ -613,7 +613,7 @@ module VCAP::CloudController::Metrics end it 'calls all update methods' do - expect(periodic_updater).to receive(:record_user_count).once + expect(periodic_updater).to receive(:update_user_count).once expect(periodic_updater).to receive(:update_job_queue_length).once expect(periodic_updater).to receive(:update_thread_info).once expect(periodic_updater).to receive(:update_failed_job_count).once diff --git a/spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb b/spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb new file mode 100644 index 00000000000..4192d9c8ea4 --- /dev/null +++ b/spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb @@ -0,0 +1,350 @@ +require 'spec_helper' +require 'cloud_controller/metrics/prometheus_updater' + +# There are a few things to NOTE here: +# 1) We're adding this function because the Prometheus Client Registry +# is interpreter-global state, so changes that happen in one test will +# absolutely affect other tests, unless the test author has taken pains +# to carefully clean up their changes. +# 2) In our CI, tests are run in parallel to speed up execution. At the time +# of this writing (2022-05-03), we use the `grosser/parallel_tests` gem to +# run tests in parallel. This gem uses the `grosser/parallel` gem to distribute +# work which runs workloads in separate subprocesses, rather than threads +# in the same interpreter process. +# You will notice that we call this function that we created in a top-level +# `before` block a little bit later in this file, and _also_ that we mention +# above that the Registry is interpeter-global state. Because our +# test-parallelizer uses subprocesses to distribute its work, and also does +# not interleave tests from one file with tests from another, the registry +# recreation that we're doing here will be restricted to the tests in _this_ +# file and will not leak out into other files. +# 3) So, if you see weird or unreliable behavior out of the Registry when running +# specs (this would probably be stuff like "metric values are sometimes not what +# they should be"), check to see if our test parallelizer is still using +# subprocesses, or if it has switched to threads and runs everything in one +# interpreter. +module Prometheus + module Client + def self.recreate_registry + @registry = Prometheus::Client::Registry.new + end + end +end + +module VCAP::CloudController::Metrics + RSpec.describe PrometheusUpdater do + let(:updater) { PrometheusUpdater.new(prom_client) } + let(:prom_client) { Prometheus::Client.registry } + + before do + Prometheus::Client.recreate_registry + end + + describe 'Promethus creation guards work correctly' do + # This might look to be a duplicate of 'records the current number of deployments that are DEPLOYING' + # below, but it tests that at least one of the metric updating functions can be called multiple times + # without failures. Because we are re-creating the Prometheus Client Registry before every test, we + # need to have at least one test that ensures that operations on a used registry won't fail. + # + # Because the PrometheusUpdater is designed so that all of the functionality that calls out to Prometheus is + # contained within a few short functions that follow the same obvious "Create the metric if it doesn't exist + # and then update the metric" pattern, the expectation is that future folks that work on the code will + # be unlikely to unintentionally remove the "create the metric if it doesn't exist" guard, and will also + # be unlikely to leave the guard out if they need to create a function that works with another Prometheus + # metric datatype. + it 'does not explode when the update function is called more than once' do + expected_deploying_count = 7 + + updater.update_deploying_count(expected_deploying_count) + metric = prom_client.metrics.find { |m| m.name == :cc_deployments_deploying } + expect(metric).to be_present + expect(metric.get).to eq 7 + + updater.update_deploying_count(expected_deploying_count) + metric = prom_client.metrics.find { |m| m.name == :cc_deployments_deploying } + expect(metric).to be_present + expect(metric.get).to eq 7 + end + end + + describe '#update_deploying_count' do + it 'records the current number of deployments that are DEPLOYING' do + expected_deploying_count = 7 + + updater.update_deploying_count(expected_deploying_count) + metric = prom_client.metrics.find { |m| m.name == :cc_deployments_deploying } + expect(metric).to be_present + expect(metric.get).to eq 7 + end + end + + describe '#update_user_count' do + it 'records number of users' do + expected_user_count = 5 + + updater.update_user_count(expected_user_count) + + metric = prom_client.metrics.find { |m| m.name == :cc_total_users } + expect(metric).to be_present + expect(metric.get).to eq 5 + end + end + + describe '#update_job_queue_length' do + it 'records the length of the delayed job queues and total' do + expected_local_length = 5 + expected_generic_length = 6 + total = expected_local_length + expected_generic_length + + pending_job_count_by_queue = { + cc_local: expected_local_length, + cc_generic: expected_generic_length + } + + updater.update_job_queue_length(pending_job_count_by_queue, total) + + metric = prom_client.metrics.find { |m| m.name == :cc_job_queue_length_cc_local } + expect(metric.get).to eq 5 + + metric = prom_client.metrics.find { |m| m.name == :cc_job_queue_length_cc_generic } + expect(metric.get).to eq 6 + + metric = prom_client.metrics.find { |m| m.name == :cc_job_queue_length_total } + expect(metric.get).to eq 11 + end + end + + describe '#update_failed_job_count' do + it 'records the number of failed jobs in the delayed job queue and the total to statsd' do + expected_local_length = 5 + expected_generic_length = 6 + total = expected_local_length + expected_generic_length + + failed_jobs_by_queue = { + cc_local: expected_local_length, + cc_generic: expected_generic_length + } + + updater.update_failed_job_count(failed_jobs_by_queue, total) + + metric = prom_client.metrics.find { |m| m.name == :cc_failed_job_count_cc_local } + expect(metric.get).to eq 5 + + metric = prom_client.metrics.find { |m| m.name == :cc_failed_job_count_cc_generic } + expect(metric.get).to eq 6 + + metric = prom_client.metrics.find { |m| m.name == :cc_failed_job_count_total } + expect(metric.get).to eq 11 + end + end + + describe '#update_thread_info' do + it 'should contain EventMachine data' do + thread_info = { + thread_count: 5, + event_machine: { + connection_count: 10, + threadqueue: { + size: 19, + num_waiting: 2, + }, + resultqueue: { + size: 8, + num_waiting: 1, + }, + }, + } + + updater.update_thread_info(thread_info) + + metric = prom_client.metrics.find { |m| m.name == :cc_thread_info_thread_count } + expect(metric.get).to eq 5 + + metric = prom_client.metrics.find { |m| m.name == :cc_thread_info_event_machine_connection_count } + expect(metric.get).to eq 10 + + metric = prom_client.metrics.find { |m| m.name == :cc_thread_info_event_machine_threadqueue_size } + expect(metric.get).to eq 19 + + metric = prom_client.metrics.find { |m| m.name == :cc_thread_info_event_machine_threadqueue_num_waiting } + expect(metric.get).to eq 2 + + metric = prom_client.metrics.find { |m| m.name == :cc_thread_info_event_machine_resultqueue_size } + expect(metric.get).to eq 8 + + metric = prom_client.metrics.find { |m| m.name == :cc_thread_info_event_machine_resultqueue_num_waiting } + expect(metric.get).to eq 1 + end + end + + describe '#update_vitals' do + it 'updates vitals' do + vitals = { + uptime: 33, + cpu_load_avg: 0.5, + mem_used_bytes: 542, + mem_free_bytes: 927, + mem_bytes: 1, + cpu: 2.0, + num_cores: 4, + } + + updater.update_vitals(vitals) + + metric = prom_client.metrics.find { |m| m.name == :cc_vitals_uptime } + expect(metric.get).to eq 33 + + metric = prom_client.metrics.find { |m| m.name == :cc_vitals_cpu_load_avg } + expect(metric.get).to eq 0.5 + + metric = prom_client.metrics.find { |m| m.name == :cc_vitals_mem_used_bytes } + expect(metric.get).to eq 542 + + metric = prom_client.metrics.find { |m| m.name == :cc_vitals_mem_free_bytes } + expect(metric.get).to eq 927 + + metric = prom_client.metrics.find { |m| m.name == :cc_vitals_mem_bytes } + expect(metric.get).to eq 1 + + metric = prom_client.metrics.find { |m| m.name == :cc_vitals_cpu } + expect(metric.get).to eq 2.0 + + metric = prom_client.metrics.find { |m| m.name == :cc_vitals_num_cores } + expect(metric.get).to eq 4 + end + end + + describe '#update_log_counts' do + it 'updates log counts' do + counts = { + off: 1, + fatal: 2, + error: 3, + warn: 4, + info: 5, + debug: 6, + debug1: 7, + debug2: 8, + all: 9 + } + + updater.update_log_counts(counts) + + metric = prom_client.metrics.find { |m| m.name == :cc_log_count_off } + expect(metric.get).to eq 1 + + metric = prom_client.metrics.find { |m| m.name == :cc_log_count_fatal } + expect(metric.get).to eq 2 + + metric = prom_client.metrics.find { |m| m.name == :cc_log_count_error } + expect(metric.get).to eq 3 + + metric = prom_client.metrics.find { |m| m.name == :cc_log_count_warn } + expect(metric.get).to eq 4 + + metric = prom_client.metrics.find { |m| m.name == :cc_log_count_info } + expect(metric.get).to eq 5 + + metric = prom_client.metrics.find { |m| m.name == :cc_log_count_debug } + expect(metric.get).to eq 6 + + metric = prom_client.metrics.find { |m| m.name == :cc_log_count_debug1 } + expect(metric.get).to eq 7 + + metric = prom_client.metrics.find { |m| m.name == :cc_log_count_debug2 } + expect(metric.get).to eq 8 + + metric = prom_client.metrics.find { |m| m.name == :cc_log_count_all } + expect(metric.get).to eq 9 + end + end + + describe '#update_task_stats' do + it 'records the number of running tasks and task memory' do + updater.update_task_stats(5, 512) + + metric = prom_client.metrics.find { |m| m.name == :cc_tasks_running_count } + expect(metric.get).to eq 5 + + metric = prom_client.metrics.find { |m| m.name == :cc_tasks_running_memory_in_mb } + expect(metric.get).to eq 512 + end + end + + describe '#update_synced_invalid_lrps' do + it 'records number of running tasks and task memory to statsd' do + updater.update_synced_invalid_lrps(5) + metric = prom_client.metrics.find { |m| m.name == :cc_diego_sync_invalid_desired_lrps } + expect(metric.get).to eq 5 + end + end + + describe '#start_staging_request_received' do + it 'increments "cc_staging_requested"' do + updater.start_staging_request_received + + metric = prom_client.metrics.find { |m| m.name == :cc_staging_requested } + expect(metric.get).to eq 1 + + updater.start_staging_request_received + + metric = prom_client.metrics.find { |m| m.name == :cc_staging_requested } + expect(metric.get).to eq 2 + end + end + + describe '#report_staging_success_metrics' do + it 'records staging success metrics' do + duration_ns = 20 * 1e9 + + updater.report_staging_success_metrics(duration_ns) + metric = prom_client.metrics.find { |m| m.name == :cc_staging_succeeded } + expect(metric.get).to eq 1 + + metric = prom_client.metrics.find { |m| m.name == :cc_staging_succeeded_duration } + # expected buckets for duration, in millis : 10000, 15000, 20000, 25000, 30000 + expect(metric.get).to eq({ '10000.0' => 0, '15000.0' => 0, '20000.0' => 1, '25000.0' => 1, '30000.0' => 1, 'sum' => 20000, '+Inf' => 1 }) + end + end + + describe '#report_staging_failure_metrics' do + it 'emits staging failure metrics' do + duration_ns = 20 * 1e9 + + updater.report_staging_failure_metrics(duration_ns) + metric = prom_client.metrics.find { |m| m.name == :cc_staging_failed } + expect(metric.get).to eq 1 + + metric = prom_client.metrics.find { |m| m.name == :cc_staging_failed_duration } + # expected buckets for duration, in millis : 10000, 15000, 20000, 25000, 30000 + expect(metric.get).to eq({ '10000.0' => 0, '15000.0' => 0, '20000.0' => 1, '25000.0' => 1, '30000.0' => 1, 'sum' => 20000, '+Inf' => 1 }) + end + end + + describe '#report_diego_cell_sync_duration' do + it 'reports diego cell sync duration' do + duration_ns = 20 * 1e9 + + updater.report_diego_cell_sync_duration(duration_ns) + metric = prom_client.metrics.find { |m| m.name == :cc_diego_sync_duration } + expect(metric.get).to eq({ 'count' => 1.0, 'sum' => 20000000000.0 }) + + metric = prom_client.metrics.find { |m| m.name == :cc_diego_sync_duration_gauge } + expect(metric.get).to eq duration_ns + end + end + + describe '#report_deployment_duration' do + it 'reports deployments update duration' do + duration_ns = 20 * 1e9 + + updater.report_deployment_duration(duration_ns) + metric = prom_client.metrics.find { |m| m.name == :cc_deployments_update_duration } + expect(metric.get).to eq({ 'count' => 1.0, 'sum' => 20000000000.0 }) + + metric = prom_client.metrics.find { |m| m.name == :cc_deployments_update_duration_gauge } + expect(metric.get).to eq duration_ns + end + end + end +end diff --git a/spec/unit/lib/cloud_controller/metrics/request_metrics_spec.rb b/spec/unit/lib/cloud_controller/metrics/request_metrics_spec.rb index efdf1ec9e8a..b3850633b03 100644 --- a/spec/unit/lib/cloud_controller/metrics/request_metrics_spec.rb +++ b/spec/unit/lib/cloud_controller/metrics/request_metrics_spec.rb @@ -4,7 +4,14 @@ module VCAP::CloudController::Metrics RSpec.describe RequestMetrics do let(:statsd_client) { double(:statsd_client) } - let!(:request_metrics) { RequestMetrics.new(statsd_client) } # TODO: probably doesn't need to be a let!, just a let + let(:prometheus_client) { double(:prometheus_client) } + let!(:request_metrics) { RequestMetrics.new(statsd_client, prometheus_client) } # TODO: probably doesn't need to be a let!, just a let + + before do + allow(prometheus_client).to receive(:update_gauge_metric) + allow(prometheus_client).to receive(:decrement_gauge_metric) + allow(prometheus_client).to receive(:increment_gauge_metric) + end describe '#start_request' do before do @@ -17,6 +24,8 @@ module VCAP::CloudController::Metrics expect(statsd_client).to have_received(:gauge).with('cc.requests.outstanding.gauge', 1) expect(statsd_client).to have_received(:increment).with('cc.requests.outstanding') + expect(prometheus_client).to have_received(:update_gauge_metric).with(:cc_requests_outstanding_gauge, 1, kind_of(String)) + expect(prometheus_client).to have_received(:increment_gauge_metric).with(:cc_requests_outstanding, kind_of(String)) end end @@ -38,17 +47,25 @@ module VCAP::CloudController::Metrics expect(batch).to have_received(:decrement).with('cc.requests.outstanding') expect(batch).to have_received(:increment).with('cc.requests.completed') expect(batch).to have_received(:increment).with('cc.http_status.2XX') + + expect(prometheus_client).to have_received(:update_gauge_metric).with(:cc_requests_outstanding_gauge, -1, kind_of(String)) + expect(prometheus_client).to have_received(:decrement_gauge_metric).with(:cc_requests_outstanding, kind_of(String)) + expect(prometheus_client).to have_received(:increment_gauge_metric).with(:cc_requests_completed, kind_of(String)) + expect(prometheus_client).to have_received(:increment_gauge_metric).with(:cc_http_status_2XX, kind_of(String)) end it 'normalizes http status codes in statsd' do request_metrics.complete_request(200) expect(batch).to have_received(:increment).with('cc.http_status.2XX') + expect(prometheus_client).to have_received(:increment_gauge_metric).with(:cc_http_status_2XX, kind_of(String)) request_metrics.complete_request(300) expect(batch).to have_received(:increment).with('cc.http_status.3XX') + expect(prometheus_client).to have_received(:increment_gauge_metric).with(:cc_http_status_3XX, kind_of(String)) request_metrics.complete_request(400) expect(batch).to have_received(:increment).with('cc.http_status.4XX') + expect(prometheus_client).to have_received(:increment_gauge_metric).with(:cc_http_status_4XX, kind_of(String)) end end end diff --git a/spec/unit/lib/cloud_controller/metrics/statsd_updater_spec.rb b/spec/unit/lib/cloud_controller/metrics/statsd_updater_spec.rb index 4bf89be77eb..da35b23b575 100644 --- a/spec/unit/lib/cloud_controller/metrics/statsd_updater_spec.rb +++ b/spec/unit/lib/cloud_controller/metrics/statsd_updater_spec.rb @@ -20,7 +20,7 @@ module VCAP::CloudController::Metrics end end - describe '#record_user_count' do + describe '#update_user_count' do before do allow(statsd_client).to receive(:gauge) end @@ -28,7 +28,7 @@ module VCAP::CloudController::Metrics it 'emits number of users to statsd' do expected_user_count = 5 - updater.record_user_count(expected_user_count) + updater.update_user_count(expected_user_count) expect(statsd_client).to have_received(:gauge).with('cc.total_users', expected_user_count) end