Skip to content

Commit

Permalink
Revert "Revert "Creates metrics endpoint for use by prom-scraper""
Browse files Browse the repository at this point in the history
This can be re-added once cloudfoundry/cf-deployment#970 is accepted

This reverts commit 8415699.
  • Loading branch information
MerricdeLauney committed Jun 28, 2022
1 parent 7cb7e88 commit cb6485a
Show file tree
Hide file tree
Showing 20 changed files with 820 additions and 55 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ gem 'sequel_pg', require: 'sequel'
gem 'sinatra', '~> 2.2', '>= 2.2.0'
gem 'sinatra-contrib'
gem 'statsd-ruby', '~> 1.4.0'
gem 'prometheus-client'
gem 'steno'
gem 'talentbox-delayed_job_sequel', '~> 4.3.0'
gem 'thin'
Expand Down
4 changes: 3 additions & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ GEM
ast (~> 2.4.1)
pg (1.4.1)
posix-spawn (0.3.15)
prometheus-client (3.0.0)
protobuf (3.6.12)
activesupport (>= 3.2)
middleware
Expand Down Expand Up @@ -600,6 +601,7 @@ DEPENDENCIES
parallel_tests
pg
posix-spawn (~> 0.3.15)
prometheus-client
protobuf (= 3.6.12)
pry-byebug
psych (>= 4.0.4)
Expand Down Expand Up @@ -640,4 +642,4 @@ DEPENDENCIES
yajl-ruby

BUNDLED WITH
2.1.4
2.2.26
25 changes: 25 additions & 0 deletions app/controllers/internal/metrics_controller.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
require 'prometheus/client'
require 'prometheus/client/formats/text'
require 'cloud_controller/metrics/prometheus_updater'

module VCAP::CloudController
module Internal
class MetricsController < RestController::BaseController
allow_unauthenticated_access
get '/internal/v4/metrics', :index

def index
periodic_updater = VCAP::CloudController::Metrics::PeriodicUpdater.new(
Time.now.utc,
Steno::Sink::Counter.new,
Steno.logger('cc.api'),
[
VCAP::CloudController::Metrics::StatsdUpdater.new,
VCAP::CloudController::Metrics::PrometheusUpdater.new
])
periodic_updater.update!
[200, Prometheus::Client::Formats::Text.marshal(Prometheus::Client.registry)]
end
end
end
end
6 changes: 6 additions & 0 deletions app/controllers/internal/staging_completion_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,21 @@ def report_metrics(bbs_staging_response)
duration = Time.now.utc.to_i * 1e9 - bbs_staging_response[:created_at]
if bbs_staging_response[:failed]
statsd_updater.report_staging_failure_metrics(duration)
prometheus_updater.report_staging_failure_metrics(duration)
else
statsd_updater.report_staging_success_metrics(duration)
prometheus_updater.report_staging_success_metrics(duration)
end
end

def statsd_updater
@statsd_updater ||= VCAP::CloudController::Metrics::StatsdUpdater.new
end

def prometheus_updater
@prometheus_updater ||= VCAP::CloudController::Metrics::PrometheusUpdater.new # this should be using singleton
end

attr_reader :stagers

def read_body
Expand Down
17 changes: 15 additions & 2 deletions app/jobs/diego/sync.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,28 @@ module VCAP::CloudController
module Jobs
module Diego
class Sync < VCAP::CloudController::Jobs::CCJob
def initialize(statsd=Statsd.new)
def initialize(statsd=Statsd.new, prometheus_updater=VCAP::CloudController::Metrics::PrometheusUpdater.new)
@statsd = statsd
@prometheus_updater = prometheus_updater
end

def perform
config = CloudController::DependencyLocator.instance.config
@statsd.time('cc.diego_sync.duration') do
begin
## TODO: At some point in the future, start using a monotonic time source, rather than wall-clock time!
start = Time.now
VCAP::CloudController::Diego::ProcessesSync.new(config: config).sync
VCAP::CloudController::Diego::TasksSync.new(config: config).sync
ensure
finish = Time.now
## NOTE: We're taking time in seconds and multiplying by 1000 because we don't have
## access to time in milliseconds. If you ever get access to reliable time in
## milliseconds, then do know that the lack of precision here is not desired
## so feed in the entire value!
elapsed_ms = ((finish - start) * 1000).round

@statsd.timing('cc.diego_sync.duration', elapsed_ms)
@prometheus_updater.report_diego_cell_sync_duration(elapsed_ms)
end
end

Expand Down
8 changes: 8 additions & 0 deletions lib/cloud_controller/dependency_locator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
require 'cloud_controller/opi/instances_client'
require 'cloud_controller/opi/stager_client'
require 'cloud_controller/opi/task_client'
require 'cloud_controller/metrics/prometheus_updater'

require 'bits_service_client'

Expand Down Expand Up @@ -70,6 +71,13 @@ def runners
@dependencies[:runners] || register(:runners, VCAP::CloudController::Runners.new(config))
end

def prometheus_updater
unless @dependencies[:prometheus_updater]
register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new)
end
@dependencies[:prometheus_updater]
end

def stagers
@dependencies[:stagers] || register(:stagers, VCAP::CloudController::Stagers.new(config))
end
Expand Down
18 changes: 13 additions & 5 deletions lib/cloud_controller/deployment_updater/scheduler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@ def start
with_error_logging('cc.deployment_updater') do
config = CloudController::DependencyLocator.instance.config
statsd_client = CloudController::DependencyLocator.instance.statsd_client
prometheus_updater = CloudController::DependencyLocator.instance.prometheus_updater

update_step = proc { update(
update_frequency: config.get(:deployment_updater, :update_frequency_in_seconds),
statsd_client: statsd_client
statsd_client: statsd_client,
prometheus_updater: prometheus_updater
)
}

Expand All @@ -39,14 +41,20 @@ def start

private

def update(update_frequency:, statsd_client:)
def update(update_frequency:, statsd_client:, prometheus_updater:)
logger = Steno.logger('cc.deployment_updater.scheduler')

update_start_time = Time.now
statsd_client.time('cc.deployments.update.duration') do
Dispatcher.dispatch
end
Dispatcher.dispatch
update_duration = Time.now - update_start_time
## NOTE: We're taking time in seconds and multiplying by 1000 because we don't have
## access to time in milliseconds. If you ever get access to reliable time in
## milliseconds, then do know that the lack of precision here is not desired
## so feed in the entire value!
update_duration_ms = update_duration * 1000
statsd_client.timing('cc.deployments.update.duration', update_duration_ms)
prometheus_updater.report_deployment_duration(update_duration_ms)

logger.info("Update loop took #{update_duration}s")

sleep_duration = update_frequency - update_duration
Expand Down
10 changes: 5 additions & 5 deletions lib/cloud_controller/metrics/periodic_updater.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

module VCAP::CloudController::Metrics
class PeriodicUpdater
def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpdater.new])
def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpdater.new, PrometheusUpdater.new])
@start_time = start_time
@updaters = updaters
@log_counter = log_counter
Expand All @@ -15,7 +15,7 @@ def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpd

def setup_updates
update!
EM.add_periodic_timer(600) { catch_error { record_user_count } }
EM.add_periodic_timer(600) { catch_error { update_user_count } }
EM.add_periodic_timer(30) { catch_error { update_job_queue_length } }
EM.add_periodic_timer(30) { catch_error { update_thread_info } }
EM.add_periodic_timer(30) { catch_error { update_failed_job_count } }
Expand All @@ -26,7 +26,7 @@ def setup_updates
end

def update!
record_user_count
update_user_count
update_job_queue_length
update_thread_info
update_failed_job_count
Expand Down Expand Up @@ -67,10 +67,10 @@ def update_deploying_count
@updaters.each { |u| u.update_deploying_count(deploying_count) }
end

def record_user_count
def update_user_count
user_count = VCAP::CloudController::User.count

@updaters.each { |u| u.record_user_count(user_count) }
@updaters.each { |u| u.update_user_count(user_count) }
end

def update_job_queue_length
Expand Down
143 changes: 143 additions & 0 deletions lib/cloud_controller/metrics/prometheus_updater.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
require 'prometheus/client'

module VCAP::CloudController::Metrics
class PrometheusUpdater
def initialize(registry=Prometheus::Client.registry)
@registry = registry
end

def update_gauge_metric(metric, value, message)
unless @registry.exist?(metric)
@registry.gauge(metric, docstring: message)
end
@registry.get(metric).set(value)
end

def increment_gauge_metric(metric, message)
unless @registry.exist?(metric)
@registry.gauge(metric, docstring: message)
end
@registry.get(metric).increment
end

def decrement_gauge_metric(metric, message)
unless @registry.exist?(metric)
@registry.gauge(metric, docstring: message)
end
@registry.get(metric).decrement
end

def increment_counter_metric(metric, message)
unless @registry.exist?(metric)
@registry.counter(metric, docstring: message)
end
@registry.get(metric).increment
end

def update_histogram_metric(metric, value, message, buckets)
unless @registry.exist?(metric)
@registry.histogram(metric, buckets: buckets, docstring: message)
end
@registry.get(metric).observe(value)
end

def update_summary_metric(metric, value, message)
unless @registry.exist?(metric)
@registry.summary(metric, docstring: message)
end
@registry.get(metric).observe(value)
end

def update_deploying_count(deploying_count)
update_gauge_metric(:cc_deployments_deploying, deploying_count, 'Number of in progress deployments')
end

def update_user_count(user_count)
update_gauge_metric(:cc_total_users, user_count, 'Number of users')
end

def update_job_queue_length(pending_job_count_by_queue, total)
pending_job_count_by_queue.each do |key, value|
metric_key = :"cc_job_queue_length_#{key.to_s.underscore}"
update_gauge_metric(metric_key, value, docstring: "Job queue length for worker #{key}")
end

update_gauge_metric(:cc_job_queue_length_total, total, 'Total job queue length')
end

def update_thread_info(thread_info)
update_gauge_metric(:cc_thread_info_thread_count, thread_info[:thread_count], 'Thread count')
update_gauge_metric(:cc_thread_info_event_machine_connection_count, thread_info[:event_machine][:connection_count], 'Event Machine connection count')
update_gauge_metric(:cc_thread_info_event_machine_threadqueue_size, thread_info[:event_machine][:threadqueue][:size], 'EventMachine thread queue size')
update_gauge_metric(:cc_thread_info_event_machine_threadqueue_num_waiting, thread_info[:event_machine][:threadqueue][:num_waiting], 'EventMachine num waiting in thread')
update_gauge_metric(:cc_thread_info_event_machine_resultqueue_size, thread_info[:event_machine][:resultqueue][:size], 'EventMachine queue size')
update_gauge_metric(:cc_thread_info_event_machine_resultqueue_num_waiting, thread_info[:event_machine][:resultqueue][:num_waiting], 'EventMachine requests waiting in queue')
end

def update_failed_job_count(failed_jobs_by_queue, total)
failed_jobs_by_queue.each do |key, value|
metric_key = :"cc_failed_job_count_#{key.to_s.underscore}"
update_gauge_metric(metric_key, value, "Failed jobs for worker #{key}")
end

update_gauge_metric(:cc_failed_job_count_total, total, 'Total failed jobs')
end

def update_vitals(vitals)
vitals.each do |key, value|
metric_key = :"cc_vitals_#{key.to_s.underscore}"
update_gauge_metric(metric_key, value, "CloudController Vitals: #{key}")
end
end

def update_log_counts(counts)
counts.each do |key, value|
metric_key = :"cc_log_count_#{key.to_s.underscore}"
update_gauge_metric(metric_key, value, "Log count for log level '#{key}'")
end
end

def update_task_stats(total_running_tasks, total_memory_in_mb)
update_gauge_metric(:cc_tasks_running_count, total_running_tasks, 'Total running tasks')
update_gauge_metric(:cc_tasks_running_memory_in_mb, total_memory_in_mb, 'Total memory consumed by running tasks')
end

def update_synced_invalid_lrps(lrp_count)
update_gauge_metric(:cc_diego_sync_invalid_desired_lrps, lrp_count, 'Invalid Desired LRPs')
end

def start_staging_request_received
increment_counter_metric(:cc_staging_requested, 'Number of staging requests')
end

def report_staging_success_metrics(duration_ns)
increment_counter_metric(:cc_staging_succeeded, 'Number of successful staging events')
update_histogram_metric(:cc_staging_succeeded_duration, nanoseconds_to_milliseconds(duration_ns), 'Durations of successful staging events', duration_buckets)
end

def report_staging_failure_metrics(duration_ns)
increment_counter_metric(:cc_staging_failed, 'Number of failed staging events')
update_histogram_metric(:cc_staging_failed_duration, nanoseconds_to_milliseconds(duration_ns), 'Durations of failed staging events', duration_buckets)
end

def report_diego_cell_sync_duration(duration_ms)
update_summary_metric(:cc_diego_sync_duration, duration_ms, 'Diego cell sync duration')
update_gauge_metric(:cc_diego_sync_duration_gauge, duration_ms, 'Diego cell sync duration (gauge metric)')
end

def report_deployment_duration(duration_ms)
update_summary_metric(:cc_deployments_update_duration, duration_ms, 'Deployment duration')
update_gauge_metric(:cc_deployments_update_duration_gauge, duration_ms, 'Deployment duration (gauge metric)')
end

private

def duration_buckets
Prometheus::Client::Histogram.linear_buckets(start: 10000, width: 5000, count: 5)
end

def nanoseconds_to_milliseconds(time_ns)
(time_ns / 1e6).to_i
end
end
end
15 changes: 13 additions & 2 deletions lib/cloud_controller/metrics/request_metrics.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,36 @@
module VCAP::CloudController
module Metrics
class RequestMetrics
def initialize(statsd=Statsd.new)
def initialize(statsd=Statsd.new, prometheus_updater=PrometheusUpdater.new)
@counter = 0
@statsd = statsd
@prometheus_updater = prometheus_updater
end

def start_request
@counter += 1
@statsd.gauge('cc.requests.outstanding.gauge', @counter)
@statsd.increment 'cc.requests.outstanding'

@prometheus_updater.update_gauge_metric(:cc_requests_outstanding_gauge, @counter, 'Requests Outstanding Gauge')
@prometheus_updater.increment_gauge_metric(:cc_requests_outstanding, 'Requests Outstanding')
end

def complete_request(status)
http_status_code = "#{status.to_s[0]}XX"
http_status_metric = "cc.http_status.#{http_status_code}"
@counter -= 1
@statsd.gauge('cc.requests.outstanding.gauge', @counter)
@statsd.batch do |batch|
batch.decrement 'cc.requests.outstanding'
batch.increment 'cc.requests.completed'
batch.increment "cc.http_status.#{status.to_s[0]}XX"
batch.increment http_status_metric
end

@prometheus_updater.update_gauge_metric(:cc_requests_outstanding_gauge, @counter, 'Requests Outstanding Gauge')
@prometheus_updater.decrement_gauge_metric(:cc_requests_outstanding, 'Requests Outstanding')
@prometheus_updater.increment_gauge_metric(:cc_requests_completed, 'Requests Completed')
@prometheus_updater.increment_gauge_metric(http_status_metric.gsub('.', '_').to_sym, "Times HTTP status #{http_status_code} have been received")
end
end
end
Expand Down
Loading

0 comments on commit cb6485a

Please sign in to comment.