From 77ab62856936a15efadba011855b9fc946b51108 Mon Sep 17 00:00:00 2001 From: Jay Mundrawala Date: Wed, 23 Aug 2017 17:17:15 -0500 Subject: [PATCH] Wire up basic stats endpoint Currently returns some basic erlang vm statistics in the prometheus text format. This is the first step towards RFC #93. Basic auth support is present out of the box. The user can edit the ['opscode-erchef']['stats_auth_enable'] flag to change this. The actual password is stored in opscode_erchef.stats_password in chef veil. A default password is generated if the user does not specify one before running chef-server-ctl reconfigure. The end-point is accessible at /_stats. You can use ?format=json or ?format=text to access the available metric in json or prometheus formats. The stats returned are documented at https://github.com/chef/chef-server/issues/1385. ``` # TYPE erlang_vm_memory_atom_bytes_total gauge # HELP erlang_vm_memory_atom_bytes_total The total amount of memory currently allocated for atoms. This memory is part of the memory presented as system memory. erlang_vm_memory_atom_bytes_total{usage="used"} 1455697 erlang_vm_memory_atom_bytes_total{usage="free"} 12848 # TYPE erlang_vm_memory_bytes_total gauge # HELP erlang_vm_memory_bytes_total The total amount of memory currently allocated. This is the same as the sum of the memory size for processes and system. erlang_vm_memory_bytes_total{kind="system"} 45984768 erlang_vm_memory_bytes_total{kind="processes"} 35098912 # TYPE erlang_vm_dets_tables gauge # HELP erlang_vm_dets_tables Erlang VM DETS Tables count erlang_vm_dets_tables 0 # TYPE erlang_vm_ets_tables gauge # HELP erlang_vm_ets_tables Erlang VM ETS Tables count erlang_vm_ets_tables 64 # TYPE erlang_vm_memory_processes_bytes_total gauge # HELP erlang_vm_memory_processes_bytes_total The total amount of memory currently allocated for the Erlang processes. erlang_vm_memory_processes_bytes_total{usage="used"} 35091312 erlang_vm_memory_processes_bytes_total{usage="free"} 7600 # TYPE erlang_vm_memory_system_bytes_total gauge # HELP erlang_vm_memory_system_bytes_total The total amount of memory currently allocated for the emulator that is not directly related to any Erlang process. Memory presented as processes is not included in this memory. erlang_vm_memory_system_bytes_total{usage="atom"} 1468545 erlang_vm_memory_system_bytes_total{usage="binary"} 803360 erlang_vm_memory_system_bytes_total{usage="code"} 33479913 erlang_vm_memory_system_bytes_total{usage="ets"} 2372400 erlang_vm_memory_system_bytes_total{usage="other"} 7860550 # TYPE erlang_vm_statistics_context_switches counter # HELP erlang_vm_statistics_context_switches Total number of context switches since the system started erlang_vm_statistics_context_switches 29549971 # TYPE erlang_vm_statistics_garbage_collection_number_of_gcs counter # HELP erlang_vm_statistics_garbage_collection_number_of_gcs Garbage collection: number of GCs erlang_vm_statistics_garbage_collection_number_of_gcs 1632986 # TYPE erlang_vm_statistics_garbage_collection_words_reclaimed counter # HELP erlang_vm_statistics_garbage_collection_words_reclaimed Garbage collection: words reclaimed erlang_vm_statistics_garbage_collection_words_reclaimed 12664754817 # TYPE erlang_vm_statistics_garbage_collection_bytes_reclaimed counter # HELP erlang_vm_statistics_garbage_collection_bytes_reclaimed Garbage collection: bytes reclaimed erlang_vm_statistics_garbage_collection_bytes_reclaimed 101318038536 # TYPE erlang_vm_statistics_bytes_received_total counter # HELP erlang_vm_statistics_bytes_received_total Total number of bytes received through ports erlang_vm_statistics_bytes_received_total 795724439 # TYPE erlang_vm_statistics_bytes_output_total counter # HELP erlang_vm_statistics_bytes_output_total Total number of bytes output to ports erlang_vm_statistics_bytes_output_total 604157373 # TYPE erlang_vm_statistics_reductions_total counter # HELP erlang_vm_statistics_reductions_total Total reductions erlang_vm_statistics_reductions_total 4946085703 # TYPE erlang_vm_statistics_run_queues_length_total gauge # HELP erlang_vm_statistics_run_queues_length_total Total length of the run-queues erlang_vm_statistics_run_queues_length_total 0 # TYPE erlang_vm_statistics_runtime_milliseconds counter # HELP erlang_vm_statistics_runtime_milliseconds The sum of the runtime for all threads in the Erlang runtime system. Can be greater than wall clock time erlang_vm_statistics_runtime_milliseconds 573750 # TYPE erlang_vm_statistics_wallclock_time_milliseconds counter # HELP erlang_vm_statistics_wallclock_time_milliseconds Information about wall clock. Same as erlang_vm_statistics_runtime_milliseconds except that real time is measured erlang_vm_statistics_wallclock_time_milliseconds 5495986 # TYPE erlang_vm_ets_limit gauge # HELP erlang_vm_ets_limit The maximum number of ETS tables allowed. erlang_vm_ets_limit 2053 # TYPE erlang_vm_logical_processors gauge # HELP erlang_vm_logical_processors The detected number of logical processors configured in the system. erlang_vm_logical_processors 4 # TYPE erlang_vm_logical_processors_available gauge # HELP erlang_vm_logical_processors_available The detected number of logical processors available to the Erlang runtime system. erlang_vm_logical_processors_available 4 # TYPE erlang_vm_logical_processors_online gauge # HELP erlang_vm_logical_processors_online The detected number of logical processors online on the system. erlang_vm_logical_processors_online 4 # TYPE erlang_vm_port_count gauge # HELP erlang_vm_port_count The number of ports currently existing at the local node. erlang_vm_port_count 61 # TYPE erlang_vm_port_limit gauge # HELP erlang_vm_port_limit The maximum number of simultaneously existing ports at the local node. erlang_vm_port_limit 65536 # TYPE erlang_vm_process_count gauge # HELP erlang_vm_process_count The number of processes currently existing at the local node. erlang_vm_process_count 395 # TYPE erlang_vm_process_limit gauge # HELP erlang_vm_process_limit The maximum number of simultaneously existing processes at the local node. erlang_vm_process_limit 262144 # TYPE erlang_vm_schedulers gauge # HELP erlang_vm_schedulers The number of scheduler threads used by the emulator. erlang_vm_schedulers 4 # TYPE erlang_vm_schedulers_online gauge # HELP erlang_vm_schedulers_online The number of schedulers online. erlang_vm_schedulers_online 4 # TYPE erlang_vm_smp_support untyped # HELP erlang_vm_smp_support 1 if the emulator has been compiled with SMP support, otherwise 0. erlang_vm_smp_support 1 # TYPE erlang_vm_threads untyped # HELP erlang_vm_threads 1 if the emulator has been compiled with thread support, otherwise 0. erlang_vm_threads 1 # TYPE erlang_vm_thread_pool_size gauge # HELP erlang_vm_thread_pool_size The number of async threads in the async thread pool used for asynchronous driver calls. erlang_vm_thread_pool_size 10 # TYPE erlang_vm_time_correction untyped # HELP erlang_vm_time_correction 1 if time correction is enabled, otherwise 0. erlang_vm_time_correction 1 # TYPE erchef_pooler_members_in_use gauge # HELP erchef_pooler_members_in_use Number of pool members currently being used. erchef_pooler_members_in_use{pool_name="sqerl"} 0 erchef_pooler_members_in_use{pool_name="oc_chef_authz_http"} 0 erchef_pooler_members_in_use{pool_name="chef_index_http"} 0 erchef_pooler_members_in_use{pool_name="chef_depsolver"} 0 # TYPE erchef_pooler_members_free gauge # HELP erchef_pooler_members_free Number of pool members currently available. erchef_pooler_members_free{pool_name="sqerl"} 20 erchef_pooler_members_free{pool_name="oc_chef_authz_http"} 25 erchef_pooler_members_free{pool_name="chef_index_http"} 25 erchef_pooler_members_free{pool_name="chef_depsolver"} 5 # TYPE erchef_pooler_members_max gauge # HELP erchef_pooler_members_max Max number of pool members allowed in the pool. erchef_pooler_members_max{pool_name="sqerl"} 20 erchef_pooler_members_max{pool_name="oc_chef_authz_http"} 100 erchef_pooler_members_max{pool_name="chef_index_http"} 100 erchef_pooler_members_max{pool_name="chef_depsolver"} 5 # TYPE erchef_pooler_queued_requestors gauge # HELP erchef_pooler_queued_requestors Number of requestors blocking to take a pool member. erchef_pooler_queued_requestors{pool_name="sqerl"} 0 erchef_pooler_queued_requestors{pool_name="oc_chef_authz_http"} 0 erchef_pooler_queued_requestors{pool_name="chef_index_http"} 0 erchef_pooler_queued_requestors{pool_name="chef_depsolver"} 0 # TYPE erchef_pooler_queued_requestors_max gauge # HELP erchef_pooler_queued_requestors_max Max number of requestors allowed to block on taking pool member. erchef_pooler_queued_requestors_max{pool_name="sqerl"} 20 erchef_pooler_queued_requestors_max{pool_name="oc_chef_authz_http"} 50 erchef_pooler_queued_requestors_max{pool_name="chef_index_http"} 50 erchef_pooler_queued_requestors_max{pool_name="chef_depsolver"} 50 # TYPE pg_stat_seq_scan counter # HELP pg_stat_seq_scan Number of sequential scans initiated pg_stat_seq_scan 5047095 # TYPE pg_stat_seq_tup_read counter # HELP pg_stat_seq_tup_read Number of live rows fetched by sequential scans pg_stat_seq_tup_read 415028001 # TYPE pg_stat_idx_scan counter # HELP pg_stat_idx_scan Number of index scans initiated pg_stat_idx_scan 23751732 # TYPE pg_stat_tup_fetch counter # HELP pg_stat_tup_fetch Number of live rows fetched by index scans pg_stat_tup_fetch 25953870 # TYPE pg_stat_n_tup_ins counter # HELP pg_stat_n_tup_ins Number of rows inserted pg_stat_n_tup_ins 52452 # TYPE pg_stat_n_tup_upd counter # HELP pg_stat_n_tup_upd Number of rows updated pg_stat_n_tup_upd 73461 # TYPE pg_stat_n_tup_del counter # HELP pg_stat_n_tup_del Number of rows deleted pg_stat_n_tup_del 31020 # TYPE pg_stat_n_live_tup gauge # HELP pg_stat_n_live_tup Estimated number of live rows pg_stat_n_live_tup 92355 # TYPE pg_stat_n_dead_tup gauge # HELP pg_stat_n_dead_tup Estimated number of dead rows pg_stat_n_dead_tup 29046 # TYPE pg_stat_heap_blocks_read counter # HELP pg_stat_heap_blocks_read Number of disk blocks read pg_stat_heap_blocks_read 69231 # TYPE pg_stat_heap_blocks_hit counter # HELP pg_stat_heap_blocks_hit Number of buffer hits pg_stat_heap_blocks_hit 38834925 # TYPE pg_stat_idx_blks_read counter # HELP pg_stat_idx_blks_read Number of disk blocks read from all indexes pg_stat_idx_blks_read 49632 # TYPE pg_stat_idx_blks_hit counter # HELP pg_stat_idx_blks_hit Number of buffer hits in all indexes pg_stat_idx_blks_hit 35615613 # TYPE pg_stat_toast_blks_read counter # HELP pg_stat_toast_blks_read Number of disk blocks read from TOAST tables pg_stat_toast_blks_read 1410 # TYPE pg_stat_toast_blks_hit counter # HELP pg_stat_toast_blks_hit Number of buffer hits in TOAST tables pg_stat_toast_blks_hit 2397 # TYPE pg_stat_tidx_blks_read counter # HELP pg_stat_tidx_blks_read Number of disk blocks read from TOAST tables pg_stat_tidx_blks_read 1410 # TYPE pg_stat_tidx_blks_hit counter # HELP pg_stat_tidx_blks_hit Number of buffer hits in TOAST table indexes pg_stat_tidx_blks_hit 3948 ``` Same stats in JSON format: ``` [ { "metrics" : [ { "value" : "1" } ], "type" : "UNTYPED", "name" : "erlang_vm_time_correction", "help" : "1 if time correction is enabled, otherwise 0." }, { "metrics" : [ { "value" : "10" } ], "type" : "GAUGE", "help" : "The number of async threads in the async thread pool used for asynchronous driver calls.", "name" : "erlang_vm_thread_pool_size" }, { "type" : "UNTYPED", "metrics" : [ { "value" : "1" } ], "help" : "1 if the emulator has been compiled with thread support, otherwise 0.", "name" : "erlang_vm_threads" }, { "help" : "1 if the emulator has been compiled with SMP support, otherwise 0.", "name" : "erlang_vm_smp_support", "type" : "UNTYPED", "metrics" : [ { "value" : "1" } ] }, { "help" : "The number of schedulers online.", "name" : "erlang_vm_schedulers_online", "metrics" : [ { "value" : "4" } ], "type" : "GAUGE" }, { "type" : "GAUGE", "metrics" : [ { "value" : "4" } ], "name" : "erlang_vm_schedulers", "help" : "The number of scheduler threads used by the emulator." }, { "name" : "erlang_vm_process_limit", "help" : "The maximum number of simultaneously existing processes at the local node.", "type" : "GAUGE", "metrics" : [ { "value" : "262144" } ] }, { "metrics" : [ { "value" : "395" } ], "type" : "GAUGE", "help" : "The number of processes currently existing at the local node.", "name" : "erlang_vm_process_count" }, { "metrics" : [ { "value" : "65536" } ], "type" : "GAUGE", "name" : "erlang_vm_port_limit", "help" : "The maximum number of simultaneously existing ports at the local node." }, { "type" : "GAUGE", "metrics" : [ { "value" : "60" } ], "name" : "erlang_vm_port_count", "help" : "The number of ports currently existing at the local node." }, { "metrics" : [ { "value" : "4" } ], "type" : "GAUGE", "help" : "The detected number of logical processors online on the system.", "name" : "erlang_vm_logical_processors_online" }, { "type" : "GAUGE", "metrics" : [ { "value" : "4" } ], "help" : "The detected number of logical processors available to the Erlang runtime system.", "name" : "erlang_vm_logical_processors_available" }, { "help" : "The detected number of logical processors configured in the system.", "name" : "erlang_vm_logical_processors", "type" : "GAUGE", "metrics" : [ { "value" : "4" } ] }, { "metrics" : [ { "value" : "2053" } ], "type" : "GAUGE", "name" : "erlang_vm_ets_limit", "help" : "The maximum number of ETS tables allowed." }, { "metrics" : [ { "value" : "21716981" } ], "type" : "COUNTER", "help" : "Information about wall clock. Same as erlang_vm_statistics_runtime_milliseconds except that real time is measured", "name" : "erlang_vm_statistics_wallclock_time_milliseconds" }, { "type" : "COUNTER", "metrics" : [ { "value" : "2089510" } ], "name" : "erlang_vm_statistics_runtime_milliseconds", "help" : "The sum of the runtime for all threads in the Erlang runtime system. Can be greater than wall clock time" }, { "type" : "GAUGE", "metrics" : [ { "value" : "0" } ], "help" : "Total length of the run-queues", "name" : "erlang_vm_statistics_run_queues_length_total" }, { "type" : "COUNTER", "metrics" : [ { "value" : "16208617229" } ], "name" : "erlang_vm_statistics_reductions_total", "help" : "Total reductions" }, { "name" : "erlang_vm_statistics_bytes_output_total", "help" : "Total number of bytes output to ports", "metrics" : [ { "value" : "2374560576" } ], "type" : "COUNTER" }, { "metrics" : [ { "value" : "2375163718" } ], "type" : "COUNTER", "help" : "Total number of bytes received through ports", "name" : "erlang_vm_statistics_bytes_received_total" }, { "metrics" : [ { "value" : "359118308568" } ], "type" : "COUNTER", "help" : "Garbage collection: bytes reclaimed", "name" : "erlang_vm_statistics_garbage_collection_bytes_reclaimed" }, { "help" : "Garbage collection: words reclaimed", "name" : "erlang_vm_statistics_garbage_collection_words_reclaimed", "type" : "COUNTER", "metrics" : [ { "value" : "44889788571" } ] }, { "help" : "Garbage collection: number of GCs", "name" : "erlang_vm_statistics_garbage_collection_number_of_gcs", "type" : "COUNTER", "metrics" : [ { "value" : "5263551" } ] }, { "name" : "erlang_vm_statistics_context_switches", "help" : "Total number of context switches since the system started", "type" : "COUNTER", "metrics" : [ { "value" : "109981973" } ] }, { "help" : "The total amount of memory currently allocated for the emulator that is not directly related to any Erlang process. Memory presented as processes is not included in this memory.", "name" : "erlang_vm_memory_system_bytes_total", "type" : "GAUGE", "metrics" : [ { "value" : "1468545", "labels" : { "usage" : "atom" } }, { "labels" : { "usage" : "binary" }, "value" : "942728" }, { "value" : "33503265", "labels" : { "usage" : "code" } }, { "value" : "2496320", "labels" : { "usage" : "ets" } }, { "value" : "8025046", "labels" : { "usage" : "other" } } ] }, { "help" : "The total amount of memory currently allocated for the Erlang processes.", "name" : "erlang_vm_memory_processes_bytes_total", "type" : "GAUGE", "metrics" : [ { "value" : "24208976", "labels" : { "usage" : "used" } }, { "labels" : { "usage" : "free" }, "value" : "21304" } ] }, { "metrics" : [ { "value" : "66" } ], "type" : "GAUGE", "help" : "Erlang VM ETS Tables count", "name" : "erlang_vm_ets_tables" }, { "name" : "erlang_vm_dets_tables", "help" : "Erlang VM DETS Tables count", "type" : "GAUGE", "metrics" : [ { "value" : "0" } ] }, { "metrics" : [ { "labels" : { "kind" : "system" }, "value" : "46435904" }, { "value" : "24230280", "labels" : { "kind" : "processes" } } ], "type" : "GAUGE", "help" : "The total amount of memory currently allocated. This is the same as the sum of the memory size for processes and system.", "name" : "erlang_vm_memory_bytes_total" }, { "name" : "erlang_vm_memory_atom_bytes_total", "help" : "The total amount of memory currently allocated for atoms. This memory is part of the memory presented as system memory.", "type" : "GAUGE", "metrics" : [ { "labels" : { "usage" : "used" }, "value" : "1455537" }, { "value" : "13008", "labels" : { "usage" : "free" } } ] }, { "help" : "Max number of requestors allowed to block on taking pool member.", "name" : "erchef_pooler_queued_requestors_max", "type" : "GAUGE", "metrics" : [ { "value" : "20", "labels" : { "pool_name" : "sqerl" } }, { "value" : "50", "labels" : { "pool_name" : "oc_chef_authz_http" } }, { "labels" : { "pool_name" : "chef_index_http" }, "value" : "50" }, { "value" : "50", "labels" : { "pool_name" : "chef_depsolver" } } ] }, { "metrics" : [ { "value" : "0", "labels" : { "pool_name" : "sqerl" } }, { "value" : "0", "labels" : { "pool_name" : "oc_chef_authz_http" } }, { "labels" : { "pool_name" : "chef_index_http" }, "value" : "0" }, { "value" : "0", "labels" : { "pool_name" : "chef_depsolver" } } ], "type" : "GAUGE", "help" : "Number of requestors blocking to take a pool member.", "name" : "erchef_pooler_queued_requestors" }, { "name" : "erchef_pooler_members_max", "help" : "Max number of pool members allowed in the pool.", "type" : "GAUGE", "metrics" : [ { "labels" : { "pool_name" : "sqerl" }, "value" : "20" }, { "value" : "100", "labels" : { "pool_name" : "oc_chef_authz_http" } }, { "labels" : { "pool_name" : "chef_index_http" }, "value" : "100" }, { "labels" : { "pool_name" : "chef_depsolver" }, "value" : "5" } ] }, { "name" : "erchef_pooler_members_free", "help" : "Number of pool members currently available.", "type" : "GAUGE", "metrics" : [ { "value" : "20", "labels" : { "pool_name" : "sqerl" } }, { "value" : "25", "labels" : { "pool_name" : "oc_chef_authz_http" } }, { "value" : "25", "labels" : { "pool_name" : "chef_index_http" } }, { "labels" : { "pool_name" : "chef_depsolver" }, "value" : "5" } ] }, { "metrics" : [ { "value" : "0", "labels" : { "pool_name" : "sqerl" } }, { "value" : "0", "labels" : { "pool_name" : "oc_chef_authz_http" } }, { "value" : "0", "labels" : { "pool_name" : "chef_index_http" } }, { "value" : "0", "labels" : { "pool_name" : "chef_depsolver" } } ], "type" : "GAUGE", "name" : "erchef_pooler_members_in_use", "help" : "Number of pool members currently being used." }, { "name" : "pg_stat_tidx_blks_hit", "help" : "Number of buffer hits in TOAST table indexes", "type" : "COUNTER", "metrics" : [ { "value" : "6486" } ] }, { "help" : "Number of disk blocks read from TOAST tables", "name" : "pg_stat_tidx_blks_read", "type" : "COUNTER", "metrics" : [ { "value" : "1410" } ] }, { "help" : "Number of buffer hits in TOAST tables", "name" : "pg_stat_toast_blks_hit", "metrics" : [ { "value" : "3666" } ], "type" : "COUNTER" }, { "help" : "Number of disk blocks read from TOAST tables", "name" : "pg_stat_toast_blks_read", "type" : "COUNTER", "metrics" : [ { "value" : "1410" } ] }, { "name" : "pg_stat_idx_blks_hit", "help" : "Number of buffer hits in all indexes", "type" : "COUNTER", "metrics" : [ { "value" : "72120936" } ] }, { "metrics" : [ { "value" : "49632" } ], "type" : "COUNTER", "name" : "pg_stat_idx_blks_read", "help" : "Number of disk blocks read from all indexes" }, { "metrics" : [ { "value" : "85697544" } ], "type" : "COUNTER", "name" : "pg_stat_heap_blocks_hit", "help" : "Number of buffer hits" }, { "help" : "Number of disk blocks read", "name" : "pg_stat_heap_blocks_read", "type" : "COUNTER", "metrics" : [ { "value" : "69513" } ] }, { "metrics" : [ { "value" : "29610" } ], "type" : "GAUGE", "help" : "Estimated number of dead rows", "name" : "pg_stat_n_dead_tup" }, { "type" : "GAUGE", "metrics" : [ { "value" : "92355" } ], "help" : "Estimated number of live rows", "name" : "pg_stat_n_live_tup" }, { "type" : "COUNTER", "metrics" : [ { "value" : "31020" } ], "name" : "pg_stat_n_tup_del", "help" : "Number of rows deleted" }, { "help" : "Number of rows updated", "name" : "pg_stat_n_tup_upd", "metrics" : [ { "value" : "87702" } ], "type" : "COUNTER" }, { "metrics" : [ { "value" : "52452" } ], "type" : "COUNTER", "help" : "Number of rows inserted", "name" : "pg_stat_n_tup_ins" }, { "help" : "Number of live rows fetched by index scans", "name" : "pg_stat_tup_fetch", "type" : "COUNTER", "metrics" : [ { "value" : "44669364" } ] }, { "type" : "COUNTER", "metrics" : [ { "value" : "48594381" } ], "name" : "pg_stat_idx_scan", "help" : "Number of index scans initiated" }, { "name" : "pg_stat_seq_tup_read", "help" : "Number of live rows fetched by sequential scans", "type" : "COUNTER", "metrics" : [ { "value" : "1167619731" } ] }, { "help" : "Number of sequential scans initiated", "name" : "pg_stat_seq_scan", "metrics" : [ { "value" : "9793719" } ], "type" : "COUNTER" } ] ``` Signed-off-by: Kartik Null Cating-Subramanian --- dev/defaults.yml | 1 + oc-chef-pedant/Gemfile | 2 + oc-chef-pedant/Gemfile.lock | 14 +- oc-chef-pedant/lib/pedant.rb | 5 +- oc-chef-pedant/lib/pedant/config.rb | 3 + oc-chef-pedant/lib/pedant/platform.rb | 8 +- oc-chef-pedant/pedant_config.rb | 1 + oc-chef-pedant/spec/api/stats_spec.rb | 128 ++++++++++++++++++ .../private-chef/attributes/default.rb | 6 + .../private-chef/libraries/helper.rb | 9 ++ .../private-chef/libraries/private_chef.rb | 3 +- .../private-chef/recipes/nginx.rb | 20 +++ .../default/nginx/nginx_chef_api_lb.conf.erb | 11 ++ .../templates/default/oc_erchef.config.erb | 6 + .../private-chef-ctl-commands/secrets.rb | 2 +- .../files/private-chef-ctl-commands/test.rb | 1 + .../apps/chef_db/priv/pgsql_statements.config | 28 ++++ .../apps/chef_db/src/chef_pgsql_collector.erl | 92 +++++++++++++ .../apps/oc_chef_wm/priv/dispatch.conf | 1 + .../src/chef_wm_pooler_collector.erl | 76 +++++++++++ .../src/chef_wm_prometheus_json_format.erl | 100 ++++++++++++++ .../apps/oc_chef_wm/src/chef_wm_stats.erl | 67 +++++++++ src/oc_erchef/rebar.config | 4 +- src/oc_erchef/rebar.lock | 4 + src/oc_erchef/src/oc_erchef.app.src | 3 +- 25 files changed, 586 insertions(+), 9 deletions(-) create mode 100644 oc-chef-pedant/spec/api/stats_spec.rb create mode 100644 src/oc_erchef/apps/chef_db/src/chef_pgsql_collector.erl create mode 100644 src/oc_erchef/apps/oc_chef_wm/src/chef_wm_pooler_collector.erl create mode 100644 src/oc_erchef/apps/oc_chef_wm/src/chef_wm_prometheus_json_format.erl create mode 100644 src/oc_erchef/apps/oc_chef_wm/src/chef_wm_stats.erl diff --git a/dev/defaults.yml b/dev/defaults.yml index 62bfe36b2e..962c7f395c 100644 --- a/dev/defaults.yml +++ b/dev/defaults.yml @@ -251,6 +251,7 @@ projects: secrets: args: list: + - opscode_erchef.stats_password - chef-server.webui_key - chef-server.superuser_key diff --git a/oc-chef-pedant/Gemfile b/oc-chef-pedant/Gemfile index 8e95cdbbaf..ee1f4ddee6 100644 --- a/oc-chef-pedant/Gemfile +++ b/oc-chef-pedant/Gemfile @@ -4,6 +4,8 @@ gemspec gem 'rest-client', github: 'chef/rest-client' # For debugging in dvm gem 'pry' +gem 'pry-byebug' +gem 'pry-stack_explorer' gem 'rake' # For "rake chef_zero_spec" diff --git a/oc-chef-pedant/Gemfile.lock b/oc-chef-pedant/Gemfile.lock index 33122955c4..525e9157fb 100644 --- a/oc-chef-pedant/Gemfile.lock +++ b/oc-chef-pedant/Gemfile.lock @@ -30,9 +30,13 @@ GEM i18n (~> 0.7) minitest (~> 5.1) tzinfo (~> 1.1) + binding_of_caller (0.7.2) + debug_inspector (>= 0.0.1) builder (3.2.3) + byebug (9.1.0) coderay (1.1.1) concurrent-ruby (1.0.4) + debug_inspector (0.0.3) diff-lcs (1.3) erubis (2.7.0) i18n (0.8.1) @@ -52,6 +56,12 @@ GEM coderay (~> 1.1.0) method_source (~> 0.8.1) slop (~> 3.4) + pry-byebug (3.5.0) + byebug (~> 9.1) + pry (~> 0.10) + pry-stack_explorer (0.4.9.2) + binding_of_caller (>= 0.7) + pry (>= 0.9.11) rake (12.0.0) rdoc (5.1.0) rspec (3.5.0) @@ -83,8 +93,10 @@ PLATFORMS DEPENDENCIES oc-chef-pedant! pry + pry-byebug + pry-stack_explorer rake rest-client! BUNDLED WITH - 1.14.5 + 1.15.3 diff --git a/oc-chef-pedant/lib/pedant.rb b/oc-chef-pedant/lib/pedant.rb index 4365275b32..f95c05811e 100644 --- a/oc-chef-pedant/lib/pedant.rb +++ b/oc-chef-pedant/lib/pedant.rb @@ -78,10 +78,13 @@ def self.configure_logging def self.create_platform superuser_key = ENV['CHEF_SECRET_CHEF-SERVER.SUPERUSER_KEY'] || ENV['SUPERUSER_KEY'] webui_key = ENV['CHEF_SECRET_CHEF-SERVER.WEBUI_KEY'] || ENV['WEBUI_KEY'] + stats_password = ENV['CHEF_SECRET_OPSCODE_ERCHEF.STATS_PASSWORD'] || ENV['STATS_PASSWORD'] config.pedant_platform = Pedant::Platform.new(config.chef_server, superuser_key, webui_key, - config.superuser_name) + config.superuser_name, + config.stats_user, + stats_password) end def self.configure_rspec diff --git a/oc-chef-pedant/lib/pedant/config.rb b/oc-chef-pedant/lib/pedant/config.rb index ec72ec0d8e..97f2b500a8 100644 --- a/oc-chef-pedant/lib/pedant/config.rb +++ b/oc-chef-pedant/lib/pedant/config.rb @@ -114,6 +114,9 @@ def self.rspec_formatting_args # Default to a config file in the current directory config_file "pedant_config.rb" + # Default user for the stats end-point + stats_user "statsuser" + # Maximum time in seconds that search endpoint requests should be # retried before giving up (to accommodate the asynchronous # commits of Solr) diff --git a/oc-chef-pedant/lib/pedant/platform.rb b/oc-chef-pedant/lib/pedant/platform.rb index 6c3d69ac47..a32d24086c 100755 --- a/oc-chef-pedant/lib/pedant/platform.rb +++ b/oc-chef-pedant/lib/pedant/platform.rb @@ -28,13 +28,14 @@ class Platform attr_reader :test_org, :test_org_owner, :validate_org, :internal_account_url, :internal_server, :ldap, :ldap_testing, - :server, :base_resource_url, :superuser, :superuser_key_data, :webui_key + :server, :base_resource_url, :superuser, :superuser_key_data, :webui_key, + :stats_user, :stats_password # Create a Platform object for a given server (specified by # protocol, hostname, and port ONLY). You must supply the # superuser's key data in PEM form. # - def initialize(server, superuser_key, webui_key, superuser_name='pivotal') + def initialize(server, superuser_key, webui_key, superuser_name='pivotal', stats_user, stats_password) @superuser_key_data = superuser_key @webui_key = webui_key @server = (Pedant.config.explicit_port_url ? explicit_port_url(server) : server ) @@ -46,6 +47,9 @@ def initialize(server, superuser_key, webui_key, superuser_name='pivotal') @superuser = Pedant::Requestor.new(superuser_name, @superuser_key_data, platform: self) @test_org = org_from_config + @stats_user = stats_user + @stats_password = stats_password + @internal_account_url = Pedant::Config[:internal_account_url] @internal_server = Pedant::Config.internal_server || (fail "Missing internal_server in Pedant config.") @ldap = Pedant::Config[:ldap] diff --git a/oc-chef-pedant/pedant_config.rb b/oc-chef-pedant/pedant_config.rb index c2c5476b4e..54979bf33e 100644 --- a/oc-chef-pedant/pedant_config.rb +++ b/oc-chef-pedant/pedant_config.rb @@ -136,6 +136,7 @@ superuser_name 'pivotal' superuser_key '/etc/opscode/pivotal.pem' webui_key '/etc/opscode/webui_priv.pem' +stats_user 'statsuser' requestors({ :clients => { diff --git a/oc-chef-pedant/spec/api/stats_spec.rb b/oc-chef-pedant/spec/api/stats_spec.rb new file mode 100644 index 0000000000..728bb3e6c5 --- /dev/null +++ b/oc-chef-pedant/spec/api/stats_spec.rb @@ -0,0 +1,128 @@ +require 'pedant/rspec/common' +require 'mixlib/shellout' +require 'base64' + +describe "/_stats API endpoint", :stats do + + RESPONSE_TYPE_MAP = { + "erlang_vm_time_correction" => "UNTYPED", + "erlang_vm_thread_pool_size" => "GAUGE", + "erlang_vm_threads" => "UNTYPED", + "erlang_vm_smp_support" => "UNTYPED", + "erlang_vm_schedulers_online" => "GAUGE", + "erlang_vm_schedulers" => "GAUGE", + "erlang_vm_process_limit" => "GAUGE", + "erlang_vm_process_count" => "GAUGE", + "erlang_vm_port_limit" => "GAUGE", + "erlang_vm_port_count" => "GAUGE", + "erlang_vm_logical_processors_online" => "GAUGE", + "erlang_vm_logical_processors_available" => "GAUGE", + "erlang_vm_logical_processors" => "GAUGE", + "erlang_vm_ets_limit" => "GAUGE", + "erlang_vm_statistics_wallclock_time_milliseconds" => "COUNTER", + "erlang_vm_statistics_runtime_milliseconds" => "COUNTER", + "erlang_vm_statistics_run_queues_length_total" => "GAUGE", + "erlang_vm_statistics_reductions_total" => "COUNTER", + "erlang_vm_statistics_bytes_output_total" => "COUNTER", + "erlang_vm_statistics_bytes_received_total" => "COUNTER", + "erlang_vm_statistics_garbage_collection_bytes_reclaimed" => "COUNTER", + "erlang_vm_statistics_garbage_collection_words_reclaimed" => "COUNTER", + "erlang_vm_statistics_garbage_collection_number_of_gcs" => "COUNTER", + "erlang_vm_statistics_context_switches" => "COUNTER", + "erlang_vm_memory_system_bytes_total" => "GAUGE", + "erlang_vm_memory_processes_bytes_total" => "GAUGE", + "erlang_vm_ets_tables" => "GAUGE", + "erlang_vm_dets_tables" => "GAUGE", + "erlang_vm_memory_bytes_total" => "GAUGE", + "erlang_vm_memory_atom_bytes_total" => "GAUGE", + "erchef_pooler_queued_requestors_max" => "GAUGE", + "erchef_pooler_queued_requestors" => "GAUGE", + "erchef_pooler_members_max" => "GAUGE", + "erchef_pooler_members_free" => "GAUGE", + "erchef_pooler_members_in_use" => "GAUGE", + "pg_stat_n_conns" => "GAUGE", + "pg_stat_n_active_conns" => "GAUGE", + "pg_stat_tidx_blks_hit" => "COUNTER", + "pg_stat_tidx_blks_read" => "COUNTER", + "pg_stat_toast_blks_hit" => "COUNTER", + "pg_stat_toast_blks_read" => "COUNTER", + "pg_stat_idx_blks_hit" => "COUNTER", + "pg_stat_idx_blks_read" => "COUNTER", + "pg_stat_heap_blocks_hit" => "COUNTER", + "pg_stat_heap_blocks_read" => "COUNTER", + "pg_stat_n_dead_tup" => "GAUGE", + "pg_stat_n_live_tup" => "GAUGE", + "pg_stat_n_tup_del" => "COUNTER", + "pg_stat_n_tup_upd" => "COUNTER", + "pg_stat_n_tup_ins" => "COUNTER", + "pg_stat_tup_fetch" => "COUNTER", + "pg_stat_idx_scan" => "COUNTER", + "pg_stat_seq_tup_read" => "COUNTER", + "pg_stat_seq_scan" => "COUNTER" + } + + let(:request_url) { "#{platform.server}/_stats" } + let(:response_body) do + RESPONSE_TYPE_MAP.map do |name, type| + { + "name" => name, + "type" => type + } + end + end + + let(:boolean_stats) { ["erlang_vm_time_correction", "erlang_vm_threads", "erlang_vm_smp_support"] } + let(:auth_headers) do + { "Authorization" => "Basic " + Base64.strict_encode64("#{platform.stats_user}:#{platform.stats_password}") } + end + + # Don't turn on any of the tests unless we have a password. + if Pedant::Config.pedant_platform.stats_password + it "returns a list of collected statistics", :smoke do + get(request_url, nil, auth_headers: auth_headers).should look_like({ + :status => 200, + :body => response_body + }) + end + + it "returns json when ?format=json", :smoke do + get(request_url + "?format=json", nil, auth_headers: auth_headers).should look_like({ + :status => 200, + :body => response_body + }) + end + + it "returns promethius output ?format=text", :smoke do + response = get(request_url + "?format=text", nil, auth_headers: auth_headers, + headers: { "Accept" => "*/*" }) + names = response.split("\n").reduce([]) do |acc, str| + m = str.strip.match(/^\w+/) + acc << m[0] if m + acc + end + expect(names.uniq).to match_array(RESPONSE_TYPE_MAP.keys) + end + + RESPONSE_TYPE_MAP.each do |name, type| + it "returns metrics for #{name} typed as #{type}" do + response = JSON.parse(get(request_url, nil, auth_headers: auth_headers)) + stat = response.find { |s| s["name"] == name } + expect(stat["metrics"]).not_to be_empty + stat["metrics"].each do |metric| + expect(metric).to have_key("value") + case type + when "GAUGE" + expect(Float(metric["value"])).to be_a(Numeric) + when "COUNTER" + expect(Integer(metric["value"])).to be_a(Integer) + when "UNTYPED" + expect(boolean_stats).to include(name) + expect(metric["value"]).to eq("1").or eq("0") + else + raise "Unimplemented test for metric type #{type}" + end + end + end + end + end +end diff --git a/omnibus/files/private-chef-cookbooks/private-chef/attributes/default.rb b/omnibus/files/private-chef-cookbooks/private-chef/attributes/default.rb index 64e83fa340..b0522344b8 100755 --- a/omnibus/files/private-chef-cookbooks/private-chef/attributes/default.rb +++ b/omnibus/files/private-chef-cookbooks/private-chef/attributes/default.rb @@ -401,6 +401,11 @@ default['private_chef']['opscode-erchef']['health_ping_timeout'] = 400 +# Stats endpoint +default['private_chef']['opscode-erchef']['stats_auth_enable'] = true +default['private_chef']['opscode-erchef']['stats_user'] = "statsuser" +default['private_chef']['opscode-erchef']['stats_password_file'] = "/var/opt/opscode/nginx/stats_htpasswd" + ### # Legacy path (required for cookbok migration) ### @@ -411,6 +416,7 @@ #### default['private_chef']['opscode-webui']['enable'] = false + #### # Chef Pedant #### diff --git a/omnibus/files/private-chef-cookbooks/private-chef/libraries/helper.rb b/omnibus/files/private-chef-cookbooks/private-chef/libraries/helper.rb index cdc8b24c90..539d3a9a35 100644 --- a/omnibus/files/private-chef-cookbooks/private-chef/libraries/helper.rb +++ b/omnibus/files/private-chef-cookbooks/private-chef/libraries/helper.rb @@ -17,6 +17,15 @@ def ownership {"owner" => owner, "group" => group} end + def apr1_password(password) + cmd = Mixlib::ShellOut.new("openssl passwd -apr1 '#{password}'") + cmd.run_command + unless cmd.status.success? + raise "Failed to generate apr1 password hash" + end + cmd.stdout + end + def rabbitmq_configuration external = node['private_chef']['external-rabbitmq']['enable'] config = if external diff --git a/omnibus/files/private-chef-cookbooks/private-chef/libraries/private_chef.rb b/omnibus/files/private-chef-cookbooks/private-chef/libraries/private_chef.rb index f4a5def2c9..3dac4fc9a9 100644 --- a/omnibus/files/private-chef-cookbooks/private-chef/libraries/private_chef.rb +++ b/omnibus/files/private-chef-cookbooks/private-chef/libraries/private_chef.rb @@ -507,6 +507,7 @@ def gen_secrets_default(node_name) {group: "keepalived", name: "vrrp_instance_password", length: 100}, {group: "opscode_erchef", name: "sql_password", length: 60}, {group: "opscode_erchef", name: "sql_ro_password", length: 60}, + {group: "opscode_erchef", name: "stats_password", lendth: 100}, {group: "oc_bifrost", name: "superuser_id", length: 32, frozen: true}, {group: "oc_bifrost", name: "sql_password", length: 100}, {group: "oc_bifrost", name: "sql_ro_password", length: 100}, @@ -661,7 +662,6 @@ def ensure_bind_password end end - def gen_ldap required_ldap_config_values = %w{ host base_dn } ensure_bind_password @@ -833,7 +833,6 @@ def generate_config(node_name) generate_config_for_topology(PrivateChef["topology"], node_name) gen_ldap if PrivateChef["ldap"]["enabled"] - generate_hash end end diff --git a/omnibus/files/private-chef-cookbooks/private-chef/recipes/nginx.rb b/omnibus/files/private-chef-cookbooks/private-chef/recipes/nginx.rb index f696502dca..d3d493830d 100644 --- a/omnibus/files/private-chef-cookbooks/private-chef/recipes/nginx.rb +++ b/omnibus/files/private-chef-cookbooks/private-chef/recipes/nginx.rb @@ -169,6 +169,26 @@ end end +stats_passwd_file = node['private_chef']['opscode-erchef']['stats_password_file'] +if node['private_chef']['opscode-erchef']['stats_auth_enable'] + stats_api_passwd = PrivateChef.credentials.get('opscode_erchef', 'stats_password') + stats_api_passwd_hash = OmnibusHelper.new(node).apr1_password(stats_api_passwd) + + file stats_passwd_file do + content "#{node['private_chef']['opscode-erchef']['stats_user']}:#{stats_api_passwd_hash}" + mode '0400' + owner OmnibusHelper.new(node).ownership['owner'] + group OmnibusHelper.new(node).ownership['group'] + sensitive true + notifies :restart, 'runit_service[nginx]' unless backend_secondary? + end +elsif stats_passwd_file + file stats_passwd_file do + action :delete + notifies :restart, 'runit_service[nginx]' unless backend_secondary? + end +end + %w(https http).each do |server_proto| config_key = "chef_#{server_proto}_config".to_sym lb_config = chef_lb_configs[config_key] diff --git a/omnibus/files/private-chef-cookbooks/private-chef/templates/default/nginx/nginx_chef_api_lb.conf.erb b/omnibus/files/private-chef-cookbooks/private-chef/templates/default/nginx/nginx_chef_api_lb.conf.erb index ed1580def7..de04992779 100644 --- a/omnibus/files/private-chef-cookbooks/private-chef/templates/default/nginx/nginx_chef_api_lb.conf.erb +++ b/omnibus/files/private-chef-cookbooks/private-chef/templates/default/nginx/nginx_chef_api_lb.conf.erb @@ -143,6 +143,17 @@ proxy_pass http://opscode_erchef; } + # erchef stats endpoint + location ~ "^/_stats/?$" { + <% if node['private_chef']['opscode-erchef']['stats_auth_enable'] -%> + auth_basic "Chef Server Admin Stats"; + auth_basic_user_file <%= node['private_chef']['opscode-erchef']['stats_password_file'] %>; + <% end -%> + types { } + default_type application/json; + proxy_pass http://opscode_erchef; + } + # This variable is set to an empty string here so it can be used in # dispatch.lua later on. An add-on can set this variable to be used as an # upstream if we determine the request was not intended to go to the API. diff --git a/omnibus/files/private-chef-cookbooks/private-chef/templates/default/oc_erchef.config.erb b/omnibus/files/private-chef-cookbooks/private-chef/templates/default/oc_erchef.config.erb index d36fd1563a..f741d74355 100755 --- a/omnibus/files/private-chef-cookbooks/private-chef/templates/default/oc_erchef.config.erb +++ b/omnibus/files/private-chef-cookbooks/private-chef/templates/default/oc_erchef.config.erb @@ -56,6 +56,12 @@ {error_logger_hwm, <%= @log_rotation['max_messages_per_second'] %>} ]}, + {prometheus, [{collectors, [default, + <% if node['private_chef']['postgresql']['enable'] && !node['private_chef']['postgresql']['external'] -%> + chef_pgsql_collector, + <% end %> + chef_wm_pooler_collector]}]}, + {chef_secrets, [{provider, chef_secrets_fd}]}, {oc_chef_wm, [ diff --git a/omnibus/files/private-chef-ctl-commands/secrets.rb b/omnibus/files/private-chef-ctl-commands/secrets.rb index 250ebfc9e1..02001af088 100644 --- a/omnibus/files/private-chef-ctl-commands/secrets.rb +++ b/omnibus/files/private-chef-ctl-commands/secrets.rb @@ -19,7 +19,7 @@ "redis_lb" => ["password"], "drbd" => ["shared_secret"], "keepalived" => ["vrrp_instance_password"], - "opscode_erchef" => ["sql_password", "sql_ro_password"], + "opscode_erchef" => ["sql_password", "sql_ro_password", "stats_password"], "oc_bifrost" => ["superuser_id", "sql_password", "sql_ro_password"], "oc_id" => ["secret_key_base", "sql_password", "sql_ro_password"], "bookshelf" => ["access_key_id", "secret_access_key", "sql_password", "sql_ro_password"], diff --git a/omnibus/files/private-chef-ctl-commands/test.rb b/omnibus/files/private-chef-ctl-commands/test.rb index e7457b50a6..b04057778d 100644 --- a/omnibus/files/private-chef-ctl-commands/test.rb +++ b/omnibus/files/private-chef-ctl-commands/test.rb @@ -7,6 +7,7 @@ add_command_under_category "test", "general", "Run the API test suite against localhost.", 2 do ENV['SUPERUSER_KEY'] = credentials.get("chef-server", "superuser_key") ENV['WEBUI_KEY'] = credentials.get("chef-server", "webui_key") + ENV['STATS_PASSWORD'] = credentials.get('opscode_erchef', 'stats_password') pedant_args = ARGV[3..-1] pedant_args = ["--smoke"] unless pedant_args.any? diff --git a/src/oc_erchef/apps/chef_db/priv/pgsql_statements.config b/src/oc_erchef/apps/chef_db/priv/pgsql_statements.config index f3aec7902c..161e356ff9 100644 --- a/src/oc_erchef/apps/chef_db/priv/pgsql_statements.config +++ b/src/oc_erchef/apps/chef_db/priv/pgsql_statements.config @@ -19,6 +19,34 @@ {ping, <<"SELECT 'pong' as ping LIMIT 1">>}. +{stats, + <<"SELECT *" + " FROM" + " ( SELECT SUM(seq_scan) as seq_scan," + " SUM(seq_tup_read) as seq_tup_read," + " SUM(idx_scan) as idx_scan," + " SUM(idx_tup_fetch) as idx_tup_fetch," + " SUM(n_tup_ins) as n_tup_ins," + " SUM(n_tup_upd) as n_tup_upd," + " SUM(n_tup_del) as n_tup_del," + " SUM(n_live_tup) as n_live_tup," + " SUM(n_dead_tup) as n_dead_tup" + " FROM pg_stat_all_tables ) stats1," + " ( SELECT SUM(heap_blks_read) as heap_blks_read," + " SUM(heap_blks_hit) as heap_blks_hit," + " SUM(idx_blks_read) as idx_blks_read," + " SUM(idx_blks_hit) as idx_blks_hit," + " SUM(toast_blks_read) as toast_blks_read," + " SUM(toast_blks_hit) as toast_blks_hit," + " SUM(tidx_blks_read) as tidx_blks_read," + " SUM(tidx_blks_hit) as tidx_blks_hit" + " FROM pg_statio_all_tables ) stats2," + " ( SELECT COUNT(*) as n_active_conns" + " FROM pg_stat_activity as pga" + " WHERE pga.state = 'active') stats3," + " ( SELECT COUNT(*) as n_conns" + " FROM pg_stat_activity) stats4">>}. + %% Query to count the number of nodes for license checks {count_nodes, <<"SELECT COUNT(*) FROM nodes INNER JOIN orgs ON nodes.org_id = orgs.id;">>}. diff --git a/src/oc_erchef/apps/chef_db/src/chef_pgsql_collector.erl b/src/oc_erchef/apps/chef_db/src/chef_pgsql_collector.erl new file mode 100644 index 0000000000..0c5d2c1aeb --- /dev/null +++ b/src/oc_erchef/apps/chef_db/src/chef_pgsql_collector.erl @@ -0,0 +1,92 @@ +-module(chef_pgsql_collector). + +-export([deregister_cleanup/1, + collect_mf/2, + collect_metrics/2]). + +-import(prometheus_model_helpers, [create_mf/5, + label_pairs/1, + gauge_metrics/1, + gauge_metric/1, + gauge_metric/2, + counter_metric/1, + counter_metric/2]). + +-behaviour(prometheus_collector). + +-define(METRICS, [ + {<<"seq_scan">>, {pg_stat_seq_scan, counter, "Number of sequential scans initiated on all tables", fun erlang:binary_to_integer/1}}, + {<<"seq_tup_read">>, {pg_stat_seq_tup_read, counter, "Number of live rows fetched by sequential scans on all tables", fun erlang:binary_to_integer/1}}, + {<<"idx_scan">>, {pg_stat_idx_scan, counter, "Number of index scans initiated on all tables", fun erlang:binary_to_integer/1}}, + {<<"idx_tup_fetch">>, {pg_stat_tup_fetch, counter, "Number of live rows fetched by index scans on all tables", fun erlang:binary_to_integer/1}}, + {<<"n_tup_ins">>, {pg_stat_n_tup_ins, counter, "Number of rows inserted on all tables", fun erlang:binary_to_integer/1}}, + {<<"n_tup_upd">>, {pg_stat_n_tup_upd, counter, "Number of rows updated on all tables", fun erlang:binary_to_integer/1}}, + {<<"n_tup_del">>, {pg_stat_n_tup_del, counter, "Number of rows deleted on all tables", fun erlang:binary_to_integer/1}}, + {<<"n_live_tup">>, {pg_stat_n_live_tup, gauge, "Estimated number of live rows on all tables", fun erlang:binary_to_integer/1}}, + {<<"n_dead_tup">>, {pg_stat_n_dead_tup, gauge, "Estimated number of dead rows on all tables", fun erlang:binary_to_integer/1}}, + {<<"heap_blks_read">>, {pg_stat_heap_blocks_read, counter, "Number of disk blocks read on all tables", fun erlang:binary_to_integer/1}}, + {<<"heap_blks_hit">>, {pg_stat_heap_blocks_hit, counter, "Number of buffer hits on all tables", fun erlang:binary_to_integer/1}}, + {<<"idx_blks_read">>, {pg_stat_idx_blks_read, counter, "Number of disk blocks read from all indexes", fun erlang:binary_to_integer/1}}, + {<<"idx_blks_hit">>, {pg_stat_idx_blks_hit, counter, "Number of buffer hits in all indexes", fun erlang:binary_to_integer/1}}, + {<<"toast_blks_read">>, {pg_stat_toast_blks_read, counter, "Number of disk blocks read from TOAST tables", fun erlang:binary_to_integer/1}}, + {<<"toast_blks_hit">>, {pg_stat_toast_blks_hit, counter, "Number of buffer hits in TOAST tables", fun erlang:binary_to_integer/1}}, + {<<"tidx_blks_read">>, {pg_stat_tidx_blks_read, counter, "Number of disk blocks read from TOAST tables", fun erlang:binary_to_integer/1}}, + {<<"tidx_blks_hit">>, {pg_stat_tidx_blks_hit, counter, "Number of buffer hits in TOAST table indexes", fun erlang:binary_to_integer/1}}, + % I have no idea why only these two columns come back as numbers instead of as a binary... + {<<"n_active_conns">>, {pg_stat_n_active_conns, gauge, "Number of active connections to the database", fun(X) -> X end}}, + {<<"n_conns">>, {pg_stat_n_conns, gauge, "Number of all connections to the database", fun(X) -> X end}} + ]). + +%%==================================================================== +%% Collector API +%%==================================================================== + +deregister_cleanup(_) -> ok. + + +collect_mf(_Registry, Callback) -> + + Stats = stats(), + + lists:foreach(fun({ColName, {StatName, StatType, StatHelp, Transform}}) -> + case proplists:get_value(ColName, Stats) of + undefined -> + ok; + Value -> + Callback(create_metric(StatName, + StatHelp, + StatType, + {StatType, Transform(Value)})) + end + end, ?METRICS), + ok. + +collect_metrics(_StatName, {counter, Value}) -> + counter_metric(Value); +collect_metrics(_StatName, {gauge, Value}) -> + gauge_metric(Value). + + +create_metric(Name, Help, Type, Data) -> + create_mf(Name, Help, Type, ?MODULE, Data). + + +%%==================================================================== +%% Private functions +%%==================================================================== + +-spec stats() -> [{binary(), binary()}]. +stats() -> + try + case sqerl:select(stats, [], first, []) of + {ok, Stats} -> Stats; + _Else -> throw(error) + end + catch + How:Why -> + error_logger:error_report({chef_sql, stats, How, Why}), + %% We don't want to propagate errors here. Doing so would mean + %% an error would prevent us from getting any metrics. + [] + end. + diff --git a/src/oc_erchef/apps/oc_chef_wm/priv/dispatch.conf b/src/oc_erchef/apps/oc_chef_wm/priv/dispatch.conf index bcc8be7a13..cda0c5b391 100755 --- a/src/oc_erchef/apps/oc_chef_wm/priv/dispatch.conf +++ b/src/oc_erchef/apps/oc_chef_wm/priv/dispatch.conf @@ -165,3 +165,4 @@ {["organizations", organization_id, "validate", '*'], chef_wm_validate, []}. {["_status"], chef_wm_status, []}. +{["_stats"], chef_wm_stats, []}. diff --git a/src/oc_erchef/apps/oc_chef_wm/src/chef_wm_pooler_collector.erl b/src/oc_erchef/apps/oc_chef_wm/src/chef_wm_pooler_collector.erl new file mode 100644 index 0000000000..cbcafb0d75 --- /dev/null +++ b/src/oc_erchef/apps/oc_chef_wm/src/chef_wm_pooler_collector.erl @@ -0,0 +1,76 @@ +-module(chef_wm_pooler_collector). + +-export([deregister_cleanup/1, + collect_mf/2, + collect_metrics/2]). + +-import(prometheus_model_helpers, [create_mf/5, + label_pairs/1, + gauge_metrics/1, + gauge_metric/1, + gauge_metric/2, + counter_metric/1, + counter_metric/2]). + +-behaviour(prometheus_collector). + +-include_lib("pooler/src/pooler.hrl"). + +-define(POOLER_IN_USE, erchef_pooler_members_in_use). +-define(POOLER_FREE, erchef_pooler_members_free). +-define(POOLER_MAX, erchef_pooler_members_max). +-define(QUEUED_REQUESTORS, erchef_pooler_queued_requestors). +-define(QUEUED_REQUESTORS_MAX, erchef_pooler_queued_requestors_max). + +%%==================================================================== +%% Collector API +%%==================================================================== + +deregister_cleanup(_) -> ok. + + +collect_mf(_Registry, Callback) -> + + % TODO(jaym) 08-23-17: This sucks, we should expose this in a sane way from pooler. Leaving for now + % as part of the complaint was that chef server should deal with the internals + % of chef server, not some external scripts. We're poking at a lot of the internal + % details of pooler here. + Pools = [{PoolName, gen_server:call(PoolName, dump_pool)} || PoolName <- monitored_pools()], + + Callback(create_gauge(?POOLER_IN_USE, + "Number of pool members currently being used.", + Pools)), + Callback(create_gauge(?POOLER_FREE, + "Number of pool members currently available.", + Pools)), + Callback(create_gauge(?POOLER_MAX, + "Max number of pool members allowed in the pool.", + Pools)), + Callback(create_gauge(?QUEUED_REQUESTORS, + "Number of requestors blocking to take a pool member.", + Pools)), + Callback(create_gauge(?QUEUED_REQUESTORS_MAX, + "Max number of requestors allowed to block on taking pool member.", + Pools)), + ok. + + +collect_metrics(?POOLER_IN_USE, Pools) -> + [gauge_metric([{pool_name, PoolName}], PoolData#pool.in_use_count) || {PoolName, PoolData} <- Pools]; +collect_metrics(?POOLER_FREE, Pools) -> + [gauge_metric([{pool_name, PoolName}], PoolData#pool.free_count) || {PoolName, PoolData} <- Pools]; +collect_metrics(?POOLER_MAX, Pools) -> + [gauge_metric([{pool_name, PoolName}], PoolData#pool.max_count) || {PoolName, PoolData} <- Pools]; +collect_metrics(?QUEUED_REQUESTORS, Pools) -> + [gauge_metric([{pool_name, PoolName}], queue:len(PoolData#pool.queued_requestors)) || {PoolName, PoolData} <- Pools]; +collect_metrics(?QUEUED_REQUESTORS_MAX, Pools) -> + [gauge_metric([{pool_name, PoolName}], PoolData#pool.queue_max) || {PoolName, PoolData} <- Pools]. + + + +create_gauge(Name, Help, Data) -> + create_mf(Name, Help, gauge, ?MODULE, Data). + +monitored_pools() -> + % TODO(jaym) 08-23-17: Move this out to configuration + [sqerl, oc_chef_authz_http, chef_index_http, chef_depsolver]. diff --git a/src/oc_erchef/apps/oc_chef_wm/src/chef_wm_prometheus_json_format.erl b/src/oc_erchef/apps/oc_chef_wm/src/chef_wm_prometheus_json_format.erl new file mode 100644 index 0000000000..68844e0ed0 --- /dev/null +++ b/src/oc_erchef/apps/oc_chef_wm/src/chef_wm_prometheus_json_format.erl @@ -0,0 +1,100 @@ +-module(chef_wm_prometheus_json_format). + +-export([ + content_type/0, + format/0, + format/1]). + +-include_lib("prometheus/include/prometheus_model.hrl"). + +-define(PROCESS_DICT_STORAGE, chef_wm_prometheus_json_format_data). + +-spec content_type() -> binary(). +content_type() -> + <<"application/json">>. + +-spec format() -> binary(). +format() -> + format(default). + +-spec format(Registry :: prometheus_registry:registry()) -> binary(). +format(Registry) -> + put(?PROCESS_DICT_STORAGE, []), + Callback = fun (_, Collector) -> + registry_collect_callback(Registry, Collector) + end, + prometheus_registry:collect(Registry, Callback), + case get(?PROCESS_DICT_STORAGE) of + undefined -> + <<"{}">>; + Data -> + %TODO(jaym) 08/27/17: It might be worthwhile making this defensive to failure... always cleanup + erase(?PROCESS_DICT_STORAGE), + jiffy:encode(Data) + end. + +registry_collect_callback(Registry, Collector) -> + Callback = fun (MF) -> + Data = get(?PROCESS_DICT_STORAGE), + put(?PROCESS_DICT_STORAGE, [mf_to_erl(MF) | Data]) + end, + prometheus_collector:collect_mf(Registry, Collector, Callback). + +%% @private +mf_to_erl(#'MetricFamily'{name = Name, help = Help, type = Type, metric = Metrics}) -> + {[ + {name, Name}, + {type, string_type(Type)}, + {help, list_to_binary(Help)}, + {metrics, [metric_to_erl(Metric) || Metric <- Metrics]} + ]}. + + +metric_to_erl(#'Metric'{label=Labels} = Metric) -> + {maybe_prepend_labels(Labels, emit_metric(Metric))}. + +emit_metric(#'Metric'{counter=#'Counter'{value=Value}}) -> + [ + {value, as_binary(Value)} + ]; +emit_metric(#'Metric'{gauge=#'Gauge'{value=Value}}) -> + [ + {value, as_binary(Value)} + ]; +emit_metric(#'Metric'{untyped=#'Untyped'{value=Value}}) -> + [ + {value, as_binary(Value)} + ]. +%% TODO(jaym) 08/27/17: Histogram and Summary types + +maybe_prepend_labels([], EJson) -> + EJson; +maybe_prepend_labels(Labels, EJson) -> + [{labels, labels(Labels)} | EJson]. + +labels(Labels) -> + Fun = fun (#'LabelPair'{name=Name, value=Value}) -> + {as_binary(Name), as_binary(Value)} + end, + {lists:map(Fun, Labels)}. + +as_binary(infinity) -> + <<"+Inf">>; +as_binary('-infinity') -> + <<"-Inf">>; +as_binary(Value) when is_binary(Value) -> + Value; +as_binary(Value) when is_list(Value) -> + list_to_binary(Value); +as_binary(Value) when is_float(Value) -> + list_to_binary(io_lib:format("~p", [Value])); +as_binary(Value) when is_integer(Value) -> + integer_to_binary(Value). + +string_type('COUNTER') -> + <<"COUNTER">>; +string_type('GAUGE') -> + <<"GAUGE">>; +string_type('UNTYPED') -> + <<"UNTYPED">>. + diff --git a/src/oc_erchef/apps/oc_chef_wm/src/chef_wm_stats.erl b/src/oc_erchef/apps/oc_chef_wm/src/chef_wm_stats.erl new file mode 100644 index 0000000000..131fd96c77 --- /dev/null +++ b/src/oc_erchef/apps/oc_chef_wm/src/chef_wm_stats.erl @@ -0,0 +1,67 @@ +%% -*- erlang-indent-level: 4;indent-tabs-mode: nil; fill-column: 92 -*- +%% ex: ts=4 sw=4 et +%%% @doc +%%% REST resource for reporting chef server stats +%%% @end +%% Copyright 2017 Chef Software, Inc. All Rights Reserved. +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% + +-module(chef_wm_stats). + +-ifdef(TEST). +-compile(export_all). +-endif. + +-export([init/1, + allowed_methods/2, + content_types_provided/2, + to_json/2, + to_text/2]). + +-include_lib("webmachine/include/webmachine.hrl"). + +init(_Any) -> + {ok, []}. + +allowed_methods(Req, State) -> + {['GET'], Req, State}. + +content_types_provided(Req, State) -> + JsonProvider = {"application/json", to_json}, + TextProviders = [ + % I think there is a bug in webmachine where it wont allow us to use + % 'text/plain; version=0.0.4'. + % TODO: Understand https://github.com/basho/webmachine/blob/develop/src/webmachine_util.erl#L140-L158 + {{"text/plain",[{"version","0.0.4"}]}, to_text}, + {"text/plain", to_text}], + case wrq:get_qs_value("format", Req) of + undefined -> + {[JsonProvider | TextProviders], Req, State}; + "json" -> + {[JsonProvider], Req, State}; + "text" -> + {TextProviders, Req, State}; + _Format -> + %% Unknown content type requested in the query string. + {[], Req, State} + end. + +to_json(Req, State) -> + {chef_wm_prometheus_json_format:format(), Req, State}. + +to_text(Req, State) -> + {prometheus_text_format:format(), Req, State}. diff --git a/src/oc_erchef/rebar.config b/src/oc_erchef/rebar.config index 4344768842..ae35d5ccd4 100644 --- a/src/oc_erchef/rebar.config +++ b/src/oc_erchef/rebar.config @@ -62,7 +62,9 @@ {opscoderl_httpc, ".*", {git, "https://github.com/chef/opscoderl_httpc", {branch, "master"}}}, {sync, ".*", - {git, "https://github.com/rustyio/sync.git", {branch, "master"}}} + {git, "https://github.com/rustyio/sync.git", {branch, "master"}}}, + {prometheus, ".*", + {git, "https://github.com/deadtrickster/prometheus.erl", {tag, "v3.4.0"}}} ]}. {cover_enabled, true}. diff --git a/src/oc_erchef/rebar.lock b/src/oc_erchef/rebar.lock index 697dd92734..fc16074fc7 100644 --- a/src/oc_erchef/rebar.lock +++ b/src/oc_erchef/rebar.lock @@ -120,6 +120,10 @@ {git,"https://github.com/seth/pooler", {ref,"521f568bf9a2ccbe7c7e0fc23f24cd06ec559b79"}}, 0}, + {<<"prometheus">>, + {git,"https://github.com/deadtrickster/prometheus.erl", + {ref,"9faad3a354ba00ac632ad120276fbbd608afb1c4"}}, + 0}, {<<"quickrand">>, {git,"https://github.com/okeuday/quickrand.git", {ref,"c7eca718faa0d52c097155263dea6c25067396f7"}}, diff --git a/src/oc_erchef/src/oc_erchef.app.src b/src/oc_erchef/src/oc_erchef.app.src index ef6cd308ee..fad808c1b7 100644 --- a/src/oc_erchef/src/oc_erchef.app.src +++ b/src/oc_erchef/src/oc_erchef.app.src @@ -53,7 +53,8 @@ runtime_tools, tools, uuid, - webtool]}, + webtool, + prometheus]}, {mod, { oc_erchef_app, []}}, {env, []} ]}.