diff --git a/docs/configuration/overview.md b/docs/configuration/overview.md index de5ea952698..9ee855aa18e 100644 --- a/docs/configuration/overview.md +++ b/docs/configuration/overview.md @@ -43,6 +43,7 @@ NOTE: All `asset_storage` related fields are deprecated. Please use the fields i | **prom_metrics_enabled** | `Boolean` | `false` | Create prometheus exporters. Kubernetes clustering only | | **prom_metrics_port** | `Number` | `3333` | Port of prometheus exporter server. Kubernetes clustering only. Metrics will be visible at `http://localhost:/metrics` | | **prom_metrics_add_default** | `Boolean` | `true` | Display default node metrics in prom exporter. Kubernetes clustering only | +| **prom_metrics_display_url** | `String` | `""` | Value to display as url label for prometheus metrics | | **workers** | `Number` | `4` | Number of workers per server | ## Teraslice Configuration Reference diff --git a/docs/development/k8s.md b/docs/development/k8s.md index 78be105b4ce..6b117b93ca9 100644 --- a/docs/development/k8s.md +++ b/docs/development/k8s.md @@ -368,6 +368,7 @@ The `PromMetrics` class lives within `packages/terafoundation/src/api/prom-metri | hasMetric | check if a metric exists | `(name: string) => boolean` | | deleteMetric | delete a metric from the metric list | `(name: string) => Promise` | | verifyAPI | verfiy that the API is running | `() => boolean` | +| resetMetrics | reset the values of all metrics | `() => void` | | shutdown | disable API and shutdown exporter server | `() => Promise` | | getDefaultLabels | retrieve the default labels set at init | `() => Record` | diff --git a/docs/management-apis/endpoints-json.md b/docs/management-apis/endpoints-json.md index 97e5fea501f..27e819bb00a 100644 --- a/docs/management-apis/endpoints-json.md +++ b/docs/management-apis/endpoints-json.md @@ -83,6 +83,55 @@ $ curl 'localhost:5678/v1/cluster/controllers' ] ``` +## GET /v1/cluster/stats + +Returns a json object containing cluster analytics. + +**NOTE:** The slicer object is identical to controllers and is present for backwards compatibility. + +**Usage:** + +```sh +$ curl 'http://localhost:5678/v1/cluster/stats' +{ + "controllers": { + "processed": 2, + "failed": 0, + "queued": 0, + "job_duration": 3, + "workers_joined": 1, + "workers_disconnected": 0, + "workers_reconnected": 0 + }, + "slicer": { + "processed": 2, + "failed": 0, + "queued": 0, + "job_duration": 3, + "workers_joined": 1, + "workers_disconnected": 0, + "workers_reconnected": 0 + } +} +``` + +Include the following header to receive stats in "prometheus exporter mode": +```sh +$ curl -H "Accept: application/openmetrics-text;" -sS http://localhost:5678/cluster/stats +# TYPE teraslice_slices_processed counter +teraslice_slices_processed{cluster="teraslice-dev1"} 2 +# TYPE teraslice_slices_failed counter +teraslice_slices_failed{cluster="teraslice-dev1"} 0 +# TYPE teraslice_slices_queued counter +teraslice_slices_queued{cluster="teraslice-dev1"} 0 +# TYPE teraslice_workers_joined counter +teraslice_workers_joined{cluster="teraslice-dev1"} 1 +# TYPE teraslice_workers_disconnected counter +teraslice_workers_disconnected{cluster="teraslice-dev1"} 0 +# TYPE teraslice_workers_reconnected counter +teraslice_workers_reconnected{cluster="teraslice-dev1"} 0 +``` + ## GET /v1/assets Retreives a list of assets diff --git a/packages/job-components/src/test-helpers.ts b/packages/job-components/src/test-helpers.ts index 8e88951bdfa..0c6f7ee61a9 100644 --- a/packages/job-components/src/test-helpers.ts +++ b/packages/job-components/src/test-helpers.ts @@ -159,6 +159,7 @@ export class TestContext implements i.Context { prom_metrics_enabled: false, prom_metrics_port: 3333, prom_metrics_add_default: true, + prom_metrics_display_url: 'http://localhost', }, teraslice: { action_timeout: 10000, @@ -499,6 +500,11 @@ export class TestContext implements i.Context { verifyAPI(): boolean { return ctx.mockPromMetrics !== null; }, + resetMetrics() { + if (ctx.mockPromMetrics) { + ctx.mockPromMetrics.metricList = {}; + } + }, async shutdown(): Promise { ctx.mockPromMetrics = null; } diff --git a/packages/job-components/test/test-helpers-spec.ts b/packages/job-components/test/test-helpers-spec.ts index 6de9be36114..7294abe419b 100644 --- a/packages/job-components/test/test-helpers-spec.ts +++ b/packages/job-components/test/test-helpers-spec.ts @@ -142,6 +142,7 @@ describe('Test Helpers', () => { tf_prom_metrics_enabled: true, tf_prom_metrics_port: 3333, tf_prom_metrics_add_default: false, + prom_metrics_display_url: 'http://localhost' }; it('should be able to init a mock prom_metrics_api', async () => { @@ -215,6 +216,11 @@ describe('Test Helpers', () => { .toThrow('Metric missing_test_histogram is not setup'); }); + it('should reset metrics', () => { + context.apis.foundation.promMetrics.resetMetrics(); + expect(context.mockPromMetrics?.metricList).toBeEmptyObject(); + }) + it('should shutdown', async () => { await context.apis.foundation.promMetrics.shutdown(); expect(context.mockPromMetrics).toBeNull(); diff --git a/packages/terafoundation/src/api/prom-metrics/exporter.ts b/packages/terafoundation/src/api/prom-metrics/exporter.ts index 80f1b03883a..929d2a0aa86 100644 --- a/packages/terafoundation/src/api/prom-metrics/exporter.ts +++ b/packages/terafoundation/src/api/prom-metrics/exporter.ts @@ -48,4 +48,8 @@ export default class Exporter { async deleteMetric(name: string): Promise { promClient.register.removeSingleMetric(name); } + + resetMetrics() { + promClient.register.resetMetrics(); + } } diff --git a/packages/terafoundation/src/api/prom-metrics/prom-metrics-api.ts b/packages/terafoundation/src/api/prom-metrics/prom-metrics-api.ts index aa2ee469356..505966af68f 100644 --- a/packages/terafoundation/src/api/prom-metrics/prom-metrics-api.ts +++ b/packages/terafoundation/src/api/prom-metrics/prom-metrics-api.ts @@ -41,7 +41,7 @@ export class PromMetrics { const { assignment, job_prom_metrics_add_default, job_prom_metrics_enabled, job_prom_metrics_port, tf_prom_metrics_add_default, tf_prom_metrics_enabled, - tf_prom_metrics_port, labels, prefix, terasliceName + tf_prom_metrics_port, labels, prefix, terasliceName, prom_metrics_display_url } = config; const portToUse = job_prom_metrics_port || tf_prom_metrics_port; @@ -67,6 +67,7 @@ export class PromMetrics { this.default_labels = { name: terasliceName, assignment: apiConfig.assignment, + url: prom_metrics_display_url, ...apiConfig.labels }; await this.createAPI(apiConfig); @@ -405,6 +406,10 @@ export class PromMetrics { return this.apiRunning; } + resetMetrics() { + this.metricExporter.resetMetrics(); + } + async shutdown(): Promise { this.logger.info('prom_metrics_API exporter shutdown'); try { diff --git a/packages/terafoundation/src/schema.ts b/packages/terafoundation/src/schema.ts index 6605b45e741..e5fead5f293 100644 --- a/packages/terafoundation/src/schema.ts +++ b/packages/terafoundation/src/schema.ts @@ -83,6 +83,11 @@ export function foundationSchema() { doc: 'Display default node metrics in prom exporter', default: true, format: Boolean + }, + prom_metrics_display_url: { + doc: 'Value to display as url label for prometheus metrics', + default: '', + format: String } }; diff --git a/packages/terafoundation/test/apis/exporter-spec.ts b/packages/terafoundation/test/apis/exporter-spec.ts index 7844f8eec97..2b7783c6901 100644 --- a/packages/terafoundation/test/apis/exporter-spec.ts +++ b/packages/terafoundation/test/apis/exporter-spec.ts @@ -8,10 +8,19 @@ import Exporter from '../../src/api/prom-metrics/exporter.js'; describe('prometheus exporter', () => { let exporter: Exporter; + + async function getExporterMetrics(): Promise { + const response: Record = await got('http://127.0.0.1:3344/metrics', { + throwHttpErrors: true + }); + return response.body; + } + beforeAll(() => { const logger = debugLogger('prometheus_exporter'); exporter = new Exporter(logger); }); + describe('create', () => { const config: tf.PromMetricsAPIConfig = { assignment: 'worker', @@ -32,8 +41,9 @@ describe('prometheus exporter', () => { expect(response.body).toBeString(); }); }); + describe('delete', () => { - it('should shutdown the express server', async () => { + it('should delete a metric', async () => { new Counter({ name: 'delete_test', help: 'delete_test_help_message', @@ -48,15 +58,27 @@ describe('prometheus exporter', () => { const bodyAfter = await getExporterMetrics(); const valueAfter = bodyAfter.split('\n').filter((line: string) => line.includes('delete_test counter'))[0]; expect(valueAfter).toBe(undefined); + }); + }); - async function getExporterMetrics(): Promise { - const response: Record = await got('http://127.0.0.1:3344/metrics', { - throwHttpErrors: true - }); - return response.body; - } - }, 3000000); + describe('reset', () => { + it('should reset the prom metrics registry', async () => { + const counter = new Counter({ + name: 'reset_test', + help: 'reset_test_help_message', + labelNames: ['reset_test_label'], + }); + + counter.inc(100); + const bodyBefore = await getExporterMetrics(); + expect(bodyBefore).toInclude('reset_test 100'); + + exporter.resetMetrics(); + const bodyAfter = await getExporterMetrics(); + expect(bodyAfter).not.toInclude('reset_test 100'); + }); }); + describe('shutdown', () => { it('should shutdown the express server', async () => { await exporter.shutdown(); diff --git a/packages/terafoundation/test/apis/prom-metrics-spec.ts b/packages/terafoundation/test/apis/prom-metrics-spec.ts index 5144115235d..6d7470ea602 100644 --- a/packages/terafoundation/test/apis/prom-metrics-spec.ts +++ b/packages/terafoundation/test/apis/prom-metrics-spec.ts @@ -17,7 +17,8 @@ describe('promMetrics foundation API', () => { log_level: 'debug', prom_metrics_enabled: true, prom_metrics_port: 3333, - prom_metrics_add_default: true + prom_metrics_add_default: true, + prom_metrics_display_url: 'http://localhost' }, teraslice: { cluster_manager_type: 'kubernetes', @@ -33,7 +34,8 @@ describe('promMetrics foundation API', () => { tf_prom_metrics_port: terafoundation.prom_metrics_port, tf_prom_metrics_add_default: terafoundation.prom_metrics_add_default, logger: debugLogger('prom-metrics-spec-logger'), - assignment: 'worker' + assignment: 'worker', + prom_metrics_display_url: terafoundation.prom_metrics_display_url }; beforeAll(() => { @@ -58,7 +60,7 @@ describe('promMetrics foundation API', () => { it('should have correct default labels', async () => { const labels = await context.apis.foundation.promMetrics.getDefaultLabels(); - expect(labels).toEqual({ assignment: 'worker', name: 'tera-test' }); + expect(labels).toEqual({ assignment: 'worker', name: 'tera-test', url: 'http://localhost' }); }); it('should throw an error if promMetricsAPI is already initialized', async () => { @@ -663,4 +665,74 @@ describe('promMetrics foundation API', () => { }); }); }); + + describe('resetMetrics', () => { + const context = { + sysconfig: { + terafoundation: { + log_level: 'debug', + prom_metrics_enabled: true, + prom_metrics_port: 3337, + prom_metrics_add_default: false + }, + teraslice: { + cluster_manager_type: 'kubernetes', + name: 'tera-test' + } + }, + } as any; + + const { terafoundation, teraslice } = context.sysconfig; + const config = { + terasliceName: teraslice.name, + tf_prom_metrics_enabled: terafoundation.prom_metrics_enabled, + tf_prom_metrics_port: terafoundation.prom_metrics_port, + tf_prom_metrics_add_default: terafoundation.prom_metrics_add_default, + logger: debugLogger('prom-metrics-spec-logger'), + assignment: 'master', + prefix: 'foundation_test_' + }; + + beforeAll(async () => { + // This sets up the API endpoints in the context. + api(context); + context.logger = debugLogger('terafoundation-tests'); + await context.apis.foundation.promMetrics.init(config); + }); + + afterAll(async () => { + await context.apis.foundation.promMetrics.shutdown(); + }); + + it('should reset metrics', async () => { + await context.apis.foundation.promMetrics.addGauge('gauge2', 'help message', ['uuid'], function collect(this: Gauge) { + const defaultLabels = context.apis.foundation.promMetrics.getDefaultLabels(); + this.inc({ uuid: '7oBd9L3sJB', ...defaultLabels }, 0); + }); + context.apis.foundation.promMetrics.inc('gauge2', { uuid: '7oBd9L3sJB' }, 200); + const response1: Record = await got(`http://127.0.0.1:${config.tf_prom_metrics_port}/metrics`, { + throwHttpErrors: true + }); + + const value1 = response1.body + .split('\n') + .filter((line: string) => line.includes('7oBd9L3sJB'))[0] + .split(' ')[1]; + + expect(value1).toBe('200'); + + context.apis.foundation.promMetrics.resetMetrics(); + + const response2: Record = await got(`http://127.0.0.1:${config.tf_prom_metrics_port}/metrics`, { + throwHttpErrors: true + }); + + const value2 = response2.body + .split('\n') + .filter((line: string) => line.includes('7oBd9L3sJB'))[0] + .split(' ')[1]; + + expect(value2).toBe('0'); + }); + }) }); diff --git a/packages/terafoundation/test/test-context-spec.ts b/packages/terafoundation/test/test-context-spec.ts index 031cfce9290..6edc1e4aada 100644 --- a/packages/terafoundation/test/test-context-spec.ts +++ b/packages/terafoundation/test/test-context-spec.ts @@ -76,7 +76,8 @@ describe('TestContext', () => { tf_prom_metrics_port: 3333, tf_prom_metrics_add_default: false, logger: context.logger, - assignment: 'master' + assignment: 'master', + prom_metrics_display_url: context.sysconfig.terafoundation.prom_metrics_display_url }; expect(await context.apis.foundation.promMetrics.init(config)).toBe(true); }); diff --git a/packages/teraslice/src/lib/cluster/cluster_master.ts b/packages/teraslice/src/lib/cluster/cluster_master.ts index fd346fc257f..9d9c92b4c36 100644 --- a/packages/teraslice/src/lib/cluster/cluster_master.ts +++ b/packages/teraslice/src/lib/cluster/cluster_master.ts @@ -12,7 +12,6 @@ import { } from './services/index.js'; import { JobsStorage, ExecutionStorage, StateStorage } from '../storage/index.js'; import { ClusterMasterContext } from '../../interfaces.js'; -import { getPackageJSON } from '../utils/file_utils.js'; export class ClusterMaster { context: ClusterMasterContext; @@ -153,7 +152,8 @@ export class ClusterMaster { tf_prom_metrics_port: terafoundation.prom_metrics_port, logger: this.logger, assignment: 'master', - prefix: 'teraslice_' + prefix: 'teraslice_', + prom_metrics_display_url: terafoundation.prom_metrics_display_url }); await this.setupPromMetrics(); @@ -240,6 +240,36 @@ export class ClusterMaster { 'Information about Teraslice cluster master', ['arch', 'clustering_type', 'name', 'node_version', 'platform', 'teraslice_version'] ), + this.context.apis.foundation.promMetrics.addGauge( + 'slices_processed', + 'Total slices processed across the cluster', + [] + ), + this.context.apis.foundation.promMetrics.addGauge( + 'slices_failed', + 'Total slices failed across the cluster', + [] + ), + this.context.apis.foundation.promMetrics.addGauge( + 'slices_queued', + 'Total slices queued across the cluster', + [] + ), + this.context.apis.foundation.promMetrics.addGauge( + 'workers_joined', + 'Total workers joined across the cluster', + [] + ), + this.context.apis.foundation.promMetrics.addGauge( + 'workers_disconnected', + 'Total workers disconnected across the cluster', + [] + ), + this.context.apis.foundation.promMetrics.addGauge( + 'workers_reconnected', + 'Total workers reconnected across the cluster', + [] + ), this.context.apis.foundation.promMetrics.addGauge( 'controller_workers_active', 'Number of Teraslice workers actively processing slices.', @@ -343,19 +373,6 @@ export class ClusterMaster { ['ex_id', 'job_id', 'job_name'], ), ]); - - this.context.apis.foundation.promMetrics.set( - 'master_info', - { - arch: this.context.arch, - clustering_type: this.context.sysconfig.teraslice.cluster_manager_type, - name: this.context.sysconfig.teraslice.name, - node_version: process.version, - platform: this.context.platform, - teraslice_version: getPackageJSON().version - }, - 1 - ); } } } diff --git a/packages/teraslice/src/lib/cluster/services/api.ts b/packages/teraslice/src/lib/cluster/services/api.ts index 13c335e8592..659489f7741 100644 --- a/packages/teraslice/src/lib/cluster/services/api.ts +++ b/packages/teraslice/src/lib/cluster/services/api.ts @@ -689,9 +689,61 @@ export class ApiService { /// Interval is hardcoded to refresh metrics every 10 seconds if (this.context.apis.foundation.promMetrics.verifyAPI()) { setInterval(async () => { + this.context.apis.foundation.promMetrics.resetMetrics(); try { this.logger.trace('Updating cluster_master prom metrics..'); const controllers = await this.executionService.getControllerStats(); + const stats = this.executionService.getClusterAnalytics(); + const { cluster_manager_type, name } = this.context.sysconfig.teraslice; + + this.context.apis.foundation.promMetrics.set( + 'master_info', + { + arch: this.context.arch, + clustering_type: cluster_manager_type, + name, + node_version: process.version, + platform: this.context.platform, + teraslice_version: getPackageJSON().version + }, + 1 + ); + + this.context.apis.foundation.promMetrics.set( + 'slices_processed', + {}, + stats.controllers.processed + ); + + this.context.apis.foundation.promMetrics.set( + 'slices_failed', + {}, + stats.controllers.failed + ); + + this.context.apis.foundation.promMetrics.set( + 'slices_queued', + {}, + stats.controllers.queued + ); + + this.context.apis.foundation.promMetrics.set( + 'workers_joined', + {}, + stats.controllers.workers_joined + ); + + this.context.apis.foundation.promMetrics.set( + 'workers_disconnected', + {}, + stats.controllers.workers_disconnected + ); + + this.context.apis.foundation.promMetrics.set( + 'workers_reconnected', + {}, + stats.controllers.workers_reconnected + ); for (const controller of controllers) { const controllerLabels = { @@ -746,7 +798,8 @@ export class ApiService { controller.slicers ); } - const exList = await this.executionStorage.search('ex_id:*'); + const query = this.executionStorage.getLivingStatuses().map((str) => `_status:${str}`).join(' OR '); + const exList = await this.executionStorage.search(query, undefined, 10000); for (const ex of exList) { const controllerLabels = { ex_id: ex.ex_id, @@ -826,9 +879,9 @@ export class ApiService { /// Filter out information about kubernetes ex pods const filteredExecutions = {}; - for (const cluster in clusterState) { - if (clusterState[cluster].active) { - for (const worker of clusterState[cluster].active) { + for (const node in clusterState) { + if (clusterState[node].active) { + for (const worker of clusterState[node].active) { if (!filteredExecutions[worker.ex_id]) { filteredExecutions[worker.ex_id] = worker.ex_id; const exLabel = { diff --git a/packages/teraslice/src/lib/workers/execution-controller/index.ts b/packages/teraslice/src/lib/workers/execution-controller/index.ts index 9cfc7a7a638..db8acd00eac 100644 --- a/packages/teraslice/src/lib/workers/execution-controller/index.ts +++ b/packages/teraslice/src/lib/workers/execution-controller/index.ts @@ -159,7 +159,9 @@ export class ExecutionController { ex_id: exId, job_id: jobId, job_name: config.name, - } + }, + prom_metrics_display_url: terafoundation.prom_metrics_display_url + }); await this.setupPromMetrics(); } diff --git a/packages/teraslice/src/lib/workers/worker/index.ts b/packages/teraslice/src/lib/workers/worker/index.ts index 0f2c651e383..3c90af082cb 100644 --- a/packages/teraslice/src/lib/workers/worker/index.ts +++ b/packages/teraslice/src/lib/workers/worker/index.ts @@ -107,7 +107,8 @@ export class Worker { ex_id: exId, job_id: jobId, job_name: config.name, - } + }, + prom_metrics_display_url: terafoundation.prom_metrics_display_url }); await this.setupPromMetrics(); } diff --git a/packages/types/src/terafoundation.ts b/packages/types/src/terafoundation.ts index a30a311e880..43e35e9c762 100644 --- a/packages/types/src/terafoundation.ts +++ b/packages/types/src/terafoundation.ts @@ -133,6 +133,7 @@ export interface TerafoundationConfig { prom_metrics_enabled: boolean; prom_metrics_port: number; prom_metrics_add_default: boolean; + prom_metrics_display_url: string; } export type SysConfig = { @@ -177,6 +178,7 @@ export interface PromMetricsInitConfig { job_prom_metrics_add_default?: boolean; labels?: Record; prefix?: string; + prom_metrics_display_url: string; } export interface PromMetricsAPIConfig { @@ -205,6 +207,7 @@ export interface PromMetrics { hasMetric: (name: string) => boolean; deleteMetric: (name: string) => Promise; verifyAPI: () => boolean; + resetMetrics: () => void; shutdown: () => Promise; getDefaultLabels: () => Record; }