diff --git a/x-pack/plugins/apm/typings/elasticsearch/aggregations.ts b/x-pack/plugins/apm/typings/elasticsearch/aggregations.ts index 8b3163e44915..a869c1e1186f 100644 --- a/x-pack/plugins/apm/typings/elasticsearch/aggregations.ts +++ b/x-pack/plugins/apm/typings/elasticsearch/aggregations.ts @@ -145,6 +145,15 @@ export interface AggregationOptionsByType { >; keyed?: boolean; } & AggregationSourceOptions; + range: { + field: string; + ranges: Array< + | { key?: string; from: string | number } + | { key?: string; to: string | number } + | { key?: string; from: string | number; to: string | number } + >; + keyed?: boolean; + }; auto_date_histogram: { buckets: number; } & AggregationSourceOptions; @@ -324,6 +333,18 @@ interface AggregationResponsePart< ? Record : { buckets: DateRangeBucket[] }; }; + range: { + buckets: TAggregationOptionsMap extends { range: { keyed: true } } + ? Record< + string, + DateRangeBucket & + SubAggregationResponseOf + > + : Array< + DateRangeBucket & + SubAggregationResponseOf + >; + }; auto_date_histogram: { buckets: Array< DateHistogramBucket & diff --git a/x-pack/plugins/task_manager/server/MONITORING.md b/x-pack/plugins/task_manager/server/MONITORING.md new file mode 100644 index 000000000000..4960086411e9 --- /dev/null +++ b/x-pack/plugins/task_manager/server/MONITORING.md @@ -0,0 +1,265 @@ +# Task Manager Monitoring + +Task Manager has an internal monitoring mechanism in which keeps track of a variety of metrics which are exposed via a `health` api endpoint and Kibana Server Log debug messaging. + +## Exposed Metrics +There are three different sections to the stats returned by the `health` api. +- `configuration`: Summarizes Task Manager's current configuration. +- `workload`: Summarizes the workload in the current deployment. +- `runtime`: Tracks Task Manager's performance. + +### Configuring the Stats +There are four new configurations: + +- `xpack.task_manager.monitored_stats_required_freshness` - The _required freshness_ of critical "Hot" stats, which means that if key stats (last polling cycle time, for example) haven't been refreshed within the specified duration, the `_health` endpoint and service will report an `Error` status. By default this is inferred from the configured `poll_interval` and is set to `poll_interval` plus a `1s` buffer. +- `xpack.task_manager.monitored_aggregated_stats_refresh_rate` - Dictates how often we refresh the "Cold" metrics. These metrics require an aggregation against Elasticsearch and add load to the system, hence we want to limit how often we execute these. We also inffer the _required freshness_ of these "Cold" metrics from this configuration, which means that if these stats have not been updated within the required duration then the `_health` endpoint and service will report an `Error` status. This covers the entire `workload` section of the stats. By default this is configured to `60s`, and as a result the _required freshness_ defaults to `61s` (refresh plus a `1s` buffer). +- `xpack.task_manager.monitored_stats_running_average_window`- Dictates the size of the window used to calculate the running average of various "Hot" stats, such as the time it takes to run a task, the _drift_ that tasks experience etc. These stats are collected throughout the lifecycle of tasks and this window will dictate how large the queue we keep in memory would be, and how many values we need to calculate the average against. We do not calculate the average on *every* new value, but rather only when the time comes to summarize the stats before logging them or returning them to the API endpoint. +- `xpack.task_manager.monitored_task_execution_thresholds`- Configures the threshold of failed task executions at which point the `warn` or `error` health status will be set either at a default level or a custom level for specific task types. This will allow you to mark the health as `error` when any task type failes 90% of the time, but set it to `error` at 50% of the time for task types that you consider critical. This value can be set to any number between 0 to 100, and a threshold is hit when the value *exceeds* this number. This means that you can avoid setting the status to `error` by setting the threshold at 100, or hit `error` the moment any task failes by setting the threshold to 0 (as it will exceed 0 once a single failer occurs). + +For example, in your `Kibana.yml`: +``` +xpack.task_manager.monitored_stats_required_freshness: 5000 +xpack.task_manager.monitored_aggregated_stats_refresh_rate: 60000 +xpack.task_manager.monitored_stats_running_average_window: 50 +xpack.task_manager.monitored_task_execution_thresholds: + default: + error_threshold: 70 + warn_threshold: 50 + custom: + "alerting:always-firing": + error_threshold: 50 + warn_threshold: 0 +``` + +## Consuming Health Stats +Task Manager exposes a `/api/task_manager/_health` api which returns the _latest_ stats. +Calling this API is designed to be fast and doesn't actually perform any checks- rather it returns the result of the latest stats in the system, and is design in such a way that you could call it from an external service on a regular basis without worrying that you'll be adding substantial load to the system. + +Additionally, the metrics are logged out into Task Manager's `DEBUG` logger at a regular cadence (dictated by the Polling Interval). +If you wish to enable DEBUG logging in your Kibana instance, you will need to add the following to your `Kibana.yml`: +``` +logging: + loggers: + - context: plugins.taskManager + appenders: [console] + level: debug +``` + +Please bear in mind that these stats are logged as often as your `poll_interval` configuration, which means it could add substantial noise to your logs. +We would recommend only enabling this level of logging temporarily. + +### Understanding the Exposed Stats + +As mentioned above, the `health` api exposes three sections: `configuration`, `workload` and `runtime`. +Each section has a `timestamp` and a `status` which indicates when the last update to this setion took place and whether the health of this section was evaluated as `OK`, `Warning` or `Error`. + +The root has its own `status` which indicate the state of the system overall as infered from the `status` of the section. +An `Error` status in any section will cause the whole system to display as `Error`. +A `Warning` status in any section will cause the whole system to display as `Warning`. +An `OK` status will only be displayed when all sections are marked as `OK`. + +The root `timestamp` is the time in which the summary was exposed (either to the DEBUG logger or the http api) and the `last_update` is the last time any one of the sections was updated. + +#### The Configuration Section +The `configuration` section summarizes Task Manager's current configuration, including dynamic configurations which change over time, such as `poll_interval` and `max_workers` which adjust in reaction to changing load on the system. + +These are "Hot" stats which are updated whenever a change happens in the configuration. + +#### The Workload Section +The `workload` which summarizes the work load in the current deployment, listing the tasks in the system, their types and what their current status is. + +It includes three sub sections: + - The number of tasks scheduled in the system, broken down by type and status. + - The number of idle `overdue` tasks, whose `runAt` has expired. + - Execution density in the next minute or so (configurable), which shows how many tasks are scheduled to execute in the scope of each polling interval. This can give us an idea of how much load there is on the current Kibana deployment. + +These are "Cold" stat which are updated at a regular cadence, configured by the `monitored_aggregated_stats_refresh_rate` config. + +#### The Runtime Section +The `runtime` tracks Task Manager's performance as it runs, making note of task execution time, _drift_ etc. +These include: + - The time it takes a task to run (mean and median, using a configurable running average window, `50` by default) + - The average _drift_ that tasks experience (mean and median, using the same configurable running average window as above). Drift tells us how long after a task's scheduled a task typically executes. + - The polling rate (the timestamp of the last time a polling cycle completed) and the result [`No tasks | Filled task pool | Unexpectedly ran out of workers`] frequency the past 50 polling cycles (using the same window size as the one used for running averages) + - The `Success | Retry | Failure ratio` by task type. This is different than the workload stats which tell you what's in the queue, but ca't keep track of retries and of non recurring tasks as they're wiped off the index when completed. + +These are "Hot" stats which are updated reactively as Tasks are executed and interacted with. + +### Example Stats + +For example, if you _curl_ the `/api/task_manager/_health` endpoint, you might get these stats: +``` +{ + /* the time these stats were returned by the api */ + "timestamp": "2020-10-05T18:26:11.346Z", + /* the overall status of the system */ + "status": "OK", + /* last time any stat was updated in this output */ + "last_update": "2020-10-05T17:57:55.411Z", + "stats": { + "configuration": { /* current configuration of TM */ + "timestamp": "2020-10-05T17:56:06.507Z", + "status": "OK", + "value": { + "max_workers": 10, + "poll_interval": 3000, + "request_capacity": 1000, + "max_poll_inactivity_cycles": 10, + "monitored_aggregated_stats_refresh_rate": 60000, + "monitored_stats_running_average_window": 50 + } + }, + "workload": { /* The workload of this deployment */ + "timestamp": "2020-10-05T17:57:06.534Z", + "status": "OK", + "value": { + "count": 6, /* count of tasks in the system */ + "task_types": { /* what tasks are there and what status are they in */ + "actions_telemetry": { + "count": 1, + "status": { + "idle": 1 + } + }, + "alerting_telemetry": { + "count": 1, + "status": { + "idle": 1 + } + }, + "apm-telemetry-task": { + "count": 1, + "status": { + "idle": 1 + } + }, + "endpoint:user-artifact-packager": { + "count": 1, + "status": { + "idle": 1 + } + }, + "lens_telemetry": { + "count": 1, + "status": { + "idle": 1 + } + }, + "session_cleanup": { + "count": 1, + "status": { + "idle": 1 + } + } + }, + + /* Frequency of recurring tasks schedules */ + "schedule": [ + ["60s", 1], /* 1 task, every 60s */ + ["3600s", 3], /* 3 tasks every hour */ + ["720m", 1] + ], + /* There are no overdue tasks in this system at the moment */ + "overdue": 0, + /* This is the schedule density, it shows a histogram of all the polling intervals in the next minute (or, if + pollInterval is configured unusually high it will show a min of 2 refresh intervals into the future, and a max of 50 buckets). + Here we see that on the 3rd polling interval from *now* (which is ~9 seconds from now, as pollInterval is `3s`) there is one task due to run. + We also see that there are 5 due two intervals later, which is fine as we have a max workers of `10` + */ + "estimated_schedule_density": [0, 0, 1, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + } + }, + "runtime": { + "timestamp": "2020-10-05T17:57:55.411Z", + "status": "OK", + "value": { + "polling": { + /* When was the last polling cycle? */ + "last_successful_poll": "2020-10-05T17:57:55.411Z", + /* What is the frequency of polling cycle result? + Here we see 94% of "NoTasksClaimed" and 6% "PoolFilled" */ + "result_frequency_percent_as_number": { + "NoTasksClaimed": 94, + "RanOutOfCapacity": 0, /* This is a legacy result, we might want to rename - it tells us when a polling cycle resulted in claiming more tasks than we had workers for, butt he name doesn't make much sense outside of the context of the code */ + "PoolFilled": 6 + } + }, + /* on average, the tasks in this deployment run 1.7s after their scheduled time */ + "drift": { + "mean": 1720, + "median": 2276 + }, + "execution": { + "duration": { + /* on average, the `endpoint:user-artifact-packager` tasks take 15ms to run */ + "endpoint:user-artifact-packager": { + "mean": 15, + "median": 14.5 + }, + "session_cleanup": { + "mean": 28, + "median": 28 + }, + "lens_telemetry": { + "mean": 100, + "median": 100 + }, + "actions_telemetry": { + "mean": 135, + "median": 135 + }, + "alerting_telemetry": { + "mean": 197, + "median": 197 + }, + "apm-telemetry-task": { + "mean": 1347, + "median": 1347 + } + }, + "result_frequency_percent_as_number": { + /* and 100% of `endpoint:user-artifact-packager` have completed in success (within the running average window, so the past 50 runs (by default, configrable by `monitored_stats_running_average_window`) */ + "endpoint:user-artifact-packager": { + "status": "OK", + "Success": 100, + "RetryScheduled": 0, + "Failed": 0 + }, + "session_cleanup": { + /* `error` status as 90% of results are `Failed` */ + "status": "error", + "Success": 5, + "RetryScheduled": 5, + "Failed": 90 + }, + "lens_telemetry": { + "status": "OK", + "Success": 100, + "RetryScheduled": 0, + "Failed": 0 + }, + "actions_telemetry": { + "status": "OK", + "Success": 100, + "RetryScheduled": 0, + "Failed": 0 + }, + "alerting_telemetry": { + "status": "OK", + "Success": 100, + "RetryScheduled": 0, + "Failed": 0 + }, + "apm-telemetry-task": { + "status": "OK", + "Success": 100, + "RetryScheduled": 0, + "Failed": 0 + } + } + } + } + } + } +} +``` diff --git a/x-pack/plugins/task_manager/server/README.md b/x-pack/plugins/task_manager/server/README.md index fd2409a7db0a..a0b35ad09453 100644 --- a/x-pack/plugins/task_manager/server/README.md +++ b/x-pack/plugins/task_manager/server/README.md @@ -48,6 +48,10 @@ The task_manager can be configured via `taskManager` config options (e.g. `taskM - `override_num_workers`: An object of `taskType: number` that overrides the `num_workers` for tasks - For example: `task_manager.override_num_workers.reporting: 2` would override the number of workers occupied by tasks of type `reporting` - This allows sysadmins to tweak the operational performance of Kibana, allowing more or fewer tasks of a specific type to run simultaneously +- `monitored_aggregated_stats_refresh_rate` - Dictates how often we refresh the "Cold" metrics. Learn More: [./MONITORING](./MONITORING.MD) +- `monitored_stats_running_average_window`- Dictates the size of the window used to calculate the running average of various "Hot" stats. Learn More: [./MONITORING](./MONITORING.MD) +- `monitored_stats_required_freshness` - Dictates the _required freshness_ of critical "Hot" stats. Learn More: [./MONITORING](./MONITORING.MD) +- `monitored_task_execution_thresholds`- Dictates the threshold of failed task executions. Learn More: [./MONITORING](./MONITORING.MD) ## Task definitions @@ -460,3 +464,9 @@ The task manager's public API is create / delete / list. Updates aren't directly node scripts/functional_tests_server.js --config x-pack/test/plugin_api_integration/config.ts node scripts/functional_test_runner --config x-pack/test/plugin_api_integration/config.ts ``` + +## Monitoring + +Task Manager exposes runtime statistics which enable basic observability into its inner workings and makes it possible to monitor the system from external services. + +Learn More: [./MONITORING](./MONITORING.MD) \ No newline at end of file diff --git a/x-pack/plugins/task_manager/server/config.test.ts b/x-pack/plugins/task_manager/server/config.test.ts index d5bbbe65582f..d2d5ac8f22a1 100644 --- a/x-pack/plugins/task_manager/server/config.test.ts +++ b/x-pack/plugins/task_manager/server/config.test.ts @@ -15,6 +15,16 @@ describe('config validation', () => { "max_attempts": 3, "max_poll_inactivity_cycles": 10, "max_workers": 10, + "monitored_aggregated_stats_refresh_rate": 60000, + "monitored_stats_required_freshness": 4000, + "monitored_stats_running_average_window": 50, + "monitored_task_execution_thresholds": Object { + "custom": Object {}, + "default": Object { + "error_threshold": 90, + "warn_threshold": 80, + }, + }, "poll_interval": 3000, "request_capacity": 1000, } @@ -31,4 +41,147 @@ describe('config validation', () => { `"[index]: \\".tasks\\" is an invalid Kibana Task Manager index, as it is already in use by the ElasticSearch Tasks Manager"` ); }); + + test('the required freshness of the monitored stats config must always be less-than-equal to the poll interval', () => { + const config: Record = { + monitored_stats_required_freshness: 100, + }; + expect(() => { + configSchema.validate(config); + }).toThrowErrorMatchingInlineSnapshot( + `"The specified monitored_stats_required_freshness (100) is invalid, as it is below the poll_interval (3000)"` + ); + }); + + test('the default required freshness of the monitored stats is poll interval with a slight buffer', () => { + const config: Record = {}; + expect(configSchema.validate(config)).toMatchInlineSnapshot(` + Object { + "enabled": true, + "index": ".kibana_task_manager", + "max_attempts": 3, + "max_poll_inactivity_cycles": 10, + "max_workers": 10, + "monitored_aggregated_stats_refresh_rate": 60000, + "monitored_stats_required_freshness": 4000, + "monitored_stats_running_average_window": 50, + "monitored_task_execution_thresholds": Object { + "custom": Object {}, + "default": Object { + "error_threshold": 90, + "warn_threshold": 80, + }, + }, + "poll_interval": 3000, + "request_capacity": 1000, + } + `); + }); + + test('the custom monitored_task_execution_thresholds can be configured at task type', () => { + const config: Record = { + monitored_task_execution_thresholds: { + custom: { + 'alerting:always-fires': { + error_threshold: 50, + warn_threshold: 30, + }, + }, + }, + }; + expect(configSchema.validate(config)).toMatchInlineSnapshot(` + Object { + "enabled": true, + "index": ".kibana_task_manager", + "max_attempts": 3, + "max_poll_inactivity_cycles": 10, + "max_workers": 10, + "monitored_aggregated_stats_refresh_rate": 60000, + "monitored_stats_required_freshness": 4000, + "monitored_stats_running_average_window": 50, + "monitored_task_execution_thresholds": Object { + "custom": Object { + "alerting:always-fires": Object { + "error_threshold": 50, + "warn_threshold": 30, + }, + }, + "default": Object { + "error_threshold": 90, + "warn_threshold": 80, + }, + }, + "poll_interval": 3000, + "request_capacity": 1000, + } + `); + }); + + test('the monitored_task_execution_thresholds ensures that the default warn_threshold is lt the default error_threshold', () => { + const config: Record = { + monitored_task_execution_thresholds: { + default: { + warn_threshold: 80, + error_threshold: 70, + }, + }, + }; + expect(() => { + configSchema.validate(config); + }).toThrowErrorMatchingInlineSnapshot( + `"[monitored_task_execution_thresholds.default]: warn_threshold (80) must be less than, or equal to, error_threshold (70)"` + ); + }); + + test('the monitored_task_execution_thresholds allows the default warn_threshold to equal the default error_threshold', () => { + const config: Record = { + monitored_task_execution_thresholds: { + default: { + warn_threshold: 70, + error_threshold: 70, + }, + }, + }; + expect(() => { + configSchema.validate(config); + }).not.toThrowError(); + }); + + test('the monitored_task_execution_thresholds ensures that the warn_threshold is lte error_threshold on custom thresholds', () => { + const config: Record = { + monitored_task_execution_thresholds: { + custom: { + 'alerting:always-fires': { + error_threshold: 80, + warn_threshold: 90, + }, + }, + }, + }; + expect(() => { + configSchema.validate(config); + }).toThrowErrorMatchingInlineSnapshot( + `"[monitored_task_execution_thresholds.custom.alerting:always-fires]: warn_threshold (90) must be less than, or equal to, error_threshold (80)"` + ); + }); + + test('the monitored_task_execution_thresholds allows a custom error_threshold which is lower than the default warn_threshold', () => { + const config: Record = { + monitored_task_execution_thresholds: { + default: { + warn_threshold: 80, + error_threshold: 90, + }, + custom: { + 'alerting:always-fires': { + error_threshold: 60, + warn_threshold: 50, + }, + }, + }, + }; + expect(() => { + configSchema.validate(config); + }).not.toThrowError(); + }); }); diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts index aa78cf3baa96..157f01281836 100644 --- a/x-pack/plugins/task_manager/server/config.ts +++ b/x-pack/plugins/task_manager/server/config.ts @@ -10,44 +10,109 @@ export const DEFAULT_MAX_WORKERS = 10; export const DEFAULT_POLL_INTERVAL = 3000; export const DEFAULT_MAX_POLL_INACTIVITY_CYCLES = 10; -export const configSchema = schema.object({ - enabled: schema.boolean({ defaultValue: true }), - /* The maximum number of times a task will be attempted before being abandoned as failed */ - max_attempts: schema.number({ - defaultValue: 3, - min: 1, - }), - /* How often, in milliseconds, the task manager will look for more work. */ - poll_interval: schema.number({ - defaultValue: DEFAULT_POLL_INTERVAL, - min: 100, - }), - /* How many poll interval cycles can work take before it's timed out. */ - max_poll_inactivity_cycles: schema.number({ - defaultValue: DEFAULT_MAX_POLL_INACTIVITY_CYCLES, - min: 1, - }), - /* How many requests can Task Manager buffer before it rejects new requests. */ - request_capacity: schema.number({ - // a nice round contrived number, feel free to change as we learn how it behaves - defaultValue: 1000, - min: 1, - }), - /* The name of the index used to store task information. */ - index: schema.string({ - defaultValue: '.kibana_task_manager', - validate: (val) => { - if (val.toLowerCase() === '.tasks') { - return `"${val}" is an invalid Kibana Task Manager index, as it is already in use by the ElasticSearch Tasks Manager`; +// Monitoring Constants +// =================== +// Refresh aggregated monitored stats at a default rate of once a minute +export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000; +export const DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW = 50; + +export const taskExecutionFailureThresholdSchema = schema.object( + { + error_threshold: schema.number({ + defaultValue: 90, + min: 0, + }), + warn_threshold: schema.number({ + defaultValue: 80, + min: 0, + }), + }, + { + validate(config) { + if (config.error_threshold < config.warn_threshold) { + return `warn_threshold (${config.warn_threshold}) must be less than, or equal to, error_threshold (${config.error_threshold})`; + } + }, + } +); + +export const configSchema = schema.object( + { + enabled: schema.boolean({ defaultValue: true }), + /* The maximum number of times a task will be attempted before being abandoned as failed */ + max_attempts: schema.number({ + defaultValue: 3, + min: 1, + }), + /* How often, in milliseconds, the task manager will look for more work. */ + poll_interval: schema.number({ + defaultValue: DEFAULT_POLL_INTERVAL, + min: 100, + }), + /* How many poll interval cycles can work take before it's timed out. */ + max_poll_inactivity_cycles: schema.number({ + defaultValue: DEFAULT_MAX_POLL_INACTIVITY_CYCLES, + min: 1, + }), + /* How many requests can Task Manager buffer before it rejects new requests. */ + request_capacity: schema.number({ + // a nice round contrived number, feel free to change as we learn how it behaves + defaultValue: 1000, + min: 1, + }), + /* The name of the index used to store task information. */ + index: schema.string({ + defaultValue: '.kibana_task_manager', + validate: (val) => { + if (val.toLowerCase() === '.tasks') { + return `"${val}" is an invalid Kibana Task Manager index, as it is already in use by the ElasticSearch Tasks Manager`; + } + }, + }), + /* The maximum number of tasks that this Kibana instance will run simultaneously. */ + max_workers: schema.number({ + defaultValue: DEFAULT_MAX_WORKERS, + // disable the task manager rather than trying to specify it with 0 workers + min: 1, + }), + /* The rate at which we emit fresh monitored stats. By default we'll use the poll_interval (+ a slight buffer) */ + monitored_stats_required_freshness: schema.number({ + defaultValue: (config?: unknown) => + ((config as { poll_interval: number })?.poll_interval ?? DEFAULT_POLL_INTERVAL) + 1000, + min: 100, + }), + /* The rate at which we refresh monitored stats that require aggregation queries against ES. */ + monitored_aggregated_stats_refresh_rate: schema.number({ + defaultValue: DEFAULT_MONITORING_REFRESH_RATE, + /* don't run monitored stat aggregations any faster than once every 5 seconds */ + min: 5000, + }), + /* The size of the running average window for monitored stats. */ + monitored_stats_running_average_window: schema.number({ + defaultValue: DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW, + max: 100, + min: 10, + }), + /* Task Execution result warn & error thresholds. */ + monitored_task_execution_thresholds: schema.object({ + default: taskExecutionFailureThresholdSchema, + custom: schema.recordOf(schema.string(), taskExecutionFailureThresholdSchema, { + defaultValue: {}, + }), + }), + }, + { + validate: (config) => { + if ( + config.monitored_stats_required_freshness && + config.poll_interval && + config.monitored_stats_required_freshness < config.poll_interval + ) { + return `The specified monitored_stats_required_freshness (${config.monitored_stats_required_freshness}) is invalid, as it is below the poll_interval (${config.poll_interval})`; } }, - }), - /* The maximum number of tasks that this Kibana instance will run simultaneously. */ - max_workers: schema.number({ - defaultValue: DEFAULT_MAX_WORKERS, - // disable the task manager rather than trying to specify it with 0 workers - min: 1, - }), -}); + } +); export type TaskManagerConfig = TypeOf; +export type TaskExecutionFailureThreshold = TypeOf; diff --git a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts index 11f6ccc88185..01326c73bd68 100644 --- a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts +++ b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts @@ -30,7 +30,17 @@ describe('managed configuration', () => { max_attempts: 9, poll_interval: 3000, max_poll_inactivity_cycles: 10, + monitored_aggregated_stats_refresh_rate: 60000, + monitored_stats_required_freshness: 4000, + monitored_stats_running_average_window: 50, request_capacity: 1000, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, }); logger = context.logger.get('taskManager'); diff --git a/x-pack/plugins/task_manager/server/lib/bulk_operation_buffer.ts b/x-pack/plugins/task_manager/server/lib/bulk_operation_buffer.ts index 6df5b064f279..4de92ffc7703 100644 --- a/x-pack/plugins/task_manager/server/lib/bulk_operation_buffer.ts +++ b/x-pack/plugins/task_manager/server/lib/bulk_operation_buffer.ts @@ -4,11 +4,11 @@ * you may not use this file except in compliance with the Elastic License. */ +import { Logger } from 'src/core/server'; import { map } from 'lodash'; import { Subject, race, from } from 'rxjs'; import { bufferWhen, filter, bufferCount, flatMap, mapTo, first } from 'rxjs/operators'; import { either, Result, asOk, asErr, Ok, Err } from './result_type'; -import { Logger } from '../../../../../src/core/server'; export interface BufferOptions { bufferMaxDuration?: number; diff --git a/x-pack/plugins/task_manager/server/lib/correct_deprecated_fields.ts b/x-pack/plugins/task_manager/server/lib/correct_deprecated_fields.ts index 9e5f4b7c143a..a15682a9d3f3 100644 --- a/x-pack/plugins/task_manager/server/lib/correct_deprecated_fields.ts +++ b/x-pack/plugins/task_manager/server/lib/correct_deprecated_fields.ts @@ -4,8 +4,8 @@ * you may not use this file except in compliance with the Elastic License. */ +import { Logger } from 'src/core/server'; import { TaskInstance, TaskInstanceWithDeprecatedFields } from '../task'; -import { Logger } from '../../../../../src/core/server'; export function ensureDeprecatedFieldsAreCorrected( { id, taskType, interval, schedule, ...taskInstance }: TaskInstanceWithDeprecatedFields, diff --git a/x-pack/plugins/task_manager/server/lib/intervals.test.ts b/x-pack/plugins/task_manager/server/lib/intervals.test.ts index ac28b81eaf49..e79694915f92 100644 --- a/x-pack/plugins/task_manager/server/lib/intervals.test.ts +++ b/x-pack/plugins/task_manager/server/lib/intervals.test.ts @@ -7,13 +7,13 @@ import _ from 'lodash'; import sinon from 'sinon'; import { - assertValidInterval, + parseIntervalAsSecond, + parseIntervalAsMillisecond, intervalFromNow, intervalFromDate, - minutesFromNow, - minutesFromDate, secondsFromNow, secondsFromDate, + asInterval, } from './intervals'; let fakeTimer: sinon.SinonFakeTimers; @@ -25,32 +25,100 @@ beforeAll(() => { afterAll(() => fakeTimer.restore()); describe('taskIntervals', () => { - describe('assertValidInterval', () => { + describe('parseIntervalAsSecond', () => { test('it accepts intervals in the form `Nm`', () => { - expect(() => assertValidInterval(`${_.random(1, 1000)}m`)).not.toThrow(); + expect(() => parseIntervalAsSecond(`${_.random(1, 1000)}m`)).not.toThrow(); }); test('it accepts intervals in the form `Ns`', () => { - expect(() => assertValidInterval(`${_.random(1, 1000)}s`)).not.toThrow(); + expect(() => parseIntervalAsSecond(`${_.random(1, 1000)}s`)).not.toThrow(); }); test('it rejects 0 based intervals', () => { - expect(() => assertValidInterval('0m')).toThrow( + expect(() => parseIntervalAsSecond('0m')).toThrow( /Invalid interval "0m"\. Intervals must be of the form {number}m. Example: 5m/ ); - expect(() => assertValidInterval('0s')).toThrow( + expect(() => parseIntervalAsSecond('0s')).toThrow( /Invalid interval "0s"\. Intervals must be of the form {number}m. Example: 5m/ ); }); test('it rejects intervals are not of the form `Nm` or `Ns`', () => { - expect(() => assertValidInterval(`5m 2s`)).toThrow( + expect(() => parseIntervalAsSecond(`5m 2s`)).toThrow( /Invalid interval "5m 2s"\. Intervals must be of the form {number}m. Example: 5m/ ); - expect(() => assertValidInterval(`hello`)).toThrow( + expect(() => parseIntervalAsSecond(`hello`)).toThrow( /Invalid interval "hello"\. Intervals must be of the form {number}m. Example: 5m/ ); }); + + test('returns an interval as s', () => { + expect(parseIntervalAsSecond('5s')).toEqual(5); + expect(parseIntervalAsSecond('15s')).toEqual(15); + expect(parseIntervalAsSecond('20m')).toEqual(20 * 60); + expect(parseIntervalAsSecond('61m')).toEqual(61 * 60); + expect(parseIntervalAsSecond('90m')).toEqual(90 * 60); + }); + }); + + describe('parseIntervalAsMillisecond', () => { + test('it accepts intervals in the form `Nm`', () => { + expect(() => parseIntervalAsMillisecond(`${_.random(1, 1000)}m`)).not.toThrow(); + }); + + test('it accepts intervals in the form `Ns`', () => { + expect(() => parseIntervalAsMillisecond(`${_.random(1, 1000)}s`)).not.toThrow(); + }); + + test('it rejects 0 based intervals', () => { + expect(() => parseIntervalAsMillisecond('0m')).toThrow( + /Invalid interval "0m"\. Intervals must be of the form {number}m. Example: 5m/ + ); + expect(() => parseIntervalAsMillisecond('0s')).toThrow( + /Invalid interval "0s"\. Intervals must be of the form {number}m. Example: 5m/ + ); + }); + + test('it rejects intervals are not of the form `Nm` or `Ns`', () => { + expect(() => parseIntervalAsMillisecond(`5m 2s`)).toThrow( + /Invalid interval "5m 2s"\. Intervals must be of the form {number}m. Example: 5m/ + ); + expect(() => parseIntervalAsMillisecond(`hello`)).toThrow( + /Invalid interval "hello"\. Intervals must be of the form {number}m. Example: 5m/ + ); + }); + + test('returns an interval as ms', () => { + expect(parseIntervalAsMillisecond('5s')).toEqual(5 * 1000); + expect(parseIntervalAsMillisecond('15s')).toEqual(15 * 1000); + expect(parseIntervalAsMillisecond('20m')).toEqual(20 * 60 * 1000); + expect(parseIntervalAsMillisecond('61m')).toEqual(61 * 60 * 1000); + expect(parseIntervalAsMillisecond('90m')).toEqual(90 * 60 * 1000); + }); + }); + + describe('asInterval', () => { + test('returns a ms interval when ms duration can only divide by ms', () => { + expect(asInterval(500)).toEqual('500ms'); + expect(asInterval(1500)).toEqual('1500ms'); + expect(asInterval(1001)).toEqual('1001ms'); + expect(asInterval(2001)).toEqual('2001ms'); + expect(asInterval(61001)).toEqual('61001ms'); + expect(asInterval(90001)).toEqual('90001ms'); + }); + + test('returns a seconds interval when ms duration divides by seconds', () => { + expect(asInterval(1000)).toEqual('1s'); + expect(asInterval(2000)).toEqual('2s'); + expect(asInterval(61000)).toEqual('61s'); + expect(asInterval(99000)).toEqual('99s'); + expect(asInterval(90000)).toEqual('90s'); + }); + + test('returns a minutes interval when ms duration divides by minutes', () => { + expect(asInterval(60000)).toEqual('1m'); + expect(asInterval(120000)).toEqual('2m'); + }); }); describe('intervalFromNow', () => { @@ -125,25 +193,6 @@ describe('taskIntervals', () => { }); }); - describe('minutesFromNow', () => { - test('it returns the current date plus a number of minutes', () => { - const mins = _.random(1, 100); - const expected = Date.now() + mins * 60 * 1000; - const nextRun = minutesFromNow(mins).getTime(); - expect(nextRun).toEqual(expected); - }); - }); - - describe('minutesFromDate', () => { - test('it returns the given date plus a number of minutes', () => { - const originalDate = new Date(2019, 1, 1); - const mins = _.random(1, 100); - const expected = originalDate.valueOf() + mins * 60 * 1000; - const nextRun = minutesFromDate(originalDate, mins).getTime(); - expect(expected).toEqual(nextRun); - }); - }); - describe('secondsFromNow', () => { test('it returns the current date plus a number of seconds', () => { const secs = _.random(1, 100); diff --git a/x-pack/plugins/task_manager/server/lib/intervals.ts b/x-pack/plugins/task_manager/server/lib/intervals.ts index 9009be5f7822..a28dfa62a501 100644 --- a/x-pack/plugins/task_manager/server/lib/intervals.ts +++ b/x-pack/plugins/task_manager/server/lib/intervals.ts @@ -4,6 +4,28 @@ * you may not use this file except in compliance with the Elastic License. */ +import { memoize } from 'lodash'; + +export enum IntervalCadence { + Minute = 'm', + Second = 's', +} +const VALID_CADENCE = new Set(Object.values(IntervalCadence)); +const CADENCE_IN_MS: Record = { + [IntervalCadence.Second]: 1000, + [IntervalCadence.Minute]: 60 * 1000, +}; + +function isCadence(cadence: IntervalCadence | string): cadence is IntervalCadence { + return VALID_CADENCE.has(cadence as IntervalCadence); +} + +export function asInterval(ms: number): string { + const secondsRemainder = ms % 1000; + const minutesRemainder = ms % 60000; + return secondsRemainder ? `${ms}ms` : minutesRemainder ? `${ms / 1000}s` : `${ms / 60000}m`; +} + /** * Returns a date that is the specified interval from now. Currently, * only minute-intervals and second-intervals are supported. @@ -14,14 +36,7 @@ export function intervalFromNow(interval?: string): Date | undefined { if (interval === undefined) { return; } - - assertValidInterval(interval); - - if (isSeconds(interval)) { - return secondsFromNow(parseInterval(interval)); - } - - return minutesFromNow(parseInterval(interval)); + return secondsFromNow(parseIntervalAsSecond(interval)); } /** @@ -35,37 +50,7 @@ export function intervalFromDate(date: Date, interval?: string): Date | undefine if (interval === undefined) { return; } - - assertValidInterval(interval); - - if (isSeconds(interval)) { - return secondsFromDate(date, parseInterval(interval)); - } - - return minutesFromDate(date, parseInterval(interval)); -} - -/** - * Returns a date that is mins minutes from now. - * - * @param mins The number of mintues from now - */ -export function minutesFromNow(mins: number): Date { - return minutesFromDate(new Date(), mins); -} - -/** - * Returns a date that is mins minutes from given date. - * - * @param date The date to add minutes to - * @param mins The number of mintues from given date - */ -export function minutesFromDate(date: Date, mins: number): Date { - const result = new Date(date.valueOf()); - - result.setMinutes(result.getMinutes() + mins); - - return result; + return secondsFromDate(date, parseIntervalAsSecond(interval)); } /** @@ -85,9 +70,7 @@ export function secondsFromNow(secs: number): Date { */ export function secondsFromDate(date: Date, secs: number): Date { const result = new Date(date.valueOf()); - result.setSeconds(result.getSeconds() + secs); - return result; } @@ -95,29 +78,22 @@ export function secondsFromDate(date: Date, secs: number): Date { * Verifies that the specified interval matches our expected format. * * @param {string} interval - An interval such as `5m` or `10s` + * @returns {number} The interval as seconds */ -export function assertValidInterval(interval: string) { - if (isMinutes(interval)) { - return interval; +export const parseIntervalAsSecond = memoize((interval: string): number => { + return Math.round(parseIntervalAsMillisecond(interval) / 1000); +}); + +export const parseIntervalAsMillisecond = memoize((interval: string): number => { + const numericAsStr: string = interval.slice(0, -1); + const numeric: number = parseInt(numericAsStr, 10); + const cadence: IntervalCadence | string = interval.slice(-1); + if (!isCadence(cadence) || isNaN(numeric) || numeric <= 0 || !isNumeric(numericAsStr)) { + throw new Error( + `Invalid interval "${interval}". Intervals must be of the form {number}m. Example: 5m.` + ); } + return numeric * CADENCE_IN_MS[cadence]; +}); - if (isSeconds(interval)) { - return interval; - } - - throw new Error( - `Invalid interval "${interval}". Intervals must be of the form {number}m. Example: 5m.` - ); -} - -function parseInterval(interval: string) { - return parseInt(interval, 10); -} - -function isMinutes(interval: string) { - return /^[1-9][0-9]*m$/.test(interval); -} - -function isSeconds(interval: string) { - return /^[1-9][0-9]*s$/.test(interval); -} +const isNumeric = (numAsStr: string) => /^\d+$/.test(numAsStr); diff --git a/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts new file mode 100644 index 000000000000..f97861901b5b --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts @@ -0,0 +1,98 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { Subject } from 'rxjs'; +import { take, bufferCount } from 'rxjs/operators'; +import { createConfigurationAggregator } from './configuration_statistics'; +import { TaskManagerConfig } from '../config'; + +describe('Configuration Statistics Aggregator', () => { + test('merges the static config with the merged configs', async () => { + const configuration: TaskManagerConfig = { + enabled: true, + max_workers: 10, + index: 'foo', + max_attempts: 9, + poll_interval: 6000000, + monitored_stats_required_freshness: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + }; + + const managedConfig = { + maxWorkersConfiguration$: new Subject(), + pollIntervalConfiguration$: new Subject(), + }; + + return new Promise(async (resolve, reject) => { + createConfigurationAggregator(configuration, managedConfig) + .pipe(take(3), bufferCount(3)) + .subscribe(([initial, updatedWorkers, updatedInterval]) => { + expect(initial.value).toEqual({ + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + }); + + expect(updatedWorkers.value).toEqual({ + max_workers: 8, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + }); + + expect(updatedInterval.value).toEqual({ + max_workers: 8, + poll_interval: 3000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + }); + resolve(); + }, reject); + + managedConfig.maxWorkersConfiguration$.next(8); + + managedConfig.pollIntervalConfiguration$.next(3000); + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.ts new file mode 100644 index 000000000000..22b08bc5c88d --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.ts @@ -0,0 +1,51 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { combineLatest, of } from 'rxjs'; +import { pick, merge } from 'lodash'; +import { map, startWith } from 'rxjs/operators'; +import { AggregatedStatProvider } from './runtime_statistics_aggregator'; +import { TaskManagerConfig } from '../config'; +import { ManagedConfiguration } from '../lib/create_managed_configuration'; + +const CONFIG_FIELDS_TO_EXPOSE = [ + 'request_capacity', + 'max_poll_inactivity_cycles', + 'monitored_aggregated_stats_refresh_rate', + 'monitored_stats_running_average_window', + 'monitored_task_execution_thresholds', +] as const; + +export type ConfigStat = Pick< + TaskManagerConfig, + 'max_workers' | 'poll_interval' | typeof CONFIG_FIELDS_TO_EXPOSE[number] +>; + +export function createConfigurationAggregator( + config: TaskManagerConfig, + managedConfig: ManagedConfiguration +): AggregatedStatProvider { + return combineLatest([ + of(pick(config, ...CONFIG_FIELDS_TO_EXPOSE)), + managedConfig.pollIntervalConfiguration$.pipe( + startWith(config.poll_interval), + map>((pollInterval) => ({ + poll_interval: pollInterval, + })) + ), + managedConfig.maxWorkersConfiguration$.pipe( + startWith(config.max_workers), + map>((maxWorkers) => ({ + max_workers: maxWorkers, + })) + ), + ]).pipe( + map((configurations) => ({ + key: 'configuration', + value: merge({}, ...configurations), + })) + ); +} diff --git a/x-pack/plugins/task_manager/server/monitoring/index.ts b/x-pack/plugins/task_manager/server/monitoring/index.ts new file mode 100644 index 000000000000..8e71ce2519a7 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/index.ts @@ -0,0 +1,39 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { Logger } from 'src/core/server'; +import { Observable } from 'rxjs'; +import { TaskManagerConfig } from '../config'; +import { + MonitoringStats, + createAggregators, + createMonitoringStatsStream, +} from './monitoring_stats_stream'; +import { TaskStore } from '../task_store'; +import { TaskPollingLifecycle } from '../polling_lifecycle'; +import { ManagedConfiguration } from '../lib/create_managed_configuration'; + +export { + MonitoringStats, + HealthStatus, + RawMonitoringStats, + summarizeMonitoringStats, + createAggregators, + createMonitoringStatsStream, +} from './monitoring_stats_stream'; + +export function createMonitoringStats( + taskPollingLifecycle: TaskPollingLifecycle, + taskStore: TaskStore, + config: TaskManagerConfig, + managedConfig: ManagedConfiguration, + logger: Logger +): Observable { + return createMonitoringStatsStream( + createAggregators(taskPollingLifecycle, taskStore, config, managedConfig, logger), + config + ); +} diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts new file mode 100644 index 000000000000..8479def5deee --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts @@ -0,0 +1,173 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { TaskManagerConfig } from '../config'; +import { of, Subject } from 'rxjs'; +import { take, bufferCount } from 'rxjs/operators'; +import { createMonitoringStatsStream, AggregatedStat } from './monitoring_stats_stream'; +import { JsonValue } from 'src/plugins/kibana_utils/common'; + +beforeEach(() => { + jest.resetAllMocks(); +}); + +describe('createMonitoringStatsStream', () => { + const configuration: TaskManagerConfig = { + enabled: true, + max_workers: 10, + index: 'foo', + max_attempts: 9, + poll_interval: 6000000, + monitored_stats_required_freshness: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + }; + + it('returns the initial config used to configure Task Manager', async () => { + return new Promise((resolve) => { + createMonitoringStatsStream(of(), configuration) + .pipe(take(1)) + .subscribe((firstValue) => { + expect(firstValue.stats).toEqual({}); + resolve(); + }); + }); + }); + + it('incrementally updates the stats returned by the endpoint', async () => { + const aggregatedStats$ = new Subject(); + + return new Promise((resolve) => { + createMonitoringStatsStream(aggregatedStats$, configuration) + .pipe(take(3), bufferCount(3)) + .subscribe(([initialValue, secondValue, thirdValue]) => { + expect(initialValue.stats).toMatchObject({ + lastUpdate: expect.any(String), + stats: { + configuration: { + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + }, + }, + }, + }); + + expect(secondValue).toMatchObject({ + lastUpdate: expect.any(String), + stats: { + newAggregatedStat: { + timestamp: expect.any(String), + value: { + some: { + complex: { + value: 123, + }, + }, + }, + }, + configuration: { + timestamp: expect.any(String), + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + }, + }, + }, + }); + + expect(thirdValue).toMatchObject({ + lastUpdate: expect.any(String), + stats: { + newAggregatedStat: { + timestamp: expect.any(String), + value: { + some: { + updated: { + value: 456, + }, + }, + }, + }, + configuration: { + timestamp: expect.any(String), + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + }, + }, + }, + }); + }); + + aggregatedStats$.next({ + key: 'newAggregatedStat', + value: { + some: { + complex: { + value: 123, + }, + }, + } as JsonValue, + }); + + aggregatedStats$.next({ + key: 'newAggregatedStat', + value: { + some: { + updated: { + value: 456, + }, + }, + } as JsonValue, + }); + + resolve(); + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts new file mode 100644 index 000000000000..374660a257c5 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts @@ -0,0 +1,148 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +import { merge, of, Observable } from 'rxjs'; +import { map, scan } from 'rxjs/operators'; +import { set } from '@elastic/safer-lodash-set'; +import { Logger } from 'src/core/server'; +import { JsonObject } from 'src/plugins/kibana_utils/common'; +import { TaskStore } from '../task_store'; +import { TaskPollingLifecycle } from '../polling_lifecycle'; +import { + createWorkloadAggregator, + summarizeWorkloadStat, + WorkloadStat, +} from './workload_statistics'; +import { + createTaskRunAggregator, + summarizeTaskRunStat, + TaskRunStat, + SummarizedTaskRunStat, +} from './task_run_statistics'; +import { ConfigStat, createConfigurationAggregator } from './configuration_statistics'; +import { TaskManagerConfig } from '../config'; +import { AggregatedStatProvider } from './runtime_statistics_aggregator'; +import { ManagedConfiguration } from '../lib/create_managed_configuration'; + +export { AggregatedStatProvider, AggregatedStat } from './runtime_statistics_aggregator'; + +export interface MonitoringStats { + last_update: string; + stats: { + configuration?: MonitoredStat; + workload?: MonitoredStat; + runtime?: MonitoredStat; + }; +} + +export enum HealthStatus { + OK = 'OK', + Warning = 'warn', + Error = 'error', +} + +interface MonitoredStat { + timestamp: string; + value: T; +} +type RawMonitoredStat = MonitoredStat & { + status: HealthStatus; +}; + +export interface RawMonitoringStats { + last_update: string; + stats: { + configuration?: RawMonitoredStat; + workload?: RawMonitoredStat; + runtime?: RawMonitoredStat; + }; +} + +export function createAggregators( + taskPollingLifecycle: TaskPollingLifecycle, + taskStore: TaskStore, + config: TaskManagerConfig, + managedConfig: ManagedConfiguration, + logger: Logger +): AggregatedStatProvider { + return merge( + createConfigurationAggregator(config, managedConfig), + createTaskRunAggregator(taskPollingLifecycle, config.monitored_stats_running_average_window), + createWorkloadAggregator( + taskStore, + config.monitored_aggregated_stats_refresh_rate, + config.poll_interval, + logger + ) + ); +} + +export function createMonitoringStatsStream( + provider$: AggregatedStatProvider, + config: TaskManagerConfig +): Observable { + const initialStats = { + last_update: new Date().toISOString(), + stats: {}, + }; + return merge( + // emit the initial stats + of(initialStats), + // emit updated stats whenever a provider updates a specific key on the stats + provider$.pipe( + map(({ key, value }) => { + return { + value: { timestamp: new Date().toISOString(), value }, + key, + }; + }), + scan((monitoringStats: MonitoringStats, { key, value }) => { + // incrementally merge stats as they come in + set(monitoringStats.stats, key, value); + monitoringStats.last_update = new Date().toISOString(); + return monitoringStats; + }, initialStats) + ) + ); +} + +export function summarizeMonitoringStats( + { + // eslint-disable-next-line @typescript-eslint/naming-convention + last_update, + stats: { runtime, workload, configuration }, + }: MonitoringStats, + config: TaskManagerConfig +): RawMonitoringStats { + return { + last_update, + stats: { + ...(configuration + ? { + configuration: { + ...configuration, + status: HealthStatus.OK, + }, + } + : {}), + ...(runtime + ? { + runtime: { + timestamp: runtime.timestamp, + ...summarizeTaskRunStat(runtime.value, config), + }, + } + : {}), + ...(workload + ? { + workload: { + timestamp: workload.timestamp, + ...summarizeWorkloadStat(workload.value), + }, + } + : {}), + }, + }; +} diff --git a/x-pack/plugins/task_manager/server/monitoring/runtime_statistics_aggregator.ts b/x-pack/plugins/task_manager/server/monitoring/runtime_statistics_aggregator.ts new file mode 100644 index 000000000000..bd2b3845f252 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/runtime_statistics_aggregator.ts @@ -0,0 +1,16 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +import { Observable } from 'rxjs'; +import { JsonValue } from 'src/plugins/kibana_utils/common'; + +export interface AggregatedStat { + key: string; + value: Stat; +} + +export type AggregatedStatProvider = Observable< + AggregatedStat +>; diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts new file mode 100644 index 000000000000..eb8cabd9f3a8 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts @@ -0,0 +1,79 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import uuid from 'uuid'; + +import { + calculateRunningAverage, + calculateFrequency, + createRunningAveragedStat, + createMapOfRunningAveragedStats, +} from './task_run_calcultors'; + +describe('calculateRunningAverage', () => { + test('calculates the running average and median of a window of values', async () => { + expect(calculateRunningAverage([2, 2, 4, 6, 6])).toMatchInlineSnapshot(` + Object { + "p50": 4, + "p90": 6, + "p95": 6, + "p99": 6, + } + `); + }); +}); + +describe('calculateFrequency', () => { + test('calculates the frequency of each terms in the list as a percentage', async () => { + const [term1, term2, term3] = [uuid.v4(), uuid.v4(), uuid.v4()]; + expect( + calculateFrequency([term1, term2, term2, term3, term1, term1, term2, term1, term3]) + ).toEqual({ + [term3]: 22, + [term1]: 44, + [term2]: 33, + }); + }); +}); + +describe('createRunningAveragedStat', () => { + test('create a function which tracks a window of values', async () => { + const queue = createRunningAveragedStat(3); + expect(queue(1)).toEqual([1]); + expect(queue(2)).toEqual([1, 2]); + expect(queue(3)).toEqual([1, 2, 3]); + expect(queue(4)).toEqual([2, 3, 4]); + expect(queue(5)).toEqual([3, 4, 5]); + }); +}); + +describe('createMapOfRunningAveragedStats', () => { + test('create a function which tracks multiple window of values by key', async () => { + const [term1, term2, term3] = [uuid.v4(), uuid.v4(), uuid.v4()]; + const mappedQueues = createMapOfRunningAveragedStats(3); + expect(mappedQueues(term1, 1)).toEqual({ [term1]: [1] }); + expect(mappedQueues(term1, 2)).toEqual({ [term1]: [1, 2] }); + expect(mappedQueues(term2, 3)).toEqual({ [term1]: [1, 2], [term2]: [3] }); + expect(mappedQueues(term3, 4)).toEqual({ [term1]: [1, 2], [term2]: [3], [term3]: [4] }); + expect(mappedQueues(term2, 5)).toEqual({ [term1]: [1, 2], [term2]: [3, 5], [term3]: [4] }); + expect(mappedQueues(term2, 6)).toEqual({ [term1]: [1, 2], [term2]: [3, 5, 6], [term3]: [4] }); + expect(mappedQueues(term1, 7)).toEqual({ + [term1]: [1, 2, 7], + [term2]: [3, 5, 6], + [term3]: [4], + }); + expect(mappedQueues(term1, 8)).toEqual({ + [term1]: [2, 7, 8], + [term2]: [3, 5, 6], + [term3]: [4], + }); + expect(mappedQueues(term1, 9)).toEqual({ + [term1]: [7, 8, 9], + [term2]: [3, 5, 6], + [term3]: [4], + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts new file mode 100644 index 000000000000..67b77a29b1c7 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts @@ -0,0 +1,65 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import stats from 'stats-lite'; +import { JsonObject } from 'src/plugins/kibana_utils/common'; +import { isUndefined, countBy, mapValues } from 'lodash'; + +export interface AveragedStat extends JsonObject { + p50: number; + p90: number; + p95: number; + p99: number; +} + +export function calculateRunningAverage(values: number[]): AveragedStat { + return { + p50: stats.percentile(values, 0.5), + p90: stats.percentile(values, 0.9), + p95: stats.percentile(values, 0.95), + p99: stats.percentile(values, 0.99), + }; +} + +/** + * Calculate the frequency of each term in a list of terms. + * @param values + */ +export function calculateFrequency(values: T[]): JsonObject { + return values.length + ? mapValues(countBy(values), (count) => Math.round((count * 100) / values.length)) + : {}; +} + +/** + * Utility to keep track of a bounded array of values which changes over time + * dropping older values as they slide out of the window we wish to track + */ +export function createRunningAveragedStat(runningAverageWindowSize: number) { + const list = new Array(); + return (value?: T) => { + if (!isUndefined(value)) { + if (list.length === runningAverageWindowSize) { + list.shift(); + } + list.push(value); + } + // clone list to ensure it isn't mutated externally + return [...list]; + }; +} + +export function createMapOfRunningAveragedStats(runningAverageWindowSize: number) { + const mappedQueue: Record T[]> = {}; + const asRecordOfValues = () => mapValues(mappedQueue, (queue) => queue()); + return (key?: string, value?: T) => { + if (!isUndefined(key)) { + mappedQueue[key] = mappedQueue[key] ?? createRunningAveragedStat(runningAverageWindowSize); + mappedQueue[key](value); + } + return asRecordOfValues(); + }; +} diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts new file mode 100644 index 000000000000..a931f0ff7c30 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.test.ts @@ -0,0 +1,501 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import uuid from 'uuid'; +import { Subject, Observable } from 'rxjs'; +import stats from 'stats-lite'; +import sinon from 'sinon'; +import { take, tap, bufferCount, skip, map } from 'rxjs/operators'; + +import { ConcreteTaskInstance, TaskStatus } from '../task'; +import { asTaskRunEvent, asTaskPollingCycleEvent, TaskTiming } from '../task_events'; +import { asOk } from '../lib/result_type'; +import { TaskLifecycleEvent } from '../polling_lifecycle'; +import { TaskRunResult } from '../task_runner'; +import { + createTaskRunAggregator, + summarizeTaskRunStat, + TaskRunStat, + SummarizedTaskRunStat, +} from './task_run_statistics'; +import { AggregatedStat } from './runtime_statistics_aggregator'; +import { FillPoolResult } from '../lib/fill_pool'; +import { taskPollingLifecycleMock } from '../polling_lifecycle.mock'; +import { configSchema } from '../config'; + +describe('Task Run Statistics', () => { + let fakeTimer: sinon.SinonFakeTimers; + + beforeAll(() => { + fakeTimer = sinon.useFakeTimers(); + }); + + afterAll(() => fakeTimer.restore()); + + test('returns a running average of task drift', async () => { + const runAtDrift = [1000, 2000, 500, 300, 400, 15000, 20000, 200]; + const events$ = new Subject(); + const taskPollingLifecycle = taskPollingLifecycleMock.create({ + events$: events$ as Observable, + }); + + const runningAverageWindowSize = 5; + const taskRunAggregator = createTaskRunAggregator( + taskPollingLifecycle, + runningAverageWindowSize + ); + + function expectWindowEqualsUpdate( + taskStat: AggregatedStat, + window: number[] + ) { + expect(taskStat.value.drift).toMatchObject({ + p50: stats.percentile(window, 0.5), + p90: stats.percentile(window, 0.9), + p95: stats.percentile(window, 0.95), + p99: stats.percentile(window, 0.99), + }); + } + + return new Promise((resolve) => { + taskRunAggregator + .pipe( + // skip initial stat which is just initialized data which + // ensures we don't stall on combineLatest + skip(1), + // Use 'summarizeTaskRunStat' to receive summarize stats + map(({ key, value }: AggregatedStat) => ({ + key, + value: summarizeTaskRunStat(value, getTaskManagerConfig()).value, + })), + take(runAtDrift.length), + bufferCount(runAtDrift.length) + ) + .subscribe((taskStats: Array>) => { + expectWindowEqualsUpdate(taskStats[0], runAtDrift.slice(0, 1)); + expectWindowEqualsUpdate(taskStats[1], runAtDrift.slice(0, 2)); + expectWindowEqualsUpdate(taskStats[2], runAtDrift.slice(0, 3)); + expectWindowEqualsUpdate(taskStats[3], runAtDrift.slice(0, 4)); + expectWindowEqualsUpdate(taskStats[4], runAtDrift.slice(0, 5)); + // from the 6th value, begin to drop old values as out window is 5 + expectWindowEqualsUpdate(taskStats[5], runAtDrift.slice(1, 6)); + expectWindowEqualsUpdate(taskStats[6], runAtDrift.slice(2, 7)); + expectWindowEqualsUpdate(taskStats[7], runAtDrift.slice(3, 8)); + resolve(); + }); + + const now = Date.now(); + for (const drift of runAtDrift) { + const start = Math.floor(Math.random() * 1000); + events$.next( + mockTaskRunEvent( + { runAt: runAtMillisecondsAgo(drift + start) }, + { start: runAtMillisecondsAgo(start).getTime(), stop: now } + ) + ); + } + }); + }); + + test('returns a running average of task run duration', async () => { + const runDurations = [1000, 2000, 500, 300, 400, 15000, 20000, 200]; + const runDurationsInReverse = runDurations.reverse(); + const events$ = new Subject(); + const taskPollingLifecycle = taskPollingLifecycleMock.create({ + events$: events$ as Observable, + }); + + const runningAverageWindowSize = 5; + const taskRunAggregator = createTaskRunAggregator( + taskPollingLifecycle, + runningAverageWindowSize + ); + + function expectWindowEqualsUpdate( + taskStat: AggregatedStat, + windows: Record + ) { + for (const [type, window] of Object.entries(windows)) { + expect(taskStat.value.execution.duration[type]).toMatchObject({ + p50: stats.percentile(window, 0.5), + p90: stats.percentile(window, 0.9), + p95: stats.percentile(window, 0.95), + p99: stats.percentile(window, 0.99), + }); + } + } + + return new Promise((resolve, reject) => { + taskRunAggregator + .pipe( + // skip initial stat which is just initialized data which + // ensures we don't stall on combineLatest + skip(1), + // Use 'summarizeTaskRunStat' to receive summarize stats + map(({ key, value }: AggregatedStat) => ({ + key, + value: summarizeTaskRunStat(value, getTaskManagerConfig()).value, + })), + take(runDurations.length * 2), + bufferCount(runDurations.length * 2) + ) + .subscribe((taskStats: Array>) => { + try { + expectWindowEqualsUpdate(taskStats[0], { 'alerting:test': runDurations.slice(0, 1) }); + expectWindowEqualsUpdate(taskStats[1], { 'alerting:test': runDurations.slice(0, 2) }); + expectWindowEqualsUpdate(taskStats[2], { 'alerting:test': runDurations.slice(0, 3) }); + expectWindowEqualsUpdate(taskStats[3], { 'alerting:test': runDurations.slice(0, 4) }); + expectWindowEqualsUpdate(taskStats[4], { 'alerting:test': runDurations.slice(0, 5) }); + // from the 6th value, begin to drop old values as out window is 5 + expectWindowEqualsUpdate(taskStats[5], { 'alerting:test': runDurations.slice(1, 6) }); + expectWindowEqualsUpdate(taskStats[6], { 'alerting:test': runDurations.slice(2, 7) }); + expectWindowEqualsUpdate(taskStats[7], { 'alerting:test': runDurations.slice(3, 8) }); + expectWindowEqualsUpdate(taskStats[8], { + 'actions:test': runDurations.slice(0, 1), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[9], { + 'actions:test': runDurations.slice(0, 2), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[10], { + 'actions:test': runDurations.slice(0, 3), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[11], { + 'actions:test': runDurations.slice(0, 4), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[12], { + 'actions:test': runDurations.slice(0, 5), + 'alerting:test': runDurations.slice(3, 8), + }); + // from the 6th value, begin to drop old values as out window is 5 + expectWindowEqualsUpdate(taskStats[13], { + 'actions:test': runDurations.slice(1, 6), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[14], { + 'actions:test': runDurations.slice(2, 7), + 'alerting:test': runDurations.slice(3, 8), + }); + expectWindowEqualsUpdate(taskStats[15], { + 'actions:test': runDurations.slice(3, 8), + 'alerting:test': runDurations.slice(3, 8), + }); + resolve(); + } catch (e) { + reject(e); + } + }); + + const now = Date.now(); + for (const runDuration of runDurations) { + events$.next( + mockTaskRunEvent( + { taskType: 'alerting:test' }, + { start: runAtMillisecondsAgo(runDuration).getTime(), stop: now } + ) + ); + } + for (const runDuration of runDurationsInReverse) { + events$.next( + mockTaskRunEvent( + { taskType: 'actions:test' }, + { start: runAtMillisecondsAgo(runDuration).getTime(), stop: now } + ) + ); + } + }); + }); + + test('returns the frequency of task run results', async () => { + const events$ = new Subject(); + const taskPollingLifecycle = taskPollingLifecycleMock.create({ + events$: events$ as Observable, + }); + + const runningAverageWindowSize = 5; + const taskRunAggregator = createTaskRunAggregator( + taskPollingLifecycle, + runningAverageWindowSize + ); + + return new Promise((resolve, reject) => { + taskRunAggregator + .pipe( + // skip initial stat which is just initialized data which + // ensures we don't stall on combineLatest + skip(1), + // Use 'summarizeTaskRunStat' to receive summarize stats + map(({ key, value }: AggregatedStat) => ({ + key, + value: summarizeTaskRunStat(value, getTaskManagerConfig()).value, + })), + take(10), + bufferCount(10) + ) + .subscribe((taskStats: Array>) => { + try { + /** + * At any given time we only keep track of the last X Polling Results + * In the tests this is ocnfiugured to a window size of 5 + */ + expect( + taskStats.map( + (taskStat) => + taskStat.value.execution.result_frequency_percent_as_number['alerting:test'] + ) + ).toEqual([ + // Success + { Success: 100, RetryScheduled: 0, Failed: 0, status: 'OK' }, + // Success, Success, + { Success: 100, RetryScheduled: 0, Failed: 0, status: 'OK' }, + // Success, Success, Success + { Success: 100, RetryScheduled: 0, Failed: 0, status: 'OK' }, + // Success, Success, Success, Failed + { Success: 75, RetryScheduled: 0, Failed: 25, status: 'OK' }, + // Success, Success, Success, Failed, Failed + { Success: 60, RetryScheduled: 0, Failed: 40, status: 'OK' }, + // Success, Success, Failed, Failed, Failed + { Success: 40, RetryScheduled: 0, Failed: 60, status: 'OK' }, + // Success, Failed, Failed, Failed, RetryScheduled + { Success: 20, RetryScheduled: 20, Failed: 60, status: 'OK' }, + // Failed, Failed, Failed, RetryScheduled, RetryScheduled + { Success: 0, RetryScheduled: 40, Failed: 60, status: 'OK' }, + // Failed, Failed, RetryScheduled, RetryScheduled, Success + { Success: 20, RetryScheduled: 40, Failed: 40, status: 'OK' }, + // Failed, RetryScheduled, RetryScheduled, Success, Success + { Success: 40, RetryScheduled: 40, Failed: 20, status: 'OK' }, + ]); + resolve(); + } catch (e) { + reject(e); + } + }); + + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Failed)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Failed)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Failed)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.RetryScheduled)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.RetryScheduled)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + }); + }); + + test('frequency of task run results return an error health status when failure is above a certain threshold', async () => { + const events$ = new Subject(); + + const taskPollingLifecycle = taskPollingLifecycleMock.create({ + events$: events$ as Observable, + }); + + const runningAverageWindowSize = 5; + const taskRunAggregator = createTaskRunAggregator( + taskPollingLifecycle, + runningAverageWindowSize + ); + + return new Promise((resolve, reject) => { + taskRunAggregator + .pipe( + // skip initial stat which is just initialized data which + // ensures we don't stall on combineLatest + skip(1), + // Use 'summarizeTaskRunStat' to receive summarize stats + map(({ key, value }: AggregatedStat) => ({ + key, + value: summarizeTaskRunStat( + value, + getTaskManagerConfig({ + monitored_task_execution_thresholds: { + custom: { + 'alerting:test': { + error_threshold: 59, + warn_threshold: 39, + }, + }, + }, + }) + ).value, + })), + take(10), + bufferCount(10) + ) + .subscribe((taskStats: Array>) => { + try { + /** + * At any given time we only keep track of the last X Polling Results + * In the tests this is ocnfiugured to a window size of 5 + */ + expect( + taskStats.map( + (taskStat) => + taskStat.value.execution.result_frequency_percent_as_number['alerting:test'] + ) + ).toEqual([ + // Success + { Success: 100, RetryScheduled: 0, Failed: 0, status: 'OK' }, + // Success, Success, + { Success: 100, RetryScheduled: 0, Failed: 0, status: 'OK' }, + // Success, Success, Success + { Success: 100, RetryScheduled: 0, Failed: 0, status: 'OK' }, + // Success, Success, Success, Failed + { Success: 75, RetryScheduled: 0, Failed: 25, status: 'OK' }, + // Success, Success, Success, Failed, Failed + { Success: 60, RetryScheduled: 0, Failed: 40, status: 'warn' }, + // Success, Success, Failed, Failed, Failed + { Success: 40, RetryScheduled: 0, Failed: 60, status: 'error' }, + // Success, Failed, Failed, Failed, RetryScheduled + { Success: 20, RetryScheduled: 20, Failed: 60, status: 'error' }, + // Failed, Failed, Failed, RetryScheduled, RetryScheduled + { Success: 0, RetryScheduled: 40, Failed: 60, status: 'error' }, + // Failed, Failed, RetryScheduled, RetryScheduled, Success + { Success: 20, RetryScheduled: 40, Failed: 40, status: 'warn' }, + // Failed, RetryScheduled, RetryScheduled, Success, Success + { Success: 40, RetryScheduled: 40, Failed: 20, status: 'OK' }, + ]); + resolve(); + } catch (e) { + reject(e); + } + }); + + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Failed)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Failed)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Failed)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.RetryScheduled)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.RetryScheduled)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + events$.next(mockTaskRunEvent({}, { start: 0, stop: 0 }, TaskRunResult.Success)); + }); + }); + + test('returns polling stats', async () => { + const expectedTimestamp: string[] = []; + const events$ = new Subject(); + const taskPollingLifecycle = taskPollingLifecycleMock.create({ + events$: events$ as Observable, + }); + + const runningAverageWindowSize = 5; + const taskRunAggregator = createTaskRunAggregator( + taskPollingLifecycle, + runningAverageWindowSize + ); + + return new Promise((resolve, reject) => { + taskRunAggregator + .pipe( + // skip initial stat which is just initialized data which + // ensures we don't stall on combineLatest + skip(1), + // Use 'summarizeTaskRunStat' to receive summarize stats + map(({ key, value }: AggregatedStat) => ({ + key, + value: summarizeTaskRunStat(value, getTaskManagerConfig()).value, + })), + tap(() => { + expectedTimestamp.push(new Date().toISOString()); + // each event is a second after the previous one + fakeTimer.tick(1000); + }), + take(10), + bufferCount(10) + ) + .subscribe((taskStats: Array>) => { + try { + expect( + taskStats.map((taskStat) => taskStat.value.polling.last_successful_poll) + ).toEqual(expectedTimestamp); + + /** + * At any given time we only keep track of the last X Polling Results + * In the tests this is ocnfiugured to a window size of 5 + */ + expect( + taskStats.map((taskStat) => taskStat.value.polling.result_frequency_percent_as_number) + ).toEqual([ + // NoTasksClaimed + { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, + // NoTasksClaimed, NoTasksClaimed, + { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, + // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed + { NoTasksClaimed: 100, RanOutOfCapacity: 0, PoolFilled: 0 }, + // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed, PoolFilled + { NoTasksClaimed: 75, RanOutOfCapacity: 0, PoolFilled: 25 }, + // NoTasksClaimed, NoTasksClaimed, NoTasksClaimed, PoolFilled, PoolFilled + { NoTasksClaimed: 60, RanOutOfCapacity: 0, PoolFilled: 40 }, + // NoTasksClaimed, NoTasksClaimed, PoolFilled, PoolFilled, PoolFilled + { NoTasksClaimed: 40, RanOutOfCapacity: 0, PoolFilled: 60 }, + // NoTasksClaimed, PoolFilled, PoolFilled, PoolFilled, RanOutOfCapacity + { NoTasksClaimed: 20, RanOutOfCapacity: 20, PoolFilled: 60 }, + // PoolFilled, PoolFilled, PoolFilled, RanOutOfCapacity, RanOutOfCapacity + { NoTasksClaimed: 0, RanOutOfCapacity: 40, PoolFilled: 60 }, + // PoolFilled, PoolFilled, RanOutOfCapacity, RanOutOfCapacity, NoTasksClaimed + { NoTasksClaimed: 20, RanOutOfCapacity: 40, PoolFilled: 40 }, + // PoolFilled, RanOutOfCapacity, RanOutOfCapacity, NoTasksClaimed, NoTasksClaimed + { NoTasksClaimed: 40, RanOutOfCapacity: 40, PoolFilled: 20 }, + ]); + resolve(); + } catch (e) { + reject(e); + } + }); + + events$.next(asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed))); + events$.next(asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed))); + events$.next(asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed))); + events$.next(asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled))); + events$.next(asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled))); + events$.next(asTaskPollingCycleEvent(asOk(FillPoolResult.PoolFilled))); + events$.next(asTaskPollingCycleEvent(asOk(FillPoolResult.RanOutOfCapacity))); + events$.next(asTaskPollingCycleEvent(asOk(FillPoolResult.RanOutOfCapacity))); + events$.next(asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed))); + events$.next(asTaskPollingCycleEvent(asOk(FillPoolResult.NoTasksClaimed))); + }); + }); +}); + +function runAtMillisecondsAgo(ms: number): Date { + return new Date(Date.now() - ms); +} + +const mockTaskRunEvent = ( + overrides: Partial = {}, + timing: TaskTiming, + result: TaskRunResult = TaskRunResult.Success +) => { + const task = mockTaskInstance(overrides); + return asTaskRunEvent(task.id, asOk({ task, result }), timing); +}; + +const mockTaskInstance = (overrides: Partial = {}): ConcreteTaskInstance => ({ + id: uuid.v4(), + attempts: 0, + status: TaskStatus.Running, + version: '123', + runAt: new Date(), + scheduledAt: new Date(), + startedAt: new Date(), + retryAt: new Date(Date.now() + 5 * 60 * 1000), + state: {}, + taskType: 'alerting:test', + params: { + alertId: '1', + }, + ownerId: null, + ...overrides, +}); + +const getTaskManagerConfig = (overrides: unknown = {}) => configSchema.validate(overrides); diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts new file mode 100644 index 000000000000..6dd533177a86 --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts @@ -0,0 +1,224 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { combineLatest, Observable } from 'rxjs'; +import { filter, startWith, map } from 'rxjs/operators'; +import { JsonObject } from 'src/plugins/kibana_utils/common'; +import { mapValues } from 'lodash'; +import { AggregatedStatProvider, AggregatedStat } from './runtime_statistics_aggregator'; +import { TaskLifecycleEvent } from '../polling_lifecycle'; +import { + isTaskRunEvent, + isTaskPollingCycleEvent, + TaskRun, + ErroredTask, + RanTask, + TaskTiming, +} from '../task_events'; +import { isOk, Ok, unwrap } from '../lib/result_type'; +import { ConcreteTaskInstance } from '../task'; +import { TaskRunResult } from '../task_runner'; +import { FillPoolResult } from '../lib/fill_pool'; +import { + AveragedStat, + calculateRunningAverage, + calculateFrequency, + createRunningAveragedStat, + createMapOfRunningAveragedStats, +} from './task_run_calcultors'; +import { HealthStatus } from './monitoring_stats_stream'; +import { TaskPollingLifecycle } from '../polling_lifecycle'; +import { TaskExecutionFailureThreshold, TaskManagerConfig } from '../config'; + +interface FillPoolStat extends JsonObject { + last_successful_poll: string; + result_frequency_percent_as_number: FillPoolResult[]; +} + +interface ExecutionStat extends JsonObject { + duration: Record; + result_frequency_percent_as_number: Record; +} + +export interface TaskRunStat extends JsonObject { + drift: number[]; + execution: ExecutionStat; + polling: FillPoolStat | Omit; +} + +interface FillPoolRawStat extends JsonObject { + last_successful_poll: string; + result_frequency_percent_as_number: { + [FillPoolResult.NoTasksClaimed]: number; + [FillPoolResult.RanOutOfCapacity]: number; + [FillPoolResult.PoolFilled]: number; + }; +} + +interface ResultFrequency extends JsonObject { + [TaskRunResult.Success]: number; + [TaskRunResult.SuccessRescheduled]: number; + [TaskRunResult.RetryScheduled]: number; + [TaskRunResult.Failed]: number; +} + +type ResultFrequencySummary = ResultFrequency & { + status: HealthStatus; +}; + +export interface SummarizedTaskRunStat extends JsonObject { + drift: AveragedStat; + execution: { + duration: Record; + result_frequency_percent_as_number: Record; + }; + polling: FillPoolRawStat | Omit; +} + +export function createTaskRunAggregator( + taskPollingLifecycle: TaskPollingLifecycle, + runningAverageWindowSize: number +): AggregatedStatProvider { + const taskRunEventToStat = createTaskRunEventToStat(runningAverageWindowSize); + const taskRunEvents$: Observable> = taskPollingLifecycle.events.pipe( + filter((taskEvent: TaskLifecycleEvent) => isTaskRunEvent(taskEvent) && hasTiming(taskEvent)), + map((taskEvent: TaskLifecycleEvent) => { + const { task, result }: RanTask | ErroredTask = unwrap((taskEvent as TaskRun).event); + return taskRunEventToStat(task, taskEvent.timing!, result); + }) + ); + + const resultFrequencyQueue = createRunningAveragedStat(runningAverageWindowSize); + const taskPollingEvents$: Observable> = taskPollingLifecycle.events.pipe( + filter( + (taskEvent: TaskLifecycleEvent) => + isTaskPollingCycleEvent(taskEvent) && isOk(taskEvent.event) + ), + map((taskEvent: TaskLifecycleEvent) => { + return { + polling: { + last_successful_poll: new Date().toISOString(), + result_frequency_percent_as_number: resultFrequencyQueue( + ((taskEvent.event as unknown) as Ok).value + ), + }, + }; + }) + ); + + return combineLatest([ + taskRunEvents$.pipe( + startWith({ drift: [], execution: { duration: {}, result_frequency_percent_as_number: {} } }) + ), + taskPollingEvents$.pipe( + startWith({ + polling: { result_frequency_percent_as_number: [] }, + }) + ), + ]).pipe( + map(([taskRun, polling]: [Omit, Pick]) => { + return { + key: 'runtime', + value: { + ...taskRun, + ...polling, + }, + } as AggregatedStat; + }) + ); +} + +function hasTiming(taskEvent: TaskLifecycleEvent) { + return !!taskEvent?.timing; +} + +function createTaskRunEventToStat(runningAverageWindowSize: number) { + const driftQueue = createRunningAveragedStat(runningAverageWindowSize); + const taskRunDurationQueue = createMapOfRunningAveragedStats(runningAverageWindowSize); + const resultFrequencyQueue = createMapOfRunningAveragedStats( + runningAverageWindowSize + ); + return ( + task: ConcreteTaskInstance, + timing: TaskTiming, + result: TaskRunResult + ): Omit => ({ + drift: driftQueue(timing!.start - task.runAt.getTime()), + execution: { + duration: taskRunDurationQueue(task.taskType, timing!.stop - timing!.start), + result_frequency_percent_as_number: resultFrequencyQueue(task.taskType, result), + }, + }); +} + +const DEFAULT_TASK_RUN_FREQUENCIES = { + [TaskRunResult.Success]: 0, + [TaskRunResult.SuccessRescheduled]: 0, + [TaskRunResult.RetryScheduled]: 0, + [TaskRunResult.Failed]: 0, +}; +const DEFAULT_POLLING_FREQUENCIES = { + [FillPoolResult.NoTasksClaimed]: 0, + [FillPoolResult.RanOutOfCapacity]: 0, + [FillPoolResult.PoolFilled]: 0, +}; + +export function summarizeTaskRunStat( + { + // eslint-disable-next-line @typescript-eslint/naming-convention + polling: { last_successful_poll, result_frequency_percent_as_number: pollingResultFrequency }, + drift, + execution: { duration, result_frequency_percent_as_number: executionResultFrequency }, + }: TaskRunStat, + config: TaskManagerConfig +): { value: SummarizedTaskRunStat; status: HealthStatus } { + return { + value: { + polling: { + ...(last_successful_poll ? { last_successful_poll } : {}), + result_frequency_percent_as_number: { + ...DEFAULT_POLLING_FREQUENCIES, + ...calculateFrequency(pollingResultFrequency as FillPoolResult[]), + }, + }, + drift: calculateRunningAverage(drift), + execution: { + duration: mapValues(duration, (typedDurations) => calculateRunningAverage(typedDurations)), + result_frequency_percent_as_number: mapValues( + executionResultFrequency, + (typedResultFrequencies, taskType) => + summarizeTaskExecutionResultFrequencyStat( + { + ...DEFAULT_TASK_RUN_FREQUENCIES, + ...calculateFrequency(typedResultFrequencies), + }, + config.monitored_task_execution_thresholds.custom[taskType] ?? + config.monitored_task_execution_thresholds.default + ) + ), + }, + }, + status: HealthStatus.OK, + }; +} + +function summarizeTaskExecutionResultFrequencyStat( + resultFrequencySummary: ResultFrequency, + executionErrorThreshold: TaskExecutionFailureThreshold +): ResultFrequencySummary { + return { + ...resultFrequencySummary, + status: + resultFrequencySummary.Failed > executionErrorThreshold.warn_threshold + ? resultFrequencySummary.Failed > executionErrorThreshold.error_threshold + ? HealthStatus.Error + : HealthStatus.Warning + : HealthStatus.OK, + }; +} diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts new file mode 100644 index 000000000000..d9af3307e75c --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts @@ -0,0 +1,773 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { first, take, bufferCount } from 'rxjs/operators'; +import { loggingSystemMock } from '../../../../../src/core/server/mocks'; +import { + WorkloadAggregation, + createWorkloadAggregator, + padBuckets, + estimateRecurringTaskScheduling, +} from './workload_statistics'; +import { ConcreteTaskInstance } from '../task'; +import { ESSearchResponse } from '../../../apm/typings/elasticsearch'; +import { AggregationResultOf } from '../../../apm/typings/elasticsearch/aggregations'; +import { times } from 'lodash'; +import { taskStoreMock } from '../task_store.mock'; + +type MockESResult = ESSearchResponse< + ConcreteTaskInstance, + { + body: WorkloadAggregation; + } +>; + +describe('Workload Statistics Aggregator', () => { + test('queries the Task Store at a fixed interval for the current workload', async () => { + const taskStore = taskStoreMock.create({}); + taskStore.aggregate.mockResolvedValue({ + hits: { + hits: [], + max_score: 0, + total: { value: 0, relation: 'eq' }, + }, + took: 1, + timed_out: false, + _shards: { + total: 1, + successful: 1, + skipped: 1, + failed: 0, + }, + aggregations: { + taskType: { + buckets: [], + }, + schedule: { + buckets: [], + }, + idleTasks: { + doc_count: 0, + overdue: { + doc_count: 0, + }, + scheduleDensity: { + buckets: [ + { + key: '2020-10-02T15:18:37.274Z-2020-10-02T15:19:36.274Z', + from: 1.601651917274e12, + from_as_string: '2020-10-02T15:18:37.274Z', + to: 1.601651976274e12, + to_as_string: '2020-10-02T15:19:36.274Z', + doc_count: 0, + histogram: { + buckets: [], + }, + }, + ], + }, + }, + }, + } as MockESResult); + + const workloadAggregator = createWorkloadAggregator( + taskStore, + 10, + 3000, + loggingSystemMock.create().get() + ); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe(() => { + expect(taskStore.aggregate).toHaveBeenCalledWith({ + aggs: { + taskType: { + terms: { field: 'task.taskType' }, + aggs: { + status: { + terms: { field: 'task.status' }, + }, + }, + }, + schedule: { + terms: { + field: 'task.schedule.interval', + }, + }, + idleTasks: { + filter: { + term: { 'task.status': 'idle' }, + }, + aggs: { + scheduleDensity: { + range: { + field: 'task.runAt', + ranges: [{ from: 'now', to: 'now+1m' }], + }, + aggs: { + histogram: { + date_histogram: { + field: 'task.runAt', + fixed_interval: '3s', + }, + aggs: { + interval: { + terms: { + field: 'task.schedule.interval', + }, + }, + }, + }, + }, + }, + overdue: { + filter: { + range: { + 'task.runAt': { lt: 'now' }, + }, + }, + }, + }, + }, + }, + }); + resolve(); + }); + }); + }); + + const mockAggregatedResult: () => MockESResult = () => + ({ + hits: { + hits: [], + max_score: 0, + total: { value: 4, relation: 'eq' }, + }, + took: 1, + timed_out: false, + _shards: { + total: 1, + successful: 1, + skipped: 1, + failed: 0, + }, + aggregations: { + schedule: { + buckets: [ + { + key: '3600s', + doc_count: 1, + }, + { + key: '60s', + doc_count: 1, + }, + { + key: '720m', + doc_count: 1, + }, + ], + }, + taskType: { + buckets: [ + { + key: 'actions_telemetry', + doc_count: 2, + status: { + buckets: [ + { + key: 'idle', + doc_count: 2, + }, + ], + }, + }, + { + key: 'alerting_telemetry', + doc_count: 1, + status: { + buckets: [ + { + key: 'idle', + doc_count: 1, + }, + ], + }, + }, + { + key: 'session_cleanup', + doc_count: 1, + status: { + buckets: [ + { + key: 'idle', + doc_count: 1, + }, + ], + }, + }, + ], + }, + idleTasks: { + doc_count: 13, + overdue: { + doc_count: 6, + }, + scheduleDensity: { + buckets: [ + mockHistogram(0, 7 * 3000 + 500, 60 * 1000, 3000, [2, 2, 5, 0, 0, 0, 0, 0, 0, 1]), + ], + }, + }, + }, + } as MockESResult); + + test('returns a summary of the workload by task type', async () => { + const taskStore = taskStoreMock.create({}); + taskStore.aggregate.mockResolvedValue(mockAggregatedResult()); + + const workloadAggregator = createWorkloadAggregator( + taskStore, + 10, + 3000, + loggingSystemMock.create().get() + ); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe((result) => { + expect(result.key).toEqual('workload'); + expect(result.value).toMatchObject({ + count: 4, + task_types: { + actions_telemetry: { count: 2, status: { idle: 2 } }, + alerting_telemetry: { count: 1, status: { idle: 1 } }, + session_cleanup: { count: 1, status: { idle: 1 } }, + }, + }); + resolve(); + }); + }); + }); + + test('returns a count of the overdue workload', async () => { + const taskStore = taskStoreMock.create({}); + taskStore.aggregate.mockResolvedValue(mockAggregatedResult()); + + const workloadAggregator = createWorkloadAggregator( + taskStore, + 10, + 3000, + loggingSystemMock.create().get() + ); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe((result) => { + expect(result.key).toEqual('workload'); + expect(result.value).toMatchObject({ + overdue: 6, + }); + resolve(); + }); + }); + }); + + test('returns a histogram of the upcoming workload for the upcoming minute when refresh rate is high', async () => { + const taskStore = taskStoreMock.create({}); + taskStore.aggregate.mockResolvedValue(mockAggregatedResult()); + + const workloadAggregator = createWorkloadAggregator( + taskStore, + 10, + 3000, + loggingSystemMock.create().get() + ); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe((result) => { + expect(result.key).toEqual('workload'); + expect(result.value).toMatchObject({ + // we have intervals every 3s, so we aggregate buckets 3s apart + // in this mock, Elasticsearch found tasks scheduled in 21 (8th bucket), 24, 27 and 48s seconds from now + // 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57 + // [0, 0, 0, 0, 0, 0, 0, 2, 2, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0 ] + // Above you see each bucket and the number of scheduled tasks we expect to have in them + estimated_schedule_density: [0, 0, 0, 0, 0, 0, 0, 2, 2, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], + }); + resolve(); + }); + }); + }); + + test('returns a histogram of the upcoming workload for twice refresh rate when rate is low', async () => { + const taskStore = taskStoreMock.create({}); + taskStore.aggregate.mockResolvedValue(mockAggregatedResult()); + + const workloadAggregator = createWorkloadAggregator( + taskStore, + 60 * 1000, + 3000, + loggingSystemMock.create().get() + ); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe(() => { + expect(taskStore.aggregate.mock.calls[0][0]).toMatchObject({ + aggs: { + idleTasks: { + aggs: { + scheduleDensity: { + range: { + field: 'task.runAt', + ranges: [ + { + from: 'now', + to: 'now+2m', + }, + ], + }, + }, + }, + }, + }, + }); + resolve(); + }); + }); + }); + + test('returns a histogram of the upcoming workload maxed out at 50 buckets when rate is too low', async () => { + const taskStore = taskStoreMock.create({}); + taskStore.aggregate.mockResolvedValue(mockAggregatedResult()); + + const workloadAggregator = createWorkloadAggregator( + taskStore, + 15 * 60 * 1000, + 3000, + loggingSystemMock.create().get() + ); + + return new Promise((resolve) => { + workloadAggregator.pipe(first()).subscribe((result) => { + expect(taskStore.aggregate.mock.calls[0][0]).toMatchObject({ + aggs: { + idleTasks: { + aggs: { + scheduleDensity: { + range: { + field: 'task.runAt', + ranges: [ + { + from: 'now', + // 50 buckets of 3s = 50 * 3 = 150s + to: 'now+150s', + }, + ], + }, + }, + }, + }, + }, + }); + resolve(); + }); + }); + }); + + test('recovers from errors fetching the workload', async () => { + const taskStore = taskStoreMock.create({}); + taskStore.aggregate + .mockResolvedValueOnce( + setTaskTypeCount(mockAggregatedResult(), 'alerting_telemetry', { + idle: 2, + }) + ) + .mockRejectedValueOnce(new Error('Elasticsearch has gone poof')) + .mockResolvedValueOnce( + setTaskTypeCount(mockAggregatedResult(), 'alerting_telemetry', { + idle: 1, + failed: 1, + }) + ); + const logger = loggingSystemMock.create().get(); + const workloadAggregator = createWorkloadAggregator(taskStore, 10, 3000, logger); + + return new Promise((resolve, reject) => { + workloadAggregator.pipe(take(2), bufferCount(2)).subscribe((results) => { + expect(results[0].key).toEqual('workload'); + expect(results[0].value).toMatchObject({ + count: 5, + task_types: { + actions_telemetry: { count: 2, status: { idle: 2 } }, + alerting_telemetry: { count: 2, status: { idle: 2 } }, + session_cleanup: { count: 1, status: { idle: 1 } }, + }, + }); + expect(results[1].key).toEqual('workload'); + expect(results[1].value).toMatchObject({ + count: 5, + task_types: { + actions_telemetry: { count: 2, status: { idle: 2 } }, + alerting_telemetry: { count: 2, status: { idle: 1, failed: 1 } }, + session_cleanup: { count: 1, status: { idle: 1 } }, + }, + }); + resolve(); + }, reject); + }); + }); +}); + +describe('estimateRecurringTaskScheduling', () => { + test('flattens out buckets with non recurring tasks', () => { + const now = Date.now(); + const schedule = times(10, (index) => ({ + key: index * 3000 + now, + nonRecurring: index, + })); + expect(estimateRecurringTaskScheduling(schedule, 3000)).toEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + }); + + test('estimates the buckets that recurring tasks might repeat in when recurring task interval equals the interval', () => { + const now = Date.now(); + const schedule: Array<{ + key: number; + nonRecurring: number; + recurring?: Array<[number, string]>; + }> = times(10, (index) => ({ + key: index * 3000 + now, + nonRecurring: 0, + })); + + schedule[0].nonRecurring = 1; + schedule[1].nonRecurring = 1; + schedule[4].recurring = [[1, '3s']]; + + expect(estimateRecurringTaskScheduling(schedule, 3000)).toEqual([1, 1, 0, 0, 1, 1, 1, 1, 1, 1]); + }); + + test('estimates the buckets that recurring tasks might repeat in when recurring task interval is larger than the interval', () => { + const now = Date.now(); + const schedule: Array<{ + key: number; + nonRecurring: number; + recurring?: Array<[number, string]>; + }> = times(10, (index) => ({ + key: index * 3000 + now, + nonRecurring: 0, + })); + + schedule[0].nonRecurring = 1; + schedule[1].nonRecurring = 1; + schedule[4].recurring = [[1, '6s']]; + + expect(estimateRecurringTaskScheduling(schedule, 3000)).toEqual([1, 1, 0, 0, 1, 0, 1, 0, 1, 0]); + }); + + test('estimates the buckets that recurring tasks might repeat in when recurring task interval doesnt divide by interval', () => { + const now = Date.now(); + const schedule: Array<{ + key: number; + nonRecurring: number; + recurring?: Array<[number, string]>; + }> = times(10, (index) => ({ + key: index * 3000 + now, + nonRecurring: 0, + })); + + schedule[0].nonRecurring = 1; + schedule[1].nonRecurring = 1; + schedule[4].recurring = [[1, '5s']]; + + expect(estimateRecurringTaskScheduling(schedule, 3000)).toEqual([1, 1, 0, 0, 1, 0, 1, 0, 1, 0]); + }); + + test('estimates the buckets that recurring tasks might repeat in when recurring tasks overlap', () => { + const now = Date.now(); + const schedule: Array<{ + key: number; + nonRecurring: number; + recurring?: Array<[number, string]>; + }> = times(20, (index) => ({ + key: index * 3000 + now, + nonRecurring: 0, + })); + + schedule[0].nonRecurring = 1; + schedule[1].nonRecurring = 1; + schedule[3].recurring = [[1, '3s']]; + schedule[4].recurring = [ + [2, '6s'], + [1, '8s'], + ]; + schedule[5].recurring = [[1, '5s']]; + schedule[6].nonRecurring = 3; + + expect(estimateRecurringTaskScheduling(schedule, 3000)).toEqual([ + 1, + 1, + 0, + 1, + 4, + 2, + 6, + 3, + 3, + 2, + 4, + 2, + 3, + 3, + 3, + 2, + 4, + 2, + 3, + 3, + ]); + }); +}); + +describe('padBuckets', () => { + test('returns zeroed out bucklets when there are no buckets in the histogram', async () => { + expect( + padBuckets(10, 3000, { + key: '2020-10-02T19:47:28.128Z-2020-10-02T19:48:28.128Z', + from: 1601668048128, + from_as_string: '2020-10-02T19:47:28.128Z', + to: 1601668108128, + to_as_string: '2020-10-02T19:48:28.128Z', + doc_count: 0, + histogram: { + buckets: [], + }, + }) + ).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + }); + + test('pads buckets with zeros to fill out the entire period of time after detected buckets', async () => { + expect( + padBuckets(10, 3000, { + key: '2020-10-02T19:47:28.128Z-2020-10-02T19:48:28.128Z', + from: 1601668046000, + from_as_string: '2020-10-02T19:47:26.000Z', + to: 1601668076000, + to_as_string: '2020-10-02T19:47:56.000Z', + doc_count: 3, + histogram: { + buckets: [ + { + key_as_string: '2020-10-02T19:47:27.000Z', + key: 1601668047000, + doc_count: 1, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, + }, + { + key_as_string: '2020-10-02T19:47:30.000Z', + key: 1601668050000, + doc_count: 1, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, + }, + { + key_as_string: '2020-10-02T19:47:33.000Z', + key: 1601668053000, + doc_count: 0, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, + }, + { + key_as_string: '2020-10-02T19:47:36.000Z', + key: 1601668056000, + doc_count: 0, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, + }, + { + key_as_string: '2020-10-02T19:47:39.000Z', + key: 1601668059000, + doc_count: 0, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, + }, + { + key_as_string: '2020-10-02T19:47:42.000Z', + key: 1601668062000, + doc_count: 1, + interval: { + sum_other_doc_count: 0, + buckets: [], + }, + }, + ], + }, + }) + ).toEqual([1, 1, 0, 0, 0, 1, 0, 0, 0, 0]); + }); + + test('pads buckets with zeros to fill out the entire period of time before detected buckets', async () => { + expect( + padBuckets(10, 3000, { + key: '2020-10-02T20:39:45.793Z-2020-10-02T20:40:14.793Z', + from: 1601671183000, + from_as_string: '2020-10-02T20:39:43.000Z', + to: 1601671213000, + to_as_string: '2020-10-02T20:40:13.000Z', + doc_count: 2, + histogram: { + buckets: [ + { + key_as_string: '2020-10-02T20:40:09.000Z', + key: 1601671209000, + doc_count: 1, + interval: { buckets: [] }, + }, + { + key_as_string: '2020-10-02T20:40:12.000Z', + key: 1601671212000, + doc_count: 1, + interval: { buckets: [] }, + }, + ], + }, + }) + ).toEqual([0, 0, 0, 0, 0, 0, 0, 0, 1, 1]); + }); + + test('pads buckets with zeros to fill out the entire period surounding the detected buckets', async () => { + expect( + padBuckets(20, 3000, { + key: '2020-10-02T20:39:45.793Z-2020-10-02T20:40:14.793Z', + from: 1601671185793, + from_as_string: '2020-10-02T20:39:45.793Z', + to: 1601671245793, + to_as_string: '2020-10-02T20:40:45.793Z', + doc_count: 2, + histogram: { + buckets: [ + { + key_as_string: '2020-10-02T20:40:09.000Z', + key: 1601671209000, + doc_count: 1, + interval: { buckets: [] }, + }, + { + key_as_string: '2020-10-02T20:40:12.000Z', + key: 1601671212000, + doc_count: 1, + interval: { buckets: [] }, + }, + ], + }, + }) + ).toEqual([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + }); +}); + +function setTaskTypeCount( + { aggregations }: MockESResult, + taskType: string, + status: Record +) { + const taskTypes = aggregations!.taskType as AggregationResultOf< + WorkloadAggregation['aggs']['taskType'], + {} + >; + const buckets = [ + ...taskTypes.buckets.filter(({ key }) => key !== taskType), + { + key: taskType, + doc_count: Object.values(status).reduce((sum, count) => sum + count, 0), + status: { + sum_other_doc_count: 0, + buckets: Object.entries(status).map(([key, count]) => ({ + key, + doc_count: count, + })), + }, + }, + ]; + return ({ + hits: { + total: { value: buckets.reduce((sum, bucket) => sum + bucket.doc_count, 0) }, + }, + aggregations: { + ...aggregations, + taskType: { + sum_other_doc_count: 0, + buckets, + }, + }, + } as {}) as MockESResult; +} + +/** * + * This creates a mock histogram as returned by Elasticsearch + * + * @param from lower bound of query + * @param findFrom the timestamp (key) of the first bucket returned + * @param to upper bound of query + * @param interval the duration that each bucket coresponds to + * @param foundBuckets the buckets identified by ES, any buckets missing before or after which + * are still in the date range are assumed to have 0 results, ES only returns 0 for + * buckets that sit in between buckets which do have results + */ +function mockHistogram( + from: number, + findFrom: number, + to: number, + interval: number, + foundBuckets: Array +) { + const now = Date.now(); + const fromDate = new Date(now + from); + const toDate = new Date(now + to); + return { + key: `${fromDate.toISOString()}-${toDate.toISOString()}`, + from: now + from, + from_as_string: fromDate.toISOString(), + to: now + to, + to_as_string: toDate.toISOString(), + doc_count: foundBuckets.reduce((sum: number, count) => sum + (count ?? 0), 0), + histogram: { + buckets: foundBuckets.reduce( + (histogramBuckets, count, index) => { + if (typeof count === 'number') { + const key = new Date(now + findFrom + index * interval); + histogramBuckets.push({ + key_as_string: key.toISOString(), + key: key.getTime(), + doc_count: count, + interval: { buckets: [] }, + }); + } + return histogramBuckets; + }, + [] as Array<{ + key_as_string: string; + key: number; + doc_count: number; + interval: { + buckets: Array<{ + key: string; + doc_count: number; + }>; + }; + }> + ), + }, + }; +} diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts new file mode 100644 index 000000000000..fe70f24684ad --- /dev/null +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -0,0 +1,344 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { timer } from 'rxjs'; +import { mergeMap, map, catchError } from 'rxjs/operators'; +import { Logger } from 'src/core/server'; +import { JsonObject } from 'src/plugins/kibana_utils/common'; +import { keyBy, mapValues } from 'lodash'; +import { AggregatedStatProvider } from './runtime_statistics_aggregator'; +import { parseIntervalAsSecond, asInterval, parseIntervalAsMillisecond } from '../lib/intervals'; +import { AggregationResultOf } from '../../../apm/typings/elasticsearch/aggregations'; +import { HealthStatus } from './monitoring_stats_stream'; +import { TaskStore } from '../task_store'; + +interface StatusStat extends JsonObject { + [status: string]: number; +} +interface TaskTypeStat extends JsonObject { + [taskType: string]: { + count: number; + status: StatusStat; + }; +} + +export interface WorkloadStat extends JsonObject { + count: number; + task_types: TaskTypeStat; + schedule: Array<[string, number]>; + overdue: number; + estimated_schedule_density: number[]; +} + +export interface WorkloadAggregation { + aggs: { + taskType: { + terms: { field: string }; + aggs: { + status: { + terms: { field: string }; + }; + }; + }; + schedule: { + terms: { field: string }; + }; + idleTasks: { + filter: { + term: { 'task.status': string }; + }; + aggs: { + scheduleDensity: { + range: { + field: string; + ranges: [{ from: string; to: string }]; + }; + aggs: { + histogram: { + date_histogram: { + field: string; + fixed_interval: string; + }; + aggs: { + interval: { + terms: { field: string }; + }; + }; + }; + }; + }; + overdue: { + filter: { + range: { + 'task.runAt': { lt: string }; + }; + }; + }; + }; + }; + }; +} + +// The type of a bucket in the scheduleDensity range aggregation +type ScheduleDensityResult = AggregationResultOf< + WorkloadAggregation['aggs']['idleTasks']['aggs']['scheduleDensity'], + {} +>['buckets'][0]; +type ScheduledIntervals = ScheduleDensityResult['histogram']['buckets'][0]; + +// Set an upper bound just in case a customer sets a really high refresh rate +const MAX_SHCEDULE_DENSITY_BUCKETS = 50; + +export function createWorkloadAggregator( + taskStore: TaskStore, + refreshInterval: number, + pollInterval: number, + logger: Logger +): AggregatedStatProvider { + // calculate scheduleDensity going two refreshIntervals or 1 minute into into the future + // (the longer of the two) + const scheduleDensityBuckets = Math.min( + Math.max(Math.round(60000 / pollInterval), Math.round((refreshInterval * 2) / pollInterval)), + MAX_SHCEDULE_DENSITY_BUCKETS + ); + + return timer(0, refreshInterval).pipe( + mergeMap(() => + taskStore.aggregate({ + aggs: { + taskType: { + terms: { field: 'task.taskType' }, + aggs: { + status: { + terms: { field: 'task.status' }, + }, + }, + }, + schedule: { + terms: { field: 'task.schedule.interval' }, + }, + idleTasks: { + filter: { + term: { 'task.status': 'idle' }, + }, + aggs: { + scheduleDensity: { + // create a window of upcoming tasks + range: { + field: 'task.runAt', + ranges: [ + { + from: `now`, + to: `now+${asInterval(scheduleDensityBuckets * pollInterval)}`, + }, + ], + }, + aggs: { + // create histogram of scheduling in the window, with each bucket being a polling interval + histogram: { + date_histogram: { + field: 'task.runAt', + fixed_interval: asInterval(pollInterval), + }, + // break down each bucket in the historgram by schedule + aggs: { + interval: { + terms: { field: 'task.schedule.interval' }, + }, + }, + }, + }, + }, + overdue: { + filter: { + range: { + 'task.runAt': { lt: 'now' }, + }, + }, + }, + }, + }, + }, + }) + ), + map((result) => { + const { + aggregations, + hits: { + total: { value: count }, + }, + } = result; + + if ( + !( + aggregations?.taskType && + aggregations?.schedule && + aggregations?.idleTasks?.overdue && + aggregations?.idleTasks?.scheduleDensity + ) + ) { + throw new Error(`Invalid workload: ${JSON.stringify(result)}`); + } + + const taskTypes = aggregations.taskType.buckets; + const schedules = aggregations.schedule.buckets; + + const { + overdue: { doc_count: overdue }, + scheduleDensity: { buckets: [scheduleDensity] = [] } = {}, + } = aggregations.idleTasks; + + const summary: WorkloadStat = { + count, + task_types: mapValues(keyBy(taskTypes, 'key'), ({ doc_count: docCount, status }) => { + return { + count: docCount, + status: mapValues(keyBy(status.buckets, 'key'), 'doc_count'), + }; + }), + schedule: schedules + .sort( + (scheduleLeft, scheduleRight) => + parseIntervalAsSecond(scheduleLeft.key as string) - + parseIntervalAsSecond(scheduleRight.key as string) + ) + .map((schedule) => [schedule.key as string, schedule.doc_count]), + overdue, + estimated_schedule_density: padBuckets( + scheduleDensityBuckets, + pollInterval, + scheduleDensity + ), + }; + return { + key: 'workload', + value: summary, + }; + }), + catchError((ex: Error, caught) => { + logger.error(`[WorkloadAggregator]: ${ex}`); + // continue to pull values from the same observable + return caught; + }) + ); +} + +interface IntervalTaskCountTouple { + nonRecurring?: number; + recurring?: Array<[number, string]>; + key: number; +} + +export function padBuckets( + scheduleDensityBuckets: number, + pollInterval: number, + scheduleDensity: ScheduleDensityResult +): number[] { + if (scheduleDensity.from && scheduleDensity.to && scheduleDensity.histogram?.buckets?.length) { + const { histogram, from, to } = scheduleDensity; + const firstBucket = histogram.buckets[0].key; + const lastBucket = histogram.buckets[histogram.buckets.length - 1].key; + + const bucketsToPadBeforeFirstBucket = calculateBucketsBetween(firstBucket, from, pollInterval); + const bucketsToPadAfterLast = calculateBucketsBetween( + lastBucket + pollInterval, + to, + pollInterval + ); + + return estimateRecurringTaskScheduling( + [ + ...bucketsToPadBeforeFirstBucket, + ...histogram.buckets.map(countByIntervalInBucket), + ...bucketsToPadAfterLast, + ], + pollInterval + ); + } + return new Array(scheduleDensityBuckets).fill(0); +} + +function countByIntervalInBucket(bucket: ScheduledIntervals): IntervalTaskCountTouple { + if (bucket.doc_count === 0) { + return { nonRecurring: 0, key: bucket.key }; + } + const recurring: Array<[number, string]> = []; + let nonRecurring = bucket.doc_count; + for (const intervalBucket of bucket.interval.buckets) { + recurring.push([intervalBucket.doc_count, intervalBucket.key as string]); + nonRecurring -= intervalBucket.doc_count; + } + + return { nonRecurring, recurring, key: bucket.key }; +} + +function calculateBucketsBetween( + from: number, + to: number, + interval: number, + bucketInterval: number = interval +): Array<{ key: number }> { + const calcForwardInTime = from < to; + + // as task interval might not divide by the pollInterval (aka the bucket interval) + // we have to adjust for the "drift" that occurs when estimating when the next + // bucket the task might actually get scheduled in + const actualInterval = Math.ceil(interval / bucketInterval) * bucketInterval; + + const buckets: Array<{ key: number }> = []; + const toBound = calcForwardInTime ? to : -(to + actualInterval); + let fromBound = calcForwardInTime ? from : -from; + + while (fromBound < toBound) { + buckets.push({ key: fromBound }); + fromBound += actualInterval; + } + + return calcForwardInTime + ? buckets + : buckets.reverse().map((bucket) => { + bucket.key = Math.abs(bucket.key); + return bucket; + }); +} + +export function estimateRecurringTaskScheduling( + scheduleDensity: IntervalTaskCountTouple[], + pollInterval: number +) { + const lastKey = scheduleDensity[scheduleDensity.length - 1].key; + + return scheduleDensity.map((bucket, currentBucketIndex) => { + for (const [count, interval] of bucket.recurring ?? []) { + for (const recurrance of calculateBucketsBetween( + bucket.key, + // `calculateBucketsBetween` uses the `to` as a non-inclusive upper bound + // but lastKey is a bucket we wish to include + lastKey + pollInterval, + parseIntervalAsMillisecond(interval), + pollInterval + )) { + const recurranceBucketIndex = + currentBucketIndex + Math.ceil((recurrance.key - bucket.key) / pollInterval); + + if (recurranceBucketIndex < scheduleDensity.length) { + scheduleDensity[recurranceBucketIndex].nonRecurring = + count + (scheduleDensity[recurranceBucketIndex].nonRecurring ?? 0); + } + } + } + return bucket.nonRecurring ?? 0; + }); +} + +export function summarizeWorkloadStat( + workloadStats: WorkloadStat +): { value: WorkloadStat; status: HealthStatus } { + return { + value: workloadStats, + status: HealthStatus.OK, + }; +} diff --git a/x-pack/plugins/task_manager/server/plugin.test.ts b/x-pack/plugins/task_manager/server/plugin.test.ts index 50e7e9a7aa19..8388468164a4 100644 --- a/x-pack/plugins/task_manager/server/plugin.test.ts +++ b/x-pack/plugins/task_manager/server/plugin.test.ts @@ -19,6 +19,16 @@ describe('TaskManagerPlugin', () => { poll_interval: 3000, max_poll_inactivity_cycles: 10, request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_required_freshness: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, }); pluginInitializerContext.env.instanceUuid = ''; @@ -38,6 +48,16 @@ describe('TaskManagerPlugin', () => { poll_interval: 3000, max_poll_inactivity_cycles: 10, request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_required_freshness: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, }); const taskManagerPlugin = new TaskManagerPlugin(pluginInitializerContext); diff --git a/x-pack/plugins/task_manager/server/plugin.ts b/x-pack/plugins/task_manager/server/plugin.ts index 0381698e6fb7..0e7abb817490 100644 --- a/x-pack/plugins/task_manager/server/plugin.ts +++ b/x-pack/plugins/task_manager/server/plugin.ts @@ -4,7 +4,8 @@ * you may not use this file except in compliance with the Elastic License. */ import { PluginInitializerContext, Plugin, CoreSetup, Logger, CoreStart } from 'src/core/server'; -import { first } from 'rxjs/operators'; +import { combineLatest, Subject } from 'rxjs'; +import { first, map } from 'rxjs/operators'; import { TaskDefinition } from './task'; import { TaskPollingLifecycle } from './polling_lifecycle'; import { TaskManagerConfig } from './config'; @@ -14,6 +15,8 @@ import { TaskTypeDictionary } from './task_type_dictionary'; import { FetchResult, SearchOpts, TaskStore } from './task_store'; import { createManagedConfiguration } from './lib/create_managed_configuration'; import { TaskScheduling } from './task_scheduling'; +import { healthRoute } from './routes'; +import { createMonitoringStats, MonitoringStats } from './monitoring'; export type TaskManagerSetupContract = { addMiddleware: (middleware: Middleware) => void } & Pick< TaskTypeDictionary, @@ -34,6 +37,7 @@ export class TaskManagerPlugin private logger: Logger; private definitions: TaskTypeDictionary; private middleware: Middleware = createInitialMiddleware(); + private monitoringStats$ = new Subject(); constructor(private readonly initContext: PluginInitializerContext) { this.initContext = initContext; @@ -41,13 +45,13 @@ export class TaskManagerPlugin this.definitions = new TaskTypeDictionary(this.logger); } - public async setup({ savedObjects }: CoreSetup): Promise { + public async setup(core: CoreSetup): Promise { this.config = await this.initContext.config .create() .pipe(first()) .toPromise(); - setupSavedObjects(savedObjects, this.config); + setupSavedObjects(core.savedObjects, this.config); this.taskManagerId = this.initContext.env.instanceUuid; if (!this.taskManagerId) { @@ -59,6 +63,26 @@ export class TaskManagerPlugin this.logger.info(`TaskManager is identified by the Kibana UUID: ${this.taskManagerId}`); } + // Routes + const router = core.http.createRouter(); + const serviceStatus$ = healthRoute( + router, + this.monitoringStats$, + this.logger, + this.taskManagerId, + this.config! + ); + + core.getStartServices().then(async () => { + core.status.set( + combineLatest([core.status.derivedStatus$, serviceStatus$]).pipe( + map(([derivedStatus, serviceStatus]) => + serviceStatus.level > derivedStatus.level ? serviceStatus : derivedStatus + ) + ) + ); + }); + return { addMiddleware: (middleware: Middleware) => { this.assertStillInSetup('add Middleware'); @@ -84,7 +108,7 @@ export class TaskManagerPlugin taskManagerId: `kibana:${this.taskManagerId!}`, }); - const { maxWorkersConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ + const managedConfiguration = createManagedConfiguration({ logger: this.logger, errors$: taskStore.errors$, startingMaxWorkers: this.config!.max_workers, @@ -97,11 +121,18 @@ export class TaskManagerPlugin logger: this.logger, taskStore, middleware: this.middleware, - maxWorkersConfiguration$, - pollIntervalConfiguration$, + ...managedConfiguration, }); this.taskPollingLifecycle = taskPollingLifecycle; + createMonitoringStats( + taskPollingLifecycle, + taskStore, + this.config!, + managedConfiguration, + this.logger + ).subscribe((stat) => this.monitoringStats$.next(stat)); + const taskScheduling = new TaskScheduling({ logger: this.logger, taskStore, diff --git a/x-pack/plugins/task_manager/server/polling/task_poller.test.ts b/x-pack/plugins/task_manager/server/polling/task_poller.test.ts index 956c8b05f386..f5f1667312d7 100644 --- a/x-pack/plugins/task_manager/server/polling/task_poller.test.ts +++ b/x-pack/plugins/task_manager/server/polling/task_poller.test.ts @@ -9,7 +9,8 @@ import { Subject, of, BehaviorSubject } from 'rxjs'; import { Option, none, some } from 'fp-ts/lib/Option'; import { createTaskPoller, PollingError, PollingErrorType } from './task_poller'; import { fakeSchedulers } from 'rxjs-marbles/jest'; -import { sleep, resolvable, Resolvable, mockLogger } from '../test_utils'; +import { sleep, resolvable, Resolvable } from '../test_utils'; +import { loggingSystemMock } from '../../../../../src/core/server/mocks'; import { asOk, asErr } from '../lib/result_type'; describe('TaskPoller', () => { @@ -24,7 +25,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); createTaskPoller({ - logger: mockLogger(), + logger: loggingSystemMock.create().get(), pollInterval$: of(pollInterval), bufferCapacity, getCapacity: () => 1, @@ -59,7 +60,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); createTaskPoller({ - logger: mockLogger(), + logger: loggingSystemMock.create().get(), pollInterval$, bufferCapacity, getCapacity: () => 1, @@ -101,7 +102,7 @@ describe('TaskPoller', () => { let hasCapacity = true; createTaskPoller({ - logger: mockLogger(), + logger: loggingSystemMock.create().get(), pollInterval$: of(pollInterval), bufferCapacity, work, @@ -160,7 +161,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); const pollRequests$ = new Subject>(); createTaskPoller({ - logger: mockLogger(), + logger: loggingSystemMock.create().get(), pollInterval$: of(pollInterval), bufferCapacity, work, @@ -206,7 +207,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); const pollRequests$ = new Subject>(); createTaskPoller({ - logger: mockLogger(), + logger: loggingSystemMock.create().get(), pollInterval$: of(pollInterval), bufferCapacity, work, @@ -251,7 +252,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => true); const pollRequests$ = new Subject>(); createTaskPoller({ - logger: mockLogger(), + logger: loggingSystemMock.create().get(), pollInterval$: of(pollInterval), bufferCapacity, work, @@ -288,7 +289,7 @@ describe('TaskPoller', () => { const handler = jest.fn(); const pollRequests$ = new Subject>(); createTaskPoller({ - logger: mockLogger(), + logger: loggingSystemMock.create().get(), pollInterval$: of(pollInterval), bufferCapacity, work: async (...args) => { @@ -339,7 +340,7 @@ describe('TaskPoller', () => { type ResolvableTupple = [string, PromiseLike & Resolvable]; const pollRequests$ = new Subject>(); createTaskPoller<[string, Resolvable], string[]>({ - logger: mockLogger(), + logger: loggingSystemMock.create().get(), pollInterval$: of(pollInterval), bufferCapacity, work: async (...resolvables) => { @@ -399,7 +400,7 @@ describe('TaskPoller', () => { const handler = jest.fn(); const pollRequests$ = new Subject>(); createTaskPoller({ - logger: mockLogger(), + logger: loggingSystemMock.create().get(), pollInterval$: of(pollInterval), bufferCapacity, work: async (...args) => { @@ -440,7 +441,7 @@ describe('TaskPoller', () => { return callCount; }); createTaskPoller({ - logger: mockLogger(), + logger: loggingSystemMock.create().get(), pollInterval$: of(pollInterval), bufferCapacity, work, @@ -483,7 +484,7 @@ describe('TaskPoller', () => { const work = jest.fn(async () => {}); const pollRequests$ = new Subject>(); createTaskPoller({ - logger: mockLogger(), + logger: loggingSystemMock.create().get(), pollInterval$: of(pollInterval), bufferCapacity, work, diff --git a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts index 29c8e836303f..5f2e774177fd 100644 --- a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts +++ b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts @@ -28,6 +28,16 @@ describe('TaskPollingLifecycle', () => { poll_interval: 6000000, max_poll_inactivity_cycles: 10, request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_required_freshness: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, }, taskStore: mockTaskStore, logger: taskManagerLogger, diff --git a/x-pack/plugins/task_manager/server/polling_lifecycle.ts b/x-pack/plugins/task_manager/server/polling_lifecycle.ts index 8a506cca699d..ba19cb63fffa 100644 --- a/x-pack/plugins/task_manager/server/polling_lifecycle.ts +++ b/x-pack/plugins/task_manager/server/polling_lifecycle.ts @@ -9,6 +9,7 @@ import { performance } from 'perf_hooks'; import { pipe } from 'fp-ts/lib/pipeable'; import { Option, some, map as mapOptional } from 'fp-ts/lib/Option'; +import { tap } from 'rxjs/operators'; import { Logger } from '../../../../src/core/server'; import { Result, asErr, mapErr } from './lib/result_type'; @@ -21,6 +22,8 @@ import { TaskClaim, TaskRunRequest, asTaskRunRequestEvent, + TaskPollingCycle, + asTaskPollingCycleEvent, } from './task_events'; import { fillPool, FillPoolResult } from './lib/fill_pool'; import { Middleware } from './lib/middleware'; @@ -47,7 +50,12 @@ export type TaskPollingLifecycleOpts = { middleware: Middleware; } & ManagedConfiguration; -export type TaskLifecycleEvent = TaskMarkRunning | TaskRun | TaskClaim | TaskRunRequest; +export type TaskLifecycleEvent = + | TaskMarkRunning + | TaskRun + | TaskClaim + | TaskRunRequest + | TaskPollingCycle; /** * The public interface into the task manager system. @@ -181,17 +189,23 @@ export class TaskPollingLifecycle { */ public start() { if (!this.isStarted) { - this.pollingSubscription = this.poller$.subscribe( - mapErr((error: PollingError) => { - if (error.type === PollingErrorType.RequestCapacityReached) { - pipe( - error.data, - mapOptional((id) => this.emitEvent(asTaskRunRequestEvent(id, asErr(error)))) - ); - } - this.logger.error(error.message); - }) - ); + this.pollingSubscription = this.poller$ + .pipe( + tap( + mapErr((error: PollingError) => { + if (error.type === PollingErrorType.RequestCapacityReached) { + pipe( + error.data, + mapOptional((id) => this.emitEvent(asTaskRunRequestEvent(id, asErr(error)))) + ); + } + this.logger.error(error.message); + }) + ) + ) + .subscribe((event: Result>) => { + this.emitEvent(asTaskPollingCycleEvent(event)); + }); } } diff --git a/x-pack/plugins/task_manager/server/routes/_mock_handler_arguments.ts b/x-pack/plugins/task_manager/server/routes/_mock_handler_arguments.ts new file mode 100644 index 000000000000..c9f4de25afaf --- /dev/null +++ b/x-pack/plugins/task_manager/server/routes/_mock_handler_arguments.ts @@ -0,0 +1,33 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { RequestHandlerContext, KibanaRequest, KibanaResponseFactory } from 'kibana/server'; +import { identity } from 'lodash'; +import { httpServerMock } from '../../../../../src/core/server/mocks'; + +export function mockHandlerArguments( + {}: {}, + req: unknown, + res?: Array> +): [RequestHandlerContext, KibanaRequest, KibanaResponseFactory] { + return [ + ({} as unknown) as RequestHandlerContext, + req as KibanaRequest, + mockResponseFactory(res), + ]; +} + +export const mockResponseFactory = (resToMock: Array> = []) => { + const factory: jest.Mocked = httpServerMock.createResponseFactory(); + resToMock.forEach((key: string) => { + if (key in factory) { + Object.defineProperty(factory, key, { + value: jest.fn(identity), + }); + } + }); + return (factory as unknown) as KibanaResponseFactory; +}; diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts new file mode 100644 index 000000000000..5a0cef8eda94 --- /dev/null +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -0,0 +1,393 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { Observable, of, Subject } from 'rxjs'; +import { take } from 'rxjs/operators'; +import { merge } from 'lodash'; +import uuid from 'uuid'; +import { httpServiceMock } from 'src/core/server/mocks'; +import { healthRoute } from './health'; +import { mockHandlerArguments } from './_mock_handler_arguments'; +import { sleep } from '../test_utils'; +import { loggingSystemMock } from '../../../../../src/core/server/mocks'; +import { Logger } from '../../../../../src/core/server'; +import { MonitoringStats, summarizeMonitoringStats } from '../monitoring'; +import { ServiceStatusLevels } from 'src/core/server'; +import { configSchema, TaskManagerConfig } from '../config'; + +describe('healthRoute', () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + + it('registers the route', async () => { + const router = httpServiceMock.createRouter(); + + const logger = loggingSystemMock.create().get(); + healthRoute(router, of(), logger, uuid.v4(), getTaskManagerConfig()); + + const [config] = router.get.mock.calls[0]; + + expect(config.path).toMatchInlineSnapshot(`"/api/task_manager/_health"`); + }); + + it('logs the Task Manager stats at a fixed interval', async () => { + const router = httpServiceMock.createRouter(); + const logger = loggingSystemMock.create().get(); + + const mockStat = mockHealthStats(); + await sleep(10); + const skippedMockStat = mockHealthStats(); + await sleep(10); + const nextMockStat = mockHealthStats(); + + const stats$ = new Subject(); + + const id = uuid.v4(); + healthRoute( + router, + stats$, + logger, + id, + getTaskManagerConfig({ + monitored_stats_required_freshness: 1000, + monitored_aggregated_stats_refresh_rate: 60000, + }) + ); + + stats$.next(mockStat); + await sleep(500); + stats$.next(skippedMockStat); + await sleep(600); + stats$.next(nextMockStat); + + const firstDebug = JSON.parse( + (logger as jest.Mocked).debug.mock.calls[0][0].replace('Latest Monitored Stats: ', '') + ); + expect(firstDebug).toMatchObject({ + id, + timestamp: expect.any(String), + status: expect.any(String), + ...summarizeMonitoringStats(mockStat, getTaskManagerConfig({})), + }); + + const secondDebug = JSON.parse( + (logger as jest.Mocked).debug.mock.calls[1][0].replace('Latest Monitored Stats: ', '') + ); + expect(secondDebug).not.toMatchObject({ + id, + timestamp: expect.any(String), + status: expect.any(String), + ...summarizeMonitoringStats(skippedMockStat, getTaskManagerConfig({})), + }); + expect(secondDebug).toMatchObject({ + id, + timestamp: expect.any(String), + status: expect.any(String), + ...summarizeMonitoringStats(nextMockStat, getTaskManagerConfig({})), + }); + + expect(logger.debug).toHaveBeenCalledTimes(2); + }); + + it('returns a error status if the overall stats have not been updated within the required hot freshness', async () => { + const router = httpServiceMock.createRouter(); + + const stats$ = new Subject(); + + const serviceStatus$ = healthRoute( + router, + stats$, + loggingSystemMock.create().get(), + uuid.v4(), + getTaskManagerConfig({ + monitored_stats_required_freshness: 1000, + monitored_aggregated_stats_refresh_rate: 60000, + }) + ); + + const serviceStatus = getLatest(serviceStatus$); + + const [, handler] = router.get.mock.calls[0]; + + const [context, req, res] = mockHandlerArguments({}, {}, ['ok', 'internalError']); + + await sleep(0); + + stats$.next( + mockHealthStats({ + last_update: new Date(Date.now() - 1500).toISOString(), + }) + ); + + expect(await handler(context, req, res)).toMatchObject({ + body: { + status: 'error', + ...summarizeMonitoringStats( + mockHealthStats({ + last_update: expect.any(String), + stats: { + configuration: { + timestamp: expect.any(String), + }, + workload: { + timestamp: expect.any(String), + }, + runtime: { + timestamp: expect.any(String), + value: { + polling: { + last_successful_poll: expect.any(String), + }, + }, + }, + }, + }), + getTaskManagerConfig({}) + ), + }, + }); + + expect(await serviceStatus).toMatchObject({ + level: ServiceStatusLevels.unavailable, + summary: 'Task Manager is unavailable', + meta: { + status: 'error', + ...summarizeMonitoringStats( + mockHealthStats({ + last_update: expect.any(String), + stats: { + configuration: { + timestamp: expect.any(String), + }, + workload: { + timestamp: expect.any(String), + }, + runtime: { + timestamp: expect.any(String), + value: { + polling: { + last_successful_poll: expect.any(String), + }, + }, + }, + }, + }), + getTaskManagerConfig({}) + ), + }, + }); + }); + + it('returns a error status if the workload stats have not been updated within the required cold freshness', async () => { + const router = httpServiceMock.createRouter(); + + const stats$ = new Subject(); + + healthRoute( + router, + stats$, + loggingSystemMock.create().get(), + uuid.v4(), + getTaskManagerConfig({ + monitored_stats_required_freshness: 5000, + monitored_aggregated_stats_refresh_rate: 60000, + }) + ); + + await sleep(0); + + const lastUpdateOfWorkload = new Date(Date.now() - 120000).toISOString(); + stats$.next( + mockHealthStats({ + stats: { + workload: { + timestamp: lastUpdateOfWorkload, + }, + }, + }) + ); + + const [, handler] = router.get.mock.calls[0]; + + const [context, req, res] = mockHandlerArguments({}, {}, ['ok', 'internalError']); + + await sleep(2000); + + expect(await handler(context, req, res)).toMatchObject({ + body: { + status: 'error', + ...summarizeMonitoringStats( + mockHealthStats({ + last_update: expect.any(String), + stats: { + configuration: { + timestamp: expect.any(String), + }, + workload: { + timestamp: expect.any(String), + }, + runtime: { + timestamp: expect.any(String), + value: { + polling: { + last_successful_poll: expect.any(String), + }, + }, + }, + }, + }), + getTaskManagerConfig() + ), + }, + }); + }); + + it('returns a error status if the poller hasnt polled within the required hot freshness', async () => { + const router = httpServiceMock.createRouter(); + + const stats$ = new Subject(); + healthRoute( + router, + stats$, + loggingSystemMock.create().get(), + uuid.v4(), + getTaskManagerConfig({ + monitored_stats_required_freshness: 1000, + monitored_aggregated_stats_refresh_rate: 60000, + }) + ); + + await sleep(0); + + // eslint-disable-next-line @typescript-eslint/naming-convention + const last_successful_poll = new Date(Date.now() - 2000).toISOString(); + stats$.next( + mockHealthStats({ + stats: { + runtime: { + value: { + polling: { + last_successful_poll, + }, + }, + }, + }, + }) + ); + + const [, handler] = router.get.mock.calls[0]; + + const [context, req, res] = mockHandlerArguments({}, {}, ['ok', 'internalError']); + + expect(await handler(context, req, res)).toMatchObject({ + body: { + status: 'error', + ...summarizeMonitoringStats( + mockHealthStats({ + last_update: expect.any(String), + stats: { + configuration: { + timestamp: expect.any(String), + }, + workload: { + timestamp: expect.any(String), + }, + runtime: { + timestamp: expect.any(String), + value: { + polling: { + last_successful_poll, + }, + }, + }, + }, + }), + getTaskManagerConfig() + ), + }, + }); + }); +}); + +function mockHealthStats(overrides = {}) { + return (merge( + { + last_update: new Date().toISOString(), + stats: { + configuration: { + timestamp: new Date().toISOString(), + value: { + value: { + max_workers: 10, + poll_interval: 6000000, + max_poll_inactivity_cycles: 10, + request_capacity: 1000, + monitored_aggregated_stats_refresh_rate: 5000, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + }, + }, + }, + workload: { + timestamp: new Date().toISOString(), + value: { + count: 4, + taskTypes: { + actions_telemetry: { count: 2, status: { idle: 2 } }, + alerting_telemetry: { count: 1, status: { idle: 1 } }, + session_cleanup: { count: 1, status: { idle: 1 } }, + }, + schedule: {}, + overdue: 0, + estimatedScheduleDensity: [], + }, + }, + runtime: { + timestamp: new Date().toISOString(), + value: { + drift: [1000, 60000], + execution: { + duration: [], + result_frequency_percent_as_number: [], + }, + polling: { + last_successful_poll: new Date().toISOString(), + result_frequency_percent_as_number: [ + 'NoTasksClaimed', + 'NoTasksClaimed', + 'NoTasksClaimed', + ], + }, + }, + }, + }, + }, + overrides + ) as unknown) as MonitoringStats; +} + +async function getLatest(stream$: Observable) { + return new Promise((resolve) => stream$.pipe(take(1)).subscribe((stats) => resolve(stats))); +} + +const getTaskManagerConfig = (overrides: Partial = {}) => + configSchema.validate( + overrides.monitored_stats_required_freshness + ? { + // use `monitored_stats_required_freshness` as poll interval otherwise we might + // fail validation as it must be greather than the poll interval + poll_interval: overrides.monitored_stats_required_freshness, + ...overrides, + } + : overrides + ); diff --git a/x-pack/plugins/task_manager/server/routes/health.ts b/x-pack/plugins/task_manager/server/routes/health.ts new file mode 100644 index 000000000000..8ddd728063d2 --- /dev/null +++ b/x-pack/plugins/task_manager/server/routes/health.ts @@ -0,0 +1,169 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { + IRouter, + RequestHandlerContext, + KibanaRequest, + IKibanaResponse, + KibanaResponseFactory, +} from 'kibana/server'; +import { Observable, Subject } from 'rxjs'; +import { tap, map } from 'rxjs/operators'; +import { throttleTime } from 'rxjs/operators'; +import { isString } from 'lodash'; +import { JsonValue } from 'src/plugins/kibana_utils/common'; +import { Logger, ServiceStatus, ServiceStatusLevels } from '../../../../../src/core/server'; +import { + MonitoringStats, + summarizeMonitoringStats, + HealthStatus, + RawMonitoringStats, +} from '../monitoring'; +import { TaskManagerConfig } from '../config'; + +type MonitoredHealth = RawMonitoringStats & { id: string; status: HealthStatus; timestamp: string }; + +const LEVEL_SUMMARY = { + [ServiceStatusLevels.available.toString()]: 'Task Manager is healthy', + [ServiceStatusLevels.degraded.toString()]: 'Task Manager is unhealthy', + [ServiceStatusLevels.unavailable.toString()]: 'Task Manager is unavailable', +}; + +export function healthRoute( + router: IRouter, + monitoringStats$: Observable, + logger: Logger, + taskManagerId: string, + config: TaskManagerConfig +): Observable { + // if "hot" health stats are any more stale than monitored_stats_required_freshness (pollInterval +1s buffer by default) + // consider the system unhealthy + const requiredHotStatsFreshness: number = config.monitored_stats_required_freshness; + + // if "cold" health stats are any more stale than the configured refresh (+ a buffer), consider the system unhealthy + const requiredColdStatsFreshness: number = config.monitored_aggregated_stats_refresh_rate * 1.5; + + function calculateStatus(monitoredStats: MonitoringStats): MonitoredHealth { + const now = Date.now(); + const timestamp = new Date(now).toISOString(); + const summarizedStats = summarizeMonitoringStats(monitoredStats, config); + + /** + * If the monitored stats aren't fresh, return a red status + */ + const healthStatus = + hasStatus(summarizedStats.stats, HealthStatus.Error) || + hasExpiredHotTimestamps(summarizedStats, now, requiredHotStatsFreshness) || + hasExpiredColdTimestamps(summarizedStats, now, requiredColdStatsFreshness) + ? HealthStatus.Error + : hasStatus(summarizedStats.stats, HealthStatus.Warning) + ? HealthStatus.Warning + : HealthStatus.OK; + return { id: taskManagerId, timestamp, status: healthStatus, ...summarizedStats }; + } + + const serviceStatus$: Subject = new Subject(); + + /* keep track of last health summary, as we'll return that to the next call to _health */ + let lastMonitoredStats: MonitoringStats | null = null; + + /* Log Task Manager stats as a Debug log line at a fixed interval */ + monitoringStats$ + .pipe( + throttleTime(requiredHotStatsFreshness), + tap((stats) => { + lastMonitoredStats = stats; + }), + // Only calculate the summerized stats (calculates all runnign averages and evaluates state) + // when needed by throttling down to the requiredHotStatsFreshness + map((stats) => withServiceStatus(calculateStatus(stats))) + ) + .subscribe(([monitoredHealth, serviceStatus]) => { + serviceStatus$.next(serviceStatus); + logger.debug(`Latest Monitored Stats: ${JSON.stringify(monitoredHealth)}`); + }); + + router.get( + { + path: '/api/task_manager/_health', + validate: false, + }, + async function ( + context: RequestHandlerContext, + req: KibanaRequest, + res: KibanaResponseFactory + ): Promise { + return res.ok({ + body: lastMonitoredStats + ? calculateStatus(lastMonitoredStats) + : { id: taskManagerId, timestamp: new Date().toISOString(), status: HealthStatus.Error }, + }); + } + ); + return serviceStatus$; +} + +export function withServiceStatus( + monitoredHealth: MonitoredHealth +): [MonitoredHealth, ServiceStatus] { + const level = + monitoredHealth.status === HealthStatus.OK + ? ServiceStatusLevels.available + : monitoredHealth.status === HealthStatus.Warning + ? ServiceStatusLevels.degraded + : ServiceStatusLevels.unavailable; + return [ + monitoredHealth, + { + level, + summary: LEVEL_SUMMARY[level.toString()], + meta: monitoredHealth, + }, + ]; +} + +/** + * If certain "hot" stats are not fresh, then the _health api will should return a Red status + * @param monitoringStats The monitored stats + * @param now The time to compare against + * @param requiredFreshness How fresh should these stats be + */ +function hasExpiredHotTimestamps( + monitoringStats: RawMonitoringStats, + now: number, + requiredFreshness: number +): boolean { + return ( + now - + getOldestTimestamp( + monitoringStats.last_update, + monitoringStats.stats.runtime?.value.polling.last_successful_poll + ) > + requiredFreshness + ); +} + +function hasExpiredColdTimestamps( + monitoringStats: RawMonitoringStats, + now: number, + requiredFreshness: number +): boolean { + return now - getOldestTimestamp(monitoringStats.stats.workload?.timestamp) > requiredFreshness; +} + +function hasStatus(stats: RawMonitoringStats['stats'], status: HealthStatus): boolean { + return Object.values(stats) + .map((stat) => stat?.status === status) + .includes(true); +} + +function getOldestTimestamp(...timestamps: Array): number { + const validTimestamps = timestamps + .map((timestamp) => (isString(timestamp) ? Date.parse(timestamp) : NaN)) + .filter((timestamp) => !isNaN(timestamp)); + return validTimestamps.length ? Math.min(...validTimestamps) : 0; +} diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/index.js b/x-pack/plugins/task_manager/server/routes/index.ts similarity index 54% rename from x-pack/test/plugin_api_integration/test_suites/task_manager/index.js rename to x-pack/plugins/task_manager/server/routes/index.ts index c3efe56c80e8..4fa1aa6cb7a9 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/index.js +++ b/x-pack/plugins/task_manager/server/routes/index.ts @@ -4,9 +4,4 @@ * you may not use this file except in compliance with the Elastic License. */ -export default function ({ loadTestFile }) { - describe('task_manager', function taskManagerSuite() { - this.tags('ciGroup4'); - loadTestFile(require.resolve('./task_manager_integration')); - }); -} +export { healthRoute } from './health'; diff --git a/x-pack/plugins/task_manager/server/task_events.ts b/x-pack/plugins/task_manager/server/task_events.ts index e1dd85f868cd..b011d435e28d 100644 --- a/x-pack/plugins/task_manager/server/task_events.ts +++ b/x-pack/plugins/task_manager/server/task_events.ts @@ -9,63 +9,109 @@ import { Option } from 'fp-ts/lib/Option'; import { ConcreteTaskInstance } from './task'; import { Result, Err } from './lib/result_type'; +import { FillPoolResult } from './lib/fill_pool'; +import { PollingError } from './polling'; +import { TaskRunResult } from './task_runner'; export enum TaskEventType { TASK_CLAIM = 'TASK_CLAIM', TASK_MARK_RUNNING = 'TASK_MARK_RUNNING', TASK_RUN = 'TASK_RUN', TASK_RUN_REQUEST = 'TASK_RUN_REQUEST', + TASK_POLLING_CYCLE = 'TASK_POLLING_CYCLE', +} + +export interface TaskTiming { + start: number; + stop: number; +} + +export function startTaskTimer(): () => TaskTiming { + const start = Date.now(); + return () => ({ start, stop: Date.now() }); } export interface TaskEvent { - id: string; + id?: string; + timing?: TaskTiming; type: TaskEventType; event: Result; } +export interface RanTask { + task: ConcreteTaskInstance; + result: TaskRunResult; +} +export type ErroredTask = RanTask & { + error: Error; +}; + export type TaskMarkRunning = TaskEvent; -export type TaskRun = TaskEvent; +export type TaskRun = TaskEvent; export type TaskClaim = TaskEvent>; export type TaskRunRequest = TaskEvent; +export type TaskPollingCycle = TaskEvent>; export function asTaskMarkRunningEvent( id: string, - event: Result + event: Result, + timing?: TaskTiming ): TaskMarkRunning { return { id, type: TaskEventType.TASK_MARK_RUNNING, event, + timing, }; } -export function asTaskRunEvent(id: string, event: Result): TaskRun { +export function asTaskRunEvent( + id: string, + event: Result, + timing?: TaskTiming +): TaskRun { return { id, type: TaskEventType.TASK_RUN, event, + timing, }; } export function asTaskClaimEvent( id: string, - event: Result> + event: Result>, + timing?: TaskTiming ): TaskClaim { return { id, type: TaskEventType.TASK_CLAIM, event, + timing, }; } export function asTaskRunRequestEvent( id: string, // we only emit a TaskRunRequest event when it fails - event: Err + event: Err, + timing?: TaskTiming ): TaskRunRequest { return { id, type: TaskEventType.TASK_RUN_REQUEST, event, + timing, + }; +} + +export function asTaskPollingCycleEvent( + event: Result>, + timing?: TaskTiming +): TaskPollingCycle { + return { + type: TaskEventType.TASK_POLLING_CYCLE, + event, + timing, }; } @@ -85,3 +131,8 @@ export function isTaskRunRequestEvent( ): taskEvent is TaskRunRequest { return taskEvent.type === TaskEventType.TASK_RUN_REQUEST; } +export function isTaskPollingCycleEvent( + taskEvent: TaskEvent +): taskEvent is TaskPollingCycle { + return taskEvent.type === TaskEventType.TASK_POLLING_CYCLE; +} diff --git a/x-pack/plugins/task_manager/server/task_pool.test.ts b/x-pack/plugins/task_manager/server/task_pool.test.ts index 12b731b2b78a..a174af71ef18 100644 --- a/x-pack/plugins/task_manager/server/task_pool.test.ts +++ b/x-pack/plugins/task_manager/server/task_pool.test.ts @@ -7,7 +7,9 @@ import sinon from 'sinon'; import { of, Subject } from 'rxjs'; import { TaskPool, TaskPoolRunResult } from './task_pool'; -import { mockLogger, resolvable, sleep } from './test_utils'; +import { resolvable, sleep } from './test_utils'; +import { loggingSystemMock } from '../../../../src/core/server/mocks'; +import { Logger } from '../../../../src/core/server'; import { asOk } from './lib/result_type'; import { SavedObjectsErrorHelpers } from '../../../../src/core/server'; import moment from 'moment'; @@ -16,7 +18,7 @@ describe('TaskPool', () => { test('occupiedWorkers are a sum of running tasks', async () => { const pool = new TaskPool({ maxWorkers$: of(200), - logger: mockLogger(), + logger: loggingSystemMock.create().get(), }); const result = await pool.run([{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]); @@ -28,7 +30,7 @@ describe('TaskPool', () => { test('availableWorkers are a function of total_capacity - occupiedWorkers', async () => { const pool = new TaskPool({ maxWorkers$: of(10), - logger: mockLogger(), + logger: loggingSystemMock.create().get(), }); const result = await pool.run([{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]); @@ -41,7 +43,7 @@ describe('TaskPool', () => { const maxWorkers$ = new Subject(); const pool = new TaskPool({ maxWorkers$, - logger: mockLogger(), + logger: loggingSystemMock.create().get(), }); expect(pool.availableWorkers).toEqual(0); @@ -52,7 +54,7 @@ describe('TaskPool', () => { test('does not run tasks that are beyond its available capacity', async () => { const pool = new TaskPool({ maxWorkers$: of(2), - logger: mockLogger(), + logger: loggingSystemMock.create().get(), }); const shouldRun = mockRun(); @@ -71,7 +73,7 @@ describe('TaskPool', () => { }); test('should log when marking a Task as running fails', async () => { - const logger = mockLogger(); + const logger = loggingSystemMock.create().get(); const pool = new TaskPool({ maxWorkers$: of(2), logger, @@ -84,7 +86,7 @@ describe('TaskPool', () => { const result = await pool.run([mockTask(), taskFailedToMarkAsRunning, mockTask()]); - expect(logger.error.mock.calls[0]).toMatchInlineSnapshot(` + expect((logger as jest.Mocked).error.mock.calls[0]).toMatchInlineSnapshot(` Array [ "Failed to mark Task TaskType \\"shooooo\\" as running: Mark Task as running has failed miserably", ] @@ -94,7 +96,7 @@ describe('TaskPool', () => { }); test('should log when running a Task fails', async () => { - const logger = mockLogger(); + const logger = loggingSystemMock.create().get(); const pool = new TaskPool({ maxWorkers$: of(3), logger, @@ -107,7 +109,7 @@ describe('TaskPool', () => { const result = await pool.run([mockTask(), taskFailedToRun, mockTask()]); - expect(logger.warn.mock.calls[0]).toMatchInlineSnapshot(` + expect((logger as jest.Mocked).warn.mock.calls[0]).toMatchInlineSnapshot(` Array [ "Task TaskType \\"shooooo\\" failed in attempt to run: Run Task has failed miserably", ] @@ -117,7 +119,7 @@ describe('TaskPool', () => { }); test('should not log when running a Task fails due to the Task SO having been deleted while in flight', async () => { - const logger = mockLogger(); + const logger = loggingSystemMock.create().get(); const pool = new TaskPool({ maxWorkers$: of(3), logger, @@ -139,7 +141,7 @@ describe('TaskPool', () => { }); test('Running a task which fails still takes up capacity', async () => { - const logger = mockLogger(); + const logger = loggingSystemMock.create().get(); const pool = new TaskPool({ maxWorkers$: of(1), logger, @@ -159,7 +161,7 @@ describe('TaskPool', () => { test('clears up capacity when a task completes', async () => { const pool = new TaskPool({ maxWorkers$: of(1), - logger: mockLogger(), + logger: loggingSystemMock.create().get(), }); const firstWork = resolvable(); @@ -202,7 +204,7 @@ describe('TaskPool', () => { }); test('run cancels expired tasks prior to running new tasks', async () => { - const logger = mockLogger(); + const logger = loggingSystemMock.create().get(); const pool = new TaskPool({ maxWorkers$: of(2), logger, @@ -259,7 +261,7 @@ describe('TaskPool', () => { }); test('logs if cancellation errors', async () => { - const logger = mockLogger(); + const logger = loggingSystemMock.create().get(); const pool = new TaskPool({ logger, maxWorkers$: of(20), @@ -290,7 +292,7 @@ describe('TaskPool', () => { // Allow the task to cancel... await cancelled; - expect(logger.error.mock.calls[0][0]).toMatchInlineSnapshot( + expect((logger as jest.Mocked).error.mock.calls[0][0]).toMatchInlineSnapshot( `"Failed to cancel task \\"shooooo!\\": Error: Dern!"` ); }); diff --git a/x-pack/plugins/task_manager/server/task_runner.test.ts b/x-pack/plugins/task_manager/server/task_runner.test.ts index 8fb1df444c60..676eeedf08f5 100644 --- a/x-pack/plugins/task_manager/server/task_runner.test.ts +++ b/x-pack/plugins/task_manager/server/task_runner.test.ts @@ -6,16 +6,18 @@ import _ from 'lodash'; import sinon from 'sinon'; -import { minutesFromNow } from './lib/intervals'; +import { secondsFromNow } from './lib/intervals'; import { asOk, asErr } from './lib/result_type'; -import { TaskEvent, asTaskRunEvent, asTaskMarkRunningEvent } from './task_events'; +import { TaskManagerRunner, TaskRunResult } from './task_runner'; +import { TaskEvent, asTaskRunEvent, asTaskMarkRunningEvent, TaskRun } from './task_events'; import { ConcreteTaskInstance, TaskStatus, TaskDefinition, RunResult } from './task'; -import { TaskManagerRunner } from './task_runner'; import { SavedObjectsErrorHelpers } from '../../../../src/core/server'; import moment from 'moment'; import { TaskTypeDictionary } from './task_type_dictionary'; import { mockLogger } from './test_utils'; +const minutesFromNow = (mins: number): Date => secondsFromNow(mins * 60); + let fakeTimer: sinon.SinonFakeTimers; beforeAll(() => { @@ -812,7 +814,9 @@ describe('TaskManagerRunner', () => { await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith(asTaskRunEvent(id, asOk(instance))); + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming(asTaskRunEvent(id, asOk({ task: instance, result: TaskRunResult.Success }))) + ); }); test('emits TaskEvent when a recurring task is run successfully', async () => { @@ -839,14 +843,16 @@ describe('TaskManagerRunner', () => { await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith(asTaskRunEvent(id, asOk(instance))); + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming(asTaskRunEvent(id, asOk({ task: instance, result: TaskRunResult.Success }))) + ); }); test('emits TaskEvent when a task run throws an error', async () => { const id = _.random(1, 20).toString(); const error = new Error('Dangit!'); const onTaskEvent = jest.fn(); - const { runner } = testOpts({ + const { runner, instance } = testOpts({ onTaskEvent, instance: { id, @@ -864,7 +870,11 @@ describe('TaskManagerRunner', () => { }); await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith(asTaskRunEvent(id, asErr(error))); + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming( + asTaskRunEvent(id, asErr({ error, task: instance, result: TaskRunResult.RetryScheduled })) + ) + ); expect(onTaskEvent).toHaveBeenCalledTimes(1); }); @@ -872,7 +882,7 @@ describe('TaskManagerRunner', () => { const id = _.random(1, 20).toString(); const error = new Error('Dangit!'); const onTaskEvent = jest.fn(); - const { runner } = testOpts({ + const { runner, instance } = testOpts({ onTaskEvent, instance: { id, @@ -893,7 +903,11 @@ describe('TaskManagerRunner', () => { await runner.run(); - expect(onTaskEvent).toHaveBeenCalledWith(asTaskRunEvent(id, asErr(error))); + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming( + asTaskRunEvent(id, asErr({ error, task: instance, result: TaskRunResult.RetryScheduled })) + ) + ); expect(onTaskEvent).toHaveBeenCalledTimes(1); }); @@ -901,7 +915,7 @@ describe('TaskManagerRunner', () => { const id = _.random(1, 20).toString(); const error = new Error('Dangit!'); const onTaskEvent = jest.fn(); - const { runner, store } = testOpts({ + const { runner, store, instance: originalInstance } = testOpts({ onTaskEvent, instance: { id, @@ -925,7 +939,18 @@ describe('TaskManagerRunner', () => { const instance = store.update.args[0][0]; expect(instance.status).toBe('failed'); - expect(onTaskEvent).toHaveBeenCalledWith(asTaskRunEvent(id, asErr(error))); + expect(onTaskEvent).toHaveBeenCalledWith( + withAnyTiming( + asTaskRunEvent( + id, + asErr({ + error, + task: originalInstance, + result: TaskRunResult.Failed, + }) + ) + ) + ); expect(onTaskEvent).toHaveBeenCalledTimes(1); }); }); @@ -936,6 +961,13 @@ describe('TaskManagerRunner', () => { onTaskEvent?: (event: TaskEvent) => void; } + function withAnyTiming(taskRun: TaskRun) { + return { + ...taskRun, + timing: { start: expect.any(Number), stop: expect.any(Number) }, + }; + } + function testOpts(opts: TestOpts) { const callCluster = sinon.stub(); const createTaskRunner = sinon.stub(); diff --git a/x-pack/plugins/task_manager/server/task_runner.ts b/x-pack/plugins/task_manager/server/task_runner.ts index 24a487e36602..45e4cb300057 100644 --- a/x-pack/plugins/task_manager/server/task_runner.ts +++ b/x-pack/plugins/task_manager/server/task_runner.ts @@ -10,16 +10,23 @@ * rescheduling, middleware application, etc. */ +import { Logger } from 'src/core/server'; import apm from 'elastic-apm-node'; import { performance } from 'perf_hooks'; import Joi from 'joi'; import { identity, defaults, flow } from 'lodash'; -import { Logger } from '../../../../src/core/server'; -import { asOk, asErr, mapErr, eitherAsync, unwrap, mapOk, Result } from './lib/result_type'; -import { TaskRun, TaskMarkRunning, asTaskRunEvent, asTaskMarkRunningEvent } from './task_events'; -import { intervalFromDate, intervalFromNow } from './lib/intervals'; import { Middleware } from './lib/middleware'; +import { asOk, asErr, mapErr, eitherAsync, unwrap, isOk, mapOk, Result } from './lib/result_type'; +import { + TaskRun, + TaskMarkRunning, + asTaskRunEvent, + asTaskMarkRunningEvent, + startTaskTimer, + TaskTiming, +} from './task_events'; +import { intervalFromDate, intervalFromNow } from './lib/intervals'; import { CancelFunction, CancellableTask, @@ -63,6 +70,21 @@ type Opts = { onTaskEvent?: (event: TaskRun | TaskMarkRunning) => void; } & Pick; +export enum TaskRunResult { + // Task completed successfully + Success = 'Success', + // Recurring Task completed successfully + SuccessRescheduled = 'Success', + // // Task completed successfully after a retry + // SuccessfulRetry = 'SuccessfulRetry', + // // Recurring Task completed successfully after a retry + // SuccessfulRetryRescheduled = 'SuccessfulRetry', + // Task has failed and a retry has been scheduled + RetryScheduled = 'RetryScheduled', + // Task has failed + Failed = 'Failed', +} + /** * Runs a background task, ensures that errors are properly handled, * allows for cancellation. @@ -172,6 +194,7 @@ export class TaskManagerRunner implements TaskRunner { taskInstance: this.instance, }); + const stopTaskTimer = startTaskTimer(); const apmTrans = apm.startTransaction( `taskManager run ${this.instance.taskType}`, 'taskManager' @@ -181,13 +204,16 @@ export class TaskManagerRunner implements TaskRunner { const result = await this.task.run(); const validatedResult = this.validateResult(result); if (apmTrans) apmTrans.end('success'); - return this.processResult(validatedResult); + return this.processResult(validatedResult, stopTaskTimer()); } catch (err) { this.logger.error(`Task ${this} failed: ${err}`); // in error scenario, we can not get the RunResult // re-use modifiedContext's state, which is correct as of beforeRun if (apmTrans) apmTrans.end('error'); - return this.processResult(asErr({ error: err, state: modifiedContext.taskInstance.state })); + return this.processResult( + asErr({ error: err, state: modifiedContext.taskInstance.state }), + stopTaskTimer() + ); } } @@ -337,8 +363,9 @@ export class TaskManagerRunner implements TaskRunner { private async processResultForRecurringTask( result: Result - ): Promise { - const fieldUpdates = flow( + ): Promise { + const hasTaskRunFailed = isOk(result); + const fieldUpdates: Partial & Pick = flow( // if running the task has failed ,try to correct by scheduling a retry in the near future mapErr(this.rescheduleFailedRun), // if retrying is possible (new runAt) or this is an recurring task - reschedule @@ -357,7 +384,7 @@ export class TaskManagerRunner implements TaskRunner { await this.bufferedTaskStore.update( defaults( { - ...(fieldUpdates as Partial), + ...fieldUpdates, // reset fields that track the lifecycle of the concluded `task run` startedAt: null, retryAt: null, @@ -366,9 +393,15 @@ export class TaskManagerRunner implements TaskRunner { this.instance ) ); + + return fieldUpdates.status === TaskStatus.Failed + ? TaskRunResult.Failed + : hasTaskRunFailed + ? TaskRunResult.SuccessRescheduled + : TaskRunResult.RetryScheduled; } - private async processResultWhenDone(): Promise { + private async processResultWhenDone(): Promise { // not a recurring task: clean up by removing the task instance from store try { await this.bufferedTaskStore.remove(this.instance.id); @@ -379,24 +412,38 @@ export class TaskManagerRunner implements TaskRunner { throw err; } } + return TaskRunResult.Success; } private async processResult( - result: Result + result: Result, + taskTiming: TaskTiming ): Promise> { + const task = this.instance; await eitherAsync( result, async ({ runAt }: SuccessfulRunResult) => { - if (runAt || this.instance.schedule) { - await this.processResultForRecurringTask(result); - } else { - await this.processResultWhenDone(); - } - this.onTaskEvent(asTaskRunEvent(this.id, asOk(this.instance))); + this.onTaskEvent( + asTaskRunEvent( + this.id, + asOk({ + task, + result: await (runAt || task.schedule + ? this.processResultForRecurringTask(result) + : this.processResultWhenDone()), + }), + taskTiming + ) + ); }, async ({ error }: FailedRunResult) => { - await this.processResultForRecurringTask(result); - this.onTaskEvent(asTaskRunEvent(this.id, asErr(error))); + this.onTaskEvent( + asTaskRunEvent( + this.id, + asErr({ task, result: await this.processResultForRecurringTask(result), error }), + taskTiming + ) + ); } ); return result; diff --git a/x-pack/plugins/task_manager/server/task_scheduling.test.ts b/x-pack/plugins/task_manager/server/task_scheduling.test.ts index 1f7f9250d901..8d660f57ab87 100644 --- a/x-pack/plugins/task_manager/server/task_scheduling.test.ts +++ b/x-pack/plugins/task_manager/server/task_scheduling.test.ts @@ -21,6 +21,7 @@ import { asErr, asOk } from './lib/result_type'; import { ConcreteTaskInstance, TaskLifecycleResult, TaskStatus } from './task'; import { createInitialMiddleware } from './lib/middleware'; import { taskStoreMock } from './task_store.mock'; +import { TaskRunResult } from './task_runner'; import { mockLogger } from './test_utils'; describe('TaskScheduling', () => { @@ -113,7 +114,7 @@ describe('TaskScheduling', () => { const result = taskScheduling.runNow(id); const task = { id } as ConcreteTaskInstance; - events$.next(asTaskRunEvent(id, asOk(task))); + events$.next(asTaskRunEvent(id, asOk({ task, result: TaskRunResult.Success }))); return expect(result).resolves.toEqual({ id }); }); @@ -132,7 +133,16 @@ describe('TaskScheduling', () => { const task = { id } as ConcreteTaskInstance; events$.next(asTaskClaimEvent(id, asOk(task))); events$.next(asTaskMarkRunningEvent(id, asOk(task))); - events$.next(asTaskRunEvent(id, asErr(new Error('some thing gone wrong')))); + events$.next( + asTaskRunEvent( + id, + asErr({ + task, + error: new Error('some thing gone wrong'), + result: TaskRunResult.Failed, + }) + ) + ); return expect(result).rejects.toMatchInlineSnapshot( `[Error: Failed to run task "01ddff11-e88a-4d13-bc4e-256164e755e2": Error: some thing gone wrong]` @@ -306,10 +316,20 @@ describe('TaskScheduling', () => { const otherTask = { id: differentTask } as ConcreteTaskInstance; events$.next(asTaskClaimEvent(id, asOk(task))); events$.next(asTaskClaimEvent(differentTask, asOk(otherTask))); + events$.next( + asTaskRunEvent(differentTask, asOk({ task: otherTask, result: TaskRunResult.Success })) + ); - events$.next(asTaskRunEvent(differentTask, asOk(task))); - - events$.next(asTaskRunEvent(id, asErr(new Error('some thing gone wrong')))); + events$.next( + asTaskRunEvent( + id, + asErr({ + task, + error: new Error('some thing gone wrong'), + result: TaskRunResult.Failed, + }) + ) + ); return expect(result).rejects.toMatchInlineSnapshot( `[Error: Failed to run task "01ddff11-e88a-4d13-bc4e-256164e755e2": Error: some thing gone wrong]` diff --git a/x-pack/plugins/task_manager/server/task_scheduling.ts b/x-pack/plugins/task_manager/server/task_scheduling.ts index 00f7d853d711..9806ada386e4 100644 --- a/x-pack/plugins/task_manager/server/task_scheduling.ts +++ b/x-pack/plugins/task_manager/server/task_scheduling.ts @@ -10,7 +10,13 @@ import { Option, map as mapOptional, getOrElse } from 'fp-ts/lib/Option'; import { Logger } from '../../../../src/core/server'; import { asOk, either, map, mapErr, promiseResult } from './lib/result_type'; -import { isTaskRunEvent, isTaskClaimEvent, isTaskRunRequestEvent } from './task_events'; +import { + isTaskRunEvent, + isTaskClaimEvent, + isTaskRunRequestEvent, + RanTask, + ErroredTask, +} from './task_events'; import { Middleware } from './lib/middleware'; import { ConcreteTaskInstance, @@ -23,6 +29,7 @@ import { import { TaskStore } from './task_store'; import { ensureDeprecatedFieldsAreCorrected } from './lib/correct_deprecated_fields'; import { TaskLifecycleEvent, TaskPollingLifecycle } from './polling_lifecycle'; +import { FillPoolResult } from './lib/fill_pool'; const VERSION_CONFLICT_STATUS = 409; @@ -118,16 +125,19 @@ export class TaskScheduling { return reject(await this.identifyTaskFailureReason(taskId, error)); }, taskEvent.event); } else { - either>( + either< + RanTask | ConcreteTaskInstance | FillPoolResult, + Error | ErroredTask | Option + >( taskEvent.event, - (taskInstance: ConcreteTaskInstance) => { + (taskInstance: RanTask | ConcreteTaskInstance | FillPoolResult) => { // resolve if the task has run sucessfully if (isTaskRunEvent(taskEvent)) { subscription.unsubscribe(); - resolve({ id: taskInstance.id }); + resolve({ id: (taskInstance as RanTask).task.id }); } }, - async (error: Error | Option) => { + async (errorResult: Error | ErroredTask | Option) => { // reject if any error event takes place for the requested task subscription.unsubscribe(); return reject( @@ -135,7 +145,9 @@ export class TaskScheduling { `Failed to run task "${taskId}": ${ isTaskRunRequestEvent(taskEvent) ? `Task Manager is at capacity, please try again later` - : error + : isTaskRunEvent(taskEvent) + ? `${(errorResult as ErroredTask).error}` + : `${errorResult}` }` ) ); diff --git a/x-pack/plugins/task_manager/server/task_store.mock.ts b/x-pack/plugins/task_manager/server/task_store.mock.ts index 9b82a3e3ee7a..7af1b9ef5f11 100644 --- a/x-pack/plugins/task_manager/server/task_store.mock.ts +++ b/x-pack/plugins/task_manager/server/task_store.mock.ts @@ -31,6 +31,7 @@ export const taskStoreMock = { get: jest.fn(), getLifecycle: jest.fn(), fetch: jest.fn(), + aggregate: jest.fn(), maxAttempts, index, taskManagerId, diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts index 1d49fbc2ecc7..8c0d7764e009 100644 --- a/x-pack/plugins/task_manager/server/task_store.ts +++ b/x-pack/plugins/task_manager/server/task_store.ts @@ -56,6 +56,8 @@ import { } from './queries/mark_available_tasks_as_claimed'; import { TaskTypeDictionary } from './task_type_dictionary'; +import { ESSearchResponse, ESSearchBody } from '../../apm/typings/elasticsearch'; + export interface StoreOpts { esClient: ElasticsearchClient; index: string; @@ -74,6 +76,9 @@ export interface SearchOpts { search_after?: unknown[]; } +export type AggregationOpts = Pick, 'aggs'> & + Pick; + export interface UpdateByQuerySearchOpts extends SearchOpts { script?: object; } @@ -494,6 +499,25 @@ export class TaskStore { } } + public async aggregate({ + aggs, + query, + size = 0, + }: TSearchRequest): Promise> { + const { body } = await this.esClient.search< + ESSearchResponse + >({ + index: this.index, + ignore_unavailable: true, + body: ensureAggregationOnlyReturnsTaskObjects({ + query, + aggs, + size, + }), + }); + return body; + } + private async updateByQuery( opts: UpdateByQuerySearchOpts = {}, // eslint-disable-next-line @typescript-eslint/naming-convention @@ -579,6 +603,22 @@ function ensureQueryOnlyReturnsTaskObjects(opts: SearchOpts): SearchOpts { }; } +function ensureAggregationOnlyReturnsTaskObjects(opts: AggregationOpts): AggregationOpts { + const originalQuery = opts.query; + const filterToOnlyTasks = { + bool: { + filter: [{ term: { type: 'task' } }], + }, + }; + const query = originalQuery + ? { bool: { must: [filterToOnlyTasks, originalQuery] } } + : filterToOnlyTasks; + return { + ...opts, + query, + }; +} + function isSavedObjectsUpdateResponse( result: SavedObjectsUpdateResponse | Error ): result is SavedObjectsUpdateResponse { diff --git a/x-pack/test/plugin_api_integration/config.ts b/x-pack/test/plugin_api_integration/config.ts index b89ed6ad550a..30a361ea2a37 100644 --- a/x-pack/test/plugin_api_integration/config.ts +++ b/x-pack/test/plugin_api_integration/config.ts @@ -43,6 +43,7 @@ export default async function ({ readConfigFile }: FtrConfigProviderContext) { '--xpack.eventLog.enabled=true', '--xpack.eventLog.logEntries=true', '--xpack.eventLog.indexEntries=true', + '--xpack.task_manager.monitored_aggregated_stats_refresh_rate=5000', ...plugins.map( (pluginDir) => `--plugin-path=${path.resolve(__dirname, 'plugins', pluginDir)}` ), diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts new file mode 100644 index 000000000000..9b02b5857367 --- /dev/null +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -0,0 +1,203 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import expect from '@kbn/expect'; +import url from 'url'; +import { keyBy, mapValues } from 'lodash'; +import supertestAsPromised from 'supertest-as-promised'; +import { FtrProviderContext } from '../../ftr_provider_context'; +import { ConcreteTaskInstance } from '../../../../plugins/task_manager/server'; + +interface MonitoringStats { + last_update: string; + status: string; + stats: { + configuration: { + timestamp: string; + value: Record; + }; + workload: { + timestamp: string; + value: { + count: number; + task_types: Record; + schedule: Array<[string, number]>; + overdue: number; + estimated_schedule_density: number[]; + }; + }; + runtime: { + timestamp: string; + value: { + drift: Record; + execution: { + duration: Record>; + result_frequency_percent_as_number: Record>; + }; + polling: { + last_successful_poll: string; + result_frequency_percent_as_number: Record; + }; + }; + }; + }; +} + +export default function ({ getService }: FtrProviderContext) { + const config = getService('config'); + const retry = getService('retry'); + const supertest = supertestAsPromised(url.format(config.get('servers.kibana'))); + + function getHealthRequest() { + return supertest.get('/api/task_manager/_health').set('kbn-xsrf', 'foo'); + } + + function getHealth(): Promise { + return getHealthRequest() + .expect(200) + .then((response) => response.body); + } + + function scheduleTask(task: Partial): Promise { + return supertest + .post('/api/sample_tasks/schedule') + .set('kbn-xsrf', 'xxx') + .send({ task }) + .expect(200) + .then((response: { body: ConcreteTaskInstance }) => response.body); + } + + const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); + + const monitoredAggregatedStatsRefreshRate = 5000; + + describe('health', () => { + it('should return basic configuration of task manager', async () => { + const health = await getHealth(); + expect(health.status).to.eql('OK'); + expect(health.stats.configuration.value).to.eql({ + poll_interval: 3000, + max_poll_inactivity_cycles: 10, + monitored_aggregated_stats_refresh_rate: monitoredAggregatedStatsRefreshRate, + monitored_stats_running_average_window: 50, + monitored_task_execution_thresholds: { + custom: {}, + default: { + error_threshold: 90, + warn_threshold: 80, + }, + }, + request_capacity: 1000, + max_workers: 10, + }); + }); + + it('should return the task manager workload', async () => { + const health = await getHealth(); + const { + status, + stats: { workload }, + } = health; + + expect(status).to.eql('OK'); + + const sumSampleTaskInWorkload = + (workload.value.task_types as { + sampleTask?: { count: number }; + }).sampleTask?.count ?? 0; + const scheduledWorkload = (mapValues( + keyBy(workload.value.schedule as Array<[string, number]>, ([interval, count]) => interval), + ([, count]) => count + ) as unknown) as { '37m': number | undefined; '37s': number | undefined }; + + await scheduleTask({ + taskType: 'sampleTask', + schedule: { interval: '37s' }, + }); + + await scheduleTask({ + taskType: 'sampleTask', + schedule: { interval: '37m' }, + }); + + await retry.try(async () => { + // workload is configured to refresh every 5s in FTs + await delay(monitoredAggregatedStatsRefreshRate); + + const workloadAfterScheduling = (await getHealth()).stats.workload.value; + + expect( + (workloadAfterScheduling.task_types as { sampleTask: { count: number } }).sampleTask.count + ).to.eql(sumSampleTaskInWorkload + 2); + + const schedulesWorkloadAfterScheduling = (mapValues( + keyBy( + workloadAfterScheduling.schedule as Array<[string, number]>, + ([interval]) => interval + ), + ([, count]) => count + ) as unknown) as { + '37m': number; + '37s': number; + }; + expect(schedulesWorkloadAfterScheduling['37s']).to.eql(1 + (scheduledWorkload['37s'] ?? 0)); + expect(schedulesWorkloadAfterScheduling['37m']).to.eql(1 + (scheduledWorkload['37m'] ?? 0)); + }); + }); + + it('should return a breakdown of idleTasks in the task manager workload', async () => { + const { + workload: { value: workload }, + } = (await getHealth()).stats; + + expect(typeof workload.overdue).to.eql('number'); + + expect(Array.isArray(workload.estimated_schedule_density)).to.eql(true); + + // test run with the default poll_interval of 3s and a monitored_aggregated_stats_refresh_rate of 5s, + // so we expect the estimated_schedule_density to span a minute (which means 20 buckets, as 60s / 3s = 20) + expect(workload.estimated_schedule_density.length).to.eql(20); + }); + + it('should return the task manager runtime stats', async () => { + await scheduleTask({ + taskType: 'sampleTask', + schedule: { interval: '5s' }, + }); + + const { + runtime: { + value: { drift, polling, execution }, + }, + } = (await getHealth()).stats; + + expect(isNaN(Date.parse(polling.last_successful_poll as string))).to.eql(false); + expect(typeof polling.result_frequency_percent_as_number.NoTasksClaimed).to.eql('number'); + expect(typeof polling.result_frequency_percent_as_number.RanOutOfCapacity).to.eql('number'); + expect(typeof polling.result_frequency_percent_as_number.PoolFilled).to.eql('number'); + + expect(typeof drift.p50).to.eql('number'); + expect(typeof drift.p90).to.eql('number'); + expect(typeof drift.p95).to.eql('number'); + expect(typeof drift.p99).to.eql('number'); + + expect(typeof execution.duration.sampleTask.p50).to.eql('number'); + expect(typeof execution.duration.sampleTask.p90).to.eql('number'); + expect(typeof execution.duration.sampleTask.p95).to.eql('number'); + expect(typeof execution.duration.sampleTask.p99).to.eql('number'); + + expect(typeof execution.result_frequency_percent_as_number.sampleTask.Success).to.eql( + 'number' + ); + expect(typeof execution.result_frequency_percent_as_number.sampleTask.RetryScheduled).to.eql( + 'number' + ); + expect(typeof execution.result_frequency_percent_as_number.sampleTask.Failed).to.eql( + 'number' + ); + }); + }); +} diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts new file mode 100644 index 000000000000..b542bff3a4aa --- /dev/null +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/index.ts @@ -0,0 +1,15 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +import { FtrProviderContext } from '../../ftr_provider_context'; + +export default function ({ loadTestFile }: FtrProviderContext) { + describe('task_manager', function taskManagerSuite() { + this.tags('ciGroup2'); + loadTestFile(require.resolve('./health_route')); + loadTestFile(require.resolve('./task_management')); + }); +} diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/task_manager_integration.js b/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts similarity index 82% rename from x-pack/test/plugin_api_integration/test_suites/task_manager/task_manager_integration.js rename to x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts index 560e4dd87daa..1fd313c1ac43 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/task_manager_integration.js +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/task_management.ts @@ -8,19 +8,48 @@ import _ from 'lodash'; import expect from '@kbn/expect'; import url from 'url'; import supertestAsPromised from 'supertest-as-promised'; +import { FtrProviderContext } from '../../ftr_provider_context'; +import TaskManagerMapping from '../../../../plugins/task_manager/server/saved_objects/mappings.json'; +import { + DEFAULT_MAX_WORKERS, + DEFAULT_POLL_INTERVAL, +} from '../../../../plugins/task_manager/server/config'; +import { ConcreteTaskInstance } from '../../../../plugins/task_manager/server'; const { task: { properties: taskManagerIndexMapping }, -} = require('../../../../plugins/task_manager/server/saved_objects/mappings.json'); +} = TaskManagerMapping; -const { - DEFAULT_MAX_WORKERS, - DEFAULT_POLL_INTERVAL, -} = require('../../../../plugins/task_manager/server/config.ts'); +const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); -const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); +export interface RawDoc { + _id: string; + _source: any; + _type?: string; +} +export interface SearchResults { + hits: { + hits: RawDoc[]; + }; +} -export default function ({ getService }) { +type DeprecatedConcreteTaskInstance = Omit & { + interval: string; +}; + +type SerializedConcreteTaskInstance = Omit< + ConcreteTaskInstance, + 'state' | 'params' | 'scheduledAt' | 'startedAt' | 'retryAt' | 'runAt' +> & { + state: State; + params: Params; + scheduledAt: string; + startedAt: string | null; + retryAt: string | null; + runAt: string; +}; + +export default function ({ getService }: FtrProviderContext) { const es = getService('legacyEs'); const log = getService('log'); const retry = getService('retry'); @@ -53,14 +82,18 @@ export default function ({ getService }) { } }); - function currentTasks() { + function currentTasks(): Promise<{ + docs: Array>; + }> { return supertest .get('/api/sample_tasks') .expect(200) .then((response) => response.body); } - function currentTask(task) { + function currentTask( + task: string + ): Promise> { return supertest .get(`/api/sample_tasks/task/${task}`) .send({ task }) @@ -69,32 +102,30 @@ export default function ({ getService }) { } function ensureTasksIndexRefreshed() { - return supertest - .get(`/api/ensure_tasks_index_refreshed`) - .send({}) - .expect(200) - .then((response) => response.body); + return supertest.get(`/api/ensure_tasks_index_refreshed`).send({}).expect(200); } - function historyDocs(taskId) { + function historyDocs(taskId?: string): Promise { return es .search({ index: testHistoryIndex, q: taskId ? `taskId:${taskId}` : 'type:task', }) - .then((result) => result.hits.hits); + .then((result: SearchResults) => result.hits.hits); } - function scheduleTask(task) { + function scheduleTask( + task: Partial + ): Promise { return supertest .post('/api/sample_tasks/schedule') .set('kbn-xsrf', 'xxx') .send({ task }) .expect(200) - .then((response) => response.body); + .then((response: { body: SerializedConcreteTaskInstance }) => response.body); } - function runTaskNow(task) { + function runTaskNow(task: { id: string }) { return supertest .post('/api/sample_tasks/run_now') .set('kbn-xsrf', 'xxx') @@ -103,16 +134,16 @@ export default function ({ getService }) { .then((response) => response.body); } - function scheduleTaskIfNotExists(task) { + function scheduleTaskIfNotExists(task: Partial) { return supertest .post('/api/sample_tasks/ensure_scheduled') .set('kbn-xsrf', 'xxx') .send({ task }) .expect(200) - .then((response) => response.body); + .then((response: { body: ConcreteTaskInstance }) => response.body); } - function releaseTasksWaitingForEventToComplete(event) { + function releaseTasksWaitingForEventToComplete(event: string) { return supertest .post('/api/sample_tasks/event') .set('kbn-xsrf', 'xxx') @@ -120,11 +151,17 @@ export default function ({ getService }) { .expect(200); } - function getTaskById(tasks, id) { + function getTaskById( + tasks: Array>, + id: string + ) { return tasks.filter((task) => task.id === id)[0]; } - async function provideParamsToTasksWaitingForParams(taskId, data = {}) { + async function provideParamsToTasksWaitingForParams( + taskId: string, + data: Record = {} + ) { // wait for task to start running and stall on waitForParams await retry.try(async () => { const tasks = (await currentTasks()).docs; @@ -151,7 +188,7 @@ export default function ({ getService }) { await retry.try(async () => { expect((await historyDocs()).length).to.eql(1); - const [task] = (await currentTasks()).docs; + const [task] = (await currentTasks<{ count: number }>()).docs; log.debug(`Task found: ${task.id}`); log.debug(`Task status: ${task.status}`); log.debug(`Task state: ${JSON.stringify(task.state, null, 2)}`); @@ -236,7 +273,7 @@ export default function ({ getService }) { await retry.try(async () => { expect((await historyDocs(originalTask.id)).length).to.eql(1); - const [task] = (await currentTasks()).docs; + const [task] = (await currentTasks<{ count: number }>()).docs; expect(task.attempts).to.eql(0); expect(task.state.count).to.eql(count + 1); @@ -257,7 +294,7 @@ export default function ({ getService }) { await retry.try(async () => { expect((await historyDocs()).length).to.eql(1); - const [task] = (await currentTasks()).docs; + const [task] = (await currentTasks<{ count: number }>()).docs; expect(task.attempts).to.eql(0); expect(task.state.count).to.eql(1); @@ -278,7 +315,7 @@ export default function ({ getService }) { await retry.try(async () => { expect((await historyDocs()).length).to.eql(1); - const [task] = (await currentTasks()).docs; + const [task] = (await currentTasks<{ count: number }>()).docs; expect(task.attempts).to.eql(0); expect(task.state.count).to.eql(1); @@ -299,7 +336,7 @@ export default function ({ getService }) { 1 ); - const [task] = (await currentTasks()).docs.filter( + const [task] = (await currentTasks<{ count: number }>()).docs.filter( (taskDoc) => taskDoc.id === originalTask.id ); @@ -322,7 +359,7 @@ export default function ({ getService }) { .length ).to.eql(2); - const [task] = (await currentTasks()).docs.filter( + const [task] = (await currentTasks<{ count: number }>()).docs.filter( (taskDoc) => taskDoc.id === originalTask.id ); expect(task.state.count).to.eql(2); @@ -343,7 +380,7 @@ export default function ({ getService }) { const docs = await historyDocs(originalTask.id); expect(docs.length).to.eql(1); - const task = await currentTask(originalTask.id); + const task = await currentTask<{ count: number }>(originalTask.id); expect(task.state.count).to.eql(1); @@ -393,16 +430,16 @@ export default function ({ getService }) { expect(await runNowResult).to.eql({ id: originalTask.id }); await retry.try(async () => { - const task = await currentTask(originalTask.id); + const task = await currentTask<{ count: number }>(originalTask.id); expect(task.state.count).to.eql(2); }); // drain tasks, othrwise they'll keep Task Manager stalled await retry.try(async () => { await releaseTasksWaitingForEventToComplete('releaseTheOthers'); - const tasks = (await currentTasks()).docs.filter( - (task) => task.params.originalParams.waitForEvent === 'releaseTheOthers' - ); + const tasks = ( + await currentTasks<{}, { originalParams: { waitForEvent: string } }>() + ).docs.filter((task) => task.params.originalParams.waitForEvent === 'releaseTheOthers'); expect(tasks.length).to.eql(0); }); }); @@ -420,7 +457,7 @@ export default function ({ getService }) { 1 ); - const task = await currentTask(originalTask.id); + const task = await currentTask<{ count: number }>(originalTask.id); expect(task.state.count).to.eql(1); expect(task.status).to.eql('idle'); @@ -437,7 +474,7 @@ export default function ({ getService }) { expect(successfulRunNowResult).to.eql({ id: originalTask.id }); await retry.try(async () => { - const task = await currentTask(originalTask.id); + const task = await currentTask<{ count: number }>(originalTask.id); expect(task.state.count).to.eql(2); expect(task.status).to.eql('idle'); }); @@ -515,7 +552,7 @@ export default function ({ getService }) { // finish first run by emitting 'runNowHasBeenAttempted' event await releaseTasksWaitingForEventToComplete('runNowHasBeenAttempted'); await retry.try(async () => { - const tasks = (await currentTasks()).docs; + const tasks = (await currentTasks<{ count: number }>()).docs; expect(getTaskById(tasks, longRunningTask.id).state.count).to.eql(1); const task = await currentTask(longRunningTask.id); @@ -564,12 +601,14 @@ export default function ({ getService }) { expect(await runNowResultWithExpectedFailure).to.eql({ id: taskThatFailsBeforeRunNow.id }); }); - async function expectReschedule(originalRunAt, currentTask, expectedDiff) { + async function expectReschedule( + originalRunAt: number, + task: SerializedConcreteTaskInstance, + expectedDiff: number + ) { const buffer = 10000; - expect(Date.parse(currentTask.runAt) - originalRunAt).to.be.greaterThan( - expectedDiff - buffer - ); - expect(Date.parse(currentTask.runAt) - originalRunAt).to.be.lessThan(expectedDiff + buffer); + expect(Date.parse(task.runAt) - originalRunAt).to.be.greaterThan(expectedDiff - buffer); + expect(Date.parse(task.runAt) - originalRunAt).to.be.lessThan(expectedDiff + buffer); } it('should run tasks in parallel, allowing for long running tasks along side faster tasks', async () => { @@ -594,14 +633,14 @@ export default function ({ getService }) { }); await retry.try(async () => { - const tasks = (await currentTasks()).docs; + const tasks = (await currentTasks<{ count: number }>()).docs; expect(getTaskById(tasks, fastTask.id).state.count).to.eql(2); }); await releaseTasksWaitingForEventToComplete('rescheduleHasHappened'); await retry.try(async () => { - const tasks = (await currentTasks()).docs; + const tasks = (await currentTasks<{ count: number }>()).docs; expect(getTaskById(tasks, fastTask.id).state.count).to.greaterThan(2); expect(getTaskById(tasks, longRunningTask.id).state.count).to.eql(1);