Revert "[Response Ops][Task Manager] Expose SLI metrics in HTTP API (… (

#163639) …#162178)" This reverts commit 582d97d. ## Summary Summarize your PR. If it involves visual changes include a screenshot or gif. ### Checklist Delete any items that are not applicable to this PR. - [ ] Any text added follows [EUI's writing guidelines](https://elastic.github.io/eui/#/guidelines/writing), uses sentence case text and includes [i18n support](https://github.com/elastic/kibana/blob/main/packages/kbn-i18n/README.md) - [ ] [Documentation](https://www.elastic.co/guide/en/kibana/master/development-documentation.html) was added for features that require explanation or tutorials - [ ] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios - [ ] Any UI touched in this PR is usable by keyboard only (learn more about [keyboard accessibility](https://webaim.org/techniques/keyboard/)) - [ ] Any UI touched in this PR does not create any new axe failures (run axe in browser: [FF](https://addons.mozilla.org/en-US/firefox/addon/axe-devtools/), [Chrome](https://chrome.google.com/webstore/detail/axe-web-accessibility-tes/lhdoppojpmngadmnindnejefpokejbdd?hl=en-US)) - [ ] If a plugin configuration key changed, check if it needs to be allowlisted in the cloud and added to the [docker list](https://github.com/elastic/kibana/blob/main/src/dev/build/tasks/os_packages/docker_generator/resources/base/bin/kibana-docker) - [ ] This renders correctly on smaller devices using a responsive layout. (You can test this [in your browser](https://www.browserstack.com/guide/responsive-testing-on-local-server)) - [ ] This was checked for [cross-browser compatibility](https://www.elastic.co/support/matrix#matrix_browsers) ### Risk Matrix Delete this section if it is not applicable to this PR. Before closing this PR, invite QA, stakeholders, and other developers to identify risks that should be tested prior to the change/feature release. When forming the risk matrix, consider some of the following examples and how they may potentially impact the change: | Risk | Probability | Severity | Mitigation/Notes | |---------------------------|-------------|----------|-------------------------| | Multiple Spaces—unexpected behavior in non-default Kibana Space. | Low | High | Integration tests will verify that all features are still supported in non-default Kibana Space and when user switches between spaces. | | Multiple nodes—Elasticsearch polling might have race conditions when multiple Kibana nodes are polling for the same tasks. | High | Low | Tasks are idempotent, so executing them multiple times will not result in logical error, but will degrade performance. To test for this case we add plenty of unit tests around this logic and document manual testing procedure. | | Code should gracefully handle cases when feature X or plugin Y are disabled. | Medium | High | Unit tests will verify that any feature flag or plugin combination still results in our service operational. | | [See more potential risk examples](https://github.com/elastic/kibana/blob/main/RISK_MATRIX.mdx) | ### For maintainers - [ ] This was checked for breaking API changes and was [labeled appropriately](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process)
elastic · Aug 10, 2023 · f61bb80 · f61bb80
1 parent 0bbc3e5
commit f61bb80
Show file tree

Hide file tree

Showing 39 changed files with 86 additions and 2,469 deletions.
diff --git a/x-pack/plugins/task_manager/server/config.test.ts b/x-pack/plugins/task_manager/server/config.test.ts
@@ -23,7 +23,6 @@ describe('config validation', () => {
         },
         "max_attempts": 3,
         "max_workers": 10,
-        "metrics_reset_interval": 30000,
         "monitored_aggregated_stats_refresh_rate": 60000,
         "monitored_stats_health_verbose_log": Object {
           "enabled": false,
@@ -82,7 +81,6 @@ describe('config validation', () => {
         },
         "max_attempts": 3,
         "max_workers": 10,
-        "metrics_reset_interval": 30000,
         "monitored_aggregated_stats_refresh_rate": 60000,
         "monitored_stats_health_verbose_log": Object {
           "enabled": false,
@@ -139,7 +137,6 @@ describe('config validation', () => {
         },
         "max_attempts": 3,
         "max_workers": 10,
-        "metrics_reset_interval": 30000,
         "monitored_aggregated_stats_refresh_rate": 60000,
         "monitored_stats_health_verbose_log": Object {
           "enabled": false,

diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts
@@ -20,8 +20,6 @@ export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000;
 export const DEFAULT_MONITORING_STATS_RUNNING_AVERAGE_WINDOW = 50;
 export const DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS = 60;
 
-export const DEFAULT_METRICS_RESET_INTERVAL = 30 * 1000; // 30 seconds
-
 // At the default poll interval of 3sec, this averages over the last 15sec.
 export const DEFAULT_WORKER_UTILIZATION_RUNNING_AVERAGE_WINDOW = 5;
 
@@ -54,63 +52,53 @@ const eventLoopDelaySchema = schema.object({
 });
 
 const requeueInvalidTasksConfig = schema.object({
-  delay: schema.number({ defaultValue: 3000, min: 0 }),
   enabled: schema.boolean({ defaultValue: false }),
+  delay: schema.number({ defaultValue: 3000, min: 0 }),
   max_attempts: schema.number({ defaultValue: 100, min: 1, max: 500 }),
 });
 
 export const configSchema = schema.object(
   {
-    allow_reading_invalid_state: schema.boolean({ defaultValue: true }),
-    ephemeral_tasks: schema.object({
-      enabled: schema.boolean({ defaultValue: false }),
-      /* How many requests can Task Manager buffer before it rejects new requests. */
-      request_capacity: schema.number({
-        // a nice round contrived number, feel free to change as we learn how it behaves
-        defaultValue: 10,
-        min: 1,
-        max: DEFAULT_MAX_EPHEMERAL_REQUEST_CAPACITY,
-      }),
-    }),
-    event_loop_delay: eventLoopDelaySchema,
     /* The maximum number of times a task will be attempted before being abandoned as failed */
     max_attempts: schema.number({
       defaultValue: 3,
       min: 1,
     }),
+    /* How often, in milliseconds, the task manager will look for more work. */
+    poll_interval: schema.number({
+      defaultValue: DEFAULT_POLL_INTERVAL,
+      min: 100,
+    }),
+    /* How many requests can Task Manager buffer before it rejects new requests. */
+    request_capacity: schema.number({
+      // a nice round contrived number, feel free to change as we learn how it behaves
+      defaultValue: 1000,
+      min: 1,
+    }),
     /* The maximum number of tasks that this Kibana instance will run simultaneously. */
     max_workers: schema.number({
       defaultValue: DEFAULT_MAX_WORKERS,
       // disable the task manager rather than trying to specify it with 0 workers
       min: 1,
     }),
-    /* The interval at which monotonically increasing metrics counters will reset */
-    metrics_reset_interval: schema.number({
-      defaultValue: DEFAULT_METRICS_RESET_INTERVAL,
-      min: 10 * 1000, // minimum 10 seconds
-    }),
-    /* The rate at which we refresh monitored stats that require aggregation queries against ES. */
-    monitored_aggregated_stats_refresh_rate: schema.number({
-      defaultValue: DEFAULT_MONITORING_REFRESH_RATE,
-      /* don't run monitored stat aggregations any faster than once every 5 seconds */
-      min: 5000,
-    }),
-    monitored_stats_health_verbose_log: schema.object({
-      enabled: schema.boolean({ defaultValue: false }),
-      level: schema.oneOf([schema.literal('debug'), schema.literal('info')], {
-        defaultValue: 'debug',
-      }),
-      /* The amount of seconds we allow a task to delay before printing a warning server log */
-      warn_delayed_task_start_in_seconds: schema.number({
-        defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
-      }),
+    /* The threshold percenatge for workers experiencing version conflicts for shifting the polling interval. */
+    version_conflict_threshold: schema.number({
+      defaultValue: DEFAULT_VERSION_CONFLICT_THRESHOLD,
+      min: 50,
+      max: 100,
     }),
     /* The rate at which we emit fresh monitored stats. By default we'll use the poll_interval (+ a slight buffer) */
     monitored_stats_required_freshness: schema.number({
       defaultValue: (config?: unknown) =>
         ((config as { poll_interval: number })?.poll_interval ?? DEFAULT_POLL_INTERVAL) + 1000,
       min: 100,
     }),
+    /* The rate at which we refresh monitored stats that require aggregation queries against ES. */
+    monitored_aggregated_stats_refresh_rate: schema.number({
+      defaultValue: DEFAULT_MONITORING_REFRESH_RATE,
+      /* don't run monitored stat aggregations any faster than once every 5 seconds */
+      min: 5000,
+    }),
     /* The size of the running average window for monitored stats. */
     monitored_stats_running_average_window: schema.number({
       defaultValue: DEFAULT_MONITORING_STATS_RUNNING_AVERAGE_WINDOW,
@@ -119,39 +107,44 @@ export const configSchema = schema.object(
     }),
     /* Task Execution result warn & error thresholds. */
     monitored_task_execution_thresholds: schema.object({
+      default: taskExecutionFailureThresholdSchema,
       custom: schema.recordOf(schema.string(), taskExecutionFailureThresholdSchema, {
         defaultValue: {},
       }),
-      default: taskExecutionFailureThresholdSchema,
-    }),
-    /* How often, in milliseconds, the task manager will look for more work. */
-    poll_interval: schema.number({
-      defaultValue: DEFAULT_POLL_INTERVAL,
-      min: 100,
-    }),
-    /* How many requests can Task Manager buffer before it rejects new requests. */
-    request_capacity: schema.number({
-      // a nice round contrived number, feel free to change as we learn how it behaves
-      defaultValue: 1000,
-      min: 1,
     }),
-    requeue_invalid_tasks: requeueInvalidTasksConfig,
-    /* These are not designed to be used by most users. Please use caution when changing these */
-    unsafe: schema.object({
-      authenticate_background_task_utilization: schema.boolean({ defaultValue: true }),
-      exclude_task_types: schema.arrayOf(schema.string(), { defaultValue: [] }),
+    monitored_stats_health_verbose_log: schema.object({
+      enabled: schema.boolean({ defaultValue: false }),
+      level: schema.oneOf([schema.literal('debug'), schema.literal('info')], {
+        defaultValue: 'debug',
+      }),
+      /* The amount of seconds we allow a task to delay before printing a warning server log */
+      warn_delayed_task_start_in_seconds: schema.number({
+        defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
+      }),
     }),
-    /* The threshold percenatge for workers experiencing version conflicts for shifting the polling interval. */
-    version_conflict_threshold: schema.number({
-      defaultValue: DEFAULT_VERSION_CONFLICT_THRESHOLD,
-      min: 50,
-      max: 100,
+    ephemeral_tasks: schema.object({
+      enabled: schema.boolean({ defaultValue: false }),
+      /* How many requests can Task Manager buffer before it rejects new requests. */
+      request_capacity: schema.number({
+        // a nice round contrived number, feel free to change as we learn how it behaves
+        defaultValue: 10,
+        min: 1,
+        max: DEFAULT_MAX_EPHEMERAL_REQUEST_CAPACITY,
+      }),
     }),
+    event_loop_delay: eventLoopDelaySchema,
     worker_utilization_running_average_window: schema.number({
       defaultValue: DEFAULT_WORKER_UTILIZATION_RUNNING_AVERAGE_WINDOW,
       max: 100,
       min: 1,
     }),
+    /* These are not designed to be used by most users. Please use caution when changing these */
+    unsafe: schema.object({
+      exclude_task_types: schema.arrayOf(schema.string(), { defaultValue: [] }),
+      authenticate_background_task_utilization: schema.boolean({ defaultValue: true }),
+    }),
+    requeue_invalid_tasks: requeueInvalidTasksConfig,
+    allow_reading_invalid_state: schema.boolean({ defaultValue: true }),
   },
   {
     validate: (config) => {

diff --git a/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts b/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts
@@ -84,7 +84,6 @@ describe('EphemeralTaskLifecycle', () => {
           delay: 3000,
           max_attempts: 20,
         },
-        metrics_reset_interval: 3000,
         ...config,
       },
       elasticsearchAndSOAvailability$,

diff --git a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts
@@ -79,7 +79,6 @@ describe('managed configuration', () => {
         delay: 3000,
         max_attempts: 20,
       },
-      metrics_reset_interval: 3000,
     });
     logger = context.logger.get('taskManager');