elastic · chrisronline · Jun 16, 2021 · Jun 9, 2021 · Jun 9, 2021 · Jun 10, 2021
diff --git a/x-pack/plugins/task_manager/server/config.test.ts b/x-pack/plugins/task_manager/server/config.test.ts
@@ -20,6 +20,7 @@ describe('config validation', () => {
         "monitored_aggregated_stats_refresh_rate": 60000,
         "monitored_stats_required_freshness": 4000,
         "monitored_stats_running_average_window": 50,
+        "monitored_stats_warn_drift_in_seconds": 60,
         "monitored_task_execution_thresholds": Object {
           "custom": Object {},
           "default": Object {
@@ -68,6 +69,7 @@ describe('config validation', () => {
         "monitored_aggregated_stats_refresh_rate": 60000,
         "monitored_stats_required_freshness": 4000,
         "monitored_stats_running_average_window": 50,
+        "monitored_stats_warn_drift_in_seconds": 60,
         "monitored_task_execution_thresholds": Object {
           "custom": Object {},
           "default": Object {
@@ -103,6 +105,7 @@ describe('config validation', () => {
         "monitored_aggregated_stats_refresh_rate": 60000,
         "monitored_stats_required_freshness": 4000,
         "monitored_stats_running_average_window": 50,
+        "monitored_stats_warn_drift_in_seconds": 60,
         "monitored_task_execution_thresholds": Object {
           "custom": Object {
             "alerting:always-fires": Object {

diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts
@@ -18,6 +18,7 @@ export const DEFAULT_VERSION_CONFLICT_THRESHOLD = 80;
 // Refresh aggregated monitored stats at a default rate of once a minute
 export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000;
 export const DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW = 50;
+export const DEFAULT_MONITORING_STATS_WARN_DRIFT_IN_SECONDS = 60;
 
 export const taskExecutionFailureThresholdSchema = schema.object(
   {
@@ -109,6 +110,9 @@ export const configSchema = schema.object(
         defaultValue: {},
       }),
     }),
+    monitored_stats_warn_drift_in_seconds: schema.number({
+      defaultValue: DEFAULT_MONITORING_STATS_WARN_DRIFT_IN_SECONDS,
+    }),
   },
   {
     validate: (config) => {

diff --git a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts
@@ -37,6 +37,7 @@ describe('managed configuration', () => {
       version_conflict_threshold: 80,
       max_poll_inactivity_cycles: 10,
       monitored_aggregated_stats_refresh_rate: 60000,
+      monitored_stats_warn_drift_in_seconds: 60,
       monitored_stats_required_freshness: 4000,
       monitored_stats_running_average_window: 50,
       request_capacity: 1000,

diff --git a/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts b/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts
@@ -0,0 +1,207 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+import { merge } from 'lodash';
+import { loggingSystemMock } from 'src/core/server/mocks';
+import { configSchema, TaskManagerConfig } from '../config';
+import { HealthStatus } from '../monitoring';
+import { TaskPersistence } from '../monitoring/task_run_statistics';
+import { MonitoredHealth } from '../routes/health';
+import { logHealthMetrics } from './log_health_metrics';
+import { Logger } from '../../../../../src/core/server';
+
+describe('logHealthMetrics', () => {
+  it('should log as debug if there the status is OK', () => {
+    const logger = loggingSystemMock.create().get();
+    const config = getTaskManagerConfig({
+      monitored_stats_warn_drift_in_seconds: 60,
+    });
+    const health = getMockMonitoredHealth();
+
+    logHealthMetrics(health, logger, config);
+
+    const firstDebug = JSON.parse(
+      (logger as jest.Mocked<Logger>).debug.mock.calls[0][0].replace('Latest Monitored Stats: ', '')
+    );
+    expect(firstDebug).toMatchObject(health);
+  });
+
+  it('should log as warn if there the status is Warn', () => {
+    const logger = loggingSystemMock.create().get();
+    const config = getTaskManagerConfig({
+      monitored_stats_warn_drift_in_seconds: 60,
+    });
+    const health = getMockMonitoredHealth({
+      status: HealthStatus.Warning,
+    });
+
+    logHealthMetrics(health, logger, config);
+
+    const logMessage = JSON.parse(
+      ((logger as jest.Mocked<Logger>).warn.mock.calls[0][0] as string).replace(
+        'Latest Monitored Stats (warning status): ',
+        ''
+      )
+    );
+    expect(logMessage).toMatchObject(health);
+  });
+
+  it('should log as error if there the status is Error', () => {
+    const logger = loggingSystemMock.create().get();
+    const config = getTaskManagerConfig({
+      monitored_stats_warn_drift_in_seconds: 60,
+    });
+    const health = getMockMonitoredHealth({
+      status: HealthStatus.Error,
+    });
+
+    logHealthMetrics(health, logger, config);
+
+    const logMessage = JSON.parse(
+      ((logger as jest.Mocked<Logger>).error.mock.calls[0][0] as string).replace(
+        'Latest Monitored Stats (error status): ',
+        ''
+      )
+    );
+    expect(logMessage).toMatchObject(health);
+  });
+
+  it('should log as warn if there the drift exceeds the threshold', () => {
+    const logger = loggingSystemMock.create().get();
+    const config = getTaskManagerConfig({
+      monitored_stats_warn_drift_in_seconds: 60,
+    });
+    const health = getMockMonitoredHealth({
+      stats: {
+        runtime: {
+          value: {
+            drift: {
+              p99: 60000,
+            },
+          },
+        },
+      },
+    });
+
+    logHealthMetrics(health, logger, config);
+
+    const logMessage = JSON.parse(
+      ((logger as jest.Mocked<Logger>).warn.mock.calls[0][0] as string).replace(
+        `Latest Monitored Stats (Detected drift of 60s): `,
+        ''
+      )
+    );
+    expect(logMessage).toMatchObject(health);
+  });
+});
+
+function getMockMonitoredHealth(overrides = {}): MonitoredHealth {
+  const stub: MonitoredHealth = {
+    id: '1',
+    status: HealthStatus.OK,
+    timestamp: new Date().toISOString(),
+    last_update: new Date().toISOString(),
+    stats: {
+      configuration: {
+        timestamp: new Date().toISOString(),
+        status: HealthStatus.OK,
+        value: {
+          max_workers: 10,
+          poll_interval: 3000,
+          max_poll_inactivity_cycles: 10,
+          request_capacity: 1000,
+          monitored_aggregated_stats_refresh_rate: 5000,
+          monitored_stats_running_average_window: 50,
+          monitored_task_execution_thresholds: {
+            default: {
+              error_threshold: 90,
+              warn_threshold: 80,
+            },
+            custom: {},
+          },
+        },
+      },
+      workload: {
+        timestamp: new Date().toISOString(),
+        status: HealthStatus.OK,
+        value: {
+          count: 4,
+          task_types: {
+            actions_telemetry: { count: 2, status: { idle: 2 } },
+            alerting_telemetry: { count: 1, status: { idle: 1 } },
+            session_cleanup: { count: 1, status: { idle: 1 } },
+          },
+          schedule: [],
+          overdue: 0,
+          overdue_non_recurring: 0,
+          estimatedScheduleDensity: [],
+          non_recurring: 20,
+          owner_ids: 2,
+          estimated_schedule_density: [],
+          capacity_requirments: {
+            per_minute: 150,
+            per_hour: 360,
+            per_day: 820,
+          },
+        },
+      },
+      runtime: {
+        timestamp: new Date().toISOString(),
+        status: HealthStatus.OK,
+        value: {
+          drift: {
+            p50: 1000,
+            p90: 2000,
+            p95: 2500,
+            p99: 3000,
+          },
+          drift_by_type: {},
+          load: {
+            p50: 1000,
+            p90: 2000,
+            p95: 2500,
+            p99: 3000,
+          },
+          execution: {
+            duration: {},
+            duration_by_persistence: {},
+            persistence: {
+              [TaskPersistence.Recurring]: 10,
+              [TaskPersistence.NonRecurring]: 10,
+              [TaskPersistence.Ephemeral]: 10,
+            },
+            result_frequency_percent_as_number: {},
+          },
+          polling: {
+            last_successful_poll: new Date().toISOString(),
+            duration: [500, 400, 3000],
+            claim_conflicts: [0, 100, 75],
+            claim_mismatches: [0, 100, 75],
+            result_frequency_percent_as_number: [
+              'NoTasksClaimed',
+              'NoTasksClaimed',
+              'NoTasksClaimed',
+            ],
+          },
+        },
+      },
+    },
+  };
+  return (merge(stub, overrides) as unknown) as MonitoredHealth;
+}
+
+function getTaskManagerConfig(overrides: Partial<TaskManagerConfig> = {}) {
+  return configSchema.validate(
+    overrides.monitored_stats_required_freshness
+      ? {
+          // use `monitored_stats_required_freshness` as poll interval otherwise we might
+          // fail validation as it must be greather than the poll interval
+          poll_interval: overrides.monitored_stats_required_freshness,
+          ...overrides,
+        }
+      : overrides
+  );
+}
diff --git a/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts b/x-pack/plugins/task_manager/server/lib/log_health_metrics.ts
@@ -0,0 +1,44 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { Logger } from '../../../../../src/core/server';
+import { HealthStatus } from '../monitoring';
+import { TaskManagerConfig } from '../config';
+import { MonitoredHealth } from '../routes/health';
+
+export function logHealthMetrics(
+  monitoredHealth: MonitoredHealth,
+  logger: Logger,
+  config: TaskManagerConfig
+) {
+  let contextMessage;
+
+  let logAsWarn = monitoredHealth.status === HealthStatus.Warning;
+  const logAsError = monitoredHealth.status === HealthStatus.Error;
+  const driftInSeconds = (monitoredHealth.stats.runtime?.value.drift.p99 ?? 0) / 1000;
+
+  if (driftInSeconds >= config.monitored_stats_warn_drift_in_seconds) {
+    contextMessage = `Detected drift of ${driftInSeconds}s`;
+    logAsWarn = true;
+  }
+
+  if (logAsError) {
+    logger.error(
+      `Latest Monitored Stats (${contextMessage ?? `error status`}): ${JSON.stringify(
+        monitoredHealth
+      )}`
+    );
+  } else if (logAsWarn) {
+    logger.warn(
+      `Latest Monitored Stats (${contextMessage ?? `warning status`}): ${JSON.stringify(
+        monitoredHealth
+      )}`
+    );
+  } else {
+    logger.debug(`Latest Monitored Stats: ${JSON.stringify(monitoredHealth)}`);
+  }
+}
diff --git a/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts
@@ -23,6 +23,7 @@ describe('Configuration Statistics Aggregator', () => {
       max_poll_inactivity_cycles: 10,
       request_capacity: 1000,
       monitored_aggregated_stats_refresh_rate: 5000,
+      monitored_stats_warn_drift_in_seconds: 60,
       monitored_stats_running_average_window: 50,
       monitored_task_execution_thresholds: {
         default: {

diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts
@@ -27,6 +27,7 @@ describe('createMonitoringStatsStream', () => {
     max_poll_inactivity_cycles: 10,
     request_capacity: 1000,
     monitored_aggregated_stats_refresh_rate: 5000,
+    monitored_stats_warn_drift_in_seconds: 60,
     monitored_stats_running_average_window: 50,
     monitored_task_execution_thresholds: {
       default: {

diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts
@@ -51,7 +51,6 @@ interface MonitoredStat<T> {
   timestamp: string;
   value: T;
 }
-
 export type RawMonitoredStat<T extends JsonObject> = MonitoredStat<T> & {
   status: HealthStatus;
 };

diff --git a/x-pack/plugins/task_manager/server/plugin.test.ts b/x-pack/plugins/task_manager/server/plugin.test.ts
@@ -25,6 +25,7 @@ describe('TaskManagerPlugin', () => {
         max_poll_inactivity_cycles: 10,
         request_capacity: 1000,
         monitored_aggregated_stats_refresh_rate: 5000,
+        monitored_stats_warn_drift_in_seconds: 60,
         monitored_stats_required_freshness: 5000,
         monitored_stats_running_average_window: 50,
         monitored_task_execution_thresholds: {
@@ -55,6 +56,7 @@ describe('TaskManagerPlugin', () => {
         max_poll_inactivity_cycles: 10,
         request_capacity: 1000,
         monitored_aggregated_stats_refresh_rate: 5000,
+        monitored_stats_warn_drift_in_seconds: 60,
         monitored_stats_required_freshness: 5000,
         monitored_stats_running_average_window: 50,
         monitored_task_execution_thresholds: {

diff --git a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts
@@ -45,6 +45,7 @@ describe('TaskPollingLifecycle', () => {
       max_poll_inactivity_cycles: 10,
       request_capacity: 1000,
       monitored_aggregated_stats_refresh_rate: 5000,
+      monitored_stats_warn_drift_in_seconds: 60,
       monitored_stats_required_freshness: 5000,
       monitored_stats_running_average_window: 50,
       monitored_task_execution_thresholds: {