diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsConfig.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsConfig.java index d95b02d36b7..b7386dbd49a 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsConfig.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsConfig.java @@ -54,6 +54,12 @@ public class VeniceMetricsConfig { */ public static final String OTEL_VENICE_METRICS_EXPORT_TO_ENDPOINT = "otel.venice.metrics.export.to.endpoint"; + /** + * Export interval in seconds for OpenTelemetry metrics + */ + public static final String OTEL_VENICE_METRICS_EXPORT_INTERVAL_IN_SECONDS = + "otel.venice.metrics.export.interval.in.seconds"; + /** * Config Map to add custom dimensions to the metrics: Can be used for system dimensions * amongst other custom dimensions
@@ -130,6 +136,7 @@ public class VeniceMetricsConfig { * 2. {@link VeniceOpenTelemetryMetricsRepository.LogBasedMetricExporter} for debug purposes */ private final boolean exportOtelMetricsToEndpoint; + private final int exportOtelMetricsIntervalInSeconds; private final boolean exportOtelMetricsToLog; /** Custom dimensions */ @@ -165,6 +172,7 @@ private VeniceMetricsConfig(Builder builder) { this.metricEntities = builder.metricEntities; this.emitOTelMetrics = builder.emitOtelMetrics; this.exportOtelMetricsToEndpoint = builder.exportOtelMetricsToEndpoint; + this.exportOtelMetricsIntervalInSeconds = builder.exportOtelMetricsIntervalInSeconds; this.otelCustomDimensionsMap = builder.otelCustomDimensionsMap; this.otelExportProtocol = builder.otelExportProtocol; this.otelEndpoint = builder.otelEndpoint; @@ -184,6 +192,7 @@ public static class Builder { private Collection metricEntities = new ArrayList<>(); private boolean emitOtelMetrics = false; private boolean exportOtelMetricsToEndpoint = false; + private int exportOtelMetricsIntervalInSeconds = 60; private Map otelCustomDimensionsMap = new HashMap<>(); private String otelExportProtocol = OtlpConfigUtil.PROTOCOL_HTTP_PROTOBUF; private String otelEndpoint = null; @@ -222,6 +231,11 @@ public Builder setExportOtelMetricsToEndpoint(boolean exportOtelMetricsToEndpoin return this; } + public Builder setExportOtelMetricsIntervalInSeconds(int exportOtelMetricsIntervalInSeconds) { + this.exportOtelMetricsIntervalInSeconds = exportOtelMetricsIntervalInSeconds; + return this; + } + public Builder setOtelExportProtocol(String otelExportProtocol) { this.otelExportProtocol = otelExportProtocol; return this; @@ -289,6 +303,10 @@ public Builder extractAndSetOtelConfigs(Map configs) { setExportOtelMetricsToEndpoint(Boolean.parseBoolean(configValue)); } + if ((configValue = configs.get(OTEL_VENICE_METRICS_EXPORT_INTERVAL_IN_SECONDS)) != null) { + setExportOtelMetricsIntervalInSeconds(Integer.parseInt(configValue)); + } + /** * custom dimensions are passed as key=value pairs separated by '='
* Multiple dimensions are separated by ',' @@ -426,6 +444,10 @@ public boolean exportOtelMetricsToEndpoint() { return exportOtelMetricsToEndpoint; } + public int getExportOtelMetricsIntervalInSeconds() { + return exportOtelMetricsIntervalInSeconds; + } + public Map getOtelCustomDimensionsMap() { return otelCustomDimensionsMap; } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepository.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepository.java index 711d41cdb56..d329f4d56ed 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepository.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepository.java @@ -19,7 +19,6 @@ import io.opentelemetry.sdk.common.CompletableResultCode; import io.opentelemetry.sdk.metrics.Aggregation; import io.opentelemetry.sdk.metrics.InstrumentSelector; -import io.opentelemetry.sdk.metrics.InstrumentSelectorBuilder; import io.opentelemetry.sdk.metrics.InstrumentType; import io.opentelemetry.sdk.metrics.SdkMeterProvider; import io.opentelemetry.sdk.metrics.SdkMeterProviderBuilder; @@ -34,6 +33,7 @@ import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.concurrent.TimeUnit; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -97,19 +97,20 @@ private void setExponentialHistogramAggregation(SdkMeterProviderBuilder builder, } } - // Build an InstrumentSelector with multiple setName calls for all Exponential Histogram metrics - InstrumentSelectorBuilder selectorBuilder = InstrumentSelector.builder().setType(InstrumentType.HISTOGRAM); - metricNames.forEach(selectorBuilder::setName); - - // Register a single view with all metric names included in the InstrumentSelector - builder.registerView( - selectorBuilder.build(), - View.builder() - .setAggregation( - Aggregation.base2ExponentialBucketHistogram( - metricsConfig.getOtelExponentialHistogramMaxBuckets(), - metricsConfig.getOtelExponentialHistogramMaxScale())) - .build()); + // Register views for all MetricType.HISTOGRAM metrics to be aggregated/exported as exponential histograms + for (String metricName: metricNames) { + InstrumentSelector selector = + InstrumentSelector.builder().setType(InstrumentType.HISTOGRAM).setName(metricName).build(); + + builder.registerView( + selector, + View.builder() + .setAggregation( + Aggregation.base2ExponentialBucketHistogram( + metricsConfig.getOtelExponentialHistogramMaxBuckets(), + metricsConfig.getOtelExponentialHistogramMaxScale())) + .build()); + } } public VeniceOpenTelemetryMetricsRepository(VeniceMetricsConfig metricsConfig) { @@ -129,11 +130,17 @@ public VeniceOpenTelemetryMetricsRepository(VeniceMetricsConfig metricsConfig) { SdkMeterProviderBuilder builder = SdkMeterProvider.builder(); if (metricsConfig.exportOtelMetricsToEndpoint()) { MetricExporter httpExporter = getOtlpHttpMetricExporter(metricsConfig); - builder.registerMetricReader(PeriodicMetricReader.builder(httpExporter).build()); + builder.registerMetricReader( + PeriodicMetricReader.builder(httpExporter) + .setInterval(metricsConfig.getExportOtelMetricsIntervalInSeconds(), TimeUnit.SECONDS) + .build()); } if (metricsConfig.exportOtelMetricsToLog()) { // internal to test: Disabled by default - builder.registerMetricReader(PeriodicMetricReader.builder(new LogBasedMetricExporter(metricsConfig)).build()); + builder.registerMetricReader( + PeriodicMetricReader.builder(new LogBasedMetricExporter(metricsConfig)) + .setInterval(metricsConfig.getExportOtelMetricsIntervalInSeconds(), TimeUnit.SECONDS) + .build()); } if (metricsConfig.useOtelExponentialHistogram()) { diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcome.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcome.java deleted file mode 100644 index 84b6ce30c5c..00000000000 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcome.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.linkedin.venice.stats.dimensions; - -public enum RequestValidationOutcome { - VALID, INVALID_KEY_COUNT_LIMIT_EXCEEDED; - - private final String outcome; - - RequestValidationOutcome() { - this.outcome = name().toLowerCase(); - } - - public String getOutcome() { - return this.outcome; - } -} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensions.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensions.java index 54737cc5342..c9f361d8f71 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensions.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensions.java @@ -21,9 +21,6 @@ public enum VeniceMetricsDimensions { /** {@link HttpResponseStatusCodeCategory} ie. 1xx, 2xx, etc */ HTTP_RESPONSE_STATUS_CODE_CATEGORY("http.response.status_code_category"), - /** {@link RequestValidationOutcome#outcome} */ - VENICE_REQUEST_VALIDATION_OUTCOME("venice.request.validation_outcome"), - /** {@link VeniceResponseStatusCategory} */ VENICE_RESPONSE_STATUS_CODE_CATEGORY("venice.response.status_code_category"), diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategory.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategory.java index 761c30cfdfe..b493e41c905 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategory.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategory.java @@ -8,7 +8,7 @@ * to account for all healthy requests. This dimensions makes it easier to make Venice specific aggregations. */ public enum VeniceResponseStatusCategory { - HEALTHY, UNHEALTHY, TARDY, THROTTLED, BAD_REQUEST; + SUCCESS, FAIL; private final String category; diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityState.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityState.java index 69aabb428e3..117430f30e7 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityState.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityState.java @@ -6,57 +6,48 @@ import io.opentelemetry.api.metrics.LongCounter; import io.tehuti.metrics.MeasurableStat; import io.tehuti.metrics.Sensor; -import io.tehuti.utils.RedundantLogFilter; -import java.util.HashMap; +import java.util.Collections; import java.util.List; -import java.util.Map; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; /** - * Operational state of a metric. It holds
- * 1. {@link MetricEntity} - * 2. 1 Otel Instrument and - * 3. multiple tehuti Sensors for this Otel Metric + * Operational state of a metric. It holds:
+ * 1. A {@link MetricEntity}
+ * 2. One OpenTelemetry (Otel) Instrument
+ * 3. Zero or one (out of zero for new metrics or more for existing metrics) Tehuti sensors for this Otel Metric.
+ * + * One Otel instrument can cover multiple Tehuti sensors through the use of dimensions. Ideally, this class should represent a one-to-many + * mapping between an Otel instrument and Tehuti sensors. However, to simplify lookup during runtime, this class holds one Otel instrument + * and one Tehuti sensor. If an Otel instrument corresponds to multiple Tehuti sensors, there will be multiple {@link MetricEntityState} + * objects, each containing the same Otel instrument but different Tehuti sensors. */ public class MetricEntityState { - private static final Logger LOGGER = LogManager.getLogger(MetricEntityState.class); - private static final RedundantLogFilter REDUNDANT_LOG_FILTER = RedundantLogFilter.getRedundantLogFilter(); private final MetricEntity metricEntity; /** Otel metric */ private Object otelMetric = null; - /** Map of tehuti names and sensors: 1 Otel metric can cover multiple Tehuti sensors */ - private Map tehutiSensors = null; + /** Respective tehuti metric */ + private Sensor tehutiSensor = null; public MetricEntityState(MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository) { - this.metricEntity = metricEntity; - setOtelMetric(otelRepository.createInstrument(this.metricEntity)); + this(metricEntity, otelRepository, null, null, Collections.EMPTY_LIST); } public MetricEntityState( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, - TehutiSensorRegistrationFunction registerTehutiSensor, - Map> tehutiMetricInput) { + TehutiSensorRegistrationFunction registerTehutiSensorFn, + TehutiMetricNameEnum tehutiMetricNameEnum, + List tehutiMetricStats) { this.metricEntity = metricEntity; - createMetric(otelRepository, tehutiMetricInput, registerTehutiSensor); + createMetric(otelRepository, tehutiMetricNameEnum, tehutiMetricStats, registerTehutiSensorFn); } public void setOtelMetric(Object otelMetric) { this.otelMetric = otelMetric; } - /** - * Add Tehuti {@link Sensor} to tehutiSensors map and throw exception if sensor with same name already exists - */ - public void addTehutiSensors(TehutiMetricNameEnum name, Sensor tehutiSensor) { - if (tehutiSensors == null) { - tehutiSensors = new HashMap<>(); - } - if (tehutiSensors.put(name, tehutiSensor) != null) { - throw new IllegalArgumentException("Sensor with name '" + name + "' already exists."); - } + public void setTehutiSensor(Sensor tehutiSensor) { + this.tehutiSensor = tehutiSensor; } /** @@ -69,18 +60,18 @@ public interface TehutiSensorRegistrationFunction { public void createMetric( VeniceOpenTelemetryMetricsRepository otelRepository, - Map> tehutiMetricInput, - TehutiSensorRegistrationFunction registerTehutiSensor) { + TehutiMetricNameEnum tehutiMetricNameEnum, + List tehutiMetricStats, + TehutiSensorRegistrationFunction registerTehutiSensorFn) { // Otel metric: otelRepository will be null if otel is not enabled if (otelRepository != null) { setOtelMetric(otelRepository.createInstrument(this.metricEntity)); } // tehuti metric - for (Map.Entry> entry: tehutiMetricInput.entrySet()) { - addTehutiSensors( - entry.getKey(), - registerTehutiSensor - .register(entry.getKey().getMetricName(), entry.getValue().toArray(new MeasurableStat[0]))); + if (tehutiMetricStats != null && !tehutiMetricStats.isEmpty()) { + setTehutiSensor( + registerTehutiSensorFn + .register(tehutiMetricNameEnum.getMetricName(), tehutiMetricStats.toArray(new MeasurableStat[0]))); } } @@ -105,38 +96,29 @@ void recordOtelMetric(double value, Attributes otelDimensions) { } } - void recordTehutiMetric(TehutiMetricNameEnum tehutiMetricNameEnum, double value) { - if (tehutiSensors != null) { - Sensor sensor = tehutiSensors.get(tehutiMetricNameEnum); - if (sensor != null) { - sensor.record(value); - } else { - // Log using Redundant log filters to catch any bad tehutiMetricNameEnum is passed in - String errorLog = "Tehuti Sensor with name '" + tehutiMetricNameEnum + "' not found."; - if (!REDUNDANT_LOG_FILTER.isRedundantLog(errorLog)) { - LOGGER.error(errorLog); - } - } + void recordTehutiMetric(double value) { + if (tehutiSensor != null) { + tehutiSensor.record(value); } } - public void record(TehutiMetricNameEnum tehutiMetricNameEnum, long value, Attributes otelDimensions) { + public void record(long value, Attributes otelDimensions) { recordOtelMetric(value, otelDimensions); - recordTehutiMetric(tehutiMetricNameEnum, value); + recordTehutiMetric(value); } - public void record(TehutiMetricNameEnum tehutiMetricNameEnum, double value, Attributes otelDimensions) { + public void record(double value, Attributes otelDimensions) { recordOtelMetric(value, otelDimensions); - recordTehutiMetric(tehutiMetricNameEnum, value); + recordTehutiMetric(value); } /** used only for testing */ - Map getTehutiSensors() { - return tehutiSensors; + Sensor getTehutiSensor() { + return tehutiSensor; } /** used only for testing */ - static RedundantLogFilter getRedundantLogFilter() { - return REDUNDANT_LOG_FILTER; + Object getOtelMetric() { + return otelMetric; } } diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsConfigTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsConfigTest.java index 31906b070da..7f5199976a0 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsConfigTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsConfigTest.java @@ -39,6 +39,7 @@ public void testDefaultValuesWithBasicConfig() { assertEquals(config.getMetricPrefix(), "service"); assertFalse(config.emitOtelMetrics()); assertFalse(config.exportOtelMetricsToEndpoint()); + assertEquals(config.getExportOtelMetricsIntervalInSeconds(), 60); assertEquals(config.getOtelExportProtocol(), OtlpConfigUtil.PROTOCOL_HTTP_PROTOBUF); assertEquals(config.getOtelEndpoint(), null); assertTrue(config.getOtelHeaders().isEmpty()); @@ -63,6 +64,7 @@ public void testCustomValues() { .setMetricPrefix("TestPrefix") .setTehutiMetricConfig(metricConfig) .extractAndSetOtelConfigs(otelConfigs) + .setExportOtelMetricsIntervalInSeconds(10) .build(); assertEquals(config.getServiceName(), "TestService"); @@ -70,6 +72,7 @@ public void testCustomValues() { assertTrue(config.emitOtelMetrics()); assertTrue(config.exportOtelMetricsToLog()); assertEquals(config.getTehutiMetricConfig(), metricConfig); + assertEquals(config.getExportOtelMetricsIntervalInSeconds(), 10); } @Test(expectedExceptions = IllegalArgumentException.class) diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepositoryTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepositoryTest.java index 1512a5db8e7..a4349f438a9 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepositoryTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepositoryTest.java @@ -38,6 +38,7 @@ public void setUp() { when(mockMetricsConfig.getServiceName()).thenReturn("test_service"); when(mockMetricsConfig.exportOtelMetricsToEndpoint()).thenReturn(true); when(mockMetricsConfig.getOtelEndpoint()).thenReturn("http://localhost:4318"); + when(mockMetricsConfig.getExportOtelMetricsIntervalInSeconds()).thenReturn(60); metricsRepository = new VeniceOpenTelemetryMetricsRepository(mockMetricsConfig); } diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcomeTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcomeTest.java deleted file mode 100644 index f144850be73..00000000000 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/RequestValidationOutcomeTest.java +++ /dev/null @@ -1,24 +0,0 @@ -package com.linkedin.venice.stats.dimensions; - -import static org.testng.Assert.assertEquals; - -import org.testng.annotations.Test; - - -public class RequestValidationOutcomeTest { - @Test - public void testVeniceRequestValidationOutcome() { - for (RequestValidationOutcome outcome: RequestValidationOutcome.values()) { - switch (outcome) { - case VALID: - assertEquals(outcome.getOutcome(), "valid"); - break; - case INVALID_KEY_COUNT_LIMIT_EXCEEDED: - assertEquals(outcome.getOutcome(), "invalid_key_count_limit_exceeded"); - break; - default: - throw new IllegalArgumentException("Unknown outcome: " + outcome); - } - } - } -} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensionsTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensionsTest.java index 13a9eb97aa3..56a5ab48f6c 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensionsTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensionsTest.java @@ -28,9 +28,6 @@ public void testGetDimensionNameInSnakeCase() { case HTTP_RESPONSE_STATUS_CODE_CATEGORY: assertEquals(dimension.getDimensionName(format), "http.response.status_code_category"); break; - case VENICE_REQUEST_VALIDATION_OUTCOME: - assertEquals(dimension.getDimensionName(format), "venice.request.validation_outcome"); - break; case VENICE_RESPONSE_STATUS_CODE_CATEGORY: assertEquals(dimension.getDimensionName(format), "venice.response.status_code_category"); break; @@ -66,9 +63,6 @@ public void testGetDimensionNameInCamelCase() { case HTTP_RESPONSE_STATUS_CODE_CATEGORY: assertEquals(dimension.getDimensionName(format), "http.response.statusCodeCategory"); break; - case VENICE_REQUEST_VALIDATION_OUTCOME: - assertEquals(dimension.getDimensionName(format), "venice.request.validationOutcome"); - break; case VENICE_RESPONSE_STATUS_CODE_CATEGORY: assertEquals(dimension.getDimensionName(format), "venice.response.statusCodeCategory"); break; @@ -104,9 +98,6 @@ public void testGetDimensionNameInPascalCase() { case HTTP_RESPONSE_STATUS_CODE_CATEGORY: assertEquals(dimension.getDimensionName(format), "Http.Response.StatusCodeCategory"); break; - case VENICE_REQUEST_VALIDATION_OUTCOME: - assertEquals(dimension.getDimensionName(format), "Venice.Request.ValidationOutcome"); - break; case VENICE_RESPONSE_STATUS_CODE_CATEGORY: assertEquals(dimension.getDimensionName(format), "Venice.Response.StatusCodeCategory"); break; diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategoryTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategoryTest.java index 22272d35763..940b8eaeced 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategoryTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceResponseStatusCategoryTest.java @@ -10,20 +10,11 @@ public class VeniceResponseStatusCategoryTest { public void testVeniceResponseStatusCategory() { for (VeniceResponseStatusCategory responseStatusCategory: VeniceResponseStatusCategory.values()) { switch (responseStatusCategory) { - case HEALTHY: - assertEquals(responseStatusCategory.getCategory(), "healthy"); + case SUCCESS: + assertEquals(responseStatusCategory.getCategory(), "success"); break; - case UNHEALTHY: - assertEquals(responseStatusCategory.getCategory(), "unhealthy"); - break; - case TARDY: - assertEquals(responseStatusCategory.getCategory(), "tardy"); - break; - case THROTTLED: - assertEquals(responseStatusCategory.getCategory(), "throttled"); - break; - case BAD_REQUEST: - assertEquals(responseStatusCategory.getCategory(), "bad_request"); + case FAIL: + assertEquals(responseStatusCategory.getCategory(), "fail"); break; default: throw new IllegalArgumentException("Unknown response status category: " + responseStatusCategory); diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java index 93aeea3749e..955f64edffe 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java @@ -1,22 +1,19 @@ package com.linkedin.venice.stats.metrics; +import static com.linkedin.venice.stats.metrics.MetricType.HISTOGRAM; +import static java.util.Collections.singletonList; +import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.api.metrics.DoubleHistogram; import io.opentelemetry.api.metrics.LongCounter; -import io.tehuti.metrics.MeasurableStat; import io.tehuti.metrics.Sensor; -import io.tehuti.utils.RedundantLogFilter; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import io.tehuti.metrics.stats.Count; import org.testng.Assert; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; @@ -29,7 +26,7 @@ public class MetricEntityStateTest { private Sensor mockSensor; private enum TestTehutiMetricNameEnum implements TehutiMetricNameEnum { - TEST_METRIC_1, TEST_METRIC_2; + TEST_METRIC; private final String metricName; @@ -47,6 +44,7 @@ public String getMetricName() { public void setUp() { mockOtelRepository = mock(VeniceOpenTelemetryMetricsRepository.class); mockMetricEntity = mock(MetricEntity.class); + doReturn(HISTOGRAM).when(mockMetricEntity).getMetricType(); sensorRegistrationFunction = (name, stats) -> mock(Sensor.class); mockSensor = mock(Sensor.class); } @@ -57,36 +55,28 @@ public void testCreateMetricWithOtelEnabled() { LongCounter longCounter = mock(LongCounter.class); when(mockOtelRepository.createInstrument(mockMetricEntity)).thenReturn(longCounter); - Map> tehutiMetricInput = new HashMap<>(); - MetricEntityState metricEntityState = - new MetricEntityState(mockMetricEntity, mockOtelRepository, sensorRegistrationFunction, tehutiMetricInput); - - Assert.assertNotNull(metricEntityState); - Assert.assertNull(metricEntityState.getTehutiSensors()); // No Tehuti sensors added - } - - @Test - public void testAddTehutiSensorsSuccessfully() { - MetricEntityState metricEntityState = new MetricEntityState(mockMetricEntity, mockOtelRepository); - metricEntityState.addTehutiSensors(TestTehutiMetricNameEnum.TEST_METRIC_1, mockSensor); - - Assert.assertNotNull(metricEntityState.getTehutiSensors()); - assertTrue(metricEntityState.getTehutiSensors().containsKey(TestTehutiMetricNameEnum.TEST_METRIC_1)); - } - - @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = ".*Sensor with name 'TEST_METRIC_1' already exists.*") - public void testAddTehutiSensorThrowsExceptionOnDuplicate() { + // without tehuti sensor MetricEntityState metricEntityState = new MetricEntityState(mockMetricEntity, mockOtelRepository); - metricEntityState.addTehutiSensors(TestTehutiMetricNameEnum.TEST_METRIC_1, mockSensor); - - // Adding the same sensor name again should throw an exception - metricEntityState.addTehutiSensors(TestTehutiMetricNameEnum.TEST_METRIC_1, mockSensor); + Assert.assertNotNull(metricEntityState); + Assert.assertNotNull(metricEntityState.getOtelMetric()); + Assert.assertNull(metricEntityState.getTehutiSensor()); // No Tehuti sensors added + + // with tehuti sensor + metricEntityState = new MetricEntityState( + mockMetricEntity, + mockOtelRepository, + sensorRegistrationFunction, + TestTehutiMetricNameEnum.TEST_METRIC, + singletonList(new Count())); + Assert.assertNotNull(metricEntityState); + Assert.assertNotNull(metricEntityState.getOtelMetric()); + Assert.assertNotNull(metricEntityState.getTehutiSensor()); } @Test public void testRecordOtelMetricHistogram() { DoubleHistogram doubleHistogram = mock(DoubleHistogram.class); - when(mockMetricEntity.getMetricType()).thenReturn(MetricType.HISTOGRAM); + when(mockMetricEntity.getMetricType()).thenReturn(HISTOGRAM); MetricEntityState metricEntityState = new MetricEntityState(mockMetricEntity, mockOtelRepository); metricEntityState.setOtelMetric(doubleHistogram); @@ -114,40 +104,34 @@ public void testRecordOtelMetricCounter() { @Test public void testRecordTehutiMetric() { MetricEntityState metricEntityState = new MetricEntityState(mockMetricEntity, mockOtelRepository); - metricEntityState.addTehutiSensors(TestTehutiMetricNameEnum.TEST_METRIC_1, mockSensor); - - metricEntityState.recordTehutiMetric(TestTehutiMetricNameEnum.TEST_METRIC_1, 15.0); - + metricEntityState.setTehutiSensor(mockSensor); + metricEntityState.recordTehutiMetric(15.0); verify(mockSensor, times(1)).record(15.0); } @Test public void testRecordMetricsWithBothOtelAndTehuti() { DoubleHistogram doubleHistogram = mock(DoubleHistogram.class); - when(mockMetricEntity.getMetricType()).thenReturn(MetricType.HISTOGRAM); + when(mockMetricEntity.getMetricType()).thenReturn(HISTOGRAM); MetricEntityState metricEntityState = new MetricEntityState(mockMetricEntity, mockOtelRepository); - RedundantLogFilter logFilter = metricEntityState.getRedundantLogFilter(); metricEntityState.setOtelMetric(doubleHistogram); - metricEntityState.addTehutiSensors(TestTehutiMetricNameEnum.TEST_METRIC_1, mockSensor); + metricEntityState.setTehutiSensor(mockSensor); Attributes attributes = Attributes.builder().put("key", "value").build(); - // case 1: check using valid Tehuti metric that was added to metricEntityState - metricEntityState.record(TestTehutiMetricNameEnum.TEST_METRIC_1, 20.0, attributes); + // called 0 times + verify(doubleHistogram, times(0)).record(20.0, attributes); + verify(mockSensor, times(0)).record(20.0); + + // called 1 time + metricEntityState.record(20.0, attributes); verify(doubleHistogram, times(1)).record(20.0, attributes); verify(mockSensor, times(1)).record(20.0); - assertFalse(logFilter.isRedundantLog("Tehuti Sensor with name 'TEST_METRIC_1' not found.", false)); - // case 2: check using a Tehuti metric that was not added to metricEntityState and verify it called - // REDUNDANT_LOG_FILTER - metricEntityState.record(TestTehutiMetricNameEnum.TEST_METRIC_2, 20.0, attributes); - // otel metric should be called for the second time + // called 2 times + metricEntityState.record(20.0, attributes); verify(doubleHistogram, times(2)).record(20.0, attributes); - // Tehuti metric should be not called for the second time as we passed in an invalid metric name - verify(mockSensor, times(1)).record(20.0); - // This should have invoked the log filter for TEST_METRIC_2 - assertFalse(logFilter.isRedundantLog("Tehuti Sensor with name 'TEST_METRIC_1' not found.", false)); - assertTrue(logFilter.isRedundantLog("Tehuti Sensor with name 'TEST_METRIC_2' not found.", false)); + verify(mockSensor, times(2)).record(20.0); } } diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/api/VenicePathParser.java b/services/venice-router/src/main/java/com/linkedin/venice/router/api/VenicePathParser.java index 7fc3deddf06..b5665a041bf 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/api/VenicePathParser.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/api/VenicePathParser.java @@ -320,6 +320,7 @@ public VenicePath parseResourceUri(String uri, BasicFullHttpRequest fullHttpRequ } AggRouterHttpRequestStats aggRouterHttpRequestStats = routerStats.getStatsByType(requestType); + if (!requestType.equals(SINGLE_GET)) { /** * Here we only track key num for non single-get request, since single-get request will be always 1. @@ -344,6 +345,7 @@ public VenicePath parseResourceUri(String uri, BasicFullHttpRequest fullHttpRequ routerStats.getStatsByType(keyCountLimitException.getRequestType()) .recordBadRequestKeyCount( keyCountLimitException.getStoreName(), + responseStatus, keyCountLimitException.getRequestKeyCount()); } /** diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceResponseAggregator.java b/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceResponseAggregator.java index 40dce6958c1..c3369fcfeb3 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceResponseAggregator.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceResponseAggregator.java @@ -250,20 +250,20 @@ public FullHttpResponse buildResponse( // here... double latency = LatencyUtils.convertNSToMS(timeValue.getRawValue(TimeUnit.NANOSECONDS)); stats.recordLatency(storeName, latency); + int keyNum = venicePath.getPartitionKeys().size(); if (HEALTHY_STATUSES.contains(httpResponseStatus)) { - routerStats.getStatsByType(RequestType.SINGLE_GET) - .recordReadQuotaUsage(storeName, venicePath.getPartitionKeys().size()); + routerStats.getStatsByType(RequestType.SINGLE_GET).recordReadQuotaUsage(storeName, keyNum); if (isFastRequest(latency, requestType)) { - stats.recordHealthyRequest(storeName, latency, httpResponseStatus); + stats.recordHealthyRequest(storeName, latency, httpResponseStatus, keyNum); } else { - stats.recordTardyRequest(storeName, latency, httpResponseStatus); + stats.recordTardyRequest(storeName, latency, httpResponseStatus, keyNum); } } else if (httpResponseStatus.equals(TOO_MANY_REQUESTS)) { LOGGER.debug("request is rejected by storage node because quota is exceeded"); - stats.recordThrottledRequest(storeName, latency, httpResponseStatus); + stats.recordThrottledRequest(storeName, latency, httpResponseStatus, keyNum); } else { LOGGER.debug("Unhealthy request detected, latency: {}ms, response status: {}", latency, httpResponseStatus); - stats.recordUnhealthyRequest(storeName, latency, httpResponseStatus); + stats.recordUnhealthyRequest(storeName, latency, httpResponseStatus, keyNum); } } timeValue = allMetrics.get(ROUTER_RESPONSE_WAIT_TIME); diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStats.java b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStats.java index 3f9dc822871..81c4d3ad0e0 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStats.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStats.java @@ -80,9 +80,9 @@ public void recordRequest(String storeName) { recordStoreStats(storeName, RouterHttpRequestStats::recordIncomingRequest); } - public void recordHealthyRequest(String storeName, double latency, HttpResponseStatus responseStatus) { - totalStats.recordHealthyRequest(latency, responseStatus); - recordStoreStats(storeName, stats -> stats.recordHealthyRequest(latency, responseStatus)); + public void recordHealthyRequest(String storeName, double latency, HttpResponseStatus responseStatus, int keyNum) { + totalStats.recordHealthyRequest(latency, responseStatus, keyNum); + recordStoreStats(storeName, stats -> stats.recordHealthyRequest(latency, responseStatus, keyNum)); } public void recordUnhealthyRequest(String storeName, HttpResponseStatus responseStatus) { @@ -97,10 +97,10 @@ public void recordUnavailableReplicaStreamingRequest(String storeName) { recordStoreStats(storeName, RouterHttpRequestStats::recordUnavailableReplicaStreamingRequest); } - public void recordUnhealthyRequest(String storeName, double latency, HttpResponseStatus responseStatus) { - totalStats.recordUnhealthyRequest(latency, responseStatus); + public void recordUnhealthyRequest(String storeName, double latency, HttpResponseStatus responseStatus, int keyNum) { + totalStats.recordUnhealthyRequest(latency, responseStatus, keyNum); if (storeName != null) { - recordStoreStats(storeName, stats -> stats.recordUnhealthyRequest(latency, responseStatus)); + recordStoreStats(storeName, stats -> stats.recordUnhealthyRequest(latency, responseStatus, keyNum)); } } @@ -115,9 +115,9 @@ public void recordReadQuotaUsage(String storeName, int quotaUsage) { recordStoreStats(storeName, stats -> stats.recordReadQuotaUsage(quotaUsage)); } - public void recordTardyRequest(String storeName, double latency, HttpResponseStatus responseStatus) { - totalStats.recordTardyRequest(latency, responseStatus); - recordStoreStats(storeName, stats -> stats.recordTardyRequest(latency, responseStatus)); + public void recordTardyRequest(String storeName, double latency, HttpResponseStatus responseStatus, int keyNum) { + totalStats.recordTardyRequest(latency, responseStatus, keyNum); + recordStoreStats(storeName, stats -> stats.recordTardyRequest(latency, responseStatus, keyNum)); } /** @@ -132,9 +132,13 @@ public void recordThrottledRequest(String storeName, HttpResponseStatus httpResp recordStoreStats(storeName, stats -> stats.recordThrottledRequest(httpResponseStatus)); } - public void recordThrottledRequest(String storeName, double latency, HttpResponseStatus httpResponseStatus) { - totalStats.recordThrottledRequest(latency, httpResponseStatus); - recordStoreStats(storeName, stats -> stats.recordThrottledRequest(latency, httpResponseStatus)); + public void recordThrottledRequest( + String storeName, + double latency, + HttpResponseStatus httpResponseStatus, + int keyNum) { + totalStats.recordThrottledRequest(latency, httpResponseStatus, keyNum); + recordStoreStats(storeName, stats -> stats.recordThrottledRequest(latency, httpResponseStatus, keyNum)); } public void recordBadRequest(String storeName, HttpResponseStatus responseStatus) { @@ -144,10 +148,10 @@ public void recordBadRequest(String storeName, HttpResponseStatus responseStatus } } - public void recordBadRequestKeyCount(String storeName, int keyCount) { - totalStats.recordBadRequestKeyCount(keyCount); + public void recordBadRequestKeyCount(String storeName, HttpResponseStatus responseStatus, int keyNum) { + totalStats.recordIncomingBadRequestKeyCountMetric(responseStatus, keyNum); if (storeName != null) { - recordStoreStats(storeName, stats -> stats.recordBadRequestKeyCount(keyCount)); + recordStoreStats(storeName, stats -> stats.recordIncomingBadRequestKeyCountMetric(responseStatus, keyNum)); } } @@ -261,8 +265,8 @@ public long getTotalRetriesError() { } public void recordKeyNum(String storeName, int keyNum) { - totalStats.recordKeyNum(keyNum); - recordStoreStats(storeName, stats -> stats.recordKeyNum(keyNum)); + totalStats.recordIncomingKeyCountMetric(keyNum); + recordStoreStats(storeName, stats -> stats.recordIncomingKeyCountMetric(keyNum)); } public void recordRequestUsage(String storeName, int usage) { @@ -291,18 +295,18 @@ public void recordUnavailableRequest(String storeName) { } public void recordDelayConstraintAbortedRetryRequest(String storeName) { - totalStats.recordDelayConstraintAbortedRetryRequest(); - recordStoreStats(storeName, RouterHttpRequestStats::recordDelayConstraintAbortedRetryRequest); + totalStats.recordDelayConstraintAbortedRetryCountMetric(); + recordStoreStats(storeName, RouterHttpRequestStats::recordDelayConstraintAbortedRetryCountMetric); } public void recordSlowRouteAbortedRetryRequest(String storeName) { - totalStats.recordSlowRouteAbortedRetryRequest(); - recordStoreStats(storeName, RouterHttpRequestStats::recordSlowRouteAbortedRetryRequest); + totalStats.recordSlowRouteAbortedRetryCountMetric(); + recordStoreStats(storeName, RouterHttpRequestStats::recordSlowRouteAbortedRetryCountMetric); } public void recordRetryRouteLimitAbortedRetryRequest(String storeName) { - totalStats.recordRetryRouteLimitAbortedRetryRequest(); - recordStoreStats(storeName, RouterHttpRequestStats::recordRetryRouteLimitAbortedRetryRequest); + totalStats.recordRetryRouteLimitAbortedRetryCountMetric(); + recordStoreStats(storeName, RouterHttpRequestStats::recordRetryRouteLimitAbortedRetryCountMetric); } public void recordKeySize(long keySize) { @@ -320,8 +324,8 @@ public void recordDisallowedRetryRequest(String storeName) { } public void recordNoAvailableReplicaAbortedRetryRequest(String storeName) { - totalStats.recordNoAvailableReplicaAbortedRetryRequest(); - recordStoreStats(storeName, RouterHttpRequestStats::recordNoAvailableReplicaAbortedRetryRequest); + totalStats.recordNoAvailableReplicaAbortedRetryCountMetric(); + recordStoreStats(storeName, RouterHttpRequestStats::recordNoAvailableReplicaAbortedRetryCountMetric); } public void recordErrorRetryAttemptTriggeredByPendingRequestCheck(String storeName) { diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java index ed9debd11ce..cdade9e5e6c 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java @@ -5,21 +5,23 @@ import static com.linkedin.venice.router.stats.RouterMetricEntity.ABORTED_RETRY_COUNT; import static com.linkedin.venice.router.stats.RouterMetricEntity.ALLOWED_RETRY_COUNT; import static com.linkedin.venice.router.stats.RouterMetricEntity.CALL_COUNT; -import static com.linkedin.venice.router.stats.RouterMetricEntity.CALL_KEY_COUNT; import static com.linkedin.venice.router.stats.RouterMetricEntity.CALL_TIME; import static com.linkedin.venice.router.stats.RouterMetricEntity.DISALLOWED_RETRY_COUNT; -import static com.linkedin.venice.router.stats.RouterMetricEntity.INCOMING_CALL_COUNT; +import static com.linkedin.venice.router.stats.RouterMetricEntity.KEY_COUNT; import static com.linkedin.venice.router.stats.RouterMetricEntity.RETRY_COUNT; import static com.linkedin.venice.router.stats.RouterMetricEntity.RETRY_DELAY; import static com.linkedin.venice.stats.AbstractVeniceAggStats.STORE_NAME_FOR_TOTAL_STAT; import static com.linkedin.venice.stats.dimensions.HttpResponseStatusCodeCategory.getVeniceHttpResponseStatusCodeCategory; +import static com.linkedin.venice.stats.dimensions.RequestRetryAbortReason.DELAY_CONSTRAINT; +import static com.linkedin.venice.stats.dimensions.RequestRetryAbortReason.MAX_RETRY_ROUTE_LIMIT; +import static com.linkedin.venice.stats.dimensions.RequestRetryAbortReason.NO_AVAILABLE_REPLICA; +import static com.linkedin.venice.stats.dimensions.RequestRetryAbortReason.SLOW_ROUTE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE_CATEGORY; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_METHOD; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_RETRY_ABORT_REASON; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_RETRY_TYPE; -import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_VALIDATION_OUTCOME; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_RESPONSE_STATUS_CODE_CATEGORY; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; import static java.util.Collections.singletonList; @@ -36,12 +38,10 @@ import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; import com.linkedin.venice.stats.dimensions.RequestRetryAbortReason; import com.linkedin.venice.stats.dimensions.RequestRetryType; -import com.linkedin.venice.stats.dimensions.RequestValidationOutcome; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; import com.linkedin.venice.stats.dimensions.VeniceResponseStatusCategory; import com.linkedin.venice.stats.metrics.MetricEntityState; import com.linkedin.venice.stats.metrics.TehutiMetricNameEnum; -import com.linkedin.venice.utils.CollectionUtils; import io.netty.handler.codec.http.HttpResponseStatus; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.api.common.AttributesBuilder; @@ -59,7 +59,6 @@ import io.tehuti.metrics.stats.Rate; import io.tehuti.metrics.stats.Total; import java.util.Arrays; -import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -79,16 +78,24 @@ public class RouterHttpRequestStats extends AbstractVeniceHttpStats { } /** metrics to track incoming requests */ - private final MetricEntityState incomingRequestMetric; + private final Sensor requestSensor; /** metrics to track response handling */ - private final MetricEntityState requestMetric; + private final MetricEntityState healthyRequestMetric; + private final MetricEntityState unhealthyRequestMetric; + private final MetricEntityState tardyRequestMetric; + private final MetricEntityState throttledRequestMetric; + private final MetricEntityState badRequestMetric; + private final Sensor healthyRequestRateSensor; private final Sensor tardyRequestRatioSensor; /** latency metrics */ private final Sensor latencyTehutiSensor; // This can be removed while removing tehuti - private final MetricEntityState latencyMetric; + private final MetricEntityState healthyLatencyMetric; + private final MetricEntityState unhealthyLatencyMetric; + private final MetricEntityState tardyLatencyMetric; + private final MetricEntityState throttledLatencyMetric; /** retry metrics */ private final MetricEntityState retryCountMetric; @@ -97,10 +104,15 @@ public class RouterHttpRequestStats extends AbstractVeniceHttpStats { private final MetricEntityState retryDelayMetric; /** retry aborted metrics */ - private final MetricEntityState abortedRetryCountMetric; + private final MetricEntityState delayConstraintAbortedRetryCountMetric; + private final MetricEntityState slowRouteAbortedRetryCountMetric; + private final MetricEntityState retryRouteLimitAbortedRetryCountMetric; + private final MetricEntityState noAvailableReplicaAbortedRetryCountMetric; /** key count metrics */ private final MetricEntityState keyCountMetric; + private final Sensor keyNumSensor; + private final Sensor badRequestKeyCountSensor; /** OTel metrics yet to be added */ private final Sensor requestSizeSensor; @@ -146,7 +158,8 @@ public RouterHttpRequestStats( if (metricsRepository instanceof VeniceMetricsRepository) { VeniceMetricsRepository veniceMetricsRepository = (VeniceMetricsRepository) metricsRepository; VeniceMetricsConfig veniceMetricsConfig = veniceMetricsRepository.getVeniceMetricsConfig(); - emitOpenTelemetryMetrics = veniceMetricsConfig.emitOtelMetrics(); + // total stats won't be emitted by OTel + emitOpenTelemetryMetrics = veniceMetricsConfig.emitOtelMetrics() && !isTotalStats(); openTelemetryMetricFormat = veniceMetricsConfig.getMetricNamingFormat(); if (emitOpenTelemetryMetrics) { otelRepository = veniceMetricsRepository.getOpenTelemetryMetricsRepository(); @@ -172,104 +185,141 @@ public RouterHttpRequestStats( Rate requestRate = new OccurrenceRate(); Rate healthyRequestRate = new OccurrenceRate(); Rate tardyRequestRate = new OccurrenceRate(); - - incomingRequestMetric = new MetricEntityState( - INCOMING_CALL_COUNT.getMetricEntity(), - otelRepository, - this::registerSensorFinal, - CollectionUtils.>mapBuilder() - .put(RouterTehutiMetricNameEnum.REQUEST, Arrays.asList(new Count(), requestRate)) - .build()); + requestSensor = registerSensor("request", new Count(), requestRate); healthyRequestRateSensor = registerSensor(new TehutiUtils.SimpleRatioStat(healthyRequestRate, requestRate, "healthy_request_ratio")); tardyRequestRatioSensor = registerSensor(new TehutiUtils.SimpleRatioStat(tardyRequestRate, requestRate, "tardy_request_ratio")); + keyNumSensor = registerSensor("key_num", new Avg(), new Max(0)); + badRequestKeyCountSensor = registerSensor("bad_request_key_count", new OccurrenceRate(), new Avg(), new Max()); + + healthyRequestMetric = new MetricEntityState( + CALL_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + RouterTehutiMetricNameEnum.HEALTHY_REQUEST, + Arrays.asList(new Count(), healthyRequestRate)); - requestMetric = new MetricEntityState( + unhealthyRequestMetric = new MetricEntityState( CALL_COUNT.getMetricEntity(), otelRepository, this::registerSensorFinal, - CollectionUtils.>mapBuilder() - .put(RouterTehutiMetricNameEnum.HEALTHY_REQUEST, Arrays.asList(new Count(), healthyRequestRate)) - .put(RouterTehutiMetricNameEnum.UNHEALTHY_REQUEST, singletonList(new Count())) - .put(RouterTehutiMetricNameEnum.TARDY_REQUEST, Arrays.asList(new Count(), tardyRequestRate)) - .put(RouterTehutiMetricNameEnum.THROTTLED_REQUEST, singletonList(new Count())) - .put(RouterTehutiMetricNameEnum.BAD_REQUEST, singletonList(new Count())) - .build()); + RouterTehutiMetricNameEnum.UNHEALTHY_REQUEST, + singletonList(new Count())); + + tardyRequestMetric = new MetricEntityState( + CALL_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + RouterTehutiMetricNameEnum.TARDY_REQUEST, + Arrays.asList(new Count(), tardyRequestRate)); + + throttledRequestMetric = new MetricEntityState( + CALL_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + RouterTehutiMetricNameEnum.THROTTLED_REQUEST, + singletonList(new Count())); + + badRequestMetric = new MetricEntityState( + CALL_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + RouterTehutiMetricNameEnum.BAD_REQUEST, + singletonList(new Count())); latencyTehutiSensor = registerSensorWithDetailedPercentiles("latency", new Avg(), new Max(0)); - latencyMetric = new MetricEntityState( + healthyLatencyMetric = new MetricEntityState( + CALL_TIME.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + RouterTehutiMetricNameEnum.HEALTHY_REQUEST_LATENCY, + Arrays.asList( + new Avg(), + new Max(0), + TehutiUtils.getPercentileStatForNetworkLatency( + getName(), + getFullMetricName(RouterTehutiMetricNameEnum.HEALTHY_REQUEST_LATENCY.getMetricName())))); + + unhealthyLatencyMetric = new MetricEntityState( CALL_TIME.getMetricEntity(), otelRepository, this::registerSensorFinal, - CollectionUtils.>mapBuilder() - .put( - RouterTehutiMetricNameEnum.HEALTHY_REQUEST_LATENCY, - Arrays.asList( - new Avg(), - new Max(0), - TehutiUtils.getPercentileStatForNetworkLatency( - getName(), - getFullMetricName(RouterTehutiMetricNameEnum.HEALTHY_REQUEST_LATENCY.getMetricName())))) - .put(RouterTehutiMetricNameEnum.UNHEALTHY_REQUEST_LATENCY, Arrays.asList(new Avg(), new Max(0))) - .put(RouterTehutiMetricNameEnum.TARDY_REQUEST_LATENCY, Arrays.asList(new Avg(), new Max(0))) - .put(RouterTehutiMetricNameEnum.THROTTLED_REQUEST_LATENCY, Arrays.asList(new Avg(), new Max(0))) - .build()); + RouterTehutiMetricNameEnum.UNHEALTHY_REQUEST_LATENCY, + Arrays.asList(new Avg(), new Max(0))); + + tardyLatencyMetric = new MetricEntityState( + CALL_TIME.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + RouterTehutiMetricNameEnum.TARDY_REQUEST_LATENCY, + Arrays.asList(new Avg(), new Max(0))); + + throttledLatencyMetric = new MetricEntityState( + CALL_TIME.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + RouterTehutiMetricNameEnum.THROTTLED_REQUEST_LATENCY, + Arrays.asList(new Avg(), new Max(0))); retryCountMetric = new MetricEntityState( RETRY_COUNT.getMetricEntity(), otelRepository, this::registerSensorFinal, - CollectionUtils.>mapBuilder() - .put(RouterTehutiMetricNameEnum.ERROR_RETRY, singletonList(new Count())) - .build()); + RouterTehutiMetricNameEnum.ERROR_RETRY, + singletonList(new Count())); allowedRetryCountMetric = new MetricEntityState( ALLOWED_RETRY_COUNT.getMetricEntity(), otelRepository, this::registerSensorFinal, - CollectionUtils.>mapBuilder() - .put(RouterTehutiMetricNameEnum.ALLOWED_RETRY_REQUEST_COUNT, singletonList(new OccurrenceRate())) - .build()); + RouterTehutiMetricNameEnum.ALLOWED_RETRY_REQUEST_COUNT, + singletonList(new OccurrenceRate())); disallowedRetryCountMetric = new MetricEntityState( DISALLOWED_RETRY_COUNT.getMetricEntity(), otelRepository, this::registerSensorFinal, - CollectionUtils.>mapBuilder() - .put(RouterTehutiMetricNameEnum.DISALLOWED_RETRY_REQUEST_COUNT, singletonList(new OccurrenceRate())) - .build()); + RouterTehutiMetricNameEnum.DISALLOWED_RETRY_REQUEST_COUNT, + singletonList(new OccurrenceRate())); retryDelayMetric = new MetricEntityState( RETRY_DELAY.getMetricEntity(), otelRepository, this::registerSensorFinal, - CollectionUtils.>mapBuilder() - .put(RouterTehutiMetricNameEnum.RETRY_DELAY, Arrays.asList(new Avg(), new Max())) - .build()); + RouterTehutiMetricNameEnum.RETRY_DELAY, + Arrays.asList(new Avg(), new Max())); + + delayConstraintAbortedRetryCountMetric = new MetricEntityState( + ABORTED_RETRY_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + RouterTehutiMetricNameEnum.DELAY_CONSTRAINT_ABORTED_RETRY_REQUEST, + singletonList(new Count())); - abortedRetryCountMetric = new MetricEntityState( + slowRouteAbortedRetryCountMetric = new MetricEntityState( ABORTED_RETRY_COUNT.getMetricEntity(), otelRepository, this::registerSensorFinal, - CollectionUtils.>mapBuilder() - .put(RouterTehutiMetricNameEnum.DELAY_CONSTRAINT_ABORTED_RETRY_REQUEST, singletonList(new Count())) - .put(RouterTehutiMetricNameEnum.SLOW_ROUTE_ABORTED_RETRY_REQUEST, singletonList(new Count())) - .put(RouterTehutiMetricNameEnum.RETRY_ROUTE_LIMIT_ABORTED_RETRY_REQUEST, singletonList(new Count())) - .put(RouterTehutiMetricNameEnum.NO_AVAILABLE_REPLICA_ABORTED_RETRY_REQUEST, singletonList(new Count())) - .build()); - - keyCountMetric = new MetricEntityState( - CALL_KEY_COUNT.getMetricEntity(), + RouterTehutiMetricNameEnum.SLOW_ROUTE_ABORTED_RETRY_REQUEST, + singletonList(new Count())); + + retryRouteLimitAbortedRetryCountMetric = new MetricEntityState( + ABORTED_RETRY_COUNT.getMetricEntity(), otelRepository, this::registerSensorFinal, - CollectionUtils.>mapBuilder() - .put(RouterTehutiMetricNameEnum.KEY_NUM, Arrays.asList(new OccurrenceRate(), new Avg(), new Max(0))) - .put( - RouterTehutiMetricNameEnum.BAD_REQUEST_KEY_COUNT, - Arrays.asList(new OccurrenceRate(), new Avg(), new Max(0))) - .build()); + RouterTehutiMetricNameEnum.RETRY_ROUTE_LIMIT_ABORTED_RETRY_REQUEST, + singletonList(new Count())); + + noAvailableReplicaAbortedRetryCountMetric = new MetricEntityState( + ABORTED_RETRY_COUNT.getMetricEntity(), + otelRepository, + this::registerSensorFinal, + RouterTehutiMetricNameEnum.NO_AVAILABLE_REPLICA_ABORTED_RETRY_REQUEST, + singletonList(new Count())); + + keyCountMetric = new MetricEntityState(KEY_COUNT.getMetricEntity(), otelRepository); errorRetryAttemptTriggeredByPendingRequestCheckSensor = registerSensor("error_retry_attempt_triggered_by_pending_request_check", new OccurrenceRate()); @@ -360,37 +410,43 @@ private String getDimensionName(VeniceMetricsDimensions dimension) { * types of requests also have their latencies logged at the same time. */ public void recordIncomingRequest() { - incomingRequestMetric.record(RouterTehutiMetricNameEnum.REQUEST, 1, commonMetricDimensions); + requestSensor.record(1); inFlightRequestSensor.record(currentInFlightRequest.incrementAndGet()); totalInflightRequestSensor.record(); } - public void recordHealthyRequest(Double latency, HttpResponseStatus responseStatus) { - VeniceResponseStatusCategory veniceResponseStatusCategory = VeniceResponseStatusCategory.HEALTHY; - recordRequestMetric(RouterTehutiMetricNameEnum.HEALTHY_REQUEST, responseStatus, veniceResponseStatusCategory); + Attributes getRequestMetricDimensions( + HttpResponseStatus responseStatus, + VeniceResponseStatusCategory veniceResponseStatusCategory) { + return Attributes.builder() + .putAll(commonMetricDimensions) + .put(getDimensionName(HTTP_RESPONSE_STATUS_CODE), responseStatus.codeAsText().toString()) + .put( + getDimensionName(HTTP_RESPONSE_STATUS_CODE_CATEGORY), + getVeniceHttpResponseStatusCodeCategory(responseStatus)) + .put(getDimensionName(VENICE_RESPONSE_STATUS_CODE_CATEGORY), veniceResponseStatusCategory.getCategory()) + .build(); + } + + public void recordHealthyRequest(Double latency, HttpResponseStatus responseStatus, int keyNum) { + Attributes dimensions = getRequestMetricDimensions(responseStatus, VeniceResponseStatusCategory.SUCCESS); + healthyRequestMetric.record(1, dimensions); + keyCountMetric.record(keyNum, dimensions); if (latency != null) { - recordLatencyMetric( - RouterTehutiMetricNameEnum.HEALTHY_REQUEST_LATENCY, - latency, - responseStatus, - veniceResponseStatusCategory); + healthyLatencyMetric.record(latency, dimensions); } } public void recordUnhealthyRequest(HttpResponseStatus responseStatus) { - recordRequestMetric( - RouterTehutiMetricNameEnum.UNHEALTHY_REQUEST, - responseStatus, - VeniceResponseStatusCategory.UNHEALTHY); + Attributes dimensions = getRequestMetricDimensions(responseStatus, VeniceResponseStatusCategory.FAIL); + unhealthyRequestMetric.record(1, dimensions); } - public void recordUnhealthyRequest(double latency, HttpResponseStatus responseStatus) { - recordUnhealthyRequest(responseStatus); - recordLatencyMetric( - RouterTehutiMetricNameEnum.UNHEALTHY_REQUEST_LATENCY, - latency, - responseStatus, - VeniceResponseStatusCategory.UNHEALTHY); + public void recordUnhealthyRequest(double latency, HttpResponseStatus responseStatus, int keyNum) { + Attributes dimensions = getRequestMetricDimensions(responseStatus, VeniceResponseStatusCategory.FAIL); + unhealthyRequestMetric.record(1, dimensions); + keyCountMetric.record(keyNum, dimensions); + unhealthyLatencyMetric.record(latency, dimensions); } public void recordUnavailableReplicaStreamingRequest() { @@ -405,23 +461,18 @@ public void recordReadQuotaUsage(int quotaUsage) { readQuotaUsageSensor.record(quotaUsage); } - public void recordTardyRequest(double latency, HttpResponseStatus responseStatus) { - VeniceResponseStatusCategory veniceResponseStatusCategory = VeniceResponseStatusCategory.TARDY; - recordRequestMetric(RouterTehutiMetricNameEnum.TARDY_REQUEST, responseStatus, veniceResponseStatusCategory); - recordLatencyMetric( - RouterTehutiMetricNameEnum.TARDY_REQUEST_LATENCY, - latency, - responseStatus, - veniceResponseStatusCategory); + public void recordTardyRequest(double latency, HttpResponseStatus responseStatus, int keyNum) { + Attributes dimensions = getRequestMetricDimensions(responseStatus, VeniceResponseStatusCategory.SUCCESS); + tardyRequestMetric.record(1, dimensions); + keyCountMetric.record(keyNum, dimensions); + tardyLatencyMetric.record(latency, dimensions); } - public void recordThrottledRequest(double latency, HttpResponseStatus responseStatus) { - recordThrottledRequest(responseStatus); - recordLatencyMetric( - RouterTehutiMetricNameEnum.THROTTLED_REQUEST_LATENCY, - latency, - responseStatus, - VeniceResponseStatusCategory.THROTTLED); + public void recordThrottledRequest(double latency, HttpResponseStatus responseStatus, int keyNum) { + Attributes dimensions = getRequestMetricDimensions(responseStatus, VeniceResponseStatusCategory.FAIL); + throttledRequestMetric.record(1, dimensions); + keyCountMetric.record(keyNum, dimensions); + throttledLatencyMetric.record(latency, dimensions); } /** @@ -432,10 +483,8 @@ public void recordThrottledRequest(double latency, HttpResponseStatus responseSt * TODO: Remove this overload after fixing the above. */ public void recordThrottledRequest(HttpResponseStatus responseStatus) { - recordRequestMetric( - RouterTehutiMetricNameEnum.THROTTLED_REQUEST, - responseStatus, - VeniceResponseStatusCategory.THROTTLED); + Attributes dimensions = getRequestMetricDimensions(responseStatus, VeniceResponseStatusCategory.FAIL); + throttledRequestMetric.record(1, dimensions); } public void recordErrorRetryCount() { @@ -450,12 +499,10 @@ public void recordRetryTriggeredSensorOtel(RequestRetryType retryType) { .put(getDimensionName(VENICE_REQUEST_RETRY_TYPE), retryType.getRetryType()) .build(); } - retryCountMetric.record(RouterTehutiMetricNameEnum.ERROR_RETRY, 1, dimensions); + retryCountMetric.record(1, dimensions); } - public void recordAbortedRetrySensorOtel( - TehutiMetricNameEnum tehutiMetricNameEnum, - RequestRetryAbortReason abortReason) { + private Attributes getRetryRequestAbortDimensions(RequestRetryAbortReason abortReason) { Attributes dimensions = null; if (emitOpenTelemetryMetrics) { dimensions = Attributes.builder() @@ -463,21 +510,32 @@ public void recordAbortedRetrySensorOtel( .put(getDimensionName(VENICE_REQUEST_RETRY_ABORT_REASON), abortReason.getAbortReason()) .build(); } - abortedRetryCountMetric.record(tehutiMetricNameEnum, 1, dimensions); + return dimensions; } - public void recordBadRequest(HttpResponseStatus responseStatus) { - recordRequestMetric( - RouterTehutiMetricNameEnum.BAD_REQUEST, - responseStatus, - VeniceResponseStatusCategory.BAD_REQUEST); + public void recordDelayConstraintAbortedRetryCountMetric() { + Attributes dimensions = getRetryRequestAbortDimensions(DELAY_CONSTRAINT); + delayConstraintAbortedRetryCountMetric.record(1, dimensions); + } + + public void recordSlowRouteAbortedRetryCountMetric() { + Attributes dimensions = getRetryRequestAbortDimensions(SLOW_ROUTE); + slowRouteAbortedRetryCountMetric.record(1, dimensions); } - public void recordBadRequestKeyCount(int keyCount) { - recordKeyCountMetric( - RouterTehutiMetricNameEnum.BAD_REQUEST_KEY_COUNT, - keyCount, - RequestValidationOutcome.INVALID_KEY_COUNT_LIMIT_EXCEEDED); + public void recordRetryRouteLimitAbortedRetryCountMetric() { + Attributes dimensions = getRetryRequestAbortDimensions(MAX_RETRY_ROUTE_LIMIT); + retryRouteLimitAbortedRetryCountMetric.record(1, dimensions); + } + + public void recordNoAvailableReplicaAbortedRetryCountMetric() { + Attributes dimensions = getRetryRequestAbortDimensions(NO_AVAILABLE_REPLICA); + noAvailableReplicaAbortedRetryCountMetric.record(1, dimensions); + } + + public void recordBadRequest(HttpResponseStatus responseStatus) { + Attributes dimensions = getRequestMetricDimensions(responseStatus, VeniceResponseStatusCategory.FAIL); + badRequestMetric.record(1, dimensions); } public void recordRequestThrottledByRouterCapacity() { @@ -494,43 +552,6 @@ public void recordLatency(double latency) { latencyTehutiSensor.record(latency); } - public void recordLatencyMetric( - TehutiMetricNameEnum tehutiMetricNameEnum, - double latency, - HttpResponseStatus responseStatus, - VeniceResponseStatusCategory veniceResponseStatusCategory) { - Attributes dimensions = null; - if (emitOpenTelemetryMetrics) { - dimensions = Attributes.builder() - .putAll(commonMetricDimensions) - // Don't add HTTP_RESPONSE_STATUS_CODE to reduce the cardinality for histogram - .put( - getDimensionName(HTTP_RESPONSE_STATUS_CODE_CATEGORY), - getVeniceHttpResponseStatusCodeCategory(responseStatus)) - .put(getDimensionName(VENICE_RESPONSE_STATUS_CODE_CATEGORY), veniceResponseStatusCategory.getCategory()) - .build(); - } - latencyMetric.record(tehutiMetricNameEnum, latency, dimensions); - } - - public void recordRequestMetric( - TehutiMetricNameEnum tehutiMetricNameEnum, - HttpResponseStatus responseStatus, - VeniceResponseStatusCategory veniceResponseStatusCategory) { - Attributes dimensions = null; - if (emitOpenTelemetryMetrics) { - dimensions = Attributes.builder() - .putAll(commonMetricDimensions) - .put( - getDimensionName(HTTP_RESPONSE_STATUS_CODE_CATEGORY), - getVeniceHttpResponseStatusCodeCategory(responseStatus)) - .put(getDimensionName(VENICE_RESPONSE_STATUS_CODE_CATEGORY), veniceResponseStatusCategory.getCategory()) - .put(getDimensionName(HTTP_RESPONSE_STATUS_CODE), responseStatus.codeAsText().toString()) - .build(); - } - requestMetric.record(tehutiMetricNameEnum, 1, dimensions); - } - public void recordResponseWaitingTime(double waitingTime) { routerResponseWaitingTimeSensor.record(waitingTime); } @@ -559,22 +580,13 @@ public void recordFindUnhealthyHostRequest() { findUnhealthyHostRequestSensor.record(); } - public void recordKeyNum(int keyNum) { - recordKeyCountMetric(RouterTehutiMetricNameEnum.KEY_NUM, keyNum, RequestValidationOutcome.VALID); + public void recordIncomingKeyCountMetric(int keyNum) { + keyNumSensor.record(keyNum); } - public void recordKeyCountMetric( - TehutiMetricNameEnum tehutiMetricNameEnum, - int keyNum, - RequestValidationOutcome outcome) { - Attributes dimensions = null; - if (emitOpenTelemetryMetrics) { - dimensions = Attributes.builder() - .putAll(commonMetricDimensions) - .put(getDimensionName(VENICE_REQUEST_VALIDATION_OUTCOME), outcome.getOutcome()) - .build(); - } - keyCountMetric.record(tehutiMetricNameEnum, keyNum, dimensions); + public void recordIncomingBadRequestKeyCountMetric(HttpResponseStatus responseStatus, int keyNum) { + badRequestKeyCountSensor.record(keyNum); + keyCountMetric.record(keyNum, getRequestMetricDimensions(responseStatus, VeniceResponseStatusCategory.FAIL)); } public void recordRequestUsage(int usage) { @@ -597,30 +609,6 @@ public void recordUnavailableRequest() { unAvailableRequestSensor.record(); } - public void recordDelayConstraintAbortedRetryRequest() { - recordAbortedRetrySensorOtel( - RouterTehutiMetricNameEnum.DELAY_CONSTRAINT_ABORTED_RETRY_REQUEST, - RequestRetryAbortReason.DELAY_CONSTRAINT); - } - - public void recordSlowRouteAbortedRetryRequest() { - recordAbortedRetrySensorOtel( - RouterTehutiMetricNameEnum.SLOW_ROUTE_ABORTED_RETRY_REQUEST, - RequestRetryAbortReason.SLOW_ROUTE); - } - - public void recordRetryRouteLimitAbortedRetryRequest() { - recordAbortedRetrySensorOtel( - RouterTehutiMetricNameEnum.RETRY_ROUTE_LIMIT_ABORTED_RETRY_REQUEST, - RequestRetryAbortReason.MAX_RETRY_ROUTE_LIMIT); - } - - public void recordNoAvailableReplicaAbortedRetryRequest() { - recordAbortedRetrySensorOtel( - RouterTehutiMetricNameEnum.NO_AVAILABLE_REPLICA_ABORTED_RETRY_REQUEST, - RequestRetryAbortReason.NO_AVAILABLE_REPLICA); - } - public void recordKeySizeInByte(long keySize) { if (keySizeSensor != null) { keySizeSensor.record(keySize); @@ -637,12 +625,11 @@ public void recordResponse() { } public void recordAllowedRetryRequest() { - allowedRetryCountMetric.record(RouterTehutiMetricNameEnum.ALLOWED_RETRY_REQUEST_COUNT, 1, commonMetricDimensions); + allowedRetryCountMetric.record(1, commonMetricDimensions); } public void recordDisallowedRetryRequest() { - disallowedRetryCountMetric - .record(RouterTehutiMetricNameEnum.DISALLOWED_RETRY_REQUEST_COUNT, 1, commonMetricDimensions); + disallowedRetryCountMetric.record(1, commonMetricDimensions); } public void recordErrorRetryAttemptTriggeredByPendingRequestCheck() { @@ -650,7 +637,7 @@ public void recordErrorRetryAttemptTriggeredByPendingRequestCheck() { } public void recordRetryDelay(double delay) { - retryDelayMetric.record(RouterTehutiMetricNameEnum.RETRY_DELAY, delay, commonMetricDimensions); + retryDelayMetric.record(delay, commonMetricDimensions); } public void recordMetaStoreShadowRead() { @@ -695,8 +682,6 @@ Attributes getCommonMetricDimensions() { * Metric names for tehuti metrics used in this class */ enum RouterTehutiMetricNameEnum implements TehutiMetricNameEnum { - /** for {@link RouterMetricEntity#INCOMING_CALL_COUNT} */ - REQUEST, /** for {@link RouterMetricEntity#CALL_COUNT} */ HEALTHY_REQUEST, UNHEALTHY_REQUEST, TARDY_REQUEST, THROTTLED_REQUEST, BAD_REQUEST, /** for {@link RouterMetricEntity#CALL_TIME} */ @@ -711,9 +696,7 @@ enum RouterTehutiMetricNameEnum implements TehutiMetricNameEnum { RETRY_DELAY, /** for {@link RouterMetricEntity#ABORTED_RETRY_COUNT} */ DELAY_CONSTRAINT_ABORTED_RETRY_REQUEST, SLOW_ROUTE_ABORTED_RETRY_REQUEST, RETRY_ROUTE_LIMIT_ABORTED_RETRY_REQUEST, - NO_AVAILABLE_REPLICA_ABORTED_RETRY_REQUEST, - /** for {@link RouterMetricEntity#CALL_KEY_COUNT} */ - KEY_NUM, BAD_REQUEST_KEY_COUNT; + NO_AVAILABLE_REPLICA_ABORTED_RETRY_REQUEST; private final String metricName; diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterMetricEntity.java b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterMetricEntity.java index df0d5094071..3cda3c1ecea 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterMetricEntity.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterMetricEntity.java @@ -6,7 +6,6 @@ import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_METHOD; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_RETRY_ABORT_REASON; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_RETRY_TYPE; -import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_VALIDATION_OUTCOME; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_RESPONSE_STATUS_CODE_CATEGORY; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; import static com.linkedin.venice.utils.Utils.setOf; @@ -22,12 +21,11 @@ * List all Metric entities for router */ public enum RouterMetricEntity { - INCOMING_CALL_COUNT( - MetricType.COUNTER, MetricUnit.NUMBER, "Count of all incoming requests", - setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD) - ), + /** + * Count of all requests during response handling along with response codes + */ CALL_COUNT( - MetricType.COUNTER, MetricUnit.NUMBER, "Count of all requests with response details", + MetricType.COUNTER, MetricUnit.NUMBER, "Count of all requests during response handling along with response codes", setOf( VENICE_STORE_NAME, VENICE_CLUSTER_NAME, @@ -36,38 +34,66 @@ public enum RouterMetricEntity { HTTP_RESPONSE_STATUS_CODE_CATEGORY, VENICE_RESPONSE_STATUS_CODE_CATEGORY) ), + /** + * Latency based on all responses + */ CALL_TIME( MetricType.HISTOGRAM, MetricUnit.MILLISECOND, "Latency based on all responses", setOf( VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD, + HTTP_RESPONSE_STATUS_CODE, HTTP_RESPONSE_STATUS_CODE_CATEGORY, VENICE_RESPONSE_STATUS_CODE_CATEGORY) ), - CALL_KEY_COUNT( - MetricType.MIN_MAX_COUNT_SUM_AGGREGATIONS, MetricUnit.NUMBER, "Count of keys in multi key requests", - setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD, VENICE_REQUEST_VALIDATION_OUTCOME) + /** + * Count of keys during response handling along with response codes + */ + KEY_COUNT( + MetricType.HISTOGRAM, MetricUnit.NUMBER, "Count of keys during response handling along with response codes", + setOf( + VENICE_STORE_NAME, + VENICE_CLUSTER_NAME, + VENICE_REQUEST_METHOD, + HTTP_RESPONSE_STATUS_CODE, + HTTP_RESPONSE_STATUS_CODE_CATEGORY, + VENICE_RESPONSE_STATUS_CODE_CATEGORY) ), + /** + * Count of retries triggered + */ RETRY_COUNT( MetricType.COUNTER, MetricUnit.NUMBER, "Count of retries triggered", setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD, VENICE_REQUEST_RETRY_TYPE) ), + /** + * Count of allowed retry requests + */ ALLOWED_RETRY_COUNT( MetricType.COUNTER, MetricUnit.NUMBER, "Count of allowed retry requests", setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD) ), + /** + * Count of disallowed retry requests + */ DISALLOWED_RETRY_COUNT( MetricType.COUNTER, MetricUnit.NUMBER, "Count of disallowed retry requests", setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD) ), - RETRY_DELAY( - MetricType.MIN_MAX_COUNT_SUM_AGGREGATIONS, MetricUnit.MILLISECOND, "Retry delay time", - setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD) - ), + /** + * Count of aborted retry requests + */ ABORTED_RETRY_COUNT( MetricType.COUNTER, MetricUnit.NUMBER, "Count of aborted retry requests", setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD, VENICE_REQUEST_RETRY_ABORT_REASON) + ), + /** + * Retry delay time: Time in milliseconds between the original request and the retry request + */ + RETRY_DELAY( + MetricType.MIN_MAX_COUNT_SUM_AGGREGATIONS, MetricUnit.MILLISECOND, "Retry delay time", + setOf(VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REQUEST_METHOD) ); private final MetricEntity metricEntity; diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVenicePathParser.java b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVenicePathParser.java index 3ba47ad8f5b..88ed517bedf 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVenicePathParser.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVenicePathParser.java @@ -46,6 +46,7 @@ import io.netty.handler.codec.http.EmptyHttpHeaders; import io.netty.handler.codec.http.HttpHeaders; import io.netty.handler.codec.http.HttpMethod; +import io.netty.handler.codec.http.HttpResponseStatus; import io.netty.handler.codec.http.HttpVersion; import java.nio.ByteBuffer; import java.util.ArrayList; @@ -306,7 +307,8 @@ public void parseRequestWithBatchSizeViolation() throws RouterException { fail("A RouterException should be thrown here"); } catch (RouterException e) { // expected and validate bad request metric - verify(multiGetStats, times(1)).recordBadRequestKeyCount(storeName, maxKeyCount + 1); + verify(multiGetStats, times(1)) + .recordBadRequestKeyCount(storeName, HttpResponseStatus.REQUEST_ENTITY_TOO_LARGE, maxKeyCount + 1); } catch (Throwable t) { t.printStackTrace(); fail("Only RouterException is expected, but got: " + t.getClass()); diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceResponseAggregator.java b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceResponseAggregator.java index bb97fca491a..883dbeb6737 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceResponseAggregator.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceResponseAggregator.java @@ -41,6 +41,7 @@ import io.netty.handler.codec.http.HttpMethod; import io.netty.handler.codec.http.HttpResponseStatus; import io.netty.handler.codec.http.HttpVersion; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -168,6 +169,11 @@ public void testBuildResponseForMultiGet() { VenicePath path = getPath(storeName, RequestType.MULTI_GET, mockRouterStat, request, compressorFactory); when(path.getChunkedResponse()).thenReturn(null); // non-streaming metrics.setPath(path); + List partitionKeys = new ArrayList<>(); + partitionKeys.add(new RouterKey("key1".getBytes(StandardCharsets.UTF_8))); + partitionKeys.add(new RouterKey("key2".getBytes(StandardCharsets.UTF_8))); + partitionKeys.add(new RouterKey("key3".getBytes(StandardCharsets.UTF_8))); + doReturn(partitionKeys).when(path).getPartitionKeys(); VeniceResponseAggregator responseAggregator = new VeniceResponseAggregator(mockRouterStat, Optional.empty()); FullHttpResponse finalResponse = responseAggregator.buildResponse(request, metrics, gatheredResponses); @@ -221,7 +227,7 @@ public void testBuildResponseForMultiGet() { FullHttpResponse response5 = buildFullHttpResponse(TOO_MANY_REQUESTS, new byte[0], headers); metrics.setMetric(MetricNames.ROUTER_SERVER_TIME, new TimeValue(1, TimeUnit.MILLISECONDS)); responseAggregator.buildResponse(request, metrics, Collections.singletonList(response5)); - verify(mockStatsForMultiGet).recordThrottledRequest(storeName, 1.0, TOO_MANY_REQUESTS); + verify(mockStatsForMultiGet).recordThrottledRequest(storeName, 1.0, TOO_MANY_REQUESTS, 3); } @Test diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStatsTest.java b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStatsTest.java index cebfcc14424..f4a84d5622b 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStatsTest.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/AggRouterHttpRequestStatsTest.java @@ -45,8 +45,8 @@ public void testAggRouterMetrics() { Assert.assertNotNull(metricsRepository.getMetric(".store1--request.Count")); Assert.assertEquals(reporter.query(".store1--request.Count").value(), 1d); - stats.recordThrottledRequest("store1", 1.0, TOO_MANY_REQUESTS); - stats.recordThrottledRequest("store2", 1.0, TOO_MANY_REQUESTS); + stats.recordThrottledRequest("store1", 1.0, TOO_MANY_REQUESTS, 1); + stats.recordThrottledRequest("store2", 1.0, TOO_MANY_REQUESTS, 1); stats.recordErrorRetryCount("store1"); Assert.assertEquals(reporter.query(".total--request.Count").value(), 2d); Assert.assertEquals(reporter.query(".store1--request.Count").value(), 1d); @@ -108,8 +108,8 @@ public void testDisableMultiGetStoreMetrics() { String storeName = Utils.getUniqueString("test-store"); multiGetStats.recordRequest(storeName); streamingMultiGetStats.recordRequest(storeName); - multiGetStats.recordHealthyRequest(storeName, 10, HttpResponseStatus.OK); - streamingMultiGetStats.recordHealthyRequest(storeName, 10, HttpResponseStatus.OK); + multiGetStats.recordHealthyRequest(storeName, 10, HttpResponseStatus.OK, 1); + streamingMultiGetStats.recordHealthyRequest(storeName, 10, HttpResponseStatus.OK, 1); // Total stats should exist for streaming and non-streaming multi-get Assert.assertEquals((int) reporter.query(".total--multiget_request.Count").value(), 1); Assert.assertEquals((int) reporter.query(".total--multiget_streaming_request.Count").value(), 1); diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterHttpRequestStatsTest.java b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterHttpRequestStatsTest.java index 363fef8416e..7f711dcff23 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterHttpRequestStatsTest.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterHttpRequestStatsTest.java @@ -78,7 +78,7 @@ public void routerMetricsTest(boolean useVeniceMetricRepository, boolean isOtelE assertNull(routerHttpRequestStats.getCommonMetricDimensions()); } - routerHttpRequestStats.recordHealthyRequest(1.0, HttpResponseStatus.OK); + routerHttpRequestStats.recordHealthyRequest(1.0, HttpResponseStatus.OK, 1); assertEquals( metricsRepository.getMetric("." + storeName + "--" + HEALTHY_REQUEST.getMetricName() + ".Count").value(), 1.0); diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterMetricEntityTest.java b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterMetricEntityTest.java index 8755ba2aeb7..c3ac31c6c9e 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterMetricEntityTest.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterMetricEntityTest.java @@ -21,25 +21,13 @@ public class RouterMetricEntityTest { @Test public void testRouterMetricEntities() { Map expectedMetrics = new HashMap<>(); - - expectedMetrics.put( - RouterMetricEntity.INCOMING_CALL_COUNT, - new MetricEntity( - "incoming_call_count", - MetricType.COUNTER, - MetricUnit.NUMBER, - "Count of all incoming requests", - Utils.setOf( - VeniceMetricsDimensions.VENICE_STORE_NAME, - VeniceMetricsDimensions.VENICE_CLUSTER_NAME, - VeniceMetricsDimensions.VENICE_REQUEST_METHOD))); expectedMetrics.put( RouterMetricEntity.CALL_COUNT, new MetricEntity( "call_count", MetricType.COUNTER, MetricUnit.NUMBER, - "Count of all requests with response details", + "Count of all requests during response handling along with response codes", Utils.setOf( VeniceMetricsDimensions.VENICE_STORE_NAME, VeniceMetricsDimensions.VENICE_CLUSTER_NAME, @@ -58,20 +46,23 @@ public void testRouterMetricEntities() { VeniceMetricsDimensions.VENICE_STORE_NAME, VeniceMetricsDimensions.VENICE_CLUSTER_NAME, VeniceMetricsDimensions.VENICE_REQUEST_METHOD, + VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE, VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE_CATEGORY, VeniceMetricsDimensions.VENICE_RESPONSE_STATUS_CODE_CATEGORY))); expectedMetrics.put( - RouterMetricEntity.CALL_KEY_COUNT, + RouterMetricEntity.KEY_COUNT, new MetricEntity( - "call_key_count", - MetricType.MIN_MAX_COUNT_SUM_AGGREGATIONS, + "key_count", + MetricType.HISTOGRAM, MetricUnit.NUMBER, - "Count of keys in multi key requests", + "Count of keys during response handling along with response codes", Utils.setOf( VeniceMetricsDimensions.VENICE_STORE_NAME, VeniceMetricsDimensions.VENICE_CLUSTER_NAME, VeniceMetricsDimensions.VENICE_REQUEST_METHOD, - VeniceMetricsDimensions.VENICE_REQUEST_VALIDATION_OUTCOME))); + VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE, + VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE_CATEGORY, + VeniceMetricsDimensions.VENICE_RESPONSE_STATUS_CODE_CATEGORY))); expectedMetrics.put( RouterMetricEntity.RETRY_COUNT, new MetricEntity(