From 46bd63e4079d03254767cec6a0710eaf486a4f6b Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 22 Jun 2023 11:29:11 -0700 Subject: [PATCH 01/37] Remove log files and add DCO (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../model/MetricAttributes.java | 1 - .../model/MetricsModel.java | 5 + .../performanceanalyzer/rca/Version.java | 7 +- .../api/metrics/SearchBackPressureStats.java | 15 ++ .../rca/framework/metrics/ReaderMetrics.java | 8 +- .../rca/store/OpenSearchAnalysisGraph.java | 3 +- .../reader/MetricsEmitter.java | 114 ++++++++++ .../reader/ReaderMetricsProcessor.java | 23 ++ .../SearchBackPressureMetricsProcessor.java | 197 ++++++++++++++++++ .../SearchBackPressureMetricsSnapShot.java | 179 ++++++++++++++++ ...earchBackPressureMetricsProcessorTest.java | 161 ++++++++++++++ ...SearchBackPressureMetricsSnapShotTest.java | 136 ++++++++++++ 12 files changed, 844 insertions(+), 5 deletions(-) create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShot.java create mode 100644 src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessorTest.java create mode 100644 src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShotTest.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java b/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java index 36230b8e5..414e266b7 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java +++ b/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java @@ -14,7 +14,6 @@ public class MetricAttributes { public HashSet dimensionNames; MetricAttributes(String unit, MetricDimension[] dimensions) { - this.unit = unit; this.dimensionNames = new HashSet(); for (MetricDimension dimension : dimensions) { diff --git a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java index 5b144ac12..f7c781ac1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java +++ b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java @@ -464,6 +464,11 @@ public class MetricsModel { MetricUnits.MILLISECOND.toString(), AllMetrics.ShardIndexingPressureDimension.values())); + // Search Back Pressure Metrics + allMetricsInitializer.put( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString(), + new MetricAttributes(MetricUnits.COUNT.toString(), EmptyDimension.values())); ALL_METRICS = Collections.unmodifiableMap(allMetricsInitializer); } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java index bfc85fcd3..ac53b4d72 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java @@ -22,8 +22,11 @@ public final class Version { * Note: The RCA version is agnostic of OpenSearch version. */ static final class Major { - // Bumping this post the Commons Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) - // and Service Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) change + // Bumping this post the Commons + // Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) + // and Service + // Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) + // change static final int RCA_MAJ_VERSION = 1; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java new file mode 100644 index 000000000..ae5c59814 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java @@ -0,0 +1,15 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.framework.api.metrics; + + +import org.opensearch.performanceanalyzer.rca.framework.api.Metric; + +public class SearchBackPressureStats extends Metric { + public SearchBackPressureStats(long evaluationIntervalSeconds) { + super("searchbp_shard_stats_cancellationCount", evaluationIntervalSeconds); + } +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/metrics/ReaderMetrics.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/metrics/ReaderMetrics.java index 4c9fc5a04..ec6ce0fd9 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/metrics/ReaderMetrics.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/metrics/ReaderMetrics.java @@ -86,7 +86,13 @@ public enum ReaderMetrics implements MeasurementSet { "FaultDetectionMetricsEmitterExecutionTime", "millis", StatsType.LATENCIES, - Statistics.SUM); + Statistics.SUM), + SEARCH_BACK_PRESSURE_METRICS_EMITTER_EXECUTION_TIME( + "SearchBackPressureMetricsEmitterExecutionTime", + "millis", + StatsType.LATENCIES, + Statistics.SUM), + ; /** What we want to appear as the metric name. */ private String name; diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index e144f2ee1..80763befb 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -183,7 +183,8 @@ public void construct() { // Use EVALUATION_INTERVAL_SECONDS instead of RCA_PERIOD which resolved to 12 seconds. // This is resulting in this RCA not getting executed in every 5 seconds. Rca> threadMetricsRca = - new ThreadMetricsRca(threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); + new ThreadMetricsRca( + threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); threadMetricsRca.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java b/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java index d209bf7f1..b060fb346 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java @@ -749,6 +749,120 @@ public static void emitGarbageCollectionInfo( ReaderMetrics.GC_INFO_EMITTER_EXECUTION_TIME, mFinalT - mCurrT); } + public static void emitSearchBackPressureMetrics( + MetricsDB metricsDB, + SearchBackPressureMetricsSnapShot searchBackPressureMetricsSnapShot) { + long mCurrT = System.currentTimeMillis(); + Result searchbp_records = searchBackPressureMetricsSnapShot.fetchAll(); + + // String SEARCHBP_MODE_DIM = "searchbp_mode"; + String SEARCHBP_TYPE_DIM = "SearchBackPressureStats"; + String SEARCHBP_TABLE_NAME = "searchbp_stats"; + + List dims = + new ArrayList() { + { + this.add(SEARCHBP_TYPE_DIM); + } + }; + + List stats_types = + new ArrayList() { + { + // Shard/Task Stats Cancellation Count + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString()); + // Shard Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + // Task Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + } + }; + + metricsDB.createMetric(new Metric<>(SEARCHBP_TABLE_NAME, 0d), dims); + + BatchBindStep handle = metricsDB.startBatchPut(new Metric<>(SEARCHBP_TABLE_NAME, 0d), dims); + + for (Record record : searchbp_records) { + for (String stats_type : stats_types) { + Optional tmpStatsObj = Optional.ofNullable(record.get(stats_type)); + // LOG.info(stats_type + " is: " + tmpStatsObj.map(o -> + // Long.parseLong(o.toString())).toString()); + + handle.bind( + stats_type, + // the rest are agg fields: sum, avg, min, max which don't make sense for + // searchbackpressure + tmpStatsObj.map(o -> Long.parseLong(o.toString())).orElse(0L), + tmpStatsObj.map(o -> Long.parseLong(o.toString())).orElse(0L), + tmpStatsObj.map(o -> Long.parseLong(o.toString())).orElse(0L), + tmpStatsObj.map(o -> Long.parseLong(o.toString())).orElse(0L)); + } + } + + handle.execute(); + + long mFinalT = System.currentTimeMillis(); + LOG.debug( + "Total time taken for writing Search Back Pressure info into metricsDB: {}", + mFinalT - mCurrT); + ServiceMetrics.READER_METRICS_AGGREGATOR.updateStat( + ReaderMetrics.SEARCH_BACK_PRESSURE_METRICS_EMITTER_EXECUTION_TIME, + mFinalT - mCurrT); + } + public static void emitAdmissionControlMetrics( MetricsDB metricsDB, AdmissionControlSnapshot snapshot) { long mCurrT = System.currentTimeMillis(); diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/ReaderMetricsProcessor.java b/src/main/java/org/opensearch/performanceanalyzer/reader/ReaderMetricsProcessor.java index 512c52f6d..3b446d95e 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/reader/ReaderMetricsProcessor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/ReaderMetricsProcessor.java @@ -70,6 +70,7 @@ public class ReaderMetricsProcessor implements Runnable { clusterManagerThrottlingMetricsMap; private NavigableMap shardStateMetricsMap; private NavigableMap admissionControlMetricsMap; + private NavigableMap searchBackPressureMetricsMap; private static final int MAX_DATABASES = 2; private static final int OS_SNAPSHOTS = 4; @@ -81,6 +82,7 @@ public class ReaderMetricsProcessor implements Runnable { private static final int GC_INFO_SNAPSHOTS = 4; private static final int CLUSTER_MANAGER_THROTTLING_SNAPSHOTS = 2; private static final int AC_SNAPSHOTS = 2; + private static final int SEARCH_BP_SNAPSHOTS = 4; private final String rootLocation; private final AppContext appContext; @@ -125,6 +127,8 @@ public ReaderMetricsProcessor( gcInfoMap = new TreeMap<>(); clusterManagerThrottlingMetricsMap = new TreeMap<>(); admissionControlMetricsMap = new TreeMap<>(); + searchBackPressureMetricsMap = new TreeMap<>(); + this.rootLocation = rootLocation; this.configOverridesApplier = new ConfigOverridesApplier(); @@ -268,6 +272,7 @@ public void trimOldSnapshots() throws Exception { trimMap(gcInfoMap, GC_INFO_SNAPSHOTS); trimMap(clusterManagerThrottlingMetricsMap, CLUSTER_MANAGER_THROTTLING_SNAPSHOTS); trimMap(admissionControlMetricsMap, AC_SNAPSHOTS); + trimMap(searchBackPressureMetricsMap, SEARCH_BP_SNAPSHOTS); for (NavigableMap snap : nodeMetricsMap.values()) { // do the same thing as OS_SNAPSHOTS. Eventually MemoryDBSnapshot @@ -397,6 +402,7 @@ private void emitMetrics(long currWindowStartTime) throws Exception { emitAdmissionControlMetrics(prevWindowStartTime, metricsDB); emitClusterManagerMetrics(prevWindowStartTime, metricsDB); emitClusterManagerThrottlingMetrics(prevWindowStartTime, metricsDB); + emitSearchBackPressureMetrics(prevWindowStartTime, metricsDB); metricsDB.commit(); metricsDBMap.put(prevWindowStartTime, metricsDB); @@ -594,6 +600,19 @@ private void emitClusterManagerThrottlingMetrics( } } + private void emitSearchBackPressureMetrics(long prevWindowStartTime, MetricsDB metricsDB) + throws Exception { + if (searchBackPressureMetricsMap.containsKey(prevWindowStartTime)) { + SearchBackPressureMetricsSnapShot prevSearchBPSnapShot = + searchBackPressureMetricsMap.get(prevWindowStartTime); + MetricsEmitter.emitSearchBackPressureMetrics(metricsDB, prevSearchBPSnapShot); + } else { + LOG.debug( + "Search Back Pressure snapshot does not exist for the previous window. " + + "Not emitting metrics."); + } + } + /** * OS, Request, Http and cluster_manager first aligns the currentTimeStamp with a 5 second * interval. In the current format, a file (previously a directory) is written every 5 seconds. @@ -679,6 +698,9 @@ is ready so it starts to read that file (go back two windows and EventProcessor admissionControlProcessor = AdmissionControlProcessor.build( currWindowStartTime, conn, admissionControlMetricsMap); + EventProcessor searchBackPressureMetricsProcessor = + SearchBackPressureMetricsProcessor.buildSearchBackPressureMetricsProcessor( + currWindowStartTime, conn, searchBackPressureMetricsMap); // The event dispatcher dispatches events to each of the registered event processors. // In addition to event processing each processor has an initialize/finalize function that @@ -702,6 +724,7 @@ is ready so it starts to read that file (go back two windows and eventDispatcher.registerEventProcessor(faultDetectionProcessor); eventDispatcher.registerEventProcessor(garbageCollectorInfoProcessor); eventDispatcher.registerEventProcessor(admissionControlProcessor); + eventDispatcher.registerEventProcessor(searchBackPressureMetricsProcessor); eventDispatcher.initializeProcessing( currWindowStartTime, currWindowStartTime + MetricsConfiguration.SAMPLING_INTERVAL); diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java new file mode 100644 index 000000000..8c6e93d8c --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java @@ -0,0 +1,197 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.reader; + + +import java.sql.Connection; +import java.util.ArrayList; +import java.util.Map; +import java.util.NavigableMap; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jooq.BatchBindStep; +import org.opensearch.performanceanalyzer.commons.event_process.Event; +import org.opensearch.performanceanalyzer.commons.event_process.EventProcessor; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; +import org.opensearch.performanceanalyzer.commons.metrics.PerformanceAnalyzerMetrics; +import org.opensearch.performanceanalyzer.commons.util.JsonConverter; + +public class SearchBackPressureMetricsProcessor implements EventProcessor { + + private static final Logger LOG = + LogManager.getLogger(SearchBackPressureMetricsProcessor.class); + + // instance of SearchBackPressureMetricsSnapShot to interact with the backend db + private SearchBackPressureMetricsSnapShot searchBackPressureMetricsSnapShot; + + // entry point for batch queries + private BatchBindStep handle; + + // normally starTime and endTime are gapped by 5 seconds (default sampling interval) + private long startTime; + private long endTime; + + private SearchBackPressureMetricsProcessor( + SearchBackPressureMetricsSnapShot searchBackPressureMetricsSnapShot) { + this.searchBackPressureMetricsSnapShot = searchBackPressureMetricsSnapShot; + } + + /* + * if current SnapShotMap has the snapshot for currentWindowStartTime, use the snapshot to build the processor + * else create a new Instance of SearchBackPressureMetricsSnapShot to initialize the processor + */ + static SearchBackPressureMetricsProcessor buildSearchBackPressureMetricsProcessor( + long currentWindowStartTime, + Connection connection, + NavigableMap + searchBackPressureSnapshotNavigableMap) { + // if current metrics is in searchBackPressureSnapshotNavigableMap map + if (searchBackPressureSnapshotNavigableMap.get(currentWindowStartTime) == null) { + SearchBackPressureMetricsSnapShot searchBackPressureMetricsSnapShot = + new SearchBackPressureMetricsSnapShot(connection, currentWindowStartTime); + searchBackPressureSnapshotNavigableMap.put( + currentWindowStartTime, searchBackPressureMetricsSnapShot); + return new SearchBackPressureMetricsProcessor(searchBackPressureMetricsSnapShot); + } + + return new SearchBackPressureMetricsProcessor( + searchBackPressureSnapshotNavigableMap.get(currentWindowStartTime)); + } + + @Override + public void initializeProcessing(long startTime, long endTime) { + this.startTime = startTime; + this.endTime = endTime; + this.handle = searchBackPressureMetricsSnapShot.startBatchPut(); + } + + @Override + public void finalizeProcessing() { + if (handle.size() > 0) { + handle.execute(); + } + } + + @Override + public boolean shouldProcessEvent(Event event) { + return event.key.contains(PerformanceAnalyzerMetrics.sSearchBackPressureMetricsPath); + } + + @Override + public void commitBatchIfRequired() { + if (handle.size() >= BATCH_LIMIT) { + handle.execute(); + handle = searchBackPressureMetricsSnapShot.startBatchPut(); + } + } + + // Handler method for incoming events + private void handleSearchBackPressureEvent(String eventValue) { + String[] lines = eventValue.split(System.lineSeparator()); + // 0thline is current time string (e.g. {current_time:1686952296889}) + // 1st line is the payload the metrics + if (lines.length < 2) { + throw new RuntimeException("Missing SearchBackPressure Metrics payload and timestamp."); + // return; + } + + // Parse metrics payload + parseJsonLine(lines[1]); + } + + private void parseJsonLine(final String jsonString) { + Map map = JsonConverter.createMapFrom(jsonString); + + if (map.isEmpty()) { + throw new RuntimeException("Missing SearchBackPressure Metrics payload."); + // return; + } + // A list of dims to be collected + ArrayList required_searchbp_dims = + new ArrayList() { + { + // Shard/Task Stats Cancellation Count + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString()); + + // Shard Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + + // Task Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + } + }; + + Object[] bindVals = new Object[required_searchbp_dims.size()]; + int idx = 0; + for (String dimension : required_searchbp_dims) { + bindVals[idx++] = map.get(dimension); + } + + handle.bind(bindVals); + } + + @Override + public void processEvent(Event event) { + // Handler method for incoming event + handleSearchBackPressureEvent(event.value); + + // commit Batch queries is overflow the limit + commitBatchIfRequired(); + } +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShot.java b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShot.java new file mode 100644 index 000000000..b995cbe44 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShot.java @@ -0,0 +1,179 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.reader; + + +import java.sql.Connection; +import java.util.ArrayList; +import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jooq.BatchBindStep; +import org.jooq.DSLContext; +import org.jooq.Field; +import org.jooq.Record; +import org.jooq.Result; +import org.jooq.SQLDialect; +import org.jooq.impl.DSL; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; + +public class SearchBackPressureMetricsSnapShot implements Removable { + + // Logger for current class + private static final Logger LOG = LogManager.getLogger(SearchBackPressureMetricsSnapShot.class); + + // entry point to interact with SQLite db + private final DSLContext create; + + private final String tableName; + private List> columns; + + // Global variables for naming + private static final String SEARCHBP_CONTROLLER_NAME_VALUE = "ControllerName"; + private static final String SEARCHBP_MODE_VALUE = "searchbp_mode"; + + // Create a table with specifed fields (columns) + public SearchBackPressureMetricsSnapShot(Connection conn, Long windowStartTime) { + this.create = DSL.using(conn, SQLDialect.SQLITE); + this.tableName = "search_back_pressure_" + windowStartTime; + + // Add the ControllerName, searchbp_mode columns in the table + this.columns = + new ArrayList>() { + { + // Shard/Task Stats Cancellation Count + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString()), + Long.class)); + + // Shard Stats Resource Heap / CPU Usage + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()), + Long.class)); + + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()), + Long.class)); + + // Task Stats Resource Heap / CPU Usage + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()), + Long.class)); + + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()), + Long.class)); + } + }; + + // create table with columns specified + create.createTable(tableName).columns(columns).execute(); + } + + public DSLContext getDSLContext() { + return create; + } + + public BatchBindStep startBatchPut() { + List dummyValues = new ArrayList<>(); + for (int i = 0; i < columns.size(); i++) { + dummyValues.add(null); + } + return create.batch(create.insertInto(DSL.table(this.tableName)).values(dummyValues)); + } + + public Result fetchAll() { + return create.select().from(DSL.table(tableName)).fetch(); + } + + @Override + public void remove() throws Exception { + create.dropTable(DSL.table(tableName)).execute(); + } +} diff --git a/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessorTest.java b/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessorTest.java new file mode 100644 index 000000000..6abbf4a90 --- /dev/null +++ b/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessorTest.java @@ -0,0 +1,161 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.reader; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.util.NavigableMap; +import java.util.TreeMap; +import org.jooq.Record; +import org.jooq.Result; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.opensearch.performanceanalyzer.commons.event_process.Event; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; +import org.opensearch.performanceanalyzer.commons.metrics.PerformanceAnalyzerMetrics; + +public class SearchBackPressureMetricsProcessorTest { + private static final String DB_URL = "jdbc:sqlite:"; + // private static final String TEST_MEM_POOL = "testMemPool"; + // private static final String COLLECTOR_NAME = "testCollectorName"; + private static final String SEARCH_BACK_PRESSURE_STATS_KEY = "search_back_pressure_stats"; + private SearchBackPressureMetricsProcessor searchBackPressureMetricsProcessor; + private long currTimeStamp; + + private NavigableMap searchBackPressureStatsMap; + Connection conn; + + // mock SearchBackPressureStatsCollector to test Event processing + private static final String SERIALIZED_EVENT = + "{\"searchbp_shard_stats_cancellationCount\":2," + + "\"searchbp_shard_stats_limitReachedCount\":2," + + "\"searchbp_shard_stats_resource_heap_usage_cancellationCount\":3," + + "\"searchbp_shard_stats_resource_heap_usage_currentMax\":3," + + "\"searchbp_shard_stats_resource_heap_usage_rollingAvg\":3," + + "\"searchbp_shard_stats_resource_cpu_usage_cancellationCount\":5," + + "\"searchbp_shard_stats_resource_cpu_usage_currentMax\":5," + + "\"searchbp_shard_stats_resource_cpu_usage_currentAvg\":5," + + "\"searchbp_shard_stats_resource_elaspedtime_usage_cancellationCount\":2," + + "\"searchbp_shard_stats_resource_elaspedtime_usage_currentMax\":2," + + "\"searchbp_shard_stats_resource_elaspedtime_usage_currentAvg\":2," + + "\"searchbp_task_stats_cancellationCount\":0," + + "\"searchbp_task_stats_limitReachedCount\":0," + + "\"searchbp_task_stats_resource_heap_usage_cancellationCount\":0," + + "\"searchbp_task_stats_resource_heap_usage_currentMax\":0," + + "\"searchbp_task_stats_resource_heap_usage_rollingAvg\":0," + + "\"searchbp_task_stats_resource_cpu_usage_cancellationCount\":0," + + "\"searchbp_task_stats_resource_cpu_usage_currentMax\":0," + + "\"searchbp_task_stats_resource_cpu_usage_currentAvg\":0," + + "\"searchbp_task_stats_resource_elaspedtime_usage_cancellationCount\":0," + + "\"searchbp_task_stats_resource_elaspedtime_usage_currentMax\":0," + + "\"searchbp_task_stats_resource_elaspedtime_usage_currentAvg\":0," + + "\"searchbp_mode\":\"MONITOR_ONLY\"," + + "\"searchbp_nodeid\":\"FgNAAAQQQDSROABCDEFHTX\"}"; + + @Before + public void setup() throws Exception { + Class.forName("org.sqlite.JDBC"); + System.setProperty("java.io.tmpdir", "/tmp"); + conn = DriverManager.getConnection(DB_URL); + this.currTimeStamp = System.currentTimeMillis(); + this.searchBackPressureStatsMap = new TreeMap<>(); + this.searchBackPressureMetricsProcessor = + searchBackPressureMetricsProcessor.buildSearchBackPressureMetricsProcessor( + currTimeStamp, conn, searchBackPressureStatsMap); + } + + // Test valid case of the handleSearchBackPressureEvent() + @Test + public void testSearchBackPressureProcessEvent() throws Exception { + // Create a SearchBackPressureEvent + Event testEvent = buildTestSearchBackPressureStatsEvent(); + + // Test the SearchBackPressureMetricsSnapShot + searchBackPressureMetricsProcessor.initializeProcessing( + this.currTimeStamp, System.currentTimeMillis()); + assertTrue(searchBackPressureMetricsProcessor.shouldProcessEvent(testEvent)); + + searchBackPressureMetricsProcessor.processEvent(testEvent); + searchBackPressureMetricsProcessor.finalizeProcessing(); + + SearchBackPressureMetricsSnapShot currSnapshot = + searchBackPressureStatsMap.get(this.currTimeStamp); + Result result = currSnapshot.fetchAll(); + assertEquals(1, result.size()); + + // SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG value is 3L according to the + // SERIALIZED_EVENT, should EQUAL + Assert.assertEquals( + 3L, + result.get(0) + .get( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString())); + // SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT value is 0L according to the + // SERIALIZED_EVENT, should EQUAL + Assert.assertEquals( + 0L, + result.get(0) + .get( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString())); + + // SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT value is 0L according to the + // SERIALIZED_EVENT, should NOT EQUAL + Assert.assertNotEquals( + 2L, + result.get(0) + .get( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString())); + } + + @Test + public void testEmptySearchBackPressureProcessEvent() throws Exception { + // Create a SearchBackPressureEvent + Event testEvent = buildEmptyTestSearchBackPressureStatsEvent(); + + // Test the SearchBackPressureMetricsSnapShot + searchBackPressureMetricsProcessor.initializeProcessing( + this.currTimeStamp, System.currentTimeMillis()); + assertTrue(searchBackPressureMetricsProcessor.shouldProcessEvent(testEvent)); + + try { + searchBackPressureMetricsProcessor.processEvent(testEvent); + Assert.assertFalse( + "Negative scenario test: Should catch a RuntimeException and skip this test", + true); + } catch (RuntimeException ex) { + // should catch the exception and the previous assertion should not be executed + } + } + + private Event buildTestSearchBackPressureStatsEvent() { + StringBuilder str = new StringBuilder(); + str.append(PerformanceAnalyzerMetrics.getJsonCurrentMilliSeconds()) + .append(PerformanceAnalyzerMetrics.sMetricNewLineDelimitor); + + str.append(SERIALIZED_EVENT).append(PerformanceAnalyzerMetrics.sMetricNewLineDelimitor); + return new Event( + SEARCH_BACK_PRESSURE_STATS_KEY, str.toString(), System.currentTimeMillis()); + } + + private Event buildEmptyTestSearchBackPressureStatsEvent() { + StringBuilder str = new StringBuilder(); + str.append(PerformanceAnalyzerMetrics.getJsonCurrentMilliSeconds()) + .append(PerformanceAnalyzerMetrics.sMetricNewLineDelimitor); + + return new Event( + SEARCH_BACK_PRESSURE_STATS_KEY, str.toString(), System.currentTimeMillis()); + } +} diff --git a/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShotTest.java b/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShotTest.java new file mode 100644 index 000000000..eeaa1a30f --- /dev/null +++ b/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShotTest.java @@ -0,0 +1,136 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.reader; + +import static org.junit.Assert.assertEquals; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.util.ArrayList; +import org.jooq.BatchBindStep; +import org.jooq.Record; +import org.jooq.Result; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; + +public class SearchBackPressureMetricsSnapShotTest { + private static final String DB_URL = "jdbc:sqlite:"; + private Connection conn; + SearchBackPressureMetricsSnapShot snapshot; + + ArrayList required_searchbp_dims = + new ArrayList() { + { + // Shard/Task Stats Cancellation Count + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString()); + + // Shard Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + + // Task Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + } + }; + + @Before + public void setup() throws Exception { + Class.forName("org.sqlite.JDBC"); + System.setProperty("java.io.tmpdir", "/tmp"); + conn = DriverManager.getConnection(DB_URL); + snapshot = new SearchBackPressureMetricsSnapShot(conn, System.currentTimeMillis()); + } + + @Test + public void testReadSearchBackPressureMetricsSnapshot() throws Exception { + final BatchBindStep handle = snapshot.startBatchPut(); + insertIntoTable(handle); + + final Result result = snapshot.fetchAll(); + + assertEquals(1, result.size()); + // for 14 (length of required_searchbp_dims) fields, each assign a value from 0 to 13 + // test each field and verify the result + for (long i = 0; i < required_searchbp_dims.size(); i++) { + Assert.assertEquals( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString() + + " should be " + + String.valueOf(i), + i, + result.get(0).get(required_searchbp_dims.get((int) i))); + } + } + + @After + public void tearDown() throws Exception { + conn.close(); + } + + private void insertIntoTable(BatchBindStep handle) { + Object[] bindVals = new Object[required_searchbp_dims.size()]; + for (int i = 0; i < required_searchbp_dims.size(); i++) { + bindVals[i] = Long.valueOf(i); + } + + handle.bind(bindVals).execute(); + } +} From 92c3fc808a7aff3e914e44797847b8bbbb2b5264 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 22 Jun 2023 11:38:55 -0700 Subject: [PATCH 02/37] Remove extra files (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../api/metrics/SearchBackPressureStats.java | 15 --------------- .../SearchBackPressureMetricsProcessor.java | 2 +- 2 files changed, 1 insertion(+), 16 deletions(-) delete mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java deleted file mode 100644 index ae5c59814..000000000 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.performanceanalyzer.rca.framework.api.metrics; - - -import org.opensearch.performanceanalyzer.rca.framework.api.Metric; - -public class SearchBackPressureStats extends Metric { - public SearchBackPressureStats(long evaluationIntervalSeconds) { - super("searchbp_shard_stats_cancellationCount", evaluationIntervalSeconds); - } -} diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java index 8c6e93d8c..8eec8a831 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java @@ -38,7 +38,7 @@ private SearchBackPressureMetricsProcessor( SearchBackPressureMetricsSnapShot searchBackPressureMetricsSnapShot) { this.searchBackPressureMetricsSnapShot = searchBackPressureMetricsSnapShot; } - + /* * if current SnapShotMap has the snapshot for currentWindowStartTime, use the snapshot to build the processor * else create a new Instance of SearchBackPressureMetricsSnapShot to initialize the processor From a47388af973f39d93f049ef60e9d9b043bf9aa2b Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 22 Jun 2023 11:42:43 -0700 Subject: [PATCH 03/37] Remove styling difference (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../performanceanalyzer/model/MetricAttributes.java | 1 + .../org/opensearch/performanceanalyzer/rca/Version.java | 7 ++----- .../rca/store/OpenSearchAnalysisGraph.java | 3 +-- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java b/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java index 414e266b7..36230b8e5 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java +++ b/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java @@ -14,6 +14,7 @@ public class MetricAttributes { public HashSet dimensionNames; MetricAttributes(String unit, MetricDimension[] dimensions) { + this.unit = unit; this.dimensionNames = new HashSet(); for (MetricDimension dimension : dimensions) { diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java index ac53b4d72..bfc85fcd3 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java @@ -22,11 +22,8 @@ public final class Version { * Note: The RCA version is agnostic of OpenSearch version. */ static final class Major { - // Bumping this post the Commons - // Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) - // and Service - // Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) - // change + // Bumping this post the Commons Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) + // and Service Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) change static final int RCA_MAJ_VERSION = 1; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index 80763befb..e144f2ee1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -183,8 +183,7 @@ public void construct() { // Use EVALUATION_INTERVAL_SECONDS instead of RCA_PERIOD which resolved to 12 seconds. // This is resulting in this RCA not getting executed in every 5 seconds. Rca> threadMetricsRca = - new ThreadMetricsRca( - threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); + new ThreadMetricsRca(threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); threadMetricsRca.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); From 8b86501bec7ec07d0c87c82515883c61e7dacb72 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 22 Jun 2023 12:00:58 -0700 Subject: [PATCH 04/37] Remove unnecessary file changes (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../opensearch/performanceanalyzer/model/MetricsModel.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java index f7c781ac1..5b144ac12 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java +++ b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java @@ -464,11 +464,6 @@ public class MetricsModel { MetricUnits.MILLISECOND.toString(), AllMetrics.ShardIndexingPressureDimension.values())); - // Search Back Pressure Metrics - allMetricsInitializer.put( - AllMetrics.SearchBackPressureStatsValue.SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT - .toString(), - new MetricAttributes(MetricUnits.COUNT.toString(), EmptyDimension.values())); ALL_METRICS = Collections.unmodifiableMap(allMetricsInitializer); } } From c6549a9be4dd77e7c9bc1bc5b96883b17fc7aa48 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 26 Jun 2023 16:06:04 -0700 Subject: [PATCH 05/37] Add RCA_Decider (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../model/MetricsModel.java | 5 + .../configs/SearchBackPressureRcaConfig.java | 32 ++++ .../framework/api/metrics/Searchbp_Stats.java | 18 ++ .../rca/framework/core/RcaConf.java | 5 + .../rca/store/OpenSearchAnalysisGraph.java | 12 +- .../SearchBackPressurClusterRCA.java | 22 +++ .../SearchBackPressureRCA.java | 162 ++++++++++++++++++ 7 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java index 5b144ac12..f7c781ac1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java +++ b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java @@ -464,6 +464,11 @@ public class MetricsModel { MetricUnits.MILLISECOND.toString(), AllMetrics.ShardIndexingPressureDimension.values())); + // Search Back Pressure Metrics + allMetricsInitializer.put( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString(), + new MetricAttributes(MetricUnits.COUNT.toString(), EmptyDimension.values())); ALL_METRICS = Collections.unmodifiableMap(allMetricsInitializer); } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java new file mode 100644 index 000000000..e646c3f69 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -0,0 +1,32 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.configs; + + +import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; + +public class SearchBackPressureRcaConfig { + public static final String CONFIG_NAME = "search-back-pressure-rca-policy"; + + // INTERVAL PERIOD IN SECONDS + public static final long DEFAULT_EVALUATION_INTERVAL_IN_S = 60; + + // Increase Threshold + // node max heap usage in last 60 secs is less than 70% + public static final int DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD = 70; + + // cancellationCount due to heap is more than 50% of all task cancellations. + public static final int DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD = 50; + + // Decrease Threshold + // node min heap usage in last 60 secs is more than 80% + public static final int DEFAULT_MIN_HEAP_OVERFLOW_THRESHOLD = 80; + + // cancellationCount due to heap is more than 30% of all task cancellations + public static final int DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD = 30; + + public SearchBackPressureRcaConfig(final RcaConf conf) {} +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java new file mode 100644 index 000000000..afe553da0 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java @@ -0,0 +1,18 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.framework.api.metrics; + + +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; +import org.opensearch.performanceanalyzer.rca.framework.api.Metric; + +public class Searchbp_Stats extends Metric { + public static final String NAME = AllMetrics.HeapValue.HEAP_USED.name(); + + public Heap_Used(long evaluationIntervalSeconds) { + super(AllMetrics.HeapValue.HEAP_USED.toString(), evaluationIntervalSeconds); + } +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/core/RcaConf.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/core/RcaConf.java index 4005e1c15..0ff06f0af 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/core/RcaConf.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/core/RcaConf.java @@ -51,6 +51,7 @@ import org.opensearch.performanceanalyzer.rca.configs.HotShardRcaConfig; import org.opensearch.performanceanalyzer.rca.configs.OldGenContendedRcaConfig; import org.opensearch.performanceanalyzer.rca.configs.QueueRejectionRcaConfig; +import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.configs.ShardRequestCacheRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.bucket.BasicBucketCalculator; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.bucket.BucketCalculator; @@ -232,6 +233,10 @@ public OldGenContendedRcaConfig getOldGenContendedRcaConfig() { return new OldGenContendedRcaConfig(this); } + public SearchBackPressureRcaConfig getSearchBackPressureRcaConfig() { + return new SearchBackPressureRcaConfig(this); + } + public T readRcaConfig( String rcaName, String key, T defaultValue, Class clazz) { return readRcaConfig(rcaName, key, defaultValue, (s) -> true, clazz); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index e144f2ee1..c24ac1f47 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -85,6 +85,7 @@ import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.LargeHeapClusterRca; import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.OldGenContendedRca; import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.OldGenReclamationRca; +import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.SearchBackPressureRCA; import org.opensearch.performanceanalyzer.rca.store.rca.temperature.NodeTemperatureRca; import org.opensearch.performanceanalyzer.rca.store.rca.temperature.dimension.CpuUtilDimensionTemperatureRca; import org.opensearch.performanceanalyzer.rca.store.rca.temperature.dimension.HeapAllocRateTemperatureRca; @@ -183,7 +184,8 @@ public void construct() { // Use EVALUATION_INTERVAL_SECONDS instead of RCA_PERIOD which resolved to 12 seconds. // This is resulting in this RCA not getting executed in every 5 seconds. Rca> threadMetricsRca = - new ThreadMetricsRca(threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); + new ThreadMetricsRca( + threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); threadMetricsRca.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); @@ -432,6 +434,14 @@ public void construct() { shardRequestCacheClusterRca, highHeapUsageClusterRca)); + // Search Back Pressure Service RCA + final SearchBackPressureRCA searchBackPressureRCA = + new SearchBackPressureRCA(heapMax, heapUsed, gcType); + searchBackPressureRCA.addTag( + RcaConsts.RcaTagConstants.TAG_LOCUS, + RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); + searchBackPressureRCA.addAllUpstreams(Arrays.asList(heapMax, heapUsed, gcType)); + AdmissionControlDecider admissionControlDecider = buildAdmissionControlDecider(heapUsed, heapMax); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java new file mode 100644 index 000000000..b97d9c0cd --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java @@ -0,0 +1,22 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; + + +import org.opensearch.performanceanalyzer.rca.framework.api.Rca; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; +import org.opensearch.performanceanalyzer.rca.store.rca.cluster.BaseClusterRca; + +public class SearchBackPressurClusterRCA extends BaseClusterRca { + + public static final String RCA_TABLE_NAME = SearchBackPressurClusterRCA.class.getSimpleName(); + + public >> SearchBackPressurClusterRCA( + final int rcaPeriod, final R SearchBackPressureRCA) { + super(rcaPeriod, SearchBackPressureRCA); + } +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java new file mode 100644 index 000000000..1937e0e0b --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -0,0 +1,162 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; + +import java.util.ArrayList; +import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jooq.Field; +import org.opensearch.performanceanalyzer.grpc.FlowUnitMessage; +import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; +import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; +import org.opensearch.performanceanalyzer.rca.framework.api.Metric; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; +import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; +import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; +import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca; +import static org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil.readDataFromSqlResult; + +public class SearchBackPressureRCA extends OldGenRca> { + // LOGGER for SearchBackPressureRCA + private static final Logger LOG = LogManager.getLogger(SearchBackPressureRCA.class); + private static final double BYTES_TO_GIGABYTES = Math.pow(1024, 3); + private static final long EVAL_INTERVAL_IN_S = 5; + + // Key Metrics to be used to determine health status + // Task Level cancellationCount + // Shard Level cancellationCount + // Task level max heap usage + // Shard level max heap usage + // total node heap usage + private final Metric heapUsed; + // private final Metric SearchBPCancellationJVMPercentage; + + private long SearchBPCancellationJVMThreshold; + + // cases to incrase threshold + private long heapUsedIncreaseMaxThreshold; + private long heapCancellationIncreaseMaxThreshold; + + // case to decrease threshold + private long heapUsedDecreaseMinThreshold; + private long heapCancellationDecreaseMaxThreashold; + + // Period: 60s + + // track how many samples has been checked (only reach 60s (12 * 5s) to execute + // operate()) + private long counter; + + // key functions to be overriden + // operate(): determine whether to generate of flow unit of HEALTHY or UNHEALTHY + // readRcaConf(): read the key configuration metrics like heapMaxThreshold, + // heapMinThreshold, + // cancellationHeapPercentageThreshold + // counter to keep track of times of checking, as the default sliding window is + // 60 times, and + // interval for RCA scanning is 5s + // counter needs to be at least 12 to trigger operate(): 12 is the + // rcaSamplesBeforeEval + + // generateFlowUnitListFromWite() gets wireFlowUnits() (Do we need this?) + + // Not to be overriden but need to have + // read_cancellationcount_from_sql_shard + // read_cancellationcount_from_sql_task + // read_heapused_from_sql + // for heapused, simply call getOldGenUsedOrDefault() from OldGenRca.java + public SearchBackPressureRCA(final Metric heapMax, final Metric heapUsed, Metric gcType) { + super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); + this.heapUsed = heapUsed; + this.heapUsedIncreaseMaxThreshold = + SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD; + this.heapCancellationIncreaseMaxThreshold = + SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD; + this.heapUsedDecreaseMinThreshold = + SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_OVERFLOW_THRESHOLD; + this.heapCancellationDecreaseMaxThreashold = + SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD; + + LOG.info("SearchBackPressureRCA initialized"); + } + + /* + * operate() is used for local build + * generateFlowUnitListFromWire simply use remote flowunits to + */ + @Override + public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { + final List flowUnitMessages = + args.getWireHopper().readFromWire(args.getNode()); + final List> flowUnitList = new ArrayList<>(); + LOG.debug("rca: Executing fromWire: {}", this.getClass().getSimpleName()); + for (FlowUnitMessage flowUnitMessage : flowUnitMessages) { + flowUnitList.add(ResourceFlowUnit.buildFlowUnitFromWrapper(flowUnitMessage)); + } + setFlowUnits(flowUnitList); + } + + @Override + public ResourceFlowUnit operate() { + LOG.info("SearchBackPressureRCA operate() intiatilized"); + // Use OldGenRca.java to get heap usage and max heap size + double prevHeapUsage = getOldGenUsedOrDefault(0d); + double maxHeapSize = getMaxOldGenSizeOrDefault(Double.MAX_VALUE); + + double heapUsedPercentage = prevHeapUsage / maxHeapSize; + + // function to read cancellation count from sql + + // print out oldGenUsed and maxOldGen + LOG.info( + "SearchBackPressureRCA: oldGenUsed: {} maxOldGen: {}, heapUsedPercentage: {}", + prevHeapUsage, + maxHeapSize, + heapUsedPercentage); + LOG.info("SearchBackPressureRCA operate() finished"); + return null; + } + + private long getSearchBackPressureShardCancellationCount() { + getMetric(null, null, null) + return 0; + } + + private long getSearchBackPressureTaskCancellationCount() { + return 0; + } + + private double getMetric(M metric, Field field, String fieldName) { + double response = 0; + for (MetricFlowUnit flowUnit : metric.getFlowUnits()) { + if (!flowUnit.isEmpty()) { + double metricResponse = + readDataFromSqlResult(flowUnit.getData(), field, fieldName, MetricsDB.MAX); + if (!Double.isNaN(metricResponse) && metricResponse > 0) { + response = metricResponse; + } + } + } + return response; + } + + /** + * read threshold values from rca.conf + * + * @param conf RcaConf object + */ + @Override + public void readRcaConf(RcaConf conf) { + // only initialized one time + LOG.info("SearchBackPressureRCA readRcaConf() intiatilized"); + final SearchBackPressureRcaConfig config = conf.getSearchBackPressureRcaConfig(); + // read anything from config file in runtime + // if not just skip it + } +} From a296384f78a79513b85d944c8b8890d67cb6459e Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 26 Jun 2023 22:14:06 -0700 Subject: [PATCH 06/37] Extract Heap Usage from SQlitedb (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../performanceanalyzer/rca/Version.java | 9 +++++--- .../framework/api/metrics/Searchbp_Stats.java | 8 +++---- .../rca/store/OpenSearchAnalysisGraph.java | 7 +++++++ .../SearchBackPressureRCA.java | 21 +++++++++++++++++-- .../reader/MetricsEmitter.java | 6 ++++-- 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java index bfc85fcd3..402013cf7 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java @@ -19,11 +19,14 @@ public final class Version { * transferred packets should be dropped. Every increment here should be accompanied with a line * describing the version bump. * - * Note: The RCA version is agnostic of OpenSearch version. + *

Note: The RCA version is agnostic of OpenSearch version. */ static final class Major { - // Bumping this post the Commons Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) - // and Service Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) change + // Bumping this post the Commons + // Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) + // and Service + // Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) + // change static final int RCA_MAJ_VERSION = 1; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java index afe553da0..e655e4edc 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java @@ -10,9 +10,9 @@ import org.opensearch.performanceanalyzer.rca.framework.api.Metric; public class Searchbp_Stats extends Metric { - public static final String NAME = AllMetrics.HeapValue.HEAP_USED.name(); - - public Heap_Used(long evaluationIntervalSeconds) { - super(AllMetrics.HeapValue.HEAP_USED.toString(), evaluationIntervalSeconds); + public Searchbp_Stats(long evaluationIntervalSeconds) { + super( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TABLE_NAME.toString(), + evaluationIntervalSeconds); } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index c24ac1f47..1cb4966af 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -118,6 +118,9 @@ public void construct() { MetricsDB.AVG, AllMetrics.CommonDimension.OPERATION.toString()); + // SearchBackpressure Metric + // Metric searchbp_Stats = new Searchbp_Stats(EVALUATION_INTERVAL_SECONDS); + heapUsed.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); @@ -142,6 +145,9 @@ public void construct() { threadWaitedTime.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); + // searchbp_Stats.addTag( + // RcaConsts.RcaTagConstants.TAG_LOCUS, + // RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); addLeaf(heapUsed); addLeaf(gcEvent); @@ -151,6 +157,7 @@ public void construct() { addLeaf(cpuUtilizationGroupByOperation); addLeaf(threadBlockedTime); addLeaf(threadWaitedTime); + // addLeaf(searchbp_Stats); // add node stats metrics List nodeStatsMetrics = constructNodeStatsMetrics(); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 1937e0e0b..9b3dec835 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -5,6 +5,8 @@ package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; +import static org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil.readDataFromSqlResult; + import java.util.ArrayList; import java.util.List; import org.apache.logging.log4j.LogManager; @@ -20,7 +22,6 @@ import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca; -import static org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil.readDataFromSqlResult; public class SearchBackPressureRCA extends OldGenRca> { // LOGGER for SearchBackPressureRCA @@ -35,6 +36,7 @@ public class SearchBackPressureRCA extends OldGenRca operate() { } private long getSearchBackPressureShardCancellationCount() { - getMetric(null, null, null) + // Use Searchbp_Stats metrics to get the metrics value + // Field shard_cancellation_count_field = + // DSL.field( + // DSL.name( + // AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TYPE_DIM + // .toString()), + // String.class); + // double searchbpShardCancellationCount = + // getMetric(this.searchbp_Stats, shard_cancellation_count_field, "avg"); + + // LOG searchbpShardCancellationCount + // LOG.info( + // "SearchBackPressureRCA: searchbpShardCancellationCount: {}", + // searchbpShardCancellationCount); + return 0; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java b/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java index b060fb346..fa7008748 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java @@ -756,8 +756,10 @@ public static void emitSearchBackPressureMetrics( Result searchbp_records = searchBackPressureMetricsSnapShot.fetchAll(); // String SEARCHBP_MODE_DIM = "searchbp_mode"; - String SEARCHBP_TYPE_DIM = "SearchBackPressureStats"; - String SEARCHBP_TABLE_NAME = "searchbp_stats"; + String SEARCHBP_TYPE_DIM = + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TYPE_DIM.toString(); + String SEARCHBP_TABLE_NAME = + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TABLE_NAME.toString(); List dims = new ArrayList() { From c44b9287517d7001df37b7b52babe437ecd7b0d2 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 26 Jun 2023 23:14:37 -0700 Subject: [PATCH 07/37] Extract required searchbp metrics for deciders (signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../rca/store/OpenSearchAnalysisGraph.java | 16 ++-- .../SearchBackPressureRCA.java | 77 +++++++++++++++---- 2 files changed, 69 insertions(+), 24 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index 1cb4966af..5d86ca57c 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -41,6 +41,7 @@ import org.opensearch.performanceanalyzer.rca.framework.api.metrics.Heap_Max; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.Heap_Used; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.IndexWriter_Memory; +import org.opensearch.performanceanalyzer.rca.framework.api.metrics.Searchbp_Stats; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.ThreadPool_QueueCapacity; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.ThreadPool_RejectedReqs; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.Thread_Blocked_Time; @@ -119,7 +120,7 @@ public void construct() { AllMetrics.CommonDimension.OPERATION.toString()); // SearchBackpressure Metric - // Metric searchbp_Stats = new Searchbp_Stats(EVALUATION_INTERVAL_SECONDS); + Metric searchbp_Stats = new Searchbp_Stats(EVALUATION_INTERVAL_SECONDS); heapUsed.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, @@ -145,9 +146,9 @@ public void construct() { threadWaitedTime.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); - // searchbp_Stats.addTag( - // RcaConsts.RcaTagConstants.TAG_LOCUS, - // RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); + searchbp_Stats.addTag( + RcaConsts.RcaTagConstants.TAG_LOCUS, + RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); addLeaf(heapUsed); addLeaf(gcEvent); @@ -157,7 +158,7 @@ public void construct() { addLeaf(cpuUtilizationGroupByOperation); addLeaf(threadBlockedTime); addLeaf(threadWaitedTime); - // addLeaf(searchbp_Stats); + addLeaf(searchbp_Stats); // add node stats metrics List nodeStatsMetrics = constructNodeStatsMetrics(); @@ -443,11 +444,12 @@ public void construct() { // Search Back Pressure Service RCA final SearchBackPressureRCA searchBackPressureRCA = - new SearchBackPressureRCA(heapMax, heapUsed, gcType); + new SearchBackPressureRCA(heapMax, heapUsed, gcType, searchbp_Stats); searchBackPressureRCA.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); - searchBackPressureRCA.addAllUpstreams(Arrays.asList(heapMax, heapUsed, gcType)); + searchBackPressureRCA.addAllUpstreams( + Arrays.asList(heapMax, heapUsed, gcType, searchbp_Stats)); AdmissionControlDecider admissionControlDecider = buildAdmissionControlDecider(heapUsed, heapMax); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 9b3dec835..f51570309 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -12,6 +12,8 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.jooq.Field; +import org.jooq.impl.DSL; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; import org.opensearch.performanceanalyzer.grpc.FlowUnitMessage; import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; @@ -36,8 +38,7 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( + final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); this.heapUsed = heapUsed; - // this.searchbp_Stats = new Searchbp_Stats(5); + this.searchbp_Stats = searchbp_Stats; this.heapUsedIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD; this.heapCancellationIncreaseMaxThreshold = @@ -115,6 +117,7 @@ public ResourceFlowUnit operate() { double heapUsedPercentage = prevHeapUsage / maxHeapSize; // function to read cancellation count from sql + getSearchBackPressureShardCancellationCount(); // print out oldGenUsed and maxOldGen LOG.info( @@ -127,21 +130,61 @@ public ResourceFlowUnit operate() { } private long getSearchBackPressureShardCancellationCount() { + LOG.info("getSearchBackPressureShardCancellationCount() STARTED"); + // Use Searchbp_Stats metrics to get the metrics value - // Field shard_cancellation_count_field = - // DSL.field( - // DSL.name( - // AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TYPE_DIM - // .toString()), - // String.class); - // double searchbpShardCancellationCount = - // getMetric(this.searchbp_Stats, shard_cancellation_count_field, "avg"); - - // LOG searchbpShardCancellationCount - // LOG.info( - // "SearchBackPressureRCA: searchbpShardCancellationCount: {}", - // searchbpShardCancellationCount); + // shard level cancellation count + Field searchbp_stats_type_field = + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TYPE_DIM + .toString()), + String.class); + + double searchbpShardCancellationCount = + getMetric( + this.searchbp_Stats, + searchbp_stats_type_field, + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString()); + double searchbpTaskCancellationCount = + getMetric( + this.searchbp_Stats, + searchbp_stats_type_field, + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString()); + double searchbpJVMShardCancellationCount = + getMetric( + this.searchbp_Stats, + searchbp_stats_type_field, + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + double searchbpJVMTaskCancellationCount = + getMetric( + this.searchbp_Stats, + searchbp_stats_type_field, + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + LOG.info( + "SearchBackPressureRCA: searchbpShardCancellationCount: {}", + searchbpShardCancellationCount); + // print out searchbpTaskCancellationCount, searchbpJVMShardCancellationCount, + // searchbpJVMTaskCancellationCount + LOG.info( + "SearchBackPressureRCA: searchbpTaskCancellationCount: {}", + searchbpTaskCancellationCount); + LOG.info( + "SearchBackPressureRCA: searchbpJVMShardCancellationCount: {}", + searchbpJVMShardCancellationCount); + LOG.info( + "SearchBackPressureRCA: searchbpJVMTaskCancellationCount: {}", + searchbpJVMTaskCancellationCount); + LOG.info("getSearchBackPressureShardCancellationCount() finished"); return 0; } From c1e957db5c4778450f32d5f1fd22258cfd28d713 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 27 Jun 2023 16:14:08 -0700 Subject: [PATCH 08/37] Add SearchBackPressureRCA Metric (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../model/SearchBackPressureRCAMetric.java | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java new file mode 100644 index 000000000..6ef8b6eae --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java @@ -0,0 +1,83 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model; + +/** Represents used heap and max heap in gigabytes */ +public class SearchBackPressureRCAMetric { + private final double usedHeap; + private final double maxHeap; + private final double searchbpShardCancellationCount; + private final double searchbpTaskCancellationCount; + private final double searchbpJVMShardCancellationCount; + private final double searchbpJVMTaskCancellationCount; + + // Constructor + public SearchBackPressureRCAMetric(double usedHeap, double maxHeap, double searchbpShardCancellationCount, + double searchbpTaskCancellationCount, double searchbpJVMShardCancellationCount, + double searchbpJVMTaskCancellationCount) { + this.usedHeap = usedHeap; + this.maxHeap = maxHeap; + this.searchbpShardCancellationCount = searchbpShardCancellationCount; + this.searchbpTaskCancellationCount = searchbpTaskCancellationCount; + this.searchbpJVMShardCancellationCount = searchbpJVMShardCancellationCount; + this.searchbpJVMTaskCancellationCount = searchbpJVMTaskCancellationCount; + } + + // Getters + public double getUsedHeap() { + return usedHeap; + } + + public double getMaxHeap() { + return maxHeap; + } + + public double getSearchbpShardCancellationCount() { + return searchbpShardCancellationCount; + } + + public double getSearchbpTaskCancellationCount() { + return searchbpTaskCancellationCount; + } + + public double getSearchbpJVMShardCancellationCount() { + return searchbpJVMShardCancellationCount; + } + + public double getSearchbpJVMTaskCancellationCount() { + return searchbpJVMTaskCancellationCount; + } + + public double getHeapUsagePercent() { + if (this.getMaxHeap() == 0) { + return 0; + } + return 100 * this.getUsedHeap() / this.getMaxHeap(); + } + + public double getShardJVMCancellationPercent() { + if (this.getSearchbpShardCancellationCount() == 0) { + return 0; + } + return 100 * this.getSearchbpJVMShardCancellationCount() / this.getSearchbpShardCancellationCount(); + } + + public double getTaskJVMCancellationPercent() { + if (this.getSearchbpTaskCancellationCount() == 0) { + return 0; + } + return 100 * this.getSearchbpJVMTaskCancellationCount() / this.getSearchbpTaskCancellationCount(); + } + + public boolean hasValues() { + return this.getUsedHeap() != 0 && this.getMaxHeap() != 0; + } + + @Override + public String toString() { + return "HeapMetric{" + "usedHeap=" + usedHeap + ", maxHeap=" + maxHeap + '}'; + } +} From 55e5cdde7738005d25f60e2e30cf8b4ba959dee5 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 27 Jun 2023 17:11:38 -0700 Subject: [PATCH 09/37] Use SearchBackPressureRCAMetrics to aggregate metrics (signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 2 + .../rca/store/OpenSearchAnalysisGraph.java | 2 +- .../SearchBackPressureRCA.java | 172 +++++++++++++----- .../model/SearchBackPressureRCAMetric.java | 31 +++- 4 files changed, 156 insertions(+), 51 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index e646c3f69..f460f16d1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -29,4 +29,6 @@ public class SearchBackPressureRcaConfig { public static final int DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD = 30; public SearchBackPressureRcaConfig(final RcaConf conf) {} + + // conf file to get Runtime Threshold for SearchBackPressureRCAConfig (TODO) } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index 5d86ca57c..bae2c74cf 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -444,7 +444,7 @@ public void construct() { // Search Back Pressure Service RCA final SearchBackPressureRCA searchBackPressureRCA = - new SearchBackPressureRCA(heapMax, heapUsed, gcType, searchbp_Stats); + new SearchBackPressureRCA(RCA_PERIOD, heapMax, heapUsed, gcType, searchbp_Stats); searchBackPressureRCA.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index f51570309..f510112d6 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -7,8 +7,10 @@ import static org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil.readDataFromSqlResult; +import java.time.Clock; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.TimeUnit; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.jooq.Field; @@ -18,12 +20,16 @@ import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.Metric; +import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindow; +import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindowData; +import org.opensearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca; +import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model.SearchBackPressureRCAMetric; public class SearchBackPressureRCA extends OldGenRca> { // LOGGER for SearchBackPressureRCA @@ -32,30 +38,46 @@ public class SearchBackPressureRCA extends OldGenRca taskJVMCancellationSlidingWindow; + private final SlidingWindow shardJVMCancellationSlidingWindow; + private final SlidingWindow heapUsageSlidingWindow; + + // Sliding Window Interval + private static final int SLIDING_WINDOW_SIZE_IN_MINS = 1; + private static final int SLIDING_WINDOW_SIZE_IN_SECS = SLIDING_WINDOW_SIZE_IN_MINS * 60; - // track how many samples has been checked (only reach 60s (12 * 5s) to execute - // operate()) + // counter to check the samples has been taken, only emit flow units when counter equals to + // rcaPeriod private long counter; + // Required amount of RCA period this RCA needs to run before sending out a flowunit + private final int rcaPeriod; + + // Current time + protected Clock clock; + // key functions to be overriden // operate(): determine whether to generate of flow unit of HEALTHY or UNHEALTHY // readRcaConf(): read the key configuration metrics like heapMaxThreshold, @@ -75,9 +97,11 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( - final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { + final int rcaPeriod, final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); this.heapUsed = heapUsed; + this.rcaPeriod = rcaPeriod; + this.clock = Clock.systemUTC(); this.searchbp_Stats = searchbp_Stats; this.heapUsedIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD; @@ -88,6 +112,14 @@ public SearchBackPressureRCA( this.heapCancellationDecreaseMaxThreashold = SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD; + // initialize sliding window + this.heapUsageSlidingWindow = + new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + this.shardJVMCancellationSlidingWindow = + new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + this.taskJVMCancellationSlidingWindow = + new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + LOG.info("SearchBackPressureRCA initialized"); } @@ -110,30 +142,93 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { @Override public ResourceFlowUnit operate() { LOG.info("SearchBackPressureRCA operate() intiatilized"); - // Use OldGenRca.java to get heap usage and max heap size - double prevHeapUsage = getOldGenUsedOrDefault(0d); - double maxHeapSize = getMaxOldGenSizeOrDefault(Double.MAX_VALUE); + counter += 1; - double heapUsedPercentage = prevHeapUsage / maxHeapSize; + long currTimeStamp = this.clock.millis(); - // function to read cancellation count from sql - getSearchBackPressureShardCancellationCount(); + // read key metrics into searchBackPressureRCAMetric for easier management + SearchBackPressureRCAMetric searchBackPressureRCAMetric = getSearchBackPressureRCAMetric(); // print out oldGenUsed and maxOldGen LOG.info( - "SearchBackPressureRCA: oldGenUsed: {} maxOldGen: {}, heapUsedPercentage: {}", - prevHeapUsage, - maxHeapSize, - heapUsedPercentage); + "SearchBackPressureRCA: oldGenUsed: {} maxOldGen: {}, heapUsedPercentage: {}, searchbpShardCancellationCount: {}, searchbpTaskCancellationCount: {}, searchbpJVMShardCancellationCount: {}, searchbpJVMTaskCancellationCount: {}", + searchBackPressureRCAMetric.getUsedHeap(), + searchBackPressureRCAMetric.getMaxHeap(), + searchBackPressureRCAMetric.getHeapUsagePercent(), + searchBackPressureRCAMetric.getSearchbpShardCancellationCount(), + searchBackPressureRCAMetric.getSearchbpTaskCancellationCount(), + searchBackPressureRCAMetric.getSearchbpJVMShardCancellationCount(), + searchBackPressureRCAMetric.getSearchbpJVMTaskCancellationCount()); + + // update sliding window if the value is NOT NaN + double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); + if (!Double.isNaN(prevheapUsagePercentage)) { + heapUsageSlidingWindow.next( + new SlidingWindowData(currTimeStamp, prevheapUsagePercentage)); + } + + double shardJVMCancellationPercentage = + searchBackPressureRCAMetric.getShardJVMCancellationPercent(); + if (!Double.isNaN(shardJVMCancellationPercentage)) { + shardJVMCancellationSlidingWindow.next( + new SlidingWindowData(currTimeStamp, shardJVMCancellationPercentage)); + } + + double taskJVMCancellationPercentage = + searchBackPressureRCAMetric.getTaskJVMCancellationPercent(); + if (!Double.isNaN(taskJVMCancellationPercentage)) { + taskJVMCancellationSlidingWindow.next( + new SlidingWindowData(currTimeStamp, taskJVMCancellationPercentage)); + } + + LOG.info("SearchBackPressureRCA counter is {}", counter); + // if counter matches the rca period, emit the flow unit + if (counter == this.rcaPeriod) { + ResourceContext context = null; + LOG.info("SearchBackPressureRCA counter in rcaPeriod is {}", counter); + counter = 0; + + // TODO change to + double maxHeapUsagePercentage = heapUsageSlidingWindow.readAvg(); + double avgShardJVMCancellationPercentage = shardJVMCancellationSlidingWindow.readAvg(); + double avgTaskJVMCancellationPercentage = taskJVMCancellationSlidingWindow.readAvg(); + LOG.info( + "SearchBackPressureRCA: maxHeapUsagePercentage: {}, SearchBackPressureRCA: maxHeapUsagePercentage: {}, SearchBackPressureRCA: maxHeapUsagePercentage: {}", + maxHeapUsagePercentage, + avgShardJVMCancellationPercentage, + avgTaskJVMCancellationPercentage); + + // get the Configured Threshold and compare with Sliding Window Stats + if (maxHeapUsagePercentage > heapUsedDecreaseMinThreshold) { + // Generate a flow unit with an Unhealthy ResourceContext + LOG.info( + "maxHeapUsagePercentage: {} is greater than threshold: {}", + maxHeapUsagePercentage, + heapUsedDecreaseMinThreshold); + + } else { + // Generate a flow unit with a Healthy ResourceContext + LOG.info( + "maxHeapUsagePercentage: {} is less than threshold: {}", + maxHeapUsagePercentage, + heapUsedDecreaseMinThreshold); + } + + } else { + LOG.info("Empty FlowUnit returned for High Heap Usage RCA"); + return new ResourceFlowUnit<>(this.clock.millis()); + } + LOG.info("SearchBackPressureRCA operate() finished"); return null; } - private long getSearchBackPressureShardCancellationCount() { - LOG.info("getSearchBackPressureShardCancellationCount() STARTED"); + private SearchBackPressureRCAMetric getSearchBackPressureRCAMetric() { + // Get Heap Usage related metrics + double prevHeapUsage = getOldGenUsedOrDefault(0d); + double maxHeapSize = getMaxOldGenSizeOrDefault(Double.MAX_VALUE); - // Use Searchbp_Stats metrics to get the metrics value - // shard level cancellation count + // Get SearchBack Pressure related metrics from stats type field Field searchbp_stats_type_field = DSL.field( DSL.name( @@ -169,27 +264,14 @@ private long getSearchBackPressureShardCancellationCount() { AllMetrics.SearchBackPressureStatsValue .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT .toString()); - LOG.info( - "SearchBackPressureRCA: searchbpShardCancellationCount: {}", - searchbpShardCancellationCount); - // print out searchbpTaskCancellationCount, searchbpJVMShardCancellationCount, - // searchbpJVMTaskCancellationCount - LOG.info( - "SearchBackPressureRCA: searchbpTaskCancellationCount: {}", - searchbpTaskCancellationCount); - LOG.info( - "SearchBackPressureRCA: searchbpJVMShardCancellationCount: {}", - searchbpJVMShardCancellationCount); - LOG.info( - "SearchBackPressureRCA: searchbpJVMTaskCancellationCount: {}", - searchbpJVMTaskCancellationCount); - LOG.info("getSearchBackPressureShardCancellationCount() finished"); - return 0; - } - - private long getSearchBackPressureTaskCancellationCount() { - return 0; + return new SearchBackPressureRCAMetric( + prevHeapUsage, + maxHeapSize, + searchbpShardCancellationCount, + searchbpTaskCancellationCount, + searchbpJVMShardCancellationCount, + searchbpJVMTaskCancellationCount); } private double getMetric(M metric, Field field, String fieldName) { diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java index 6ef8b6eae..718c76b8f 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java @@ -15,8 +15,12 @@ public class SearchBackPressureRCAMetric { private final double searchbpJVMTaskCancellationCount; // Constructor - public SearchBackPressureRCAMetric(double usedHeap, double maxHeap, double searchbpShardCancellationCount, - double searchbpTaskCancellationCount, double searchbpJVMShardCancellationCount, + public SearchBackPressureRCAMetric( + double usedHeap, + double maxHeap, + double searchbpShardCancellationCount, + double searchbpTaskCancellationCount, + double searchbpJVMShardCancellationCount, double searchbpJVMTaskCancellationCount) { this.usedHeap = usedHeap; this.maxHeap = maxHeap; @@ -62,14 +66,18 @@ public double getShardJVMCancellationPercent() { if (this.getSearchbpShardCancellationCount() == 0) { return 0; } - return 100 * this.getSearchbpJVMShardCancellationCount() / this.getSearchbpShardCancellationCount(); + return 100 + * this.getSearchbpJVMShardCancellationCount() + / this.getSearchbpShardCancellationCount(); } public double getTaskJVMCancellationPercent() { if (this.getSearchbpTaskCancellationCount() == 0) { return 0; } - return 100 * this.getSearchbpJVMTaskCancellationCount() / this.getSearchbpTaskCancellationCount(); + return 100 + * this.getSearchbpJVMTaskCancellationCount() + / this.getSearchbpTaskCancellationCount(); } public boolean hasValues() { @@ -78,6 +86,19 @@ public boolean hasValues() { @Override public String toString() { - return "HeapMetric{" + "usedHeap=" + usedHeap + ", maxHeap=" + maxHeap + '}'; + return "HeapMetric{" + + "usedHeap=" + + usedHeap + + ", maxHeap=" + + maxHeap + + ", searchbpShardCancellationCount=" + + searchbpShardCancellationCount + + ", searchbpTaskCancellationCount=" + + searchbpTaskCancellationCount + + ", searchbpJVMShardCancellationCount=" + + searchbpJVMShardCancellationCount + + ", searchbpJVMTaskCancellationCount=" + + searchbpJVMTaskCancellationCount + + '}'; } } From 28980608ca5b91229309eecc27479410b070a2a9 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 27 Jun 2023 17:36:31 -0700 Subject: [PATCH 10/37] Add the conf file extracted part for SearchBackPressureRcaConfig.java (signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 72 ++++++++++++++++--- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index f460f16d1..86ee11848 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -4,31 +4,87 @@ */ package org.opensearch.performanceanalyzer.rca.configs; - - import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; public class SearchBackPressureRcaConfig { public static final String CONFIG_NAME = "search-back-pressure-rca-policy"; - // INTERVAL PERIOD IN SECONDS + // Interval period in seconds public static final long DEFAULT_EVALUATION_INTERVAL_IN_S = 60; // Increase Threshold // node max heap usage in last 60 secs is less than 70% - public static final int DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD = 70; + public static final int DEFAULT_MAX_HEAP_INCREASE_THRESHOLD = 70; + private Integer maxHeapIncreasePercentageThreshold; // cancellationCount due to heap is more than 50% of all task cancellations. public static final int DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD = 50; + private Integer maxHeapCancellationPercentageThreshold; // Decrease Threshold // node min heap usage in last 60 secs is more than 80% - public static final int DEFAULT_MIN_HEAP_OVERFLOW_THRESHOLD = 80; + public static final int DEFAULT_MAX_HEAP_DECREASE_THRESHOLD = 80; + private Integer maxHeapDecreasePercentageThreshold; - // cancellationCount due to heap is more than 30% of all task cancellations + // cancellationCount due to heap is less than 30% of all task cancellations public static final int DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD = 30; + private Integer minHeapCancellationPercentageThreshold; + + public SearchBackPressureRcaConfig(final RcaConf conf) { + // (s) -> s > 0 is the validator, if validated, fields from conf file will be returned, else, default value gets returned + maxHeapIncreasePercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MAX_HEAP_USAGE_INCREASE_FIELD, + DEFAULT_MAX_HEAP_INCREASE_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + maxHeapCancellationPercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + maxHeapDecreasePercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MAX_HEAP_USAGE_DECREASE_FIELD, + DEFAULT_MAX_HEAP_DECREASE_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + minHeapCancellationPercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + } + + // Getters for private field + public int getMaxHeapIncreasePercentageThreshold() { + return maxHeapIncreasePercentageThreshold; + } + + public int getMaxHeapCancellationPercentageThreshold() { + return maxHeapCancellationPercentageThreshold; + } + + public int getMaxHeapDecreasePercentageThreshold() { + return maxHeapDecreasePercentageThreshold; + } - public SearchBackPressureRcaConfig(final RcaConf conf) {} + public int getMinHeapCancellationPercentageThreshold() { + return minHeapCancellationPercentageThreshold; + } - // conf file to get Runtime Threshold for SearchBackPressureRCAConfig (TODO) + // name for the configuration field + public static class RCA_CONF_KEY_CONSTANTS { + public static final String MAX_HEAP_USAGE_INCREASE_FIELD = "max-heap-usage-increase"; + public static final String MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD = "max-heap-cancellation-percentage"; + public static final String MAX_HEAP_USAGE_DECREASE_FIELD = "max-heap-usage-decrease"; + public static final String MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD = "min-heap-cancellation-percentage"; + } } + \ No newline at end of file From 48c92fbd1f824cf46b24e0666d3130b47f47e334 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 28 Jun 2023 10:28:21 -0700 Subject: [PATCH 11/37] Add MinMaxSlidingWindow in OldGenRca (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 12 ++-- .../rca/store/rca/OldGenRca.java | 32 +++++++++ .../SearchBackPressureRCA.java | 66 ++++++++++++------- 3 files changed, 84 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index 86ee11848..a71e74da8 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -4,6 +4,8 @@ */ package org.opensearch.performanceanalyzer.rca.configs; + + import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; public class SearchBackPressureRcaConfig { @@ -31,7 +33,8 @@ public class SearchBackPressureRcaConfig { private Integer minHeapCancellationPercentageThreshold; public SearchBackPressureRcaConfig(final RcaConf conf) { - // (s) -> s > 0 is the validator, if validated, fields from conf file will be returned, else, default value gets returned + // (s) -> s > 0 is the validator, if validated, fields from conf file will be returned, + // else, default value gets returned maxHeapIncreasePercentageThreshold = conf.readRcaConfig( CONFIG_NAME, @@ -82,9 +85,10 @@ public int getMinHeapCancellationPercentageThreshold() { // name for the configuration field public static class RCA_CONF_KEY_CONSTANTS { public static final String MAX_HEAP_USAGE_INCREASE_FIELD = "max-heap-usage-increase"; - public static final String MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD = "max-heap-cancellation-percentage"; + public static final String MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "max-heap-cancellation-percentage"; public static final String MAX_HEAP_USAGE_DECREASE_FIELD = "max-heap-usage-decrease"; - public static final String MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD = "min-heap-cancellation-percentage"; + public static final String MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "min-heap-cancellation-percentage"; } } - \ No newline at end of file diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java index fc3558339..a53c2b7bf 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java @@ -250,4 +250,36 @@ public double readMin() { return Double.NaN; } } + + /** + * Sliding window to check the max/min olg gen usage within a given time frame Previous + * MinGoldGenSlidingWindow should be deprecated since it modify the sliding window size in + * next() + */ + public static class MinMaxOldGenSlidingWindow extends SlidingWindow { + + public MinMaxOldGenSlidingWindow(int SLIDING_WINDOW_SIZE_IN_TIMESTAMP, TimeUnit timeUnit) { + super(SLIDING_WINDOW_SIZE_IN_TIMESTAMP, timeUnit); + } + + public double readMax() { + if (!windowDeque.isEmpty()) { + return windowDeque.stream() + .mapToDouble(SlidingWindowData::getValue) + .max() + .orElse(Double.NaN); + } + return Double.NaN; + } + + public double readMin() { + if (!windowDeque.isEmpty()) { + return windowDeque.stream() + .mapToDouble(SlidingWindowData::getValue) + .min() + .orElse(Double.NaN); + } + return Double.NaN; + } + } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index f510112d6..9624b2088 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -20,6 +20,7 @@ import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.Metric; +import org.opensearch.performanceanalyzer.rca.framework.api.Resources; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindow; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindowData; import org.opensearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; @@ -27,6 +28,7 @@ import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; +import org.opensearch.performanceanalyzer.rca.framework.util.InstanceDetails; import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca; import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model.SearchBackPressureRCAMetric; @@ -48,12 +50,12 @@ public class SearchBackPressureRCA extends OldGenRca taskJVMCancellationSlidingWindow; private final SlidingWindow shardJVMCancellationSlidingWindow; - private final SlidingWindow heapUsageSlidingWindow; + private final MinMaxOldGenSlidingWindow heapUsageSlidingWindow; // Sliding Window Interval private static final int SLIDING_WINDOW_SIZE_IN_MINS = 1; @@ -103,18 +105,18 @@ public SearchBackPressureRCA( this.rcaPeriod = rcaPeriod; this.clock = Clock.systemUTC(); this.searchbp_Stats = searchbp_Stats; - this.heapUsedIncreaseMaxThreshold = - SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD; + this.heapUsedIncreaseThreshold = + SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_INCREASE_THRESHOLD; this.heapCancellationIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD; - this.heapUsedDecreaseMinThreshold = - SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_OVERFLOW_THRESHOLD; - this.heapCancellationDecreaseMaxThreashold = + this.heapUsedDecreaseThreshold = + SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DECREASE_THRESHOLD; + this.heapCancellationDecreaseMinThreashold = SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD; // initialize sliding window this.heapUsageSlidingWindow = - new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + new MinMaxOldGenSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); this.shardJVMCancellationSlidingWindow = new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); this.taskJVMCancellationSlidingWindow = @@ -167,6 +169,9 @@ public ResourceFlowUnit operate() { new SlidingWindowData(currTimeStamp, prevheapUsagePercentage)); } + // for testing + // heapUsageSlidingWindow.next(new SlidingWindowData(currTimeStamp, 80.3)); + double shardJVMCancellationPercentage = searchBackPressureRCAMetric.getShardJVMCancellationPercent(); if (!Double.isNaN(shardJVMCancellationPercentage)) { @@ -186,32 +191,49 @@ public ResourceFlowUnit operate() { if (counter == this.rcaPeriod) { ResourceContext context = null; LOG.info("SearchBackPressureRCA counter in rcaPeriod is {}", counter); + long currentTimeMillis = System.currentTimeMillis(); counter = 0; - // TODO change to - double maxHeapUsagePercentage = heapUsageSlidingWindow.readAvg(); + double maxHeapUsagePercentage = heapUsageSlidingWindow.readMax(); + double minHeapUsagePercentage = heapUsageSlidingWindow.readMin(); double avgShardJVMCancellationPercentage = shardJVMCancellationSlidingWindow.readAvg(); double avgTaskJVMCancellationPercentage = taskJVMCancellationSlidingWindow.readAvg(); + LOG.info( - "SearchBackPressureRCA: maxHeapUsagePercentage: {}, SearchBackPressureRCA: maxHeapUsagePercentage: {}, SearchBackPressureRCA: maxHeapUsagePercentage: {}", + "SearchBackPressureRCA: maxHeapUsagePercentage: {}, minHeapUsagePercentage: {}, SearchBackPressureRCA: avgShardJVMCancellationPercentage: {}, SearchBackPressureRCA: avgTaskJVMCancellationPercentage: {}", maxHeapUsagePercentage, + minHeapUsagePercentage, avgShardJVMCancellationPercentage, avgTaskJVMCancellationPercentage); + InstanceDetails instanceDetails = getInstanceDetails(); + HotNodeSummary nodeSummary = + new HotNodeSummary( + instanceDetails.getInstanceId(), instanceDetails.getInstanceIp()); // get the Configured Threshold and compare with Sliding Window Stats - if (maxHeapUsagePercentage > heapUsedDecreaseMinThreshold) { + /* + * 2 cases we send Unhealthy ResourceContext when we need to autotune the threshold + * - (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations + * - (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations + */ + if ((maxHeapUsagePercentage < heapUsedIncreaseThreshold) + && (avgShardJVMCancellationPercentage > heapCancellationIncreaseMaxThreshold)) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( - "maxHeapUsagePercentage: {} is greater than threshold: {}", + "Condition 1 Meet, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapCancellationIncreaseMaxThreshold: {}", maxHeapUsagePercentage, - heapUsedDecreaseMinThreshold); - + heapUsedIncreaseThreshold, + avgShardJVMCancellationPercentage, + heapCancellationIncreaseMaxThreshold); + + context = new ResourceContext(Resources.State.UNHEALTHY); + return new ResourceFlowUnit<>( + currentTimeMillis, + context, + nodeSummary, + !instanceDetails.getIsClusterManager()); } else { - // Generate a flow unit with a Healthy ResourceContext - LOG.info( - "maxHeapUsagePercentage: {} is less than threshold: {}", - maxHeapUsagePercentage, - heapUsedDecreaseMinThreshold); + LOG.info("cindition 1 is not met."); } } else { From c84cf3435b870ee4844deb5d6c6032931015e66b Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 28 Jun 2023 11:53:07 -0700 Subject: [PATCH 12/37] Rename SearchBackPressureClusterRCA and add it to AnalysisGraph (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 12 +-- .../rca/store/OpenSearchAnalysisGraph.java | 19 ++++- ...java => SearchBackPressureClusterRCA.java} | 10 ++- .../SearchBackPressureRCA.java | 80 ++++++++++--------- 4 files changed, 71 insertions(+), 50 deletions(-) rename src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/{SearchBackPressurClusterRCA.java => SearchBackPressureClusterRCA.java} (66%) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index a71e74da8..7d6bce315 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -25,8 +25,8 @@ public class SearchBackPressureRcaConfig { // Decrease Threshold // node min heap usage in last 60 secs is more than 80% - public static final int DEFAULT_MAX_HEAP_DECREASE_THRESHOLD = 80; - private Integer maxHeapDecreasePercentageThreshold; + public static final int DEFAULT_MIN_HEAP_DECREASE_THRESHOLD = 80; + private Integer minHeapDecreasePercentageThreshold; // cancellationCount due to heap is less than 30% of all task cancellations public static final int DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD = 30; @@ -49,11 +49,11 @@ public SearchBackPressureRcaConfig(final RcaConf conf) { DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); - maxHeapDecreasePercentageThreshold = + minHeapDecreasePercentageThreshold = conf.readRcaConfig( CONFIG_NAME, RCA_CONF_KEY_CONSTANTS.MAX_HEAP_USAGE_DECREASE_FIELD, - DEFAULT_MAX_HEAP_DECREASE_THRESHOLD, + DEFAULT_MIN_HEAP_DECREASE_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); minHeapCancellationPercentageThreshold = @@ -74,8 +74,8 @@ public int getMaxHeapCancellationPercentageThreshold() { return maxHeapCancellationPercentageThreshold; } - public int getMaxHeapDecreasePercentageThreshold() { - return maxHeapDecreasePercentageThreshold; + public int getMinHeapDecreasePercentageThreshold() { + return minHeapDecreasePercentageThreshold; } public int getMinHeapCancellationPercentageThreshold() { diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index bae2c74cf..967eded24 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -86,6 +86,7 @@ import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.LargeHeapClusterRca; import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.OldGenContendedRca; import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.OldGenReclamationRca; +import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.SearchBackPressureClusterRCA; import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.SearchBackPressureRCA; import org.opensearch.performanceanalyzer.rca.store.rca.temperature.NodeTemperatureRca; import org.opensearch.performanceanalyzer.rca.store.rca.temperature.dimension.CpuUtilDimensionTemperatureRca; @@ -442,8 +443,8 @@ public void construct() { shardRequestCacheClusterRca, highHeapUsageClusterRca)); - // Search Back Pressure Service RCA - final SearchBackPressureRCA searchBackPressureRCA = + // Search Back Pressure Service RCA enabled + SearchBackPressureRCA searchBackPressureRCA = new SearchBackPressureRCA(RCA_PERIOD, heapMax, heapUsed, gcType, searchbp_Stats); searchBackPressureRCA.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, @@ -451,6 +452,20 @@ public void construct() { searchBackPressureRCA.addAllUpstreams( Arrays.asList(heapMax, heapUsed, gcType, searchbp_Stats)); + // Search Back Pressure Service Cluster RCA enabled + SearchBackPressureClusterRCA searchBackPressureClusterRCA = + new SearchBackPressureClusterRCA(RCA_PERIOD, searchBackPressureRCA); + searchBackPressureClusterRCA.addTag( + RcaConsts.RcaTagConstants.TAG_LOCUS, + RcaConsts.RcaTagConstants.LOCUS_CLUSTER_MANAGER_NODE); + searchBackPressureClusterRCA.addAllUpstreams( + Collections.singletonList(searchBackPressureRCA)); + searchBackPressureClusterRCA.addTag( + RcaConsts.RcaTagConstants.TAG_AGGREGATE_UPSTREAM, + RcaConsts.RcaTagConstants.LOCUS_DATA_NODE); + + // To Do SearchBackPressure Decider + AdmissionControlDecider admissionControlDecider = buildAdmissionControlDecider(heapUsed, heapMax); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java similarity index 66% rename from src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java rename to src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java index b97d9c0cd..2f2ea88a5 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java @@ -6,17 +6,21 @@ package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.opensearch.performanceanalyzer.rca.framework.api.Rca; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; import org.opensearch.performanceanalyzer.rca.store.rca.cluster.BaseClusterRca; -public class SearchBackPressurClusterRCA extends BaseClusterRca { +public class SearchBackPressureClusterRCA extends BaseClusterRca { - public static final String RCA_TABLE_NAME = SearchBackPressurClusterRCA.class.getSimpleName(); + public static final String RCA_TABLE_NAME = SearchBackPressureClusterRCA.class.getSimpleName(); + private static final Logger LOG = LogManager.getLogger(SearchBackPressureClusterRCA.class); - public >> SearchBackPressurClusterRCA( + public >> SearchBackPressureClusterRCA( final int rcaPeriod, final R SearchBackPressureRCA) { super(rcaPeriod, SearchBackPressureRCA); + LOG.info("SearchBackPressureClusterRCA enabeld."); } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 9624b2088..fa0dda859 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -47,8 +47,6 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( final int rcaPeriod, final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); @@ -110,7 +90,7 @@ public SearchBackPressureRCA( this.heapCancellationIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD; this.heapUsedDecreaseThreshold = - SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DECREASE_THRESHOLD; + SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_DECREASE_THRESHOLD; this.heapCancellationDecreaseMinThreashold = SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD; @@ -144,9 +124,11 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { @Override public ResourceFlowUnit operate() { LOG.info("SearchBackPressureRCA operate() intiatilized"); - counter += 1; - long currTimeStamp = this.clock.millis(); + counter += 1; + ResourceContext context = null; + long currentTimeMillis = System.currentTimeMillis(); + ; // read key metrics into searchBackPressureRCAMetric for easier management SearchBackPressureRCAMetric searchBackPressureRCAMetric = getSearchBackPressureRCAMetric(); @@ -166,32 +148,33 @@ public ResourceFlowUnit operate() { double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); if (!Double.isNaN(prevheapUsagePercentage)) { heapUsageSlidingWindow.next( - new SlidingWindowData(currTimeStamp, prevheapUsagePercentage)); + new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); } // for testing - // heapUsageSlidingWindow.next(new SlidingWindowData(currTimeStamp, 80.3)); + // heapUsageSlidingWindow.next(new SlidingWindowData(currentTimeMillis, 65.3)); double shardJVMCancellationPercentage = searchBackPressureRCAMetric.getShardJVMCancellationPercent(); if (!Double.isNaN(shardJVMCancellationPercentage)) { shardJVMCancellationSlidingWindow.next( - new SlidingWindowData(currTimeStamp, shardJVMCancellationPercentage)); + new SlidingWindowData(currentTimeMillis, shardJVMCancellationPercentage)); } double taskJVMCancellationPercentage = searchBackPressureRCAMetric.getTaskJVMCancellationPercent(); if (!Double.isNaN(taskJVMCancellationPercentage)) { taskJVMCancellationSlidingWindow.next( - new SlidingWindowData(currTimeStamp, taskJVMCancellationPercentage)); + new SlidingWindowData(currentTimeMillis, taskJVMCancellationPercentage)); } LOG.info("SearchBackPressureRCA counter is {}", counter); // if counter matches the rca period, emit the flow unit if (counter == this.rcaPeriod) { - ResourceContext context = null; LOG.info("SearchBackPressureRCA counter in rcaPeriod is {}", counter); - long currentTimeMillis = System.currentTimeMillis(); + currentTimeMillis = System.currentTimeMillis(); + + // reset counter counter = 0; double maxHeapUsagePercentage = heapUsageSlidingWindow.readMax(); @@ -216,11 +199,20 @@ public ResourceFlowUnit operate() { * - (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations * - (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations */ - if ((maxHeapUsagePercentage < heapUsedIncreaseThreshold) - && (avgShardJVMCancellationPercentage > heapCancellationIncreaseMaxThreshold)) { + // avgShardJVMCancellationPercentage = 80.0; // testing + boolean increaseThresholdMet = + (maxHeapUsagePercentage < heapUsedIncreaseThreshold) + && (avgShardJVMCancellationPercentage + > heapCancellationIncreaseMaxThreshold); + boolean decreaseThresholdMet = + (minHeapUsagePercentage > heapUsedDecreaseThreshold) + && (avgShardJVMCancellationPercentage + < heapCancellationDecreaseMinThreashold); + + if (increaseThresholdMet || decreaseThresholdMet) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( - "Condition 1 Meet, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapCancellationIncreaseMaxThreshold: {}", + "Increase/Decrease Condition Meet, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapCancellationIncreaseMaxThreshold: {}", maxHeapUsagePercentage, heapUsedIncreaseThreshold, avgShardJVMCancellationPercentage, @@ -233,16 +225,20 @@ public ResourceFlowUnit operate() { nodeSummary, !instanceDetails.getIsClusterManager()); } else { - LOG.info("cindition 1 is not met."); + // if autotune is not triggered, return healthy state + context = new ResourceContext(Resources.State.HEALTHY); + return new ResourceFlowUnit<>( + currentTimeMillis, + context, + nodeSummary, + !instanceDetails.getIsClusterManager()); } - } else { - LOG.info("Empty FlowUnit returned for High Heap Usage RCA"); - return new ResourceFlowUnit<>(this.clock.millis()); + // return healthy state when the counter does not meet rcaPeriod + LOG.info("Empty Healthy FlowUnit returned for SearchbackPressureRCA"); + currentTimeMillis = System.currentTimeMillis(); + return new ResourceFlowUnit<>(currentTimeMillis); } - - LOG.info("SearchBackPressureRCA operate() finished"); - return null; } private SearchBackPressureRCAMetric getSearchBackPressureRCAMetric() { @@ -322,5 +318,11 @@ public void readRcaConf(RcaConf conf) { final SearchBackPressureRcaConfig config = conf.getSearchBackPressureRcaConfig(); // read anything from config file in runtime // if not just skip it + this.heapUsedIncreaseThreshold = config.getMaxHeapIncreasePercentageThreshold(); + this.heapCancellationIncreaseMaxThreshold = + config.getMaxHeapCancellationPercentageThreshold(); + this.heapUsedDecreaseThreshold = config.getMinHeapDecreasePercentageThreshold(); + this.heapCancellationDecreaseMinThreashold = + config.getMinHeapCancellationPercentageThreshold(); } } From 08f69270eba609c0e5e52cafd6af1585b9659cf6 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 28 Jun 2023 16:59:08 -0700 Subject: [PATCH 13/37] Add basic UTs for SearchBackPressureRCA cluster/node level (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRcaTest.java | 217 ++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java new file mode 100644 index 000000000..9ebde61ed --- /dev/null +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -0,0 +1,217 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.when; +import static org.mockito.MockitoAnnotations.initMocks; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.IntStream; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mock; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; +import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; +import org.opensearch.performanceanalyzer.rca.framework.api.Metric; +import org.opensearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.metrics.MetricTestHelper; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; + +public class SearchBackPressureRcaTest { + // Mock Metrics + @Mock private Metric mockHeapUsed; + + @Mock private Metric mockHeapMax; + + @Mock private Metric mockGcType; + + @Mock private Metric mockSearchbpStats; + + // every 5s operate() gets initiated + private static final int RCA_PERIOD = 5; + + private SearchBackPressureRCA testRca; + private MetricTestHelper metricTestHelper; + private static final double DEFAULT_MAX_HEAP_SIZE = 4294967296.0; + + // mock heap metric columns + private final List heapTableColumns = + Arrays.asList( + AllMetrics.HeapDimension.MEM_TYPE.toString(), + MetricsDB.SUM, + MetricsDB.AVG, + MetricsDB.MIN, + MetricsDB.MAX); + + // mock search back pressure metric columns + private final List searchbpTableColumns = + Arrays.asList( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TYPE_DIM.toString(), + MetricsDB.SUM, + MetricsDB.AVG, + MetricsDB.MIN, + MetricsDB.MAX); + + // dummy field to create a mock gcType Metric + private static final String CMS_COLLECTOR = "ConcurrentMarkSweep"; + + /* + * initialization before running any test + * + */ + @Before + public void setup() throws Exception { + initMocks(this); + this.metricTestHelper = new MetricTestHelper(RCA_PERIOD); + setupMockHeapMetric(mockHeapUsed, 80.0); + setupMockHeapMetric(mockHeapMax, 100.0); + // gcType is required for constructor of SearchBackPressureRCA but the exact type of gcType + // does not matter + setupMockGcType(CMS_COLLECTOR); + + // set up SearchBp_Stats table + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + + this.testRca = + new SearchBackPressureRCA( + RCA_PERIOD, mockHeapMax, mockHeapUsed, mockGcType, mockSearchbpStats); + } + + @Test + public void testSearchBackpressureGetResourceContextGeneral() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + System.out.println("testAdmissionControlRcaSmallMaxHeap started"); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + + assertFalse(flowUnit.isEmpty()); + ResourceContext context = flowUnit.getResourceContext(); + assertTrue(context.isHealthy()); + } + + private void setupMockHeapMetric(final Metric metric, final double val) { + String valString = Double.toString(val); + List data = + Arrays.asList( + AllMetrics.GCType.OLD_GEN.toString(), + valString, + valString, + valString, + valString); + when(metric.getFlowUnits()) + .thenReturn( + Collections.singletonList( + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + heapTableColumns, data)))); + } + + private void setupMockGcType(final String collector) { + List gcInfoTableColumns = + Arrays.asList( + AllMetrics.GCInfoDimension.MEMORY_POOL.toString(), + AllMetrics.GCInfoDimension.COLLECTOR_NAME.toString()); + List data = Arrays.asList(AllMetrics.GCType.OLD_GEN.toString(), collector); + when(mockGcType.getFlowUnits()) + .thenReturn( + Collections.singletonList( + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + gcInfoTableColumns, data)))); + } + + private void setupMockSearchbpStats( + final Metric metric, + final double searchbpShardCancellationCount, + final double searchbpTaskCancellationCount, + final double searchbpJVMShardCancellationCount, + final double searchbpJVMTaskCancellationCount) { + String searchbpShardCancellationCountStr = Double.toString(searchbpShardCancellationCount); + String searchbpTaskCancellationCountStr = Double.toString(searchbpTaskCancellationCount); + String searchbpJVMShardCancellationCountStr = + Double.toString(searchbpJVMShardCancellationCount); + String searchbpJVMTaskCancellationCountStr = + Double.toString(searchbpJVMTaskCancellationCount); + + // add searchbpShardCancellationCountStr row + List searchbpShardCancellationCountRow = + Arrays.asList( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString(), + searchbpShardCancellationCountStr, + searchbpShardCancellationCountStr, + searchbpShardCancellationCountStr, + searchbpShardCancellationCountStr); + + // add searchbpTaskCancellationCountStr row + List searchbpTaskCancellationCountRow = + Arrays.asList( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString(), + searchbpTaskCancellationCountStr, + searchbpTaskCancellationCountStr, + searchbpTaskCancellationCountStr, + searchbpTaskCancellationCountStr); + + // add searchbpJVMShardCancellationCountStr row + List searchbpJVMShardCancellationCountRow = + Arrays.asList( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString(), + searchbpJVMShardCancellationCountStr, + searchbpJVMShardCancellationCountStr, + searchbpJVMShardCancellationCountStr, + searchbpJVMShardCancellationCountStr); + + // add searchbpJVMTaskCancellationCountStr row + List searchbpJVMTaskCancellationCountRow = + Arrays.asList( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString(), + searchbpJVMTaskCancellationCountStr, + searchbpJVMTaskCancellationCountStr, + searchbpJVMTaskCancellationCountStr, + searchbpJVMTaskCancellationCountStr); + + List flowUnits = + Arrays.asList( + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + searchbpTableColumns, searchbpShardCancellationCountRow)), + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + searchbpTableColumns, searchbpTaskCancellationCountRow)), + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + searchbpTableColumns, + searchbpJVMShardCancellationCountRow)), + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + searchbpTableColumns, + searchbpJVMTaskCancellationCountRow))); + + when(metric.getFlowUnits()).thenReturn(flowUnits); + } +} From 31e8b49ac9f0c873b45bd8bbd193cedeed60121d Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 28 Jun 2023 17:17:02 -0700 Subject: [PATCH 14/37] Add unhealthy/healthy stats UTs for SearchBackPressureRCA cluster/node level (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRcaTest.java | 79 +++++++++++++++++-- 1 file changed, 74 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index 9ebde61ed..cf8f1c759 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -20,7 +20,6 @@ import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; import org.opensearch.performanceanalyzer.rca.framework.api.Metric; -import org.opensearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.MetricTestHelper; @@ -86,8 +85,27 @@ public void setup() throws Exception { RCA_PERIOD, mockHeapMax, mockHeapUsed, mockGcType, mockSearchbpStats); } + /* + * Test SearchBackPressure RCA returns empty resourceFlowUnit if counter is less than the rcaPeriod + */ @Test - public void testSearchBackpressureGetResourceContextGeneral() { + public void testSearchBpGetResourceContextLessRcaPeriod() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + + ResourceFlowUnit flowUnit = testRca.operate(); + + // counter = 1 + // counter needs to equal to RcaPeriod (5 in this case) to get nonempty resourceflowunit + assertTrue(flowUnit.isEmpty()); + } + + /* + * Test SearchBackPressure RCA returns nonempty resourceFlowUnit if counter equals to rcaPeriod + */ + @Test + public void testSearchBpGetResourceContextEqualRcaPeriod() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); @@ -96,9 +114,60 @@ public void testSearchBackpressureGetResourceContextGeneral() { ResourceFlowUnit flowUnit = testRca.operate(); + // counter = RCA_PERIOD + // counter needs to equal to RcaPeriod (5 in this case) to get nonempty resourceflowunit + assertFalse(flowUnit.isEmpty()); + } + + /* + * Test SearchBackPressure RCA returns healthy nonempty flow units if the settings does not trigger autotune + */ + // @Test + // public void testSearchBpGetHealthyFlowUnit() { + // setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + // setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); + // setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + // System.out.println("testAdmissionControlRcaSmallMaxHeap started"); + // IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + // ResourceFlowUnit flowUnit = testRca.operate(); + // assertFalse(flowUnit.isEmpty()); + // } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by increasing threshold + * Increasing threshold: + * node max heap usage in last 60 secs is less than 70% + * cancellationCount due to heap is more than 50% of all task cancellations. + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitByIncreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.3); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by decreasing threshold + * decreasing threshold: + * node min heap usage in last 60 secs is more than 80% + * cancellationCount due to heap is less than 30% of all task cancellations + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitByDecreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 2.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); assertFalse(flowUnit.isEmpty()); - ResourceContext context = flowUnit.getResourceContext(); - assertTrue(context.isHealthy()); + assertFalse(flowUnit.getResourceContext().isHealthy()); } private void setupMockHeapMetric(final Metric metric, final double val) { @@ -212,6 +281,6 @@ private void setupMockSearchbpStats( searchbpTableColumns, searchbpJVMTaskCancellationCountRow))); - when(metric.getFlowUnits()).thenReturn(flowUnits); + when(metric.getFlowUnits()).thenReturn(flowUnits); } } From 4bfa1b2c9d4c117821617e8a6acc9987d8b3fbb8 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 29 Jun 2023 09:27:22 -0700 Subject: [PATCH 15/37] Add healthy resource unit UT (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRcaTest.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index cf8f1c759..2f38bb2e4 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -109,7 +109,6 @@ public void testSearchBpGetResourceContextEqualRcaPeriod() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); - System.out.println("testAdmissionControlRcaSmallMaxHeap started"); IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); ResourceFlowUnit flowUnit = testRca.operate(); @@ -121,18 +120,19 @@ public void testSearchBpGetResourceContextEqualRcaPeriod() { /* * Test SearchBackPressure RCA returns healthy nonempty flow units if the settings does not trigger autotune + * Meeting None of Increasing or Decreasing Threshold */ - // @Test - // public void testSearchBpGetHealthyFlowUnit() { - // setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); - // setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); - // setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); - // System.out.println("testAdmissionControlRcaSmallMaxHeap started"); - // IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); - - // ResourceFlowUnit flowUnit = testRca.operate(); - // assertFalse(flowUnit.isEmpty()); - // } + @Test + public void testSearchBpGetHealthyFlowUnit() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertTrue(flowUnit.getResourceContext().isHealthy()); + } /* * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by increasing threshold From 13e2d48e503eb87bf9f7f7656221a35a47172f7c Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 29 Jun 2023 13:09:49 -0700 Subject: [PATCH 16/37] Add UT s both shard/task level (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 82 +++++++++----- .../SearchBackPressureRCA.java | 100 ++++++++++++++---- .../SearchBackPressureRcaTest.java | 50 +++++++-- 3 files changed, 180 insertions(+), 52 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index 7d6bce315..9a6512893 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -14,23 +14,31 @@ public class SearchBackPressureRcaConfig { // Interval period in seconds public static final long DEFAULT_EVALUATION_INTERVAL_IN_S = 60; - // Increase Threshold + /* Increase Threshold */ // node max heap usage in last 60 secs is less than 70% public static final int DEFAULT_MAX_HEAP_INCREASE_THRESHOLD = 70; private Integer maxHeapIncreasePercentageThreshold; - // cancellationCount due to heap is more than 50% of all task cancellations. - public static final int DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD = 50; - private Integer maxHeapCancellationPercentageThreshold; + // cancellationCount due to heap is more than 50% of all task cancellations in shard level + public static final int DEFAULT_SHARD_MAX_HEAP_CANCELLATION_THRESHOLD = 50; + private Integer maxShardHeapCancellationPercentageThreshold; - // Decrease Threshold + // cancellationCount due to heap is more than 50% of all task cancellations in task level + public static final int DEFAULT_TASK_MAX_HEAP_CANCELLATION_THRESHOLD = 50; + private Integer maxTaskHeapCancellationPercentageThreshold; + + /* Decrease Threshold */ // node min heap usage in last 60 secs is more than 80% public static final int DEFAULT_MIN_HEAP_DECREASE_THRESHOLD = 80; private Integer minHeapDecreasePercentageThreshold; - // cancellationCount due to heap is less than 30% of all task cancellations - public static final int DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD = 30; - private Integer minHeapCancellationPercentageThreshold; + // cancellationCount due to heap is less than 30% of all task cancellations in shard level + public static final int DEFAULT_SHARD_MIN_HEAP_CANCELLATION_THRESHOLD = 30; + private Integer minShardHeapCancellationPercentageThreshold; + + // cancellationCount due to heap is less than 30% of all task cancellations in task level + public static final int DEFAULT_TASK_MIN_HEAP_CANCELLATION_THRESHOLD = 30; + private Integer minTaskHeapCancellationPercentageThreshold; public SearchBackPressureRcaConfig(final RcaConf conf) { // (s) -> s > 0 is the validator, if validated, fields from conf file will be returned, @@ -42,11 +50,18 @@ public SearchBackPressureRcaConfig(final RcaConf conf) { DEFAULT_MAX_HEAP_INCREASE_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); - maxHeapCancellationPercentageThreshold = + maxShardHeapCancellationPercentageThreshold = conf.readRcaConfig( CONFIG_NAME, - RCA_CONF_KEY_CONSTANTS.MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD, - DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD, + RCA_CONF_KEY_CONSTANTS.MAX_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_SHARD_MAX_HEAP_CANCELLATION_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + maxTaskHeapCancellationPercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MAX_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_TASK_MAX_HEAP_CANCELLATION_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); minHeapDecreasePercentageThreshold = @@ -56,39 +71,58 @@ public SearchBackPressureRcaConfig(final RcaConf conf) { DEFAULT_MIN_HEAP_DECREASE_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); - minHeapCancellationPercentageThreshold = + minShardHeapCancellationPercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MIN_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_SHARD_MIN_HEAP_CANCELLATION_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + minTaskHeapCancellationPercentageThreshold = conf.readRcaConfig( CONFIG_NAME, - RCA_CONF_KEY_CONSTANTS.MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD, - DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD, + RCA_CONF_KEY_CONSTANTS.MIN_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_TASK_MIN_HEAP_CANCELLATION_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); } // Getters for private field - public int getMaxHeapIncreasePercentageThreshold() { + public Integer getMaxHeapIncreasePercentageThreshold() { return maxHeapIncreasePercentageThreshold; } - public int getMaxHeapCancellationPercentageThreshold() { - return maxHeapCancellationPercentageThreshold; + public Integer getMaxShardHeapCancellationPercentageThreshold() { + return maxShardHeapCancellationPercentageThreshold; } - public int getMinHeapDecreasePercentageThreshold() { + public Integer getMaxTaskHeapCancellationPercentageThreshold() { + return maxTaskHeapCancellationPercentageThreshold; + } + + public Integer getMinHeapDecreasePercentageThreshold() { return minHeapDecreasePercentageThreshold; } - public int getMinHeapCancellationPercentageThreshold() { - return minHeapCancellationPercentageThreshold; + public Integer getMinShardHeapCancellationPercentageThreshold() { + return minShardHeapCancellationPercentageThreshold; + } + + public Integer getMinTaskHeapCancellationPercentageThreshold() { + return minTaskHeapCancellationPercentageThreshold; } // name for the configuration field public static class RCA_CONF_KEY_CONSTANTS { public static final String MAX_HEAP_USAGE_INCREASE_FIELD = "max-heap-usage-increase"; - public static final String MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD = - "max-heap-cancellation-percentage"; + public static final String MAX_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "max-shard-heap-cancellation-percentage"; + public static final String MAX_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "max-task-heap-cancellation-percentage"; public static final String MAX_HEAP_USAGE_DECREASE_FIELD = "max-heap-usage-decrease"; - public static final String MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD = - "min-heap-cancellation-percentage"; + public static final String MIN_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "min-shard-heap-cancellation-percentage"; + public static final String MIN_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "min-task-heap-cancellation-percentage"; } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index fa0dda859..aabbd1fc2 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -44,16 +44,24 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( this.searchbp_Stats = searchbp_Stats; this.heapUsedIncreaseThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_INCREASE_THRESHOLD; - this.heapCancellationIncreaseMaxThreshold = - SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD; + this.heapShardCancellationIncreaseMaxThreshold = + SearchBackPressureRcaConfig.DEFAULT_SHARD_MAX_HEAP_CANCELLATION_THRESHOLD; + this.heapTaskCancellationIncreaseMaxThreshold = + SearchBackPressureRcaConfig.DEFAULT_TASK_MAX_HEAP_CANCELLATION_THRESHOLD; this.heapUsedDecreaseThreshold = SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_DECREASE_THRESHOLD; - this.heapCancellationDecreaseMinThreashold = - SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD; + this.heapShardCancellationDecreaseMinThreashold = + SearchBackPressureRcaConfig.DEFAULT_SHARD_MIN_HEAP_CANCELLATION_THRESHOLD; + this.heapTaskCancellationDecreaseMinThreashold = + SearchBackPressureRcaConfig.DEFAULT_TASK_MIN_HEAP_CANCELLATION_THRESHOLD; // initialize sliding window this.heapUsageSlidingWindow = @@ -200,25 +212,66 @@ public ResourceFlowUnit operate() { * - (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations */ // avgShardJVMCancellationPercentage = 80.0; // testing - boolean increaseThresholdMet = + + // TODO: add Task CancellationCountPercentage as another criteria + // TODO + /* + * HotResourceSummary resourceSummary = + new HotResourceSummary(HEAP_MAX_SIZE, currentThreshold, previousThreshold, 0); + nodeSummary.appendNestedSummary(resourceSummary); + + If you + */ + boolean increaseThresholdMetByShard = (maxHeapUsagePercentage < heapUsedIncreaseThreshold) && (avgShardJVMCancellationPercentage - > heapCancellationIncreaseMaxThreshold); - boolean decreaseThresholdMet = + > heapShardCancellationIncreaseMaxThreshold); + boolean decreaseThresholdMetByShard = (minHeapUsagePercentage > heapUsedDecreaseThreshold) && (avgShardJVMCancellationPercentage - < heapCancellationDecreaseMinThreashold); + < heapShardCancellationDecreaseMinThreashold); + + boolean increaseThresholdMetByTask = + (maxHeapUsagePercentage < heapUsedIncreaseThreshold) + && (avgTaskJVMCancellationPercentage + > heapTaskCancellationIncreaseMaxThreshold); + boolean decreaseThresholdMetByTask = + (minHeapUsagePercentage > heapUsedDecreaseThreshold) + && (avgTaskJVMCancellationPercentage + < heapTaskCancellationDecreaseMinThreashold); + + // HotResourceSummary resourceSummary = + // new HotResourceSummary(HEAP_MAX_SIZE, currentThreshold, + // previousThreshold, 0); + // nodeSummary.appendNestedSummary(resourceSummary); - if (increaseThresholdMet || decreaseThresholdMet) { + if (increaseThresholdMetByShard || decreaseThresholdMetByShard) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( - "Increase/Decrease Condition Meet, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapCancellationIncreaseMaxThreshold: {}", + "Increase/Decrease Condition Meet for Shard, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", maxHeapUsagePercentage, heapUsedIncreaseThreshold, avgShardJVMCancellationPercentage, - heapCancellationIncreaseMaxThreshold); + heapShardCancellationIncreaseMaxThreshold); context = new ResourceContext(Resources.State.UNHEALTHY); + // add an additional resource with metadata: shard-level + return new ResourceFlowUnit<>( + currentTimeMillis, + context, + nodeSummary, + !instanceDetails.getIsClusterManager()); + } else if (increaseThresholdMetByTask || decreaseThresholdMetByTask) { + // Generate a flow unit with an Unhealthy ResourceContext + LOG.info( + "Increase/Decrease Condition Meet for Task, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", + maxHeapUsagePercentage, + heapUsedIncreaseThreshold, + avgTaskJVMCancellationPercentage, + heapTaskCancellationIncreaseMaxThreshold); + + context = new ResourceContext(Resources.State.UNHEALTHY); + // add an additional resource with metadata: task-level return new ResourceFlowUnit<>( currentTimeMillis, context, @@ -233,6 +286,7 @@ public ResourceFlowUnit operate() { nodeSummary, !instanceDetails.getIsClusterManager()); } + } else { // return healthy state when the counter does not meet rcaPeriod LOG.info("Empty Healthy FlowUnit returned for SearchbackPressureRCA"); @@ -319,10 +373,14 @@ public void readRcaConf(RcaConf conf) { // read anything from config file in runtime // if not just skip it this.heapUsedIncreaseThreshold = config.getMaxHeapIncreasePercentageThreshold(); - this.heapCancellationIncreaseMaxThreshold = - config.getMaxHeapCancellationPercentageThreshold(); + this.heapShardCancellationIncreaseMaxThreshold = + config.getMaxShardHeapCancellationPercentageThreshold(); + this.heapTaskCancellationIncreaseMaxThreshold = + config.getMaxTaskHeapCancellationPercentageThreshold(); this.heapUsedDecreaseThreshold = config.getMinHeapDecreasePercentageThreshold(); - this.heapCancellationDecreaseMinThreashold = - config.getMinHeapCancellationPercentageThreshold(); + this.heapShardCancellationDecreaseMinThreashold = + config.getMinShardHeapCancellationPercentageThreshold(); + this.heapTaskCancellationDecreaseMinThreashold = + config.getMinTaskHeapCancellationPercentageThreshold(); } } diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index 2f38bb2e4..de6014fb4 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -120,7 +120,7 @@ public void testSearchBpGetResourceContextEqualRcaPeriod() { /* * Test SearchBackPressure RCA returns healthy nonempty flow units if the settings does not trigger autotune - * Meeting None of Increasing or Decreasing Threshold + * Meeting None of Increasing or Decreasing Threshold for both shard/task level */ @Test public void testSearchBpGetHealthyFlowUnit() { @@ -138,13 +138,49 @@ public void testSearchBpGetHealthyFlowUnit() { * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by increasing threshold * Increasing threshold: * node max heap usage in last 60 secs is less than 70% - * cancellationCount due to heap is more than 50% of all task cancellations. + * cancellationCount due to heap is more than 50% of all task cancellations (Shard-Level) */ @Test - public void testSearchBpGetUnHealthyFlowUnitByIncreaseThreshold() { + public void testSearchBpGetUnHealthyFlowUnitByShardIncreaseThreshold() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.3); - setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 4.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by increasing threshold + * Increasing threshold: + * node max heap usage in last 60 secs is less than 70% + * cancellationCount due to heap is more than 50% of all task cancellations (Task-Level). + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitByTaskIncreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.3); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 4.0, 8.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by decreasing threshold + * decreasing threshold: + * node min heap usage in last 60 secs is more than 80% + * cancellationCount due to heap is less than 30% of all task cancellations (Shard-Level) + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitByShardDecreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 8.0); IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); ResourceFlowUnit flowUnit = testRca.operate(); @@ -156,13 +192,13 @@ public void testSearchBpGetUnHealthyFlowUnitByIncreaseThreshold() { * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by decreasing threshold * decreasing threshold: * node min heap usage in last 60 secs is more than 80% - * cancellationCount due to heap is less than 30% of all task cancellations + * cancellationCount due to heap is less than 30% of all task cancellations (Task-Level) */ @Test - public void testSearchBpGetUnHealthyFlowUnitByDecreaseThreshold() { + public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); - setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 2.0); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 2.0); IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); ResourceFlowUnit flowUnit = testRca.operate(); From 5e3aed707109da92990b97b52bf1291dc35d0d65 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 30 Jun 2023 00:12:49 -0700 Subject: [PATCH 17/37] Add a new SearchBp Resource Unit (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../framework/api/summaries/ResourceUtil.java | 20 +++++++++++++++++++ .../SearchBackPressureRCA.java | 11 +++++++++- src/main/proto/inter_node_rpc_service.proto | 6 ++++++ .../SearchBackPressureRcaTest.java | 17 ++++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java index 876cd4ca1..659f85548 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java @@ -135,6 +135,26 @@ public class ResourceUtil { .setResourceEnum(ResourceEnum.SHARD_REQUEST_CACHE) .setMetricEnum(MetricEnum.CACHE_MAX_SIZE) .build(); + /* + * searchbackpressure related resource + * SEARCHBACKPRESSURE_SHARD resource indicate a searchbackpressure unhealthy resource unit is caused by shard level cancellation + * ResourceEnum.OLD_GEN and MetricEnum.UNRECOGNIZED are dummy values + */ + public static final Resource SEARCHBACKPRESSURE_SHARD = + Resource.newBuilder() + .setResourceEnum(ResourceEnum.SEARCHBP) + .setMetricEnum(MetricEnum.SEARCHBP_SHARD) + .build(); + + /* + * SEARCHBACKPRESSURE_TASK resource indicate a searchbackpressure unhealthy resource unit is caused by task level cancellation + * ResourceEnum.OLD_GEN and MetricEnum.UNRECOGNIZED are dummy values + */ + public static final Resource SEARCHBACKPRESSURE_TASK = + Resource.newBuilder() + .setResourceEnum(ResourceEnum.SEARCHBP) + .setMetricEnum(MetricEnum.SEARCHBP_TASK) + .build(); /** * Read the resourceType name from the ResourceType object diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index aabbd1fc2..882c39d7a 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -6,6 +6,8 @@ package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; import static org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil.readDataFromSqlResult; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.SEARCHBACKPRESSURE_SHARD; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.SEARCHBACKPRESSURE_TASK; import java.time.Clock; import java.util.ArrayList; @@ -27,6 +29,7 @@ import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; import org.opensearch.performanceanalyzer.rca.framework.util.InstanceDetails; import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; @@ -140,7 +143,6 @@ public ResourceFlowUnit operate() { counter += 1; ResourceContext context = null; long currentTimeMillis = System.currentTimeMillis(); - ; // read key metrics into searchBackPressureRCAMetric for easier management SearchBackPressureRCAMetric searchBackPressureRCAMetric = getSearchBackPressureRCAMetric(); @@ -256,6 +258,10 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: shard-level + HotResourceSummary resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0); + nodeSummary.appendNestedSummary(resourceSummary); + return new ResourceFlowUnit<>( currentTimeMillis, context, @@ -272,6 +278,9 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: task-level + HotResourceSummary resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0); + nodeSummary.appendNestedSummary(resourceSummary); return new ResourceFlowUnit<>( currentTimeMillis, context, diff --git a/src/main/proto/inter_node_rpc_service.proto b/src/main/proto/inter_node_rpc_service.proto index fe5c864c9..89ea45d93 100644 --- a/src/main/proto/inter_node_rpc_service.proto +++ b/src/main/proto/inter_node_rpc_service.proto @@ -77,6 +77,9 @@ enum ResourceEnum { // Heap HEAP = 20 [(additional_fields).name = "heap"]; + // Search Back Pressure + SEARCHBP = 21 [(additional_fields).name = "search back pressure"]; + } enum MetricEnum { @@ -106,6 +109,9 @@ enum MetricEnum { OLD_GEN_USAGE_AFTER_FULL_GC = 31 [(additional_fields).name = "full gc", (additional_fields).description = "old gen usage after full gc in mb"]; // GC FULL_GC = 32 [(additional_fields).name = "full gc", (additional_fields).description = "full gc pause time in ms"]; + // Searchbp + SEARCHBP_SHARD = 33 [(additional_fields).name = "searchbackpressure shard", (additional_fields).description = "default value to indicate an unhealthy resource unit is from shard-level cancellation"]; + SEARCHBP_TASK = 34 [(additional_fields).name = "searchbackpressure task", (additional_fields).description = "default value to indicate an unhealthy resource unit is from task-level cancellation"]; } /* diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index de6014fb4..344dcb9db 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -206,6 +206,23 @@ public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { assertFalse(flowUnit.getResourceContext().isHealthy()); } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units with a HotResourceSummary of SEARCHBACKPRESSURE_SHARD Resource + * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in shard-level + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 2.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + } + private void setupMockHeapMetric(final Metric metric, final double val) { String valString = Double.toString(val); List data = From 8d78c3b9f2fb0f40bcdf33f5acacfbb650097d74 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 30 Jun 2023 09:51:33 -0700 Subject: [PATCH 18/37] Add UTs to test shard/task level resource include-ness (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRcaTest.java | 112 +++++++++++++++++- 1 file changed, 109 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index 344dcb9db..ffc80d876 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -9,6 +9,8 @@ import static org.junit.Assert.assertTrue; import static org.mockito.Mockito.when; import static org.mockito.MockitoAnnotations.initMocks; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.SEARCHBACKPRESSURE_SHARD; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.SEARCHBACKPRESSURE_TASK; import java.util.Arrays; import java.util.Collections; @@ -24,6 +26,7 @@ import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.MetricTestHelper; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; public class SearchBackPressureRcaTest { // Mock Metrics @@ -206,13 +209,75 @@ public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { assertFalse(flowUnit.getResourceContext().isHealthy()); } + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units with a HotResourceSummary of SEARCHBACKPRESSURE_SHARD Resource + * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in shard-level in decrease threshold + * decreasing threshold: + * node min heap usage in last 60 secs is more than 80% + * cancellationCount due to heap is less than 30% of all task cancellations (Shard-Level) + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitInShardLevelByDecreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 8.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); - /* + HotNodeSummary hotNodeSummary = flowUnit.getSummary(); + List hotResourceSummaries = hotNodeSummary.getHotResourceSummaryList(); + boolean found_shard_resource = + hotResourceSummaries.stream() + .anyMatch( + hotResourceSummary -> + hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_SHARD); + + assertTrue(found_shard_resource); + } + + /* * Test SearchBackPressure RCA returns unhealthy nonempty flow units with a HotResourceSummary of SEARCHBACKPRESSURE_SHARD Resource - * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in shard-level + * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in shard-level in increase threshold + * Increasing threshold: + * node max heap usage in last 60 secs is less than 70% + * cancellationCount due to heap is more than 50% of all task cancellations (Shard-Level) */ @Test - public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { + public void testSearchBpGetUnHealthyFlowUnitInShardLevelByIncreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.5); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 2.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + + HotNodeSummary hotNodeSummary = flowUnit.getSummary(); + List hotResourceSummaries = hotNodeSummary.getHotResourceSummaryList(); + boolean found_shard_resource = + hotResourceSummaries.stream() + .anyMatch( + hotResourceSummary -> + hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_SHARD); + + assertTrue(found_shard_resource); + } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units with a HotResourceSummary of SEARCHBACKPRESSURE_SHARD Resource + * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in task-level + * decreasing threshold: + * node min heap usage in last 60 secs is more than 80% + * cancellationCount due to heap is less than 30% of all task cancellations (Task-Level) + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByDecreaseThreshold() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 2.0); @@ -221,6 +286,47 @@ public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { ResourceFlowUnit flowUnit = testRca.operate(); assertFalse(flowUnit.isEmpty()); assertFalse(flowUnit.getResourceContext().isHealthy()); + + HotNodeSummary hotNodeSummary = flowUnit.getSummary(); + List hotResourceSummaries = hotNodeSummary.getHotResourceSummaryList(); + boolean found_task_resource = + hotResourceSummaries.stream() + .anyMatch( + hotResourceSummary -> + hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_TASK); + + assertTrue(found_task_resource); + } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units with a HotResourceSummary of SEARCHBACKPRESSURE_SHARD Resource + * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in shard-level + * Increasing threshold: + * node max heap usage in last 60 secs is less than 70% + * cancellationCount due to heap is more than 50% of all task cancellations (Task-Level) + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByIncreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.5); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 8.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + + HotNodeSummary hotNodeSummary = flowUnit.getSummary(); + List hotResourceSummaries = hotNodeSummary.getHotResourceSummaryList(); + boolean found_task_resource = + hotResourceSummaries.stream() + .anyMatch( + hotResourceSummary -> + hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_TASK); + + assertTrue(found_task_resource); } private void setupMockHeapMetric(final Metric metric, final double val) { From 55b8ec0828c29f09e03d41ad174a1764c2d3c5ba Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 3 Jul 2023 15:03:51 -0700 Subject: [PATCH 19/37] Remove styling changes for Version.java (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../org/opensearch/performanceanalyzer/rca/Version.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java index 402013cf7..bfc85fcd3 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java @@ -19,14 +19,11 @@ public final class Version { * transferred packets should be dropped. Every increment here should be accompanied with a line * describing the version bump. * - *

Note: The RCA version is agnostic of OpenSearch version. + * Note: The RCA version is agnostic of OpenSearch version. */ static final class Major { - // Bumping this post the Commons - // Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) - // and Service - // Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) - // change + // Bumping this post the Commons Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) + // and Service Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) change static final int RCA_MAJ_VERSION = 1; } From 12fe8a8ac7bbcc8a66e9ee3af7b9bbc4f12d2cb8 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 5 Jul 2023 19:54:27 -0700 Subject: [PATCH 20/37] Add metadata to resourceSummary (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRCA.java | 22 +++++++++++--- .../SearchBackPressureRcaTest.java | 30 ++++++++++++------- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 882c39d7a..c8d095b9b 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -258,8 +258,15 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: shard-level - HotResourceSummary resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0); + HotResourceSummary resourceSummary; + if (increaseThresholdMetByShard) { + resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0, "increase"); + } else { + resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0, "decrease"); + } + nodeSummary.appendNestedSummary(resourceSummary); return new ResourceFlowUnit<>( @@ -278,8 +285,15 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: task-level - HotResourceSummary resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0); + HotResourceSummary resourceSummary; + if (increaseThresholdMetByTask) { + resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0, "increase"); + } else { + resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0, "decrease"); + } + nodeSummary.appendNestedSummary(resourceSummary); return new ResourceFlowUnit<>( currentTimeMillis, diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index ffc80d876..355f757cb 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -44,6 +44,8 @@ public class SearchBackPressureRcaTest { private SearchBackPressureRCA testRca; private MetricTestHelper metricTestHelper; private static final double DEFAULT_MAX_HEAP_SIZE = 4294967296.0; + private static final String INCREASE_METADATA_STR = "increase"; + private static final String DECREASE_METADATA_STR = "decrease"; // mock heap metric columns private final List heapTableColumns = @@ -233,8 +235,10 @@ public void testSearchBpGetUnHealthyFlowUnitInShardLevelByDecreaseThreshold() { hotResourceSummaries.stream() .anyMatch( hotResourceSummary -> - hotResourceSummary.getResource() - == SEARCHBACKPRESSURE_SHARD); + (hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_SHARD) + && (hotResourceSummary.getMetaData() + == DECREASE_METADATA_STR)); assertTrue(found_shard_resource); } @@ -259,14 +263,16 @@ public void testSearchBpGetUnHealthyFlowUnitInShardLevelByIncreaseThreshold() { HotNodeSummary hotNodeSummary = flowUnit.getSummary(); List hotResourceSummaries = hotNodeSummary.getHotResourceSummaryList(); - boolean found_shard_resource = + boolean found_shard_resource_and_increase_metadata = hotResourceSummaries.stream() .anyMatch( hotResourceSummary -> - hotResourceSummary.getResource() - == SEARCHBACKPRESSURE_SHARD); + (hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_SHARD) + && (hotResourceSummary.getMetaData() + == INCREASE_METADATA_STR)); - assertTrue(found_shard_resource); + assertTrue(found_shard_resource_and_increase_metadata); } /* @@ -293,8 +299,10 @@ public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByDecreaseThreshold() { hotResourceSummaries.stream() .anyMatch( hotResourceSummary -> - hotResourceSummary.getResource() - == SEARCHBACKPRESSURE_TASK); + (hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_TASK) + && (hotResourceSummary.getMetaData() + == DECREASE_METADATA_STR)); assertTrue(found_task_resource); } @@ -323,8 +331,10 @@ public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByIncreaseThreshold() { hotResourceSummaries.stream() .anyMatch( hotResourceSummary -> - hotResourceSummary.getResource() - == SEARCHBACKPRESSURE_TASK); + (hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_TASK) + && (hotResourceSummary.getMetaData() + == INCREASE_METADATA_STR)); assertTrue(found_task_resource); } From 1b7837d9cd4480cdcdacce6a890ebd0185a15496 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 6 Jul 2023 11:24:00 -0700 Subject: [PATCH 21/37] Update to more general framework (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 4 + .../SearchBackPressureRCA.java | 90 ++++++++++++------- .../SearchBackPressureRcaTest.java | 15 ++-- 3 files changed, 70 insertions(+), 39 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index 9a6512893..a3dca6031 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -11,6 +11,10 @@ public class SearchBackPressureRcaConfig { public static final String CONFIG_NAME = "search-back-pressure-rca-policy"; + /* Metadata fields for thresholds */ + public static final String INCREASE_THRESHOLD_BY_JVM_STR = "increase_jvm"; + public static final String DECREASE_THRESHOLD_BY_JVM_STR = "decrease_jvm"; + // Interval period in seconds public static final long DEFAULT_EVALUATION_INTERVAL_IN_S = 60; diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index c8d095b9b..cae86ffe5 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -96,22 +96,32 @@ public SearchBackPressureRCA( this.rcaPeriod = rcaPeriod; this.clock = Clock.systemUTC(); this.searchbp_Stats = searchbp_Stats; + + // threshold for heap usage this.heapUsedIncreaseThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_INCREASE_THRESHOLD; + this.heapUsedDecreaseThreshold = + SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_DECREASE_THRESHOLD; + + /* + * threshold for search back pressure service stats + * currently, only consider the percentage of JVM Usage cancellation count compared to the total cancellation count + * + */ this.heapShardCancellationIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_SHARD_MAX_HEAP_CANCELLATION_THRESHOLD; this.heapTaskCancellationIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_TASK_MAX_HEAP_CANCELLATION_THRESHOLD; - this.heapUsedDecreaseThreshold = - SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_DECREASE_THRESHOLD; + this.heapShardCancellationDecreaseMinThreashold = SearchBackPressureRcaConfig.DEFAULT_SHARD_MIN_HEAP_CANCELLATION_THRESHOLD; this.heapTaskCancellationDecreaseMinThreashold = SearchBackPressureRcaConfig.DEFAULT_TASK_MIN_HEAP_CANCELLATION_THRESHOLD; - // initialize sliding window + // sliding window for heap usage this.heapUsageSlidingWindow = new MinMaxOldGenSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + // sliding window for JVM this.shardJVMCancellationSlidingWindow = new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); this.taskJVMCancellationSlidingWindow = @@ -122,7 +132,7 @@ public SearchBackPressureRCA( /* * operate() is used for local build - * generateFlowUnitListFromWire simply use remote flowunits to + * generateFlowUnitListFromWire simply use remote flowunits to generate flow units locally */ @Override public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { @@ -136,6 +146,11 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { setFlowUnits(flowUnitList); } + /* + * operate() evaluates the current stats against threshold + * generate Unhealthy Flow Unit if Searchbp Service needs autotune + * else, generate Healthy Flow Unit + */ @Override public ResourceFlowUnit operate() { LOG.info("SearchBackPressureRCA operate() intiatilized"); @@ -207,37 +222,27 @@ public ResourceFlowUnit operate() { new HotNodeSummary( instanceDetails.getInstanceId(), instanceDetails.getInstanceIp()); - // get the Configured Threshold and compare with Sliding Window Stats /* * 2 cases we send Unhealthy ResourceContext when we need to autotune the threshold - * - (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations - * - (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations + * (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations + * (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations */ - // avgShardJVMCancellationPercentage = 80.0; // testing - - // TODO: add Task CancellationCountPercentage as another criteria - // TODO - /* - * HotResourceSummary resourceSummary = - new HotResourceSummary(HEAP_MAX_SIZE, currentThreshold, previousThreshold, 0); - nodeSummary.appendNestedSummary(resourceSummary); - - If you - */ - boolean increaseThresholdMetByShard = + // shard level thresholds + boolean increaseJVMThresholdMetByShard = (maxHeapUsagePercentage < heapUsedIncreaseThreshold) && (avgShardJVMCancellationPercentage > heapShardCancellationIncreaseMaxThreshold); - boolean decreaseThresholdMetByShard = + boolean decreaseJVMThresholdMetByShard = (minHeapUsagePercentage > heapUsedDecreaseThreshold) && (avgShardJVMCancellationPercentage < heapShardCancellationDecreaseMinThreashold); - boolean increaseThresholdMetByTask = + // task level thresholds + boolean increaseJVMThresholdMetByTask = (maxHeapUsagePercentage < heapUsedIncreaseThreshold) && (avgTaskJVMCancellationPercentage > heapTaskCancellationIncreaseMaxThreshold); - boolean decreaseThresholdMetByTask = + boolean decreaseJVMThresholdMetByTask = (minHeapUsagePercentage > heapUsedDecreaseThreshold) && (avgTaskJVMCancellationPercentage < heapTaskCancellationDecreaseMinThreashold); @@ -247,7 +252,7 @@ public ResourceFlowUnit operate() { // previousThreshold, 0); // nodeSummary.appendNestedSummary(resourceSummary); - if (increaseThresholdMetByShard || decreaseThresholdMetByShard) { + if (increaseJVMThresholdMetByShard || decreaseJVMThresholdMetByShard) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( "Increase/Decrease Condition Meet for Shard, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", @@ -259,12 +264,22 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: shard-level HotResourceSummary resourceSummary; - if (increaseThresholdMetByShard) { + if (increaseJVMThresholdMetByShard) { resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0, "increase"); + new HotResourceSummary( + SEARCHBACKPRESSURE_SHARD, + 0, + 0, + 0, + SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR); } else { resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0, "decrease"); + new HotResourceSummary( + SEARCHBACKPRESSURE_SHARD, + 0, + 0, + 0, + SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR); } nodeSummary.appendNestedSummary(resourceSummary); @@ -274,7 +289,7 @@ public ResourceFlowUnit operate() { context, nodeSummary, !instanceDetails.getIsClusterManager()); - } else if (increaseThresholdMetByTask || decreaseThresholdMetByTask) { + } else if (increaseJVMThresholdMetByTask || decreaseJVMThresholdMetByTask) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( "Increase/Decrease Condition Meet for Task, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", @@ -286,12 +301,22 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: task-level HotResourceSummary resourceSummary; - if (increaseThresholdMetByTask) { + if (increaseJVMThresholdMetByTask) { resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0, "increase"); + new HotResourceSummary( + SEARCHBACKPRESSURE_TASK, + 0, + 0, + 0, + SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR); } else { resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0, "decrease"); + new HotResourceSummary( + SEARCHBACKPRESSURE_TASK, + 0, + 0, + 0, + SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR); } nodeSummary.appendNestedSummary(resourceSummary); @@ -390,11 +415,10 @@ private double getMetric(M metric, Field field, Strin */ @Override public void readRcaConf(RcaConf conf) { - // only initialized one time LOG.info("SearchBackPressureRCA readRcaConf() intiatilized"); final SearchBackPressureRcaConfig config = conf.getSearchBackPressureRcaConfig(); - // read anything from config file in runtime - // if not just skip it + + // threshold value read from config file this.heapUsedIncreaseThreshold = config.getMaxHeapIncreasePercentageThreshold(); this.heapShardCancellationIncreaseMaxThreshold = config.getMaxShardHeapCancellationPercentageThreshold(); diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index 355f757cb..c4282e9ec 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -21,6 +21,7 @@ import org.mockito.Mock; import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; +import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.Metric; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; @@ -44,8 +45,6 @@ public class SearchBackPressureRcaTest { private SearchBackPressureRCA testRca; private MetricTestHelper metricTestHelper; private static final double DEFAULT_MAX_HEAP_SIZE = 4294967296.0; - private static final String INCREASE_METADATA_STR = "increase"; - private static final String DECREASE_METADATA_STR = "decrease"; // mock heap metric columns private final List heapTableColumns = @@ -238,7 +237,8 @@ public void testSearchBpGetUnHealthyFlowUnitInShardLevelByDecreaseThreshold() { (hotResourceSummary.getResource() == SEARCHBACKPRESSURE_SHARD) && (hotResourceSummary.getMetaData() - == DECREASE_METADATA_STR)); + == SearchBackPressureRcaConfig + .DECREASE_THRESHOLD_BY_JVM_STR)); assertTrue(found_shard_resource); } @@ -270,7 +270,8 @@ public void testSearchBpGetUnHealthyFlowUnitInShardLevelByIncreaseThreshold() { (hotResourceSummary.getResource() == SEARCHBACKPRESSURE_SHARD) && (hotResourceSummary.getMetaData() - == INCREASE_METADATA_STR)); + == SearchBackPressureRcaConfig + .INCREASE_THRESHOLD_BY_JVM_STR)); assertTrue(found_shard_resource_and_increase_metadata); } @@ -302,7 +303,8 @@ public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByDecreaseThreshold() { (hotResourceSummary.getResource() == SEARCHBACKPRESSURE_TASK) && (hotResourceSummary.getMetaData() - == DECREASE_METADATA_STR)); + == SearchBackPressureRcaConfig + .DECREASE_THRESHOLD_BY_JVM_STR)); assertTrue(found_task_resource); } @@ -334,7 +336,8 @@ public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByIncreaseThreshold() { (hotResourceSummary.getResource() == SEARCHBACKPRESSURE_TASK) && (hotResourceSummary.getMetaData() - == INCREASE_METADATA_STR)); + == SearchBackPressureRcaConfig + .INCREASE_THRESHOLD_BY_JVM_STR)); assertTrue(found_task_resource); } From 8b059a8148e5d374bb2a140c93a67dda83113ddd Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 6 Jul 2023 13:42:56 -0700 Subject: [PATCH 22/37] (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 5 +++++ .../SearchBackPressureRCA.java | 20 ++++--------------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index a3dca6031..8838ce12f 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -15,9 +15,14 @@ public class SearchBackPressureRcaConfig { public static final String INCREASE_THRESHOLD_BY_JVM_STR = "increase_jvm"; public static final String DECREASE_THRESHOLD_BY_JVM_STR = "decrease_jvm"; + public static final int SLIDING_WINDOW_SIZE_IN_MINS = 1; + // Interval period in seconds public static final long DEFAULT_EVALUATION_INTERVAL_IN_S = 60; + /* interval period to call operate() */ + public static final long EVAL_INTERVAL_IN_S = 5; + /* Increase Threshold */ // node max heap usage in last 60 secs is less than 70% public static final int DEFAULT_MAX_HEAP_INCREASE_THRESHOLD = 70; diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index cae86ffe5..73a53b74d 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -37,12 +37,11 @@ import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model.SearchBackPressureRCAMetric; public class SearchBackPressureRCA extends OldGenRca> { - // LOGGER for SearchBackPressureRCA private static final Logger LOG = LogManager.getLogger(SearchBackPressureRCA.class); private static final double BYTES_TO_GIGABYTES = Math.pow(1024, 3); - private static final long EVAL_INTERVAL_IN_S = 5; + private static final long EVAL_INTERVAL_IN_S = SearchBackPressureRcaConfig.EVAL_INTERVAL_IN_S; - // Key Metrics to be used to determine health status + // Key metrics used to determine RCA Flow Unit health status private final Metric heapUsed; private final Metric searchbp_Stats; @@ -76,7 +75,7 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( /* * threshold for search back pressure service stats * currently, only consider the percentage of JVM Usage cancellation count compared to the total cancellation count - * */ this.heapShardCancellationIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_SHARD_MAX_HEAP_CANCELLATION_THRESHOLD; @@ -153,8 +151,6 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { */ @Override public ResourceFlowUnit operate() { - LOG.info("SearchBackPressureRCA operate() intiatilized"); - counter += 1; ResourceContext context = null; long currentTimeMillis = System.currentTimeMillis(); @@ -180,9 +176,6 @@ public ResourceFlowUnit operate() { new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); } - // for testing - // heapUsageSlidingWindow.next(new SlidingWindowData(currentTimeMillis, 65.3)); - double shardJVMCancellationPercentage = searchBackPressureRCAMetric.getShardJVMCancellationPercent(); if (!Double.isNaN(shardJVMCancellationPercentage)) { @@ -247,11 +240,6 @@ public ResourceFlowUnit operate() { && (avgTaskJVMCancellationPercentage < heapTaskCancellationDecreaseMinThreashold); - // HotResourceSummary resourceSummary = - // new HotResourceSummary(HEAP_MAX_SIZE, currentThreshold, - // previousThreshold, 0); - // nodeSummary.appendNestedSummary(resourceSummary); - if (increaseJVMThresholdMetByShard || decreaseJVMThresholdMetByShard) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( @@ -418,7 +406,7 @@ public void readRcaConf(RcaConf conf) { LOG.info("SearchBackPressureRCA readRcaConf() intiatilized"); final SearchBackPressureRcaConfig config = conf.getSearchBackPressureRcaConfig(); - // threshold value read from config file + // threshold read from config file this.heapUsedIncreaseThreshold = config.getMaxHeapIncreasePercentageThreshold(); this.heapShardCancellationIncreaseMaxThreshold = config.getMaxShardHeapCancellationPercentageThreshold(); From c49e771553a5a869fdfd848263dad76aa8ce1b74 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 7 Jul 2023 14:08:58 -0700 Subject: [PATCH 23/37] Refactor the MinMaxSlidingWindow and bug fix (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../framework/api/summaries/ResourceUtil.java | 2 - .../rca/store/rca/OldGenRca.java | 59 ++++++++---- .../SearchBackPressureRCA.java | 96 +++++++++++++------ .../model/SearchBackPressureRCAMetric.java | 3 - 4 files changed, 108 insertions(+), 52 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java index 659f85548..03db4299b 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java @@ -138,7 +138,6 @@ public class ResourceUtil { /* * searchbackpressure related resource * SEARCHBACKPRESSURE_SHARD resource indicate a searchbackpressure unhealthy resource unit is caused by shard level cancellation - * ResourceEnum.OLD_GEN and MetricEnum.UNRECOGNIZED are dummy values */ public static final Resource SEARCHBACKPRESSURE_SHARD = Resource.newBuilder() @@ -148,7 +147,6 @@ public class ResourceUtil { /* * SEARCHBACKPRESSURE_TASK resource indicate a searchbackpressure unhealthy resource unit is caused by task level cancellation - * ResourceEnum.OLD_GEN and MetricEnum.UNRECOGNIZED are dummy values */ public static final Resource SEARCHBACKPRESSURE_TASK = Resource.newBuilder() diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java index a53c2b7bf..147dd5db1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java @@ -252,32 +252,57 @@ public double readMin() { } /** - * Sliding window to check the max/min olg gen usage within a given time frame Previous - * MinGoldGenSlidingWindow should be deprecated since it modify the sliding window size in - * next() + * Sliding window to check the max/min olg gen usage within a given time frame + * + * @param isMinSlidingWindow true if the sliding window is for min usage, false for max usage + * Provides a more general framework than MinOldGenSlidingWindow as this sliding window can + * be implemented as minSlidingWindow or maxSlidingWindow depending on the need. */ - public static class MinMaxOldGenSlidingWindow extends SlidingWindow { + public static class MinMaxSlidingWindow extends SlidingWindow { + boolean isMinSlidingWindow; - public MinMaxOldGenSlidingWindow(int SLIDING_WINDOW_SIZE_IN_TIMESTAMP, TimeUnit timeUnit) { + public MinMaxSlidingWindow( + int SLIDING_WINDOW_SIZE_IN_TIMESTAMP, + TimeUnit timeUnit, + boolean isMinSlidingWindow) { super(SLIDING_WINDOW_SIZE_IN_TIMESTAMP, timeUnit); + this.isMinSlidingWindow = isMinSlidingWindow; } - public double readMax() { - if (!windowDeque.isEmpty()) { - return windowDeque.stream() - .mapToDouble(SlidingWindowData::getValue) - .max() - .orElse(Double.NaN); + @Override + public void next(SlidingWindowData e) { + boolean pollFirstCondition; + if (isMinSlidingWindow) { + // monotonically decreasing sliding window + while (!windowDeque.isEmpty() + && windowDeque.peekFirst().getValue() >= e.getValue()) { + windowDeque.pollFirst(); + } + } else { + // monotonically increasing sliding window + while (!windowDeque.isEmpty() + && windowDeque.peekFirst().getValue() < e.getValue()) { + windowDeque.pollFirst(); + } + } + + windowDeque.addFirst(e); + while (!windowDeque.isEmpty() + && TimeUnit.MILLISECONDS.toSeconds( + e.getTimeStamp() - windowDeque.peekLast().getTimeStamp()) + > SLIDING_WINDOW_SIZE) { + windowDeque.pollLast(); } - return Double.NaN; } - public double readMin() { + /* + * read last element in the window + * if the sliding window is MinSlidingWindow then returns the min element + * else return the max element in the deque + */ + public double readLastElementInWindow() { if (!windowDeque.isEmpty()) { - return windowDeque.stream() - .mapToDouble(SlidingWindowData::getValue) - .min() - .orElse(Double.NaN); + return windowDeque.peekLast().getValue(); } return Double.NaN; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 73a53b74d..35b1261b1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -72,15 +72,18 @@ public class SearchBackPressureRCA extends OldGenRca taskJVMCancellationSlidingWindow; private final SlidingWindow shardJVMCancellationSlidingWindow; - private final MinMaxOldGenSlidingWindow heapUsageSlidingWindow; + private final MinMaxSlidingWindow minHeapUsageSlidingWindow; + private final MinMaxSlidingWindow maxHeapUsageSlidingWindow; // Sliding Window Interval - private static final int SLIDING_WINDOW_SIZE_IN_MINS = SearchBackPressureRcaConfig.SLIDING_WINDOW_SIZE_IN_MINS; + private static final int SLIDING_WINDOW_SIZE_IN_MINS = + SearchBackPressureRcaConfig.SLIDING_WINDOW_SIZE_IN_MINS; private static final int SLIDING_WINDOW_SIZE_IN_SECS = SLIDING_WINDOW_SIZE_IN_MINS * 60; - // counter to check the samples has been taken, only emit flow units when counter equals to + // currentIterationNumber to check the samples has been taken, only emit flow units when + // currentIterationNumber equals to // rcaPeriod - private long counter; + private long currentIterationNumber; // Required amount of RCA period this RCA needs to run before sending out a flowunit private final int rcaPeriod; @@ -90,6 +93,7 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( final int rcaPeriod, final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { + // metric gcType is needed to construct OldGenRca Class (Parent Class) super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); this.heapUsed = heapUsed; this.rcaPeriod = rcaPeriod; @@ -117,8 +121,11 @@ public SearchBackPressureRCA( SearchBackPressureRcaConfig.DEFAULT_TASK_MIN_HEAP_CANCELLATION_THRESHOLD; // sliding window for heap usage - this.heapUsageSlidingWindow = - new MinMaxOldGenSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + this.minHeapUsageSlidingWindow = + new MinMaxSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES, true); + this.maxHeapUsageSlidingWindow = + new MinMaxSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES, false); + // sliding window for JVM this.shardJVMCancellationSlidingWindow = new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); @@ -130,7 +137,9 @@ public SearchBackPressureRCA( /* * operate() is used for local build - * generateFlowUnitListFromWire simply use remote flowunits to generate flow units locally + * This will compute the flow units from other hosts in the cluster + * for a given Metric and try to send the subscription requests + * to stale or new hosts in cluster if need be */ @Override public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { @@ -146,12 +155,12 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { /* * operate() evaluates the current stats against threshold - * generate Unhealthy Flow Unit if Searchbp Service needs autotune - * else, generate Healthy Flow Unit + * Unhealthy Flow Units is a marker that this resource at current instance is not healthy + * Autotune decision would be made by downstream classes */ @Override public ResourceFlowUnit operate() { - counter += 1; + currentIterationNumber += 1; ResourceContext context = null; long currentTimeMillis = System.currentTimeMillis(); @@ -172,7 +181,9 @@ public ResourceFlowUnit operate() { // update sliding window if the value is NOT NaN double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); if (!Double.isNaN(prevheapUsagePercentage)) { - heapUsageSlidingWindow.next( + minHeapUsageSlidingWindow.next( + new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); + maxHeapUsageSlidingWindow.next( new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); } @@ -190,17 +201,19 @@ public ResourceFlowUnit operate() { new SlidingWindowData(currentTimeMillis, taskJVMCancellationPercentage)); } - LOG.info("SearchBackPressureRCA counter is {}", counter); - // if counter matches the rca period, emit the flow unit - if (counter == this.rcaPeriod) { - LOG.info("SearchBackPressureRCA counter in rcaPeriod is {}", counter); + LOG.info("SearchBackPressureRCA currentIterationNumber is {}", currentIterationNumber); + // if currentIterationNumber matches the rca period, emit the flow unit + if (currentIterationNumber == this.rcaPeriod) { + LOG.info( + "SearchBackPressureRCA currentIterationNumber in rcaPeriod is {}", + currentIterationNumber); currentTimeMillis = System.currentTimeMillis(); - // reset counter - counter = 0; + // reset currentIterationNumber + currentIterationNumber = 0; - double maxHeapUsagePercentage = heapUsageSlidingWindow.readMax(); - double minHeapUsagePercentage = heapUsageSlidingWindow.readMin(); + double maxHeapUsagePercentage = maxHeapUsageSlidingWindow.readLastElementInWindow(); + double minHeapUsagePercentage = minHeapUsageSlidingWindow.readLastElementInWindow(); double avgShardJVMCancellationPercentage = shardJVMCancellationSlidingWindow.readAvg(); double avgTaskJVMCancellationPercentage = taskJVMCancellationSlidingWindow.readAvg(); @@ -240,15 +253,15 @@ public ResourceFlowUnit operate() { && (avgTaskJVMCancellationPercentage < heapTaskCancellationDecreaseMinThreashold); - if (increaseJVMThresholdMetByShard || decreaseJVMThresholdMetByShard) { - // Generate a flow unit with an Unhealthy ResourceContext - LOG.info( - "Increase/Decrease Condition Meet for Shard, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", - maxHeapUsagePercentage, - heapUsedIncreaseThreshold, - avgShardJVMCancellationPercentage, - heapShardCancellationIncreaseMaxThreshold); + // Generate a flow unit with an Unhealthy ResourceContext + LOG.info( + "Increase/Decrease Condition Meet for Shard, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", + maxHeapUsagePercentage, + heapUsedIncreaseThreshold, + avgShardJVMCancellationPercentage, + heapShardCancellationIncreaseMaxThreshold); + if (increaseJVMThresholdMetByShard || decreaseJVMThresholdMetByShard) { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: shard-level HotResourceSummary resourceSummary; @@ -324,7 +337,7 @@ public ResourceFlowUnit operate() { } } else { - // return healthy state when the counter does not meet rcaPeriod + // return healthy state when the currentIterationNumber does not meet rcaPeriod LOG.info("Empty Healthy FlowUnit returned for SearchbackPressureRCA"); currentTimeMillis = System.currentTimeMillis(); return new ResourceFlowUnit<>(currentTimeMillis); @@ -383,16 +396,36 @@ private SearchBackPressureRCAMetric getSearchBackPressureRCAMetric() { } private double getMetric(M metric, Field field, String fieldName) { - double response = 0; + if (metric == null) { + throw new IllegalStateException( + "RCA: " + + this.name() + + "was not configured in the graph to " + + "take " + + metric.name() + + " as a metric. Please check the analysis graph!"); + } + + double response = 0.0; + // LOG.info( + // " metric.getFlowUnits() length is: {}, and metric name is {}", + // metric.getFlowUnits().size(), + // metric.name()); for (MetricFlowUnit flowUnit : metric.getFlowUnits()) { if (!flowUnit.isEmpty()) { + LOG.info( + "flowUnit.getData() rows size is {}", + flowUnit.getData().getValues("SearchBackPressureStats").size()); double metricResponse = readDataFromSqlResult(flowUnit.getData(), field, fieldName, MetricsDB.MAX); - if (!Double.isNaN(metricResponse) && metricResponse > 0) { + // print out the metricResponse + LOG.info("Searchbp metricResponse is: {}", metricResponse); + if (!Double.isNaN(metricResponse) && metricResponse >= 0.0) { response = metricResponse; } } } + LOG.info("Searchbp response is: {}", response); return response; } @@ -408,6 +441,9 @@ public void readRcaConf(RcaConf conf) { // threshold read from config file this.heapUsedIncreaseThreshold = config.getMaxHeapIncreasePercentageThreshold(); + LOG.info( + "SearchBackPressureRCA heapUsedIncreaseThreshold is set to {}", + this.heapUsedIncreaseThreshold); this.heapShardCancellationIncreaseMaxThreshold = config.getMaxShardHeapCancellationPercentageThreshold(); this.heapTaskCancellationIncreaseMaxThreshold = diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java index 718c76b8f..ef74e1763 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java @@ -5,7 +5,6 @@ package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model; -/** Represents used heap and max heap in gigabytes */ public class SearchBackPressureRCAMetric { private final double usedHeap; private final double maxHeap; @@ -14,7 +13,6 @@ public class SearchBackPressureRCAMetric { private final double searchbpJVMShardCancellationCount; private final double searchbpJVMTaskCancellationCount; - // Constructor public SearchBackPressureRCAMetric( double usedHeap, double maxHeap, @@ -30,7 +28,6 @@ public SearchBackPressureRCAMetric( this.searchbpJVMTaskCancellationCount = searchbpJVMTaskCancellationCount; } - // Getters public double getUsedHeap() { return usedHeap; } From 648e94d57240be143e063b21308032ee579f9b8a Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 7 Jul 2023 14:38:54 -0700 Subject: [PATCH 24/37] Refactor Heap Stats Metrics Getter(Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRCA.java | 68 ++++++++++++++++++- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 35b1261b1..40e35fa5f 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -28,6 +28,7 @@ import org.opensearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; @@ -40,11 +41,16 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( // metric gcType is needed to construct OldGenRca Class (Parent Class) super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); this.heapUsed = heapUsed; + this.heapMax = heapMax; this.rcaPeriod = rcaPeriod; this.clock = Clock.systemUTC(); this.searchbp_Stats = searchbp_Stats; @@ -344,10 +351,67 @@ public ResourceFlowUnit operate() { } } + /** + * Get the Heap Related Stats (Heap Used and Heap Size in gigabytes) + * + * @param isHeapUsed is true meaning get the value of used heap in gigabytes otherwise, meaning + * get the value of max heap in gigabytes + */ + public double getHeapStats(boolean isHeapUsed) { + double heapStats = DEFAULT_HEAP_VAL; + List heapStatsMetrics; + if (isHeapUsed == true) { + if (heap_Used == null) { + throw new IllegalStateException( + "RCA: " + + this.name() + + "was not configured in the graph to " + + "take heap_Used as a metric. Please check the analysis graph!"); + } + + heapStatsMetrics = heap_Used.getFlowUnits(); + } else { + if (heap_Max == null) { + throw new IllegalStateException( + "RCA: " + + this.name() + + "was not configured in the graph to " + + "take heap_Max as a metric. Please check the analysis graph!"); + } + + heapStatsMetrics = heap_Max.getFlowUnits(); + } + + for (MetricFlowUnit heapStatsMetric : heapStatsMetrics) { + if (heapStatsMetric.isEmpty()) { + continue; + } + + double ret = + SQLParsingUtil.readDataFromSqlResult( + heapStatsMetric.getData(), + AllMetrics.HeapDimension.MEM_TYPE.getField(), + AllMetrics.GCType.HEAP.toString(), + MetricsDB.MAX); + if (Double.isNaN(ret)) { + LOG.error( + "Failed to parse metric in FlowUnit from {}", + heap_Used.getClass().getName()); + } else { + heapStats = ret / CONVERT_BYTES_TO_MEGABYTES; + } + } + + return heapStats; + } + private SearchBackPressureRCAMetric getSearchBackPressureRCAMetric() { // Get Heap Usage related metrics - double prevHeapUsage = getOldGenUsedOrDefault(0d); - double maxHeapSize = getMaxOldGenSizeOrDefault(Double.MAX_VALUE); + double prevHeapUsage = getHeapStats(true); + double maxHeapSize = getHeapStats(false); + + // Log prevHeapUsage and maxHeapSize + LOG.info("prevHeapUsage: {}, maxHeapSize: {}", prevHeapUsage, maxHeapSize); // Get SearchBack Pressure related metrics from stats type field Field searchbp_stats_type_field = From ca6505937737339a8d84e3da95d720dab64c4aea Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 7 Jul 2023 15:04:41 -0700 Subject: [PATCH 25/37] Refactor HeapUsed and HeapMax Getters (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../rca/store/OpenSearchAnalysisGraph.java | 5 ++- .../SearchBackPressureRCA.java | 31 +++++++------------ .../SearchBackPressureRcaTest.java | 25 ++------------- 3 files changed, 17 insertions(+), 44 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index 967eded24..970db2ace 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -445,12 +445,11 @@ public void construct() { // Search Back Pressure Service RCA enabled SearchBackPressureRCA searchBackPressureRCA = - new SearchBackPressureRCA(RCA_PERIOD, heapMax, heapUsed, gcType, searchbp_Stats); + new SearchBackPressureRCA(RCA_PERIOD, heapMax, heapUsed, searchbp_Stats); searchBackPressureRCA.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); - searchBackPressureRCA.addAllUpstreams( - Arrays.asList(heapMax, heapUsed, gcType, searchbp_Stats)); + searchBackPressureRCA.addAllUpstreams(Arrays.asList(heapMax, heapUsed, searchbp_Stats)); // Search Back Pressure Service Cluster RCA enabled SearchBackPressureClusterRCA searchBackPressureClusterRCA = diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 40e35fa5f..a81dcd936 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -22,6 +22,7 @@ import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.Metric; +import org.opensearch.performanceanalyzer.rca.framework.api.Rca; import org.opensearch.performanceanalyzer.rca.framework.api.Resources; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindow; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindowData; @@ -34,10 +35,10 @@ import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; import org.opensearch.performanceanalyzer.rca.framework.util.InstanceDetails; import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; -import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca; +import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca.MinMaxSlidingWindow; import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model.SearchBackPressureRCAMetric; -public class SearchBackPressureRCA extends OldGenRca> { +public class SearchBackPressureRCA extends Rca> { private static final Logger LOG = LogManager.getLogger(SearchBackPressureRCA.class); private static final double BYTES_TO_GIGABYTES = Math.pow(1024, 3); private static final long EVAL_INTERVAL_IN_S = SearchBackPressureRcaConfig.EVAL_INTERVAL_IN_S; @@ -98,9 +99,8 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( - final int rcaPeriod, final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { - // metric gcType is needed to construct OldGenRca Class (Parent Class) - super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); + final int rcaPeriod, final M heapMax, final M heapUsed, M searchbp_Stats) { + super(EVAL_INTERVAL_IN_S); this.heapUsed = heapUsed; this.heapMax = heapMax; this.rcaPeriod = rcaPeriod; @@ -361,7 +361,7 @@ public double getHeapStats(boolean isHeapUsed) { double heapStats = DEFAULT_HEAP_VAL; List heapStatsMetrics; if (isHeapUsed == true) { - if (heap_Used == null) { + if (heapUsed == null) { throw new IllegalStateException( "RCA: " + this.name() @@ -369,9 +369,9 @@ public double getHeapStats(boolean isHeapUsed) { + "take heap_Used as a metric. Please check the analysis graph!"); } - heapStatsMetrics = heap_Used.getFlowUnits(); + heapStatsMetrics = heapUsed.getFlowUnits(); } else { - if (heap_Max == null) { + if (heapMax == null) { throw new IllegalStateException( "RCA: " + this.name() @@ -379,7 +379,7 @@ public double getHeapStats(boolean isHeapUsed) { + "take heap_Max as a metric. Please check the analysis graph!"); } - heapStatsMetrics = heap_Max.getFlowUnits(); + heapStatsMetrics = heapMax.getFlowUnits(); } for (MetricFlowUnit heapStatsMetric : heapStatsMetrics) { @@ -396,7 +396,7 @@ public double getHeapStats(boolean isHeapUsed) { if (Double.isNaN(ret)) { LOG.error( "Failed to parse metric in FlowUnit from {}", - heap_Used.getClass().getName()); + heapUsed.getClass().getName()); } else { heapStats = ret / CONVERT_BYTES_TO_MEGABYTES; } @@ -471,19 +471,12 @@ private double getMetric(M metric, Field field, Strin } double response = 0.0; - // LOG.info( - // " metric.getFlowUnits() length is: {}, and metric name is {}", - // metric.getFlowUnits().size(), - // metric.name()); for (MetricFlowUnit flowUnit : metric.getFlowUnits()) { if (!flowUnit.isEmpty()) { - LOG.info( - "flowUnit.getData() rows size is {}", - flowUnit.getData().getValues("SearchBackPressureStats").size()); double metricResponse = readDataFromSqlResult(flowUnit.getData(), field, fieldName, MetricsDB.MAX); - // print out the metricResponse - LOG.info("Searchbp metricResponse is: {}", metricResponse); + // // print out the metricResponse + // LOG.info("Searchbp metricResponse is: {}", metricResponse); if (!Double.isNaN(metricResponse) && metricResponse >= 0.0) { response = metricResponse; } diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index c4282e9ec..f371064e9 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -77,16 +77,12 @@ public void setup() throws Exception { this.metricTestHelper = new MetricTestHelper(RCA_PERIOD); setupMockHeapMetric(mockHeapUsed, 80.0); setupMockHeapMetric(mockHeapMax, 100.0); - // gcType is required for constructor of SearchBackPressureRCA but the exact type of gcType - // does not matter - setupMockGcType(CMS_COLLECTOR); // set up SearchBp_Stats table setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); this.testRca = - new SearchBackPressureRCA( - RCA_PERIOD, mockHeapMax, mockHeapUsed, mockGcType, mockSearchbpStats); + new SearchBackPressureRCA(RCA_PERIOD, mockHeapMax, mockHeapUsed, mockSearchbpStats); } /* @@ -220,7 +216,7 @@ public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { @Test public void testSearchBpGetUnHealthyFlowUnitInShardLevelByDecreaseThreshold() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); - setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.95); setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 8.0); IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); @@ -346,7 +342,7 @@ private void setupMockHeapMetric(final Metric metric, final double val) { String valString = Double.toString(val); List data = Arrays.asList( - AllMetrics.GCType.OLD_GEN.toString(), + AllMetrics.GCType.HEAP.toString(), valString, valString, valString, @@ -360,21 +356,6 @@ private void setupMockHeapMetric(final Metric metric, final double val) { heapTableColumns, data)))); } - private void setupMockGcType(final String collector) { - List gcInfoTableColumns = - Arrays.asList( - AllMetrics.GCInfoDimension.MEMORY_POOL.toString(), - AllMetrics.GCInfoDimension.COLLECTOR_NAME.toString()); - List data = Arrays.asList(AllMetrics.GCType.OLD_GEN.toString(), collector); - when(mockGcType.getFlowUnits()) - .thenReturn( - Collections.singletonList( - new MetricFlowUnit( - 0, - metricTestHelper.createTestResult( - gcInfoTableColumns, data)))); - } - private void setupMockSearchbpStats( final Metric metric, final double searchbpShardCancellationCount, From 4c69fb3a9c8b767f0028d823348998e9fb62a62a Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 7 Jul 2023 15:24:57 -0700 Subject: [PATCH 26/37] Refactor operate() (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../searchbackpressure/SearchBackPressureRCA.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index a81dcd936..4050413c7 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -186,6 +186,7 @@ public ResourceFlowUnit operate() { searchBackPressureRCAMetric.getSearchbpJVMTaskCancellationCount()); // update sliding window if the value is NOT NaN + // TO DO double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); if (!Double.isNaN(prevheapUsagePercentage)) { minHeapUsageSlidingWindow.next( @@ -240,23 +241,26 @@ public ResourceFlowUnit operate() { * (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations * (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations */ + boolean maxHeapBelowIncreaseThreshold = maxHeapUsagePercentage < heapUsedIncreaseThreshold; + boolean minHeapAboveDecreaseThreshold = minHeapUsagePercentage > heapUsedDecreaseThreshold; + // shard level thresholds boolean increaseJVMThresholdMetByShard = - (maxHeapUsagePercentage < heapUsedIncreaseThreshold) + maxHeapBelowIncreaseThreshold && (avgShardJVMCancellationPercentage > heapShardCancellationIncreaseMaxThreshold); boolean decreaseJVMThresholdMetByShard = - (minHeapUsagePercentage > heapUsedDecreaseThreshold) + minHeapAboveDecreaseThreshold && (avgShardJVMCancellationPercentage < heapShardCancellationDecreaseMinThreashold); // task level thresholds boolean increaseJVMThresholdMetByTask = - (maxHeapUsagePercentage < heapUsedIncreaseThreshold) + maxHeapBelowIncreaseThreshold && (avgTaskJVMCancellationPercentage > heapTaskCancellationIncreaseMaxThreshold); boolean decreaseJVMThresholdMetByTask = - (minHeapUsagePercentage > heapUsedDecreaseThreshold) + minHeapAboveDecreaseThreshold && (avgTaskJVMCancellationPercentage < heapTaskCancellationDecreaseMinThreashold); From cf92a617bbce21596f5a07398599df27e03aa180 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 7 Jul 2023 16:00:04 -0700 Subject: [PATCH 27/37] Refactor operate() and remove dead comments (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRCA.java | 99 ++++++++++--------- 1 file changed, 51 insertions(+), 48 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 4050413c7..e8a50d064 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -153,7 +153,7 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { final List flowUnitMessages = args.getWireHopper().readFromWire(args.getNode()); final List> flowUnitList = new ArrayList<>(); - LOG.debug("rca: Executing fromWire: {}", this.getClass().getSimpleName()); + LOG.info("rca: Executing fromWire: {}", this.getClass().getSimpleName()); for (FlowUnitMessage flowUnitMessage : flowUnitMessages) { flowUnitList.add(ResourceFlowUnit.buildFlowUnitFromWrapper(flowUnitMessage)); } @@ -185,29 +185,7 @@ public ResourceFlowUnit operate() { searchBackPressureRCAMetric.getSearchbpJVMShardCancellationCount(), searchBackPressureRCAMetric.getSearchbpJVMTaskCancellationCount()); - // update sliding window if the value is NOT NaN - // TO DO - double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); - if (!Double.isNaN(prevheapUsagePercentage)) { - minHeapUsageSlidingWindow.next( - new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); - maxHeapUsageSlidingWindow.next( - new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); - } - - double shardJVMCancellationPercentage = - searchBackPressureRCAMetric.getShardJVMCancellationPercent(); - if (!Double.isNaN(shardJVMCancellationPercentage)) { - shardJVMCancellationSlidingWindow.next( - new SlidingWindowData(currentTimeMillis, shardJVMCancellationPercentage)); - } - - double taskJVMCancellationPercentage = - searchBackPressureRCAMetric.getTaskJVMCancellationPercent(); - if (!Double.isNaN(taskJVMCancellationPercentage)) { - taskJVMCancellationSlidingWindow.next( - new SlidingWindowData(currentTimeMillis, taskJVMCancellationPercentage)); - } + updateAllSlidingWindows(searchBackPressureRCAMetric, currentTimeMillis); LOG.info("SearchBackPressureRCA currentIterationNumber is {}", currentIterationNumber); // if currentIterationNumber matches the rca period, emit the flow unit @@ -241,28 +219,30 @@ public ResourceFlowUnit operate() { * (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations * (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations */ - boolean maxHeapBelowIncreaseThreshold = maxHeapUsagePercentage < heapUsedIncreaseThreshold; - boolean minHeapAboveDecreaseThreshold = minHeapUsagePercentage > heapUsedDecreaseThreshold; - + boolean maxHeapBelowIncreaseThreshold = + maxHeapUsagePercentage < heapUsedIncreaseThreshold; + boolean minHeapAboveDecreaseThreshold = + minHeapUsagePercentage > heapUsedDecreaseThreshold; + boolean shardHeapCancellationPercentageAboveThreshold = + avgShardJVMCancellationPercentage > heapShardCancellationIncreaseMaxThreshold; + boolean shardHeapCancellationPercentageBelowThreshold = + avgShardJVMCancellationPercentage < heapShardCancellationDecreaseMinThreashold; + boolean taskHeapCancellationPercentageAboveThreshold = + avgTaskJVMCancellationPercentage > heapTaskCancellationIncreaseMaxThreshold; + boolean taskHeapCancellationPercentageBelowThreshold = + avgTaskJVMCancellationPercentage < heapTaskCancellationDecreaseMinThreashold; + // shard level thresholds boolean increaseJVMThresholdMetByShard = - maxHeapBelowIncreaseThreshold - && (avgShardJVMCancellationPercentage - > heapShardCancellationIncreaseMaxThreshold); + maxHeapBelowIncreaseThreshold && shardHeapCancellationPercentageAboveThreshold; boolean decreaseJVMThresholdMetByShard = - minHeapAboveDecreaseThreshold - && (avgShardJVMCancellationPercentage - < heapShardCancellationDecreaseMinThreashold); + minHeapAboveDecreaseThreshold && shardHeapCancellationPercentageBelowThreshold; // task level thresholds boolean increaseJVMThresholdMetByTask = - maxHeapBelowIncreaseThreshold - && (avgTaskJVMCancellationPercentage - > heapTaskCancellationIncreaseMaxThreshold); + maxHeapBelowIncreaseThreshold && taskHeapCancellationPercentageAboveThreshold; boolean decreaseJVMThresholdMetByTask = - minHeapAboveDecreaseThreshold - && (avgTaskJVMCancellationPercentage - < heapTaskCancellationDecreaseMinThreashold); + minHeapAboveDecreaseThreshold && taskHeapCancellationPercentageBelowThreshold; // Generate a flow unit with an Unhealthy ResourceContext LOG.info( @@ -274,8 +254,8 @@ public ResourceFlowUnit operate() { if (increaseJVMThresholdMetByShard || decreaseJVMThresholdMetByShard) { context = new ResourceContext(Resources.State.UNHEALTHY); - // add an additional resource with metadata: shard-level HotResourceSummary resourceSummary; + // metadata fields indicate the reason for Unhealthy Resource Unit if (increaseJVMThresholdMetByShard) { resourceSummary = new HotResourceSummary( @@ -295,14 +275,12 @@ public ResourceFlowUnit operate() { } nodeSummary.appendNestedSummary(resourceSummary); - return new ResourceFlowUnit<>( currentTimeMillis, context, nodeSummary, !instanceDetails.getIsClusterManager()); } else if (increaseJVMThresholdMetByTask || decreaseJVMThresholdMetByTask) { - // Generate a flow unit with an Unhealthy ResourceContext LOG.info( "Increase/Decrease Condition Meet for Task, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", maxHeapUsagePercentage, @@ -311,7 +289,6 @@ public ResourceFlowUnit operate() { heapTaskCancellationIncreaseMaxThreshold); context = new ResourceContext(Resources.State.UNHEALTHY); - // add an additional resource with metadata: task-level HotResourceSummary resourceSummary; if (increaseJVMThresholdMetByTask) { resourceSummary = @@ -348,8 +325,8 @@ public ResourceFlowUnit operate() { } } else { - // return healthy state when the currentIterationNumber does not meet rcaPeriod - LOG.info("Empty Healthy FlowUnit returned for SearchbackPressureRCA"); + // Return Empty ResourceFlowUnit if none of the thresholds is met + LOG.info("Empty FlowUnit returned for SearchbackPressureRCA"); currentTimeMillis = System.currentTimeMillis(); return new ResourceFlowUnit<>(currentTimeMillis); } @@ -479,8 +456,7 @@ private double getMetric(M metric, Field field, Strin if (!flowUnit.isEmpty()) { double metricResponse = readDataFromSqlResult(flowUnit.getData(), field, fieldName, MetricsDB.MAX); - // // print out the metricResponse - // LOG.info("Searchbp metricResponse is: {}", metricResponse); + LOG.info("Searchbp metricResponse is: {}", metricResponse); if (!Double.isNaN(metricResponse) && metricResponse >= 0.0) { response = metricResponse; } @@ -497,7 +473,6 @@ private double getMetric(M metric, Field field, Strin */ @Override public void readRcaConf(RcaConf conf) { - LOG.info("SearchBackPressureRCA readRcaConf() intiatilized"); final SearchBackPressureRcaConfig config = conf.getSearchBackPressureRcaConfig(); // threshold read from config file @@ -515,4 +490,32 @@ public void readRcaConf(RcaConf conf) { this.heapTaskCancellationDecreaseMinThreashold = config.getMinTaskHeapCancellationPercentageThreshold(); } + + /* + * Update Stats for all Sliding Windows + */ + private void updateAllSlidingWindows( + SearchBackPressureRCAMetric searchBackPressureRCAMetric, long currentTimeMillis) { + double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); + if (!Double.isNaN(prevheapUsagePercentage)) { + minHeapUsageSlidingWindow.next( + new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); + maxHeapUsageSlidingWindow.next( + new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); + } + + double shardJVMCancellationPercentage = + searchBackPressureRCAMetric.getShardJVMCancellationPercent(); + if (!Double.isNaN(shardJVMCancellationPercentage)) { + shardJVMCancellationSlidingWindow.next( + new SlidingWindowData(currentTimeMillis, shardJVMCancellationPercentage)); + } + + double taskJVMCancellationPercentage = + searchBackPressureRCAMetric.getTaskJVMCancellationPercent(); + if (!Double.isNaN(taskJVMCancellationPercentage)) { + taskJVMCancellationSlidingWindow.next( + new SlidingWindowData(currentTimeMillis, taskJVMCancellationPercentage)); + } + } } From aca9a5223e881d43106877c516d862e91a87df7c Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 11 Jul 2023 09:43:36 -0700 Subject: [PATCH 28/37] Merged Main (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- build.gradle | 2 +- .../rca/searchbackpressure/SearchBackPressureClusterRCA.java | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/build.gradle b/build.gradle index 25b70ea22..bfd4fc044 100644 --- a/build.gradle +++ b/build.gradle @@ -371,7 +371,7 @@ dependencies { implementation("org.mockito:mockito-core") { version { strictly "2.23.0" - } + } } testImplementation group: 'org.powermock', name: 'powermock-core', version: '2.0.0' testImplementation group: 'org.powermock', name: 'powermock-api-support', version: '2.0.0' diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java index 2f2ea88a5..a7b95bada 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java @@ -21,6 +21,5 @@ public class SearchBackPressureClusterRCA extends BaseClusterRca { public >> SearchBackPressureClusterRCA( final int rcaPeriod, final R SearchBackPressureRCA) { super(rcaPeriod, SearchBackPressureRCA); - LOG.info("SearchBackPressureClusterRCA enabeld."); } } From f4ea13ec07a4eca6ea8b22413a18025ec49b0f47 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 11 Jul 2023 10:45:44 -0700 Subject: [PATCH 29/37] Merged Main (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../rca/searchbackpressure/SearchBackPressureClusterRCA.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java index a7b95bada..2dd36a308 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java @@ -21,5 +21,6 @@ public class SearchBackPressureClusterRCA extends BaseClusterRca { public >> SearchBackPressureClusterRCA( final int rcaPeriod, final R SearchBackPressureRCA) { super(rcaPeriod, SearchBackPressureRCA); + LOG.info("SearchBackPressureClusterRCA created"); } } From 6d63620399408eb513eea10c2802d32420dbeacb Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 09:59:04 -0700 Subject: [PATCH 30/37] remove trailing space in build.gradle (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index bfd4fc044..25b70ea22 100644 --- a/build.gradle +++ b/build.gradle @@ -371,7 +371,7 @@ dependencies { implementation("org.mockito:mockito-core") { version { strictly "2.23.0" - } + } } testImplementation group: 'org.powermock', name: 'powermock-core', version: '2.0.0' testImplementation group: 'org.powermock', name: 'powermock-api-support', version: '2.0.0' From 86654b6850d320bf8db7da8290bd0c9f4cc120ee Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 10:03:20 -0700 Subject: [PATCH 31/37] nit javadoc update (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index 970db2ace..ecf80257e 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -463,7 +463,7 @@ public void construct() { RcaConsts.RcaTagConstants.TAG_AGGREGATE_UPSTREAM, RcaConsts.RcaTagConstants.LOCUS_DATA_NODE); - // To Do SearchBackPressure Decider + // ToDo SearchBackPressure Decider AdmissionControlDecider admissionControlDecider = buildAdmissionControlDecider(heapUsed, heapMax); From fd0ad05a2a6b6190dba59703ef531864367cd8c7 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 10:06:00 -0700 Subject: [PATCH 32/37] nit javadoc updates (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index ecf80257e..e2a51c9a4 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -463,7 +463,7 @@ public void construct() { RcaConsts.RcaTagConstants.TAG_AGGREGATE_UPSTREAM, RcaConsts.RcaTagConstants.LOCUS_DATA_NODE); - // ToDo SearchBackPressure Decider + // TODO: SearchBackPressure Decider AdmissionControlDecider admissionControlDecider = buildAdmissionControlDecider(heapUsed, heapMax); From 2ef08004a8e3d34bdb80323ba886b202c900082a Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 10:06:55 -0700 Subject: [PATCH 33/37] nit javadoc updates (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index e2a51c9a4..1b11c014e 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -463,7 +463,7 @@ public void construct() { RcaConsts.RcaTagConstants.TAG_AGGREGATE_UPSTREAM, RcaConsts.RcaTagConstants.LOCUS_DATA_NODE); - // TODO: SearchBackPressure Decider + // TODO: Add SearchBackPressure Decider AdmissionControlDecider admissionControlDecider = buildAdmissionControlDecider(heapUsed, heapMax); From 78a3acbb7d7d3a2b1c3f9c8be9e04078417e5037 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 10:48:59 -0700 Subject: [PATCH 34/37] Remove dead comments (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../store/rca/searchbackpressure/SearchBackPressureRCA.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index e8a50d064..a95362050 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -143,8 +143,7 @@ public SearchBackPressureRCA( } /* - * operate() is used for local build - * This will compute the flow units from other hosts in the cluster + * generateFlowUnitListFromWire() compute the flow units from other hosts in the cluster * for a given Metric and try to send the subscription requests * to stale or new hosts in cluster if need be */ From 7d10b5c4177cdb775541617b7f871e6065d732cb Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 10:50:20 -0700 Subject: [PATCH 35/37] update javadoc (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../performanceanalyzer/rca/store/rca/OldGenRca.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java index 147dd5db1..a1146da10 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java @@ -222,7 +222,7 @@ protected boolean isOldGenCollectorCMS() { return true; } - /** Sliding window to check the minimal olg gen usage within a given time frame */ + /** Sliding window to check the minimal old gen usage within a given time frame */ public static class MinOldGenSlidingWindow extends SlidingWindow { public MinOldGenSlidingWindow(int SLIDING_WINDOW_SIZE_IN_TIMESTAMP, TimeUnit timeUnit) { @@ -252,7 +252,7 @@ public double readMin() { } /** - * Sliding window to check the max/min olg gen usage within a given time frame + * Sliding window to check the max/min old gen usage within a given time frame * * @param isMinSlidingWindow true if the sliding window is for min usage, false for max usage * Provides a more general framework than MinOldGenSlidingWindow as this sliding window can From d899fb0a4bab5f726cfe400613d851f281942173 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 11:14:05 -0700 Subject: [PATCH 36/37] LOG Level Change (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../rca/store/rca/OldGenRca.java | 1 - .../SearchBackPressureClusterRCA.java | 1 - .../SearchBackPressureRCA.java | 24 +++++++++---------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java index a1146da10..e2056eec8 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java @@ -271,7 +271,6 @@ public MinMaxSlidingWindow( @Override public void next(SlidingWindowData e) { - boolean pollFirstCondition; if (isMinSlidingWindow) { // monotonically decreasing sliding window while (!windowDeque.isEmpty() diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java index 2dd36a308..a7b95bada 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java @@ -21,6 +21,5 @@ public class SearchBackPressureClusterRCA extends BaseClusterRca { public >> SearchBackPressureClusterRCA( final int rcaPeriod, final R SearchBackPressureRCA) { super(rcaPeriod, SearchBackPressureRCA); - LOG.info("SearchBackPressureClusterRCA created"); } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index a95362050..d274e6f52 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -139,7 +139,7 @@ public SearchBackPressureRCA( this.taskJVMCancellationSlidingWindow = new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); - LOG.info("SearchBackPressureRCA initialized"); + LOG.debug("SearchBackPressureRCA initialized"); } /* @@ -174,7 +174,7 @@ public ResourceFlowUnit operate() { SearchBackPressureRCAMetric searchBackPressureRCAMetric = getSearchBackPressureRCAMetric(); // print out oldGenUsed and maxOldGen - LOG.info( + LOG.debug( "SearchBackPressureRCA: oldGenUsed: {} maxOldGen: {}, heapUsedPercentage: {}, searchbpShardCancellationCount: {}, searchbpTaskCancellationCount: {}, searchbpJVMShardCancellationCount: {}, searchbpJVMTaskCancellationCount: {}", searchBackPressureRCAMetric.getUsedHeap(), searchBackPressureRCAMetric.getMaxHeap(), @@ -186,10 +186,10 @@ public ResourceFlowUnit operate() { updateAllSlidingWindows(searchBackPressureRCAMetric, currentTimeMillis); - LOG.info("SearchBackPressureRCA currentIterationNumber is {}", currentIterationNumber); + LOG.debug("SearchBackPressureRCA currentIterationNumber is {}", currentIterationNumber); // if currentIterationNumber matches the rca period, emit the flow unit if (currentIterationNumber == this.rcaPeriod) { - LOG.info( + LOG.debug( "SearchBackPressureRCA currentIterationNumber in rcaPeriod is {}", currentIterationNumber); currentTimeMillis = System.currentTimeMillis(); @@ -202,7 +202,7 @@ public ResourceFlowUnit operate() { double avgShardJVMCancellationPercentage = shardJVMCancellationSlidingWindow.readAvg(); double avgTaskJVMCancellationPercentage = taskJVMCancellationSlidingWindow.readAvg(); - LOG.info( + LOG.debug( "SearchBackPressureRCA: maxHeapUsagePercentage: {}, minHeapUsagePercentage: {}, SearchBackPressureRCA: avgShardJVMCancellationPercentage: {}, SearchBackPressureRCA: avgTaskJVMCancellationPercentage: {}", maxHeapUsagePercentage, minHeapUsagePercentage, @@ -244,7 +244,7 @@ public ResourceFlowUnit operate() { minHeapAboveDecreaseThreshold && taskHeapCancellationPercentageBelowThreshold; // Generate a flow unit with an Unhealthy ResourceContext - LOG.info( + LOG.debug( "Increase/Decrease Condition Meet for Shard, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", maxHeapUsagePercentage, heapUsedIncreaseThreshold, @@ -280,7 +280,7 @@ public ResourceFlowUnit operate() { nodeSummary, !instanceDetails.getIsClusterManager()); } else if (increaseJVMThresholdMetByTask || decreaseJVMThresholdMetByTask) { - LOG.info( + LOG.debug( "Increase/Decrease Condition Meet for Task, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", maxHeapUsagePercentage, heapUsedIncreaseThreshold, @@ -325,7 +325,7 @@ public ResourceFlowUnit operate() { } else { // Return Empty ResourceFlowUnit if none of the thresholds is met - LOG.info("Empty FlowUnit returned for SearchbackPressureRCA"); + LOG.debug("Empty FlowUnit returned for SearchbackPressureRCA"); currentTimeMillis = System.currentTimeMillis(); return new ResourceFlowUnit<>(currentTimeMillis); } @@ -391,7 +391,7 @@ private SearchBackPressureRCAMetric getSearchBackPressureRCAMetric() { double maxHeapSize = getHeapStats(false); // Log prevHeapUsage and maxHeapSize - LOG.info("prevHeapUsage: {}, maxHeapSize: {}", prevHeapUsage, maxHeapSize); + LOG.debug("prevHeapUsage: {}, maxHeapSize: {}", prevHeapUsage, maxHeapSize); // Get SearchBack Pressure related metrics from stats type field Field searchbp_stats_type_field = @@ -455,13 +455,13 @@ private double getMetric(M metric, Field field, Strin if (!flowUnit.isEmpty()) { double metricResponse = readDataFromSqlResult(flowUnit.getData(), field, fieldName, MetricsDB.MAX); - LOG.info("Searchbp metricResponse is: {}", metricResponse); + LOG.debug("Searchbp metricResponse is: {}", metricResponse); if (!Double.isNaN(metricResponse) && metricResponse >= 0.0) { response = metricResponse; } } } - LOG.info("Searchbp response is: {}", response); + LOG.debug("Searchbp response is: {}", response); return response; } @@ -476,7 +476,7 @@ public void readRcaConf(RcaConf conf) { // threshold read from config file this.heapUsedIncreaseThreshold = config.getMaxHeapIncreasePercentageThreshold(); - LOG.info( + LOG.debug( "SearchBackPressureRCA heapUsedIncreaseThreshold is set to {}", this.heapUsedIncreaseThreshold); this.heapShardCancellationIncreaseMaxThreshold = From 3a1893b144e14bef1e91775d6714fe0588729511 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 11:35:08 -0700 Subject: [PATCH 37/37] Change from static class to enum (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 45 ++++++++++++------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index 8838ce12f..a4c81a53b 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -55,42 +55,46 @@ public SearchBackPressureRcaConfig(final RcaConf conf) { maxHeapIncreasePercentageThreshold = conf.readRcaConfig( CONFIG_NAME, - RCA_CONF_KEY_CONSTANTS.MAX_HEAP_USAGE_INCREASE_FIELD, + SearchBackPressureRcaConfigKeys.MAX_HEAP_USAGE_INCREASE_FIELD.toString(), DEFAULT_MAX_HEAP_INCREASE_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); maxShardHeapCancellationPercentageThreshold = conf.readRcaConfig( CONFIG_NAME, - RCA_CONF_KEY_CONSTANTS.MAX_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD, + SearchBackPressureRcaConfigKeys.MAX_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD + .toString(), DEFAULT_SHARD_MAX_HEAP_CANCELLATION_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); maxTaskHeapCancellationPercentageThreshold = conf.readRcaConfig( CONFIG_NAME, - RCA_CONF_KEY_CONSTANTS.MAX_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD, + SearchBackPressureRcaConfigKeys.MAX_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD + .toString(), DEFAULT_TASK_MAX_HEAP_CANCELLATION_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); minHeapDecreasePercentageThreshold = conf.readRcaConfig( CONFIG_NAME, - RCA_CONF_KEY_CONSTANTS.MAX_HEAP_USAGE_DECREASE_FIELD, + SearchBackPressureRcaConfigKeys.MAX_HEAP_USAGE_DECREASE_FIELD.toString(), DEFAULT_MIN_HEAP_DECREASE_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); minShardHeapCancellationPercentageThreshold = conf.readRcaConfig( CONFIG_NAME, - RCA_CONF_KEY_CONSTANTS.MIN_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD, + SearchBackPressureRcaConfigKeys.MIN_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD + .toString(), DEFAULT_SHARD_MIN_HEAP_CANCELLATION_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); minTaskHeapCancellationPercentageThreshold = conf.readRcaConfig( CONFIG_NAME, - RCA_CONF_KEY_CONSTANTS.MIN_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD, + SearchBackPressureRcaConfigKeys.MIN_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD + .toString(), DEFAULT_TASK_MIN_HEAP_CANCELLATION_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); @@ -122,16 +126,23 @@ public Integer getMinTaskHeapCancellationPercentageThreshold() { } // name for the configuration field - public static class RCA_CONF_KEY_CONSTANTS { - public static final String MAX_HEAP_USAGE_INCREASE_FIELD = "max-heap-usage-increase"; - public static final String MAX_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD = - "max-shard-heap-cancellation-percentage"; - public static final String MAX_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD = - "max-task-heap-cancellation-percentage"; - public static final String MAX_HEAP_USAGE_DECREASE_FIELD = "max-heap-usage-decrease"; - public static final String MIN_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD = - "min-shard-heap-cancellation-percentage"; - public static final String MIN_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD = - "min-task-heap-cancellation-percentage"; + public enum SearchBackPressureRcaConfigKeys { + MAX_HEAP_USAGE_INCREASE_FIELD("max-heap-usage-increase"), + MAX_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD("max-shard-heap-cancellation-percentage"), + MAX_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD("max-task-heap-cancellation-percentage"), + MAX_HEAP_USAGE_DECREASE_FIELD("max-heap-usage-decrease"), + MIN_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD("min-shard-heap-cancellation-percentage"), + MIN_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD("min-task-heap-cancellation-percentage"); + + private final String value; + + SearchBackPressureRcaConfigKeys(final String value) { + this.value = value; + } + + @Override + public String toString() { + return this.value; + } } }