Skip to content

Commit

Permalink
resetTransitTimeoutAndCancellationCountDuringHighCPULoad (#38157)
Browse files Browse the repository at this point in the history
* reset transitTimeout and cancellationCount under high CPU load
---------

Co-authored-by: annie-mac <[email protected]>
  • Loading branch information
xinlian12 and annie-mac authored Jan 3, 2024
1 parent 0866382 commit 2c940cf
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ public void transitTimeoutOnWrite_HighCPULoadTests(boolean withFailureReason) th
Mockito.when(timestampsMock.tansitTimeoutWriteCount()).thenReturn(writeTimeoutCount);
Mockito.when(timestampsMock.lastChannelWriteTime()).thenReturn(lastChannelWriteTime);
Mockito.when(timestampsMock.lastChannelWriteAttemptTime()).thenReturn(lastChannelWriteAttemptTime);
Mockito.doNothing().when(timestampsMock).resetTransitTimeout();

try(MockedStatic<CpuMemoryMonitor> cpuMemoryMonitorMock = Mockito.mockStatic(CpuMemoryMonitor.class)) {
CpuLoadHistory cpuLoadHistoryMock = Mockito.mock(CpuLoadHistory.class);
Expand All @@ -300,10 +301,14 @@ public void transitTimeoutOnWrite_HighCPULoadTests(boolean withFailureReason) th
Future<String> healthyResult = healthChecker.isHealthyWithFailureReason(channelMock).sync();
assertThat(healthyResult.isSuccess()).isTrue();
assertThat(healthyResult.getNow()).isEqualTo(RntbdConstants.RntbdHealthCheckResults.SuccessValue);
// Verify under high CPU load, the transitTimeout will be reset
Mockito.verify(timestampsMock, Mockito.times(1)).resetTransitTimeout();
} else {
Future<Boolean> healthyResult = healthChecker.isHealthy(channelMock).sync();
assertThat(healthyResult.isSuccess()).isTrue();
assertThat(healthyResult.getNow()).isTrue();
// Verify under high CPU load, the transitTimeout will be reset
Mockito.verify(timestampsMock, Mockito.times(1)).resetTransitTimeout();
}
}
}
Expand Down Expand Up @@ -394,6 +399,7 @@ public void cancellationPronenessOfChannelWithHighCpuLoad_Test(boolean withFailu
Mockito.when(timestampsMock.lastChannelWriteAttemptTime()).thenReturn(lastChannelWriteTime);
Mockito.when(timestampsMock.transitTimeoutCount()).thenReturn(0);
Mockito.when(timestampsMock.cancellationCount()).thenReturn(config.cancellationCountSinceLastReadThreshold());
Mockito.doNothing().when(timestampsMock).resetCancellationCount();

try(MockedStatic<CpuMemoryMonitor> cpuMemoryMonitorMock = Mockito.mockStatic(CpuMemoryMonitor.class)) {
CpuLoadHistory cpuLoadHistoryMock = Mockito.mock(CpuLoadHistory.class);
Expand All @@ -404,10 +410,14 @@ public void cancellationPronenessOfChannelWithHighCpuLoad_Test(boolean withFailu
Future<String> healthyResult = healthChecker.isHealthyWithFailureReason(channelMock).sync();
assertThat(healthyResult.isSuccess()).isTrue();
assertThat(healthyResult.getNow()).isEqualTo(RntbdConstants.RntbdHealthCheckResults.SuccessValue);
// Verify under high CPU load, the cancellationCount will be reset
Mockito.verify(timestampsMock, Mockito.times(1)).resetCancellationCount();
} else {
Future<Boolean> healthyResult = healthChecker.isHealthy(channelMock).sync();
assertThat(healthyResult.isSuccess()).isTrue();
assertThat(healthyResult.getNow()).isTrue();
// Verify under high CPU load, the transitTimeout will be reset
Mockito.verify(timestampsMock, Mockito.times(1)).resetCancellationCount();
}
}
}
Expand Down
1 change: 1 addition & 0 deletions sdk/cosmos/azure-cosmos/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Fixed a `NullPointerException` issue in `MetadataRequestRetryPolicy` when `locationEndpointToRoute` is not set. - See [PR 38094](https://github.com/Azure/azure-sdk-for-java/pull/38094)

#### Other Changes
* Reset `transitTimeoutCount` and `cancellationCount` in `RntbdClientChannelHealthChecker` when CPU load is above threshold. - See [PR 38157](https://github.com/Azure/azure-sdk-for-java/pull/38157)

### 4.53.1 (2023-12-06)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ public final class RntbdClientChannelHealthChecker implements ChannelHealthCheck
@JsonProperty
private final long writeDelayLimitInNanos;
@JsonProperty
private final long networkRequestTimeoutInNanos;
@JsonProperty
private final boolean timeoutDetectionEnabled;
@JsonProperty
private final double timeoutDetectionDisableCPUThreshold;
Expand All @@ -78,7 +76,6 @@ public final class RntbdClientChannelHealthChecker implements ChannelHealthCheck
@JsonProperty
private final int cancellationCountSinceLastReadThreshold;


// endregion

// region Constructors
Expand All @@ -98,7 +95,6 @@ public RntbdClientChannelHealthChecker(final Config config) {
this.idleConnectionTimeoutInNanos = config.idleConnectionTimeoutInNanos();
this.readDelayLimitInNanos = config.receiveHangDetectionTimeInNanos();
this.writeDelayLimitInNanos = config.sendHangDetectionTimeInNanos();
this.networkRequestTimeoutInNanos = config.tcpNetworkRequestTimeoutInNanos();
this.timeoutDetectionEnabled = config.timeoutDetectionEnabled();
this.timeoutDetectionDisableCPUThreshold = config.timeoutDetectionDisableCPUThreshold();
this.timeoutTimeLimitInNanos = config.timeoutDetectionTimeLimitInNanos();
Expand Down Expand Up @@ -327,6 +323,9 @@ private String transitTimeoutValidation(Timestamps timestamps, Instant currentTi
// When request timeout due to high CPU,
// close the existing the connection and re-establish a new one will not help the issue but rather make it worse, return fast
if (CpuMemoryMonitor.getCpuLoad().isCpuOverThreshold(this.timeoutDetectionDisableCPUThreshold)) {
// reset the transit timeout here
// else when the CPU back to below the threshold, the connection may still trigger a connection close right away
timestamps.resetTransitTimeout();
return transitTimeoutValidationMessage;
}

Expand Down Expand Up @@ -418,6 +417,9 @@ private String isCancellationProneChannel(Timestamps timestamps, Instant current
// When request cancellations are due to high CPU,
// close the existing the connection and re-establish a new one will not help the issue but rather make it worse, return fast
if (CpuMemoryMonitor.getCpuLoad().isCpuOverThreshold(this.timeoutDetectionDisableCPUThreshold)) {
// reset the cancellation count here
// else when the CPU back to below the threshold, the connection may still trigger a connection close right away
timestamps.resetCancellationCount();
return errorMessage;
}

Expand Down

0 comments on commit 2c940cf

Please sign in to comment.