Skip to content

Commit

Permalink
Remove cluster block preflight check from health api (#87520)
Browse files Browse the repository at this point in the history
Removes the logic at the start of the health service that returns UNKNOWN status in case 
the cluster state not recovered block is in place. Instead, we will defer directly to the 
master-is-stable indicator and all following preflight indicators.
  • Loading branch information
jbaiera authored Jun 8, 2022
1 parent 2ec59e7 commit ea5554d
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 189 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/87520.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 87520
summary: Remove cluster block preflight check from health api
area: Health
type: enhancement
issues:
- 87464
69 changes: 18 additions & 51 deletions server/src/main/java/org/elasticsearch/health/HealthService.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
package org.elasticsearch.health;

import org.elasticsearch.ResourceNotFoundException;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.gateway.GatewayService;

import java.util.Collections;
import java.util.HashSet;
Expand All @@ -37,23 +35,14 @@ public class HealthService {
// Visible for testing
static final String UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED = "Could not determine health status. Check details on critical issues "
+ "preventing the health status from reporting.";
static final String UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED =
"Could not determine health status. This node is not ready to assess the cluster health. Try again later or run the health API "
+ "against a different node.";

/**
* Detail map key that contains the reasons a result was marked as UNKNOWN
*/
private static final String REASON = "reasons";

private static final String CLUSTER_STATE_RECOVERED = "cluster_state_recovered";
private static final SimpleHealthIndicatorDetails DETAILS_UNKNOWN_STATE_NOT_RECOVERED = new SimpleHealthIndicatorDetails(
Map.of(REASON, Map.of(CLUSTER_STATE_RECOVERED, false))
);

private final List<HealthIndicatorService> preflightHealthIndicatorServices;
private final List<HealthIndicatorService> healthIndicatorServices;
private final ClusterService clusterService;

/**
* Creates a new HealthService.
Expand All @@ -68,12 +57,10 @@ public class HealthService {
*/
public HealthService(
List<HealthIndicatorService> preflightHealthIndicatorServices,
List<HealthIndicatorService> healthIndicatorServices,
ClusterService clusterService
List<HealthIndicatorService> healthIndicatorServices
) {
this.preflightHealthIndicatorServices = preflightHealthIndicatorServices;
this.healthIndicatorServices = healthIndicatorServices;
this.clusterService = clusterService;
}

/**
Expand All @@ -91,22 +78,10 @@ public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nu
final boolean shouldDrillDownToIndicatorLevel = indicatorName != null;
final boolean showRolledUpComponentStatus = shouldDrillDownToIndicatorLevel == false;

// Is the cluster state recovered? If not, ALL indicators should return UNKNOWN
boolean clusterStateRecovered = clusterService.state()
.getBlocks()
.hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK) == false;

List<HealthIndicatorResult> preflightResults;
if (clusterStateRecovered) {
// Determine if cluster is stable enough to calculate health before running other indicators
preflightResults = preflightHealthIndicatorServices.stream().map(service -> service.calculate(explain)).toList();
} else {
// Mark preflight indicators as UNKNOWN
HealthIndicatorDetails details = explain ? DETAILS_UNKNOWN_STATE_NOT_RECOVERED : HealthIndicatorDetails.EMPTY;
preflightResults = preflightHealthIndicatorServices.stream()
.map(service -> generateUnknownResult(service, UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED, details))
.toList();
}
// Determine if cluster is stable enough to calculate health before running other indicators
List<HealthIndicatorResult> preflightResults = preflightHealthIndicatorServices.stream()
.map(service -> service.calculate(explain))
.toList();

// If any of these are not GREEN, then we cannot obtain health from other indicators
boolean clusterHealthIsObtainable = preflightResults.isEmpty()
Expand All @@ -118,14 +93,15 @@ public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nu
.filter(service -> indicatorName == null || service.name().equals(indicatorName));

Stream<HealthIndicatorResult> filteredIndicatorResults;
if (clusterStateRecovered && clusterHealthIsObtainable) {
if (clusterHealthIsObtainable) {
// Calculate remaining indicators
filteredIndicatorResults = filteredIndicators.map(service -> service.calculate(explain));
} else {
// Mark remaining indicators as UNKNOWN
String unknownSummary = clusterStateRecovered ? UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED : UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED;
HealthIndicatorDetails unknownDetails = healthUnknownReason(preflightResults, clusterStateRecovered, explain);
filteredIndicatorResults = filteredIndicators.map(service -> generateUnknownResult(service, unknownSummary, unknownDetails));
HealthIndicatorDetails unknownDetails = healthUnknownReason(preflightResults, explain);
filteredIndicatorResults = filteredIndicators.map(
service -> generateUnknownResult(service, UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED, unknownDetails)
);
}

// Filter the cluster indicator results by component name and indicator name if present
Expand Down Expand Up @@ -166,25 +142,16 @@ public List<HealthComponentResult> getHealth(@Nullable String componentName, @Nu
* @param computeDetails If details should be calculated on which indicators are causing the UNKNOWN state.
* @return Details explaining why results are UNKNOWN, or an empty detail set if computeDetails is false.
*/
private HealthIndicatorDetails healthUnknownReason(
List<HealthIndicatorResult> preflightResults,
boolean clusterStateRecovered,
boolean computeDetails
) {
assert clusterStateRecovered == false || preflightResults.isEmpty() == false
: "Requires at least one non-GREEN preflight result or cluster state not recovered";
private HealthIndicatorDetails healthUnknownReason(List<HealthIndicatorResult> preflightResults, boolean computeDetails) {
assert preflightResults.isEmpty() == false : "Requires at least one non-GREEN preflight result";
HealthIndicatorDetails unknownDetails;
if (computeDetails) {
if (clusterStateRecovered) {
// Determine why the cluster is not stable enough for running remaining indicators
Map<String, String> clusterUnstableReasons = preflightResults.stream()
.filter(result -> HealthStatus.GREEN.equals(result.status()) == false)
.collect(toMap(HealthIndicatorResult::name, result -> result.status().xContentValue()));
assert clusterUnstableReasons.isEmpty() == false : "Requires at least one non-GREEN preflight result";
unknownDetails = new SimpleHealthIndicatorDetails(Map.of(REASON, clusterUnstableReasons));
} else {
unknownDetails = DETAILS_UNKNOWN_STATE_NOT_RECOVERED;
}
// Determine why the cluster is not stable enough for running remaining indicators
Map<String, String> clusterUnstableReasons = preflightResults.stream()
.filter(result -> HealthStatus.GREEN.equals(result.status()) == false)
.collect(toMap(HealthIndicatorResult::name, result -> result.status().xContentValue()));
assert clusterUnstableReasons.isEmpty() == false : "Requires at least one non-GREEN preflight result";
unknownDetails = new SimpleHealthIndicatorDetails(Map.of(REASON, clusterUnstableReasons));
} else {
unknownDetails = HealthIndicatorDetails.EMPTY;
}
Expand Down
3 changes: 1 addition & 2 deletions server/src/main/java/org/elasticsearch/node/Node.java
Original file line number Diff line number Diff line change
Expand Up @@ -1046,8 +1046,7 @@ private HealthService createHealthService(
.toList();
return new HealthService(
preflightHealthIndicatorServices,
concatLists(serverHealthIndicatorServices, pluginHealthIndicatorServices),
clusterService
concatLists(serverHealthIndicatorServices, pluginHealthIndicatorServices)
);
}

Expand Down
141 changes: 5 additions & 136 deletions server/src/test/java/org/elasticsearch/health/HealthServiceTests.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,6 @@
package org.elasticsearch.health;

import org.elasticsearch.ResourceNotFoundException;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.block.ClusterBlocks;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.gateway.GatewayService;
import org.elasticsearch.test.ESTestCase;

import java.util.Collections;
Expand All @@ -30,9 +25,6 @@
import static org.hamcrest.Matchers.hasItems;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.oneOf;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

public class HealthServiceTests extends ESTestCase {

Expand All @@ -48,8 +40,7 @@ public void testShouldReturnGroupedIndicators() {
createMockHealthIndicatorService(networkLatency),
createMockHealthIndicatorService(slowTasks),
createMockHealthIndicatorService(shardsAvailable)
),
mockEmptyClusterService()
)
);

assertThat(
Expand Down Expand Up @@ -116,8 +107,7 @@ public void testMissingComponentOrIndicator() {
createMockHealthIndicatorService(networkLatency),
createMockHealthIndicatorService(slowTasks),
createMockHealthIndicatorService(shardsAvailable)
),
mockEmptyClusterService()
)
);

expectThrows(
Expand Down Expand Up @@ -147,8 +137,7 @@ public void testPreflightIndicatorResultsPresent() {
createMockHealthIndicatorService(networkLatency),
createMockHealthIndicatorService(slowTasks),
createMockHealthIndicatorService(shardsAvailable)
),
mockEmptyClusterService()
)
);

// Get all indicators returns preflight result mixed in with appropriate component
Expand Down Expand Up @@ -210,10 +199,7 @@ public void testPreflightIndicatorResultsPresent() {

private void assertIndicatorIsUnknownStatus(HealthIndicatorResult result) {
assertThat(result.status(), is(equalTo(UNKNOWN)));
assertThat(
result.summary(),
is(oneOf(HealthService.UNKNOWN_RESULT_SUMMARY_NOT_RECOVERED, HealthService.UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED))
);
assertThat(result.summary(), is(HealthService.UNKNOWN_RESULT_SUMMARY_PREFLIGHT_FAILED));
}

public void testPreflightIndicatorFailureTriggersUnknownResults() {
Expand All @@ -231,8 +217,7 @@ public void testPreflightIndicatorFailureTriggersUnknownResults() {
createMockHealthIndicatorService(networkLatency),
createMockHealthIndicatorService(slowTasks),
createMockHealthIndicatorService(shardsAvailable)
),
mockEmptyClusterService()
)
);

List<HealthComponentResult> health = service.getHealth(null, null, false);
Expand Down Expand Up @@ -321,113 +306,6 @@ public void testPreflightIndicatorFailureTriggersUnknownResults() {
}
}

public void testAllIndicatorsUnknownWhenClusterStateNotRecovered() {
var hasMaster = new HealthIndicatorResult("has_master", "coordination", RED, null, null, null, null, null);
var hasStorage = new HealthIndicatorResult("has_storage", "data", GREEN, null, null, null, null, null);
var networkLatency = new HealthIndicatorResult("network_latency", "coordination", GREEN, null, null, null, null, null);
var slowTasks = new HealthIndicatorResult("slow_task_assignment", "coordination", YELLOW, null, null, null, null, null);
var shardsAvailable = new HealthIndicatorResult("shards_availability", "data", GREEN, null, null, null, null, null);

var service = new HealthService(
List.of(createMockHealthIndicatorService(hasMaster), createMockHealthIndicatorService(hasStorage)),
List.of(
createMockHealthIndicatorService(networkLatency),
createMockHealthIndicatorService(slowTasks),
createMockHealthIndicatorService(shardsAvailable)
),
mockClusterService(
ClusterState.builder(new ClusterName("test-cluster"))
.blocks(ClusterBlocks.builder().addGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK).build())
.build()
)
);

List<HealthComponentResult> health = service.getHealth(null, null, false);
assertThat(health.size(), is(equalTo(2)));
{
HealthComponentResult component1 = health.stream()
.filter(result -> result.name().equals("coordination"))
.findAny()
.orElseThrow();
// UNKNOWN because cluster state not recovered
assertThat(component1.status(), is(equalTo(UNKNOWN)));
assertThat(component1.indicators(), is(notNullValue()));
assertThat(component1.indicators().size(), is(equalTo(3)));
// Preflight 1 should be UNKNOWN
HealthIndicatorResult hasMasterResult = component1.findIndicator("has_master");
assertIndicatorIsUnknownStatus(hasMasterResult);
// Indicator 1 should be UNKNOWN
HealthIndicatorResult networkLatencyResult = component1.findIndicator("network_latency");
assertIndicatorIsUnknownStatus(networkLatencyResult);
// Indicator 2 should be UNKNOWN
HealthIndicatorResult slowTasksResult = component1.findIndicator("slow_task_assignment");
assertIndicatorIsUnknownStatus(slowTasksResult);
}
{
HealthComponentResult component2 = health.stream().filter(result -> result.name().equals("data")).findAny().orElseThrow();
// UNKNOWN because cluster state not recovered
assertThat(component2.status(), is(equalTo(UNKNOWN)));
assertThat(component2.indicators(), is(notNullValue()));
assertThat(component2.indicators().size(), is(equalTo(2)));
// Preflight 2 should be UNKNOWN
HealthIndicatorResult hasStorageResult = component2.findIndicator("has_storage");
assertIndicatorIsUnknownStatus(hasStorageResult);
// Indicator 3 should be UNKNOWN
HealthIndicatorResult shardsAvailableResult = component2.findIndicator("shards_availability");
assertIndicatorIsUnknownStatus(shardsAvailableResult);
}

health = service.getHealth("coordination", null, false);
assertThat(health.size(), is(equalTo(1)));
{
HealthComponentResult component1 = health.stream()
.filter(result -> result.name().equals("coordination"))
.findAny()
.orElseThrow();
// UNKNOWN because cluster state not recovered
assertThat(component1.status(), is(equalTo(UNKNOWN)));
assertThat(component1.indicators(), is(notNullValue()));
assertThat(component1.indicators().size(), is(equalTo(3)));
// Preflight 1 should be UNKNOWN
HealthIndicatorResult hasMasterResult = component1.findIndicator("has_master");
assertIndicatorIsUnknownStatus(hasMasterResult);
// Indicator 1 should be UNKNOWN
HealthIndicatorResult networkLatencyResult = component1.findIndicator("network_latency");
assertIndicatorIsUnknownStatus(networkLatencyResult);
// Indicator 2 should be UNKNOWN
HealthIndicatorResult slowTasksResult = component1.findIndicator("slow_task_assignment");
assertIndicatorIsUnknownStatus(slowTasksResult);
}

health = service.getHealth("coordination", "slow_task_assignment", false);
assertThat(health.size(), is(equalTo(1)));
{
HealthComponentResult component1 = health.stream()
.filter(result -> result.name().equals("coordination"))
.findAny()
.orElseThrow();
assertThat(component1.indicators(), is(notNullValue()));
assertThat(component1.indicators().size(), is(equalTo(1)));
// Indicator 2 should be UNKNOWN
HealthIndicatorResult slowTasksResult = component1.findIndicator("slow_task_assignment");
assertIndicatorIsUnknownStatus(slowTasksResult);
}

health = service.getHealth("coordination", "has_master", false);
assertThat(health.size(), is(equalTo(1)));
{
HealthComponentResult component1 = health.stream()
.filter(result -> result.name().equals("coordination"))
.findAny()
.orElseThrow();
assertThat(component1.indicators(), is(notNullValue()));
assertThat(component1.indicators().size(), is(equalTo(1)));
// Preflight 1 should be UNKNOWN
HealthIndicatorResult hasMasterResult = component1.findIndicator("has_master");
assertIndicatorIsUnknownStatus(hasMasterResult);
}
}

private static HealthIndicatorService createMockHealthIndicatorService(HealthIndicatorResult result) {
return new HealthIndicatorService() {
@Override
Expand All @@ -452,13 +330,4 @@ public HealthIndicatorResult calculate(boolean explain) {
};
}

private static ClusterService mockEmptyClusterService() {
return mockClusterService(ClusterState.EMPTY_STATE);
}

private static ClusterService mockClusterService(ClusterState clusterState) {
var clusterService = mock(ClusterService.class);
when(clusterService.state()).thenReturn(clusterState);
return clusterService;
}
}

0 comments on commit ea5554d

Please sign in to comment.