From 1e60ceff3aded0889938fca26f7c20bf6c7b46d0 Mon Sep 17 00:00:00 2001 From: Gordon Brown Date: Tue, 5 Oct 2021 16:09:35 -0600 Subject: [PATCH 1/6] Increase node shutdown logging level when stalled --- .../TransportGetShutdownStatusAction.java | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java index 81675b85d1af2..96206091c25b5 100644 --- a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java +++ b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java @@ -262,17 +262,15 @@ static ShutdownShardMigrationStatus shardMigrationStatus( return hasShardCopyOnOtherNode == false; }) .peek(pair -> { - if (logger.isTraceEnabled()) { // don't serialize the decision unless we have to - logger.trace( - "node [{}] shutdown of type [{}] stalled: found shard [{}][{}] from index [{}] with negative decision: [{}]", - nodeId, - shutdownType, - pair.v1().getId(), - pair.v1().primary() ? "primary" : "replica", - pair.v1().shardId().getIndexName(), - Strings.toString(pair.v2()) - ); - } + logger.info( + "node [{}] shutdown of type [{}] stalled: found shard [{}][{}] from index [{}] with negative decision: [{}]", + nodeId, + shutdownType, + pair.v1().getId(), + pair.v1().primary() ? "primary" : "replica", + pair.v1().shardId().getIndexName(), + Strings.toString(pair.v2()) + ); }) .findFirst(); From e781b8196ceaea2564d2cf7101ebd7e0a4f0fd2f Mon Sep 17 00:00:00 2001 From: Gordon Brown Date: Tue, 5 Oct 2021 16:10:19 -0600 Subject: [PATCH 2/6] Add allocation explanation to STALLED response --- .../ShutdownShardMigrationStatus.java | 38 +++++++++++++++++-- .../TransportGetShutdownStatusAction.java | 4 +- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/ShutdownShardMigrationStatus.java b/server/src/main/java/org/elasticsearch/cluster/metadata/ShutdownShardMigrationStatus.java index bc8f93c0706af..3e6a3c187b248 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/ShutdownShardMigrationStatus.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/ShutdownShardMigrationStatus.java @@ -8,6 +8,8 @@ package org.elasticsearch.cluster.metadata; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.routing.allocation.ShardAllocationDecision; import org.elasticsearch.common.Strings; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; @@ -20,25 +22,42 @@ import java.util.Objects; public class ShutdownShardMigrationStatus implements Writeable, ToXContentObject { + private static final Version ALLOCATION_DECISION_ADDED_VERSION = Version.V_8_0_0; private final SingleNodeShutdownMetadata.Status status; private final long shardsRemaining; @Nullable private final String explanation; + @Nullable private final ShardAllocationDecision allocationDecision; public ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status status, long shardsRemaining) { - this(status, shardsRemaining, null); + this(status, shardsRemaining, null, null); } public ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status status, long shardsRemaining, @Nullable String explanation) { + this(status, shardsRemaining, explanation, null); + } + + public ShutdownShardMigrationStatus( + SingleNodeShutdownMetadata.Status status, + long shardsRemaining, + @Nullable String explanation, + @Nullable ShardAllocationDecision allocationDecision + ) { this.status = Objects.requireNonNull(status, "status must not be null"); this.shardsRemaining = shardsRemaining; this.explanation = explanation; + this.allocationDecision = allocationDecision; } public ShutdownShardMigrationStatus(StreamInput in) throws IOException { this.status = in.readEnum(SingleNodeShutdownMetadata.Status.class); this.shardsRemaining = in.readLong(); this.explanation = in.readOptionalString(); + if (in.getVersion().onOrAfter(ALLOCATION_DECISION_ADDED_VERSION)) { + this.allocationDecision = in.readOptionalWriteable(ShardAllocationDecision::new); + } else { + this.allocationDecision = null; + } } public long getShardsRemaining() { @@ -61,6 +80,13 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (Objects.nonNull(explanation)) { builder.field("explanation", explanation); } + if (Objects.nonNull(allocationDecision)) { + builder.startObject("node_allocation_decisions"); + { + allocationDecision.toXContent(builder, params); + } + builder.endObject(); + } builder.endObject(); return builder; } @@ -70,6 +96,9 @@ public void writeTo(StreamOutput out) throws IOException { out.writeEnum(status); out.writeLong(shardsRemaining); out.writeOptionalString(explanation); + if (out.getVersion().onOrAfter(ALLOCATION_DECISION_ADDED_VERSION)) { + out.writeOptionalWriteable(allocationDecision); + } } @Override @@ -77,12 +106,15 @@ public boolean equals(Object o) { if (this == o) return true; if ((o instanceof ShutdownShardMigrationStatus) == false) return false; ShutdownShardMigrationStatus that = (ShutdownShardMigrationStatus) o; - return shardsRemaining == that.shardsRemaining && status == that.status && Objects.equals(explanation, that.explanation); + return shardsRemaining == that.shardsRemaining + && status == that.status + && Objects.equals(explanation, that.explanation) + && Objects.equals(allocationDecision, that.allocationDecision); } @Override public int hashCode() { - return Objects.hash(status, shardsRemaining, explanation); + return Objects.hash(status, shardsRemaining, explanation, allocationDecision); } @Override diff --git a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java index 96206091c25b5..58a655f52f2ba 100644 --- a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java +++ b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java @@ -285,6 +285,7 @@ static ShutdownShardMigrationStatus shardMigrationStatus( } else if (unmovableShard.isPresent()) { // We found a shard that can't be moved, so shard relocation is stalled. Blame the unmovable shard. ShardRouting shardRouting = unmovableShard.get().v1(); + ShardAllocationDecision decision = unmovableShard.get().v2(); return new ShutdownShardMigrationStatus( SingleNodeShutdownMetadata.Status.STALLED, @@ -294,7 +295,8 @@ static ShutdownShardMigrationStatus shardMigrationStatus( shardRouting.shardId().getId(), shardRouting.primary() ? "primary" : "replica", shardRouting.index().getName() - ).getFormattedMessage() + ).getFormattedMessage(), + decision ); } else { return new ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status.IN_PROGRESS, totalRemainingShards); From e34a5331887617d7bbf00a535409c000caeb64c7 Mon Sep 17 00:00:00 2001 From: Gordon Brown Date: Wed, 6 Oct 2021 16:23:53 -0600 Subject: [PATCH 3/6] Adjust logging levels per review --- .../xpack/shutdown/TransportDeleteShutdownNodeAction.java | 2 ++ .../xpack/shutdown/TransportGetShutdownStatusAction.java | 2 +- .../xpack/shutdown/TransportPutShutdownNodeAction.java | 7 +++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportDeleteShutdownNodeAction.java b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportDeleteShutdownNodeAction.java index 06777af114911..b45077830b3df 100644 --- a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportDeleteShutdownNodeAction.java +++ b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportDeleteShutdownNodeAction.java @@ -71,6 +71,8 @@ protected void masterOperation( public ClusterState execute(ClusterState currentState) throws Exception { NodesShutdownMetadata currentShutdownMetadata = currentState.metadata().custom(NodesShutdownMetadata.TYPE); + logger.info("removing shutdown record for node [{}]", request.getNodeId()); + return ClusterState.builder(currentState) .metadata( Metadata.builder(currentState.metadata()) diff --git a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java index 58a655f52f2ba..01fba0a336a63 100644 --- a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java +++ b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java @@ -262,7 +262,7 @@ static ShutdownShardMigrationStatus shardMigrationStatus( return hasShardCopyOnOtherNode == false; }) .peek(pair -> { - logger.info( + logger.debug( "node [{}] shutdown of type [{}] stalled: found shard [{}][{}] from index [{}] with negative decision: [{}]", nodeId, shutdownType, diff --git a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java index 6d95c0eac8779..5e80be34aa35e 100644 --- a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java +++ b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java @@ -80,6 +80,13 @@ public ClusterState execute(ClusterState currentState) { request.getType(), request.getReason() ); + } else { + logger.info( + "creating shutdown record for node [{}] of type [{}] with reason [{}]", + request.getNodeId(), + request.getType(), + request.getReason() + ); } final boolean nodeSeen = currentState.getNodes().nodeExists(request.getNodeId()); From 37e0250eca4a6e19252207fd271c6438f7b27668 Mon Sep 17 00:00:00 2001 From: Gordon Brown Date: Wed, 6 Oct 2021 18:13:49 -0600 Subject: [PATCH 4/6] Include all SingleNodeShutdownMetadata fields in logs --- .../metadata/SingleNodeShutdownMetadata.java | 28 +++++++++++++++++++ .../TransportPutShutdownNodeAction.java | 28 ++++++------------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java b/server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java index d2cf25f219303..ba67125ebd9cd 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java @@ -255,6 +255,34 @@ public int hashCode() { ); } + @Override public String toString() { + final StringBuilder stringBuilder = new StringBuilder(); + stringBuilder + .append("{") + .append("nodeId=[") + .append(nodeId) + .append(']') + .append(", type=[") + .append(type) + .append("], reason=[") + .append(reason) + .append(']'); + if (allocationDelay != null) { + stringBuilder + .append(", allocationDelay=[") + .append(allocationDelay) + .append("]"); + } + if (targetNodeName != null) { + stringBuilder + .append(", targetNodeName=[") + .append(targetNodeName) + .append("]"); + } + stringBuilder.append("}"); + return stringBuilder.toString(); + } + public static Builder builder() { return new Builder(); } diff --git a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java index 5e80be34aa35e..01b42a56c2fcb 100644 --- a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java +++ b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java @@ -69,26 +69,6 @@ public ClusterState execute(ClusterState currentState) { currentShutdownMetadata = new NodesShutdownMetadata(new HashMap<>()); } - // Verify that there's not already a shutdown metadata for this node - SingleNodeShutdownMetadata existingRecord = currentShutdownMetadata.getAllNodeMetadataMap().get(request.getNodeId()); - if (existingRecord != null) { - logger.info( - "updating existing shutdown record for node [{}] of type [{}] with reason [{}] with new type [{}] and reason [{}]", - existingRecord.getNodeId(), - existingRecord.getType(), - existingRecord.getReason(), - request.getType(), - request.getReason() - ); - } else { - logger.info( - "creating shutdown record for node [{}] of type [{}] with reason [{}]", - request.getNodeId(), - request.getType(), - request.getReason() - ); - } - final boolean nodeSeen = currentState.getNodes().nodeExists(request.getNodeId()); SingleNodeShutdownMetadata newNodeMetadata = SingleNodeShutdownMetadata.builder() @@ -101,6 +81,14 @@ public ClusterState execute(ClusterState currentState) { .setTargetNodeName(request.getTargetNodeName()) .build(); + // Verify that there's not already a shutdown metadata for this node + SingleNodeShutdownMetadata existingRecord = currentShutdownMetadata.getAllNodeMetadataMap().get(request.getNodeId()); + if (existingRecord != null) { + logger.info("updating existing shutdown record {} with new record {}", existingRecord, newNodeMetadata); + } else { + logger.info("creating shutdown record {}", newNodeMetadata); + } + return ClusterState.builder(currentState) .metadata( Metadata.builder(currentState.metadata()) From b63751443dc53a9da29f23c0415e1f5bd71bd632 Mon Sep 17 00:00:00 2001 From: Gordon Brown Date: Wed, 6 Oct 2021 18:15:23 -0600 Subject: [PATCH 5/6] Depluralize `node_shutdown_decisions` per review --- .../cluster/metadata/ShutdownShardMigrationStatus.java | 2 +- .../java/org/elasticsearch/xpack/shutdown/NodeShutdownIT.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/ShutdownShardMigrationStatus.java b/server/src/main/java/org/elasticsearch/cluster/metadata/ShutdownShardMigrationStatus.java index 3e6a3c187b248..988e6dde17eef 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/ShutdownShardMigrationStatus.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/ShutdownShardMigrationStatus.java @@ -81,7 +81,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.field("explanation", explanation); } if (Objects.nonNull(allocationDecision)) { - builder.startObject("node_allocation_decisions"); + builder.startObject("node_allocation_decision"); { allocationDecision.toXContent(builder, params); } diff --git a/x-pack/plugin/shutdown/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownIT.java b/x-pack/plugin/shutdown/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownIT.java index 421ec0a9be37b..3015c1949b766 100644 --- a/x-pack/plugin/shutdown/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownIT.java +++ b/x-pack/plugin/shutdown/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownIT.java @@ -174,7 +174,7 @@ public void testAllocationPreventedForRemoval() throws Exception { Request allocationExplainRequest = new Request("GET", "_cluster/allocation/explain"); allocationExplainRequest.setJsonEntity("{\"index\": \"" + indexName + "\", \"shard\": 0, \"primary\": false}"); Map allocationExplainMap = entityAsMap(client().performRequest(allocationExplainRequest)); - List> decisions = (List>) allocationExplainMap.get("node_allocation_decisions"); + List> decisions = (List>) allocationExplainMap.get("node_allocation_decision"); assertThat(decisions, notNullValue()); Optional> maybeDecision = decisions.stream() From 069079b3355ad8620f87a3ba732a7add0253aa72 Mon Sep 17 00:00:00 2001 From: Gordon Brown Date: Thu, 7 Oct 2021 15:17:18 -0600 Subject: [PATCH 6/6] Repluralize node_allocation_decisions when we're actually reading from the Allocation Explain API --- .../java/org/elasticsearch/xpack/shutdown/NodeShutdownIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/shutdown/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownIT.java b/x-pack/plugin/shutdown/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownIT.java index 3015c1949b766..421ec0a9be37b 100644 --- a/x-pack/plugin/shutdown/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownIT.java +++ b/x-pack/plugin/shutdown/qa/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownIT.java @@ -174,7 +174,7 @@ public void testAllocationPreventedForRemoval() throws Exception { Request allocationExplainRequest = new Request("GET", "_cluster/allocation/explain"); allocationExplainRequest.setJsonEntity("{\"index\": \"" + indexName + "\", \"shard\": 0, \"primary\": false}"); Map allocationExplainMap = entityAsMap(client().performRequest(allocationExplainRequest)); - List> decisions = (List>) allocationExplainMap.get("node_allocation_decision"); + List> decisions = (List>) allocationExplainMap.get("node_allocation_decisions"); assertThat(decisions, notNullValue()); Optional> maybeDecision = decisions.stream()