Skip to content

Commit

Permalink
Fix Stacktraces Taking Memory in ILM Error Step Serialization (#84266)
Browse files Browse the repository at this point in the history
Don't serialize stack-traces to ILM error steps. This will take multiple kb
per index in the CS. In bad cases where an error might affect thousands of
indices this can mean hundreds of MB of heap memory used on every node in
the cluster just to keep the serialized stack-traces around in the CS
that provide limited value anyway.
  • Loading branch information
original-brownbear authored Mar 10, 2022
1 parent 01c5bc0 commit e3a8475
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 39 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/84266.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 84266
summary: Fix Stacktraces Taking Memory in ILM Error Step Serialization
area: ILM+SLM
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ public void handleFailure(Exception e) {
logger.warn(new ParameterizedMessage("policy [{}] for index [{}] failed on step [{}].", policy, index, startStep.getKey()), e);
}

private ClusterState moveToErrorStep(final ClusterState state, Step.StepKey currentStepKey, Exception cause) throws IOException {
private ClusterState moveToErrorStep(final ClusterState state, Step.StepKey currentStepKey, Exception cause) {
this.failure = cause;
logger.warn(
"policy [{}] for index [{}] failed on cluster state step [{}]. Moving to ERROR step",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,10 @@
import org.elasticsearch.cluster.metadata.LifecycleExecutionState;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.license.XPackLicenseState;
import org.elasticsearch.xcontent.ToXContent;
import org.elasticsearch.xcontent.ToXContentObject;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.json.JsonXContent;
import org.elasticsearch.xpack.core.ilm.ErrorStep;
import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata;
import org.elasticsearch.xpack.core.ilm.InitializePolicyContextStep;
Expand All @@ -38,8 +34,6 @@
import org.elasticsearch.xpack.core.ilm.Step;
import org.elasticsearch.xpack.core.ilm.TerminalPolicyStep;

import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
Expand All @@ -48,8 +42,8 @@
import java.util.function.BiFunction;
import java.util.function.LongSupplier;

import static org.elasticsearch.ElasticsearchException.REST_EXCEPTION_SKIP_STACK_TRACE;
import static org.elasticsearch.cluster.metadata.LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY;
import static org.elasticsearch.xcontent.ToXContent.EMPTY_PARAMS;

/**
* The {@link IndexLifecycleTransition} class handles cluster state transitions
Expand All @@ -61,9 +55,6 @@
*/
public final class IndexLifecycleTransition {
private static final Logger logger = LogManager.getLogger(IndexLifecycleTransition.class);
private static final ToXContent.Params STACKTRACE_PARAMS = new ToXContent.MapParams(
Collections.singletonMap(REST_EXCEPTION_SKIP_STACK_TRACE, "false")
);

/**
* Validates that the given transition from {@code currentStepKey} to {@code newStepKey} can be accomplished
Expand Down Expand Up @@ -158,14 +149,10 @@ static ClusterState moveClusterStateToErrorStep(
Exception cause,
LongSupplier nowSupplier,
BiFunction<IndexMetadata, Step.StepKey, Step> stepLookupFunction
) throws IOException {
) {
IndexMetadata idxMeta = clusterState.getMetadata().index(index);
IndexLifecycleMetadata ilmMeta = clusterState.metadata().custom(IndexLifecycleMetadata.TYPE);
LifecyclePolicyMetadata policyMetadata = ilmMeta.getPolicyMetadatas().get(idxMeta.getLifecyclePolicyName());
XContentBuilder causeXContentBuilder = JsonXContent.contentBuilder();
causeXContentBuilder.startObject();
ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, cause);
causeXContentBuilder.endObject();
LifecycleExecutionState currentState = idxMeta.getLifecycleExecutionState();
Step.StepKey currentStep;
// if an error is encountered while initialising the policy the lifecycle execution state will not yet contain any step information
Expand All @@ -189,7 +176,10 @@ static ClusterState moveClusterStateToErrorStep(

LifecycleExecutionState.Builder failedState = LifecycleExecutionState.builder(nextStepState);
failedState.setFailedStep(currentStep.getName());
failedState.setStepInfo(BytesReference.bytes(causeXContentBuilder).utf8ToString());
failedState.setStepInfo(Strings.toString(((builder, params) -> {
ElasticsearchException.generateThrowableXContent(builder, EMPTY_PARAMS, cause);
return builder;
})));
Step failedStep = stepLookupFunction.apply(idxMeta, currentStep);

if (failedStep != null) {
Expand Down Expand Up @@ -444,20 +434,15 @@ public static ClusterState.Builder newClusterStateWithLifecycleState(
* @param stepInfo the new step info to update
* @return Updated cluster state with <code>stepInfo</code> if changed, otherwise the same cluster state
* if no changes to step info exist
* @throws IOException if parsing step info fails
*/
static ClusterState addStepInfoToClusterState(Index index, ClusterState clusterState, ToXContentObject stepInfo) throws IOException {
static ClusterState addStepInfoToClusterState(Index index, ClusterState clusterState, ToXContentObject stepInfo) {
IndexMetadata indexMetadata = clusterState.getMetadata().index(index);
if (indexMetadata == null) {
// This index doesn't exist anymore, we can't do anything
return clusterState;
}
LifecycleExecutionState lifecycleState = indexMetadata.getLifecycleExecutionState();
final String stepInfoString;
try (XContentBuilder infoXContentBuilder = JsonXContent.contentBuilder()) {
stepInfo.toXContent(infoXContentBuilder, ToXContent.EMPTY_PARAMS);
stepInfoString = BytesReference.bytes(infoXContentBuilder).utf8ToString();
}
final String stepInfoString = Strings.toString(stepInfo);
if (stepInfoString.equals(lifecycleState.stepInfo())) {
return clusterState;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import org.elasticsearch.index.Index;
import org.elasticsearch.xpack.core.ilm.Step;

import java.io.IOException;
import java.util.function.BiFunction;
import java.util.function.Consumer;
import java.util.function.LongSupplier;
Expand Down Expand Up @@ -56,7 +55,7 @@ public MoveToErrorStepUpdateTask(
}

@Override
public ClusterState execute(ClusterState currentState) throws IOException {
public ClusterState execute(ClusterState currentState) {
IndexMetadata idxMeta = currentState.getMetadata().index(index);
if (idxMeta == null) {
// Index must have been since deleted, ignore it
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ public void testClusterActionStepThrowsException() throws Exception {
assertThat(lifecycleState.phaseTime(), nullValue());
assertThat(lifecycleState.actionTime(), nullValue());
assertThat(lifecycleState.stepInfo(), containsString("""
{"type":"runtime_exception","reason":"error","stack_trace":\""""));
{"type":"runtime_exception","reason":"error\""""));
}

public void testClusterWaitStepThrowsException() throws Exception {
Expand All @@ -322,7 +322,7 @@ public void testClusterWaitStepThrowsException() throws Exception {
assertThat(lifecycleState.phaseTime(), nullValue());
assertThat(lifecycleState.actionTime(), nullValue());
assertThat(lifecycleState.stepInfo(), containsString("""
{"type":"runtime_exception","reason":"error","stack_trace":\""""));
{"type":"runtime_exception","reason":"error\""""));
}

private void setStateToKey(StepKey stepKey) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ public void testMoveClusterStateToErrorStep() throws IOException {
(idxMeta, stepKey) -> new MockStep(stepKey, nextStepKey)
);
assertClusterStateOnErrorStep(clusterState, index, currentStep, newClusterState, now, """
{"type":"illegal_argument_exception","reason":"non elasticsearch-exception","stack_trace":\"""");
{"type":"illegal_argument_exception","reason":"non elasticsearch-exception\"""");
}

public void testAddStepInfoToClusterState() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,9 @@
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.metadata.LifecycleExecutionState;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xcontent.ToXContent;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.json.JsonXContent;
import org.elasticsearch.xpack.core.ilm.ErrorStep;
import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata;
import org.elasticsearch.xpack.core.ilm.LifecyclePolicy;
Expand Down Expand Up @@ -98,13 +94,8 @@ public void testExecuteSuccessfullyMoved() throws IOException {
assertThat(lifecycleState.actionTime(), nullValue());
assertThat(lifecycleState.stepTime(), equalTo(now));

XContentBuilder causeXContentBuilder = JsonXContent.contentBuilder();
causeXContentBuilder.startObject();
ElasticsearchException.generateThrowableXContent(causeXContentBuilder, ToXContent.EMPTY_PARAMS, cause);
causeXContentBuilder.endObject();
String expectedCauseValue = BytesReference.bytes(causeXContentBuilder).utf8ToString();
assertThat(lifecycleState.stepInfo(), containsString("""
{"type":"exception","reason":"THIS IS AN EXPECTED CAUSE","stack_trace":\""""));
{"type":"exception","reason":"THIS IS AN EXPECTED CAUSE\""""));
}

public void testExecuteNoopDifferentStep() throws IOException {
Expand Down

0 comments on commit e3a8475

Please sign in to comment.