From e3a847562c6a43584a003da11917e0ff25a21e90 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Thu, 10 Mar 2022 12:16:31 +0100 Subject: [PATCH] Fix Stacktraces Taking Memory in ILM Error Step Serialization (#84266) Don't serialize stack-traces to ILM error steps. This will take multiple kb per index in the CS. In bad cases where an error might affect thousands of indices this can mean hundreds of MB of heap memory used on every node in the cluster just to keep the serialized stack-traces around in the CS that provide limited value anyway. --- docs/changelog/84266.yaml | 5 +++ .../xpack/ilm/ExecuteStepsUpdateTask.java | 2 +- .../xpack/ilm/IndexLifecycleTransition.java | 31 +++++-------------- .../xpack/ilm/MoveToErrorStepUpdateTask.java | 3 +- .../ilm/ExecuteStepsUpdateTaskTests.java | 4 +-- .../ilm/IndexLifecycleTransitionTests.java | 2 +- .../ilm/MoveToErrorStepUpdateTaskTests.java | 11 +------ 7 files changed, 19 insertions(+), 39 deletions(-) create mode 100644 docs/changelog/84266.yaml diff --git a/docs/changelog/84266.yaml b/docs/changelog/84266.yaml new file mode 100644 index 0000000000000..212d63338d98b --- /dev/null +++ b/docs/changelog/84266.yaml @@ -0,0 +1,5 @@ +pr: 84266 +summary: Fix Stacktraces Taking Memory in ILM Error Step Serialization +area: ILM+SLM +type: bug +issues: [] diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTask.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTask.java index 188bdc0a16042..444772e03ed3f 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTask.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTask.java @@ -271,7 +271,7 @@ public void handleFailure(Exception e) { logger.warn(new ParameterizedMessage("policy [{}] for index [{}] failed on step [{}].", policy, index, startStep.getKey()), e); } - private ClusterState moveToErrorStep(final ClusterState state, Step.StepKey currentStepKey, Exception cause) throws IOException { + private ClusterState moveToErrorStep(final ClusterState state, Step.StepKey currentStepKey, Exception cause) { this.failure = cause; logger.warn( "policy [{}] for index [{}] failed on cluster state step [{}]. Moving to ERROR step", diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java index 6c91c78d3644b..1a7f53b377aa8 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java @@ -17,14 +17,10 @@ import org.elasticsearch.cluster.metadata.LifecycleExecutionState; import org.elasticsearch.cluster.metadata.Metadata; import org.elasticsearch.common.Strings; -import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; import org.elasticsearch.license.XPackLicenseState; -import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.ToXContentObject; -import org.elasticsearch.xcontent.XContentBuilder; -import org.elasticsearch.xcontent.json.JsonXContent; import org.elasticsearch.xpack.core.ilm.ErrorStep; import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata; import org.elasticsearch.xpack.core.ilm.InitializePolicyContextStep; @@ -38,8 +34,6 @@ import org.elasticsearch.xpack.core.ilm.Step; import org.elasticsearch.xpack.core.ilm.TerminalPolicyStep; -import java.io.IOException; -import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Objects; @@ -48,8 +42,8 @@ import java.util.function.BiFunction; import java.util.function.LongSupplier; -import static org.elasticsearch.ElasticsearchException.REST_EXCEPTION_SKIP_STACK_TRACE; import static org.elasticsearch.cluster.metadata.LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY; +import static org.elasticsearch.xcontent.ToXContent.EMPTY_PARAMS; /** * The {@link IndexLifecycleTransition} class handles cluster state transitions @@ -61,9 +55,6 @@ */ public final class IndexLifecycleTransition { private static final Logger logger = LogManager.getLogger(IndexLifecycleTransition.class); - private static final ToXContent.Params STACKTRACE_PARAMS = new ToXContent.MapParams( - Collections.singletonMap(REST_EXCEPTION_SKIP_STACK_TRACE, "false") - ); /** * Validates that the given transition from {@code currentStepKey} to {@code newStepKey} can be accomplished @@ -158,14 +149,10 @@ static ClusterState moveClusterStateToErrorStep( Exception cause, LongSupplier nowSupplier, BiFunction stepLookupFunction - ) throws IOException { + ) { IndexMetadata idxMeta = clusterState.getMetadata().index(index); IndexLifecycleMetadata ilmMeta = clusterState.metadata().custom(IndexLifecycleMetadata.TYPE); LifecyclePolicyMetadata policyMetadata = ilmMeta.getPolicyMetadatas().get(idxMeta.getLifecyclePolicyName()); - XContentBuilder causeXContentBuilder = JsonXContent.contentBuilder(); - causeXContentBuilder.startObject(); - ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, cause); - causeXContentBuilder.endObject(); LifecycleExecutionState currentState = idxMeta.getLifecycleExecutionState(); Step.StepKey currentStep; // if an error is encountered while initialising the policy the lifecycle execution state will not yet contain any step information @@ -189,7 +176,10 @@ static ClusterState moveClusterStateToErrorStep( LifecycleExecutionState.Builder failedState = LifecycleExecutionState.builder(nextStepState); failedState.setFailedStep(currentStep.getName()); - failedState.setStepInfo(BytesReference.bytes(causeXContentBuilder).utf8ToString()); + failedState.setStepInfo(Strings.toString(((builder, params) -> { + ElasticsearchException.generateThrowableXContent(builder, EMPTY_PARAMS, cause); + return builder; + }))); Step failedStep = stepLookupFunction.apply(idxMeta, currentStep); if (failedStep != null) { @@ -444,20 +434,15 @@ public static ClusterState.Builder newClusterStateWithLifecycleState( * @param stepInfo the new step info to update * @return Updated cluster state with stepInfo if changed, otherwise the same cluster state * if no changes to step info exist - * @throws IOException if parsing step info fails */ - static ClusterState addStepInfoToClusterState(Index index, ClusterState clusterState, ToXContentObject stepInfo) throws IOException { + static ClusterState addStepInfoToClusterState(Index index, ClusterState clusterState, ToXContentObject stepInfo) { IndexMetadata indexMetadata = clusterState.getMetadata().index(index); if (indexMetadata == null) { // This index doesn't exist anymore, we can't do anything return clusterState; } LifecycleExecutionState lifecycleState = indexMetadata.getLifecycleExecutionState(); - final String stepInfoString; - try (XContentBuilder infoXContentBuilder = JsonXContent.contentBuilder()) { - stepInfo.toXContent(infoXContentBuilder, ToXContent.EMPTY_PARAMS); - stepInfoString = BytesReference.bytes(infoXContentBuilder).utf8ToString(); - } + final String stepInfoString = Strings.toString(stepInfo); if (stepInfoString.equals(lifecycleState.stepInfo())) { return clusterState; } diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTask.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTask.java index c848300f8b3a6..cbf13bcf6104c 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTask.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTask.java @@ -20,7 +20,6 @@ import org.elasticsearch.index.Index; import org.elasticsearch.xpack.core.ilm.Step; -import java.io.IOException; import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.function.LongSupplier; @@ -56,7 +55,7 @@ public MoveToErrorStepUpdateTask( } @Override - public ClusterState execute(ClusterState currentState) throws IOException { + public ClusterState execute(ClusterState currentState) { IndexMetadata idxMeta = currentState.getMetadata().index(index); if (idxMeta == null) { // Index must have been since deleted, ignore it diff --git a/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTaskTests.java b/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTaskTests.java index 2f8a35645d65b..97a57729735a7 100644 --- a/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTaskTests.java +++ b/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTaskTests.java @@ -303,7 +303,7 @@ public void testClusterActionStepThrowsException() throws Exception { assertThat(lifecycleState.phaseTime(), nullValue()); assertThat(lifecycleState.actionTime(), nullValue()); assertThat(lifecycleState.stepInfo(), containsString(""" - {"type":"runtime_exception","reason":"error","stack_trace":\"""")); + {"type":"runtime_exception","reason":"error\"""")); } public void testClusterWaitStepThrowsException() throws Exception { @@ -322,7 +322,7 @@ public void testClusterWaitStepThrowsException() throws Exception { assertThat(lifecycleState.phaseTime(), nullValue()); assertThat(lifecycleState.actionTime(), nullValue()); assertThat(lifecycleState.stepInfo(), containsString(""" - {"type":"runtime_exception","reason":"error","stack_trace":\"""")); + {"type":"runtime_exception","reason":"error\"""")); } private void setStateToKey(StepKey stepKey) throws IOException { diff --git a/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransitionTests.java b/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransitionTests.java index 323c6614d8c94..b58f3659a01a6 100644 --- a/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransitionTests.java +++ b/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransitionTests.java @@ -346,7 +346,7 @@ public void testMoveClusterStateToErrorStep() throws IOException { (idxMeta, stepKey) -> new MockStep(stepKey, nextStepKey) ); assertClusterStateOnErrorStep(clusterState, index, currentStep, newClusterState, now, """ - {"type":"illegal_argument_exception","reason":"non elasticsearch-exception","stack_trace":\""""); + {"type":"illegal_argument_exception","reason":"non elasticsearch-exception\""""); } public void testAddStepInfoToClusterState() throws IOException { diff --git a/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTaskTests.java b/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTaskTests.java index 76b383cc221f6..af255d6962b63 100644 --- a/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTaskTests.java +++ b/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTaskTests.java @@ -13,13 +13,9 @@ import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.metadata.LifecycleExecutionState; import org.elasticsearch.cluster.metadata.Metadata; -import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; import org.elasticsearch.test.ESTestCase; -import org.elasticsearch.xcontent.ToXContent; -import org.elasticsearch.xcontent.XContentBuilder; -import org.elasticsearch.xcontent.json.JsonXContent; import org.elasticsearch.xpack.core.ilm.ErrorStep; import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata; import org.elasticsearch.xpack.core.ilm.LifecyclePolicy; @@ -98,13 +94,8 @@ public void testExecuteSuccessfullyMoved() throws IOException { assertThat(lifecycleState.actionTime(), nullValue()); assertThat(lifecycleState.stepTime(), equalTo(now)); - XContentBuilder causeXContentBuilder = JsonXContent.contentBuilder(); - causeXContentBuilder.startObject(); - ElasticsearchException.generateThrowableXContent(causeXContentBuilder, ToXContent.EMPTY_PARAMS, cause); - causeXContentBuilder.endObject(); - String expectedCauseValue = BytesReference.bytes(causeXContentBuilder).utf8ToString(); assertThat(lifecycleState.stepInfo(), containsString(""" - {"type":"exception","reason":"THIS IS AN EXPECTED CAUSE","stack_trace":\"""")); + {"type":"exception","reason":"THIS IS AN EXPECTED CAUSE\"""")); } public void testExecuteNoopDifferentStep() throws IOException {