Skip to content

Commit

Permalink
ILM add step retries information to explain api
Browse files Browse the repository at this point in the history
  • Loading branch information
andreidan committed Oct 18, 2019
1 parent 10368db commit 016d1c9
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 18 deletions.
24 changes: 17 additions & 7 deletions docs/reference/ilm/apis/explain.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,11 @@ information for the step that's being performed on the index.

If the index is in the ERROR step, something went wrong while executing a
step in the policy and you will need to take action for the index to proceed
to the next step. To help you diagnose the problem, the explain response shows
the step that failed and the step info provides information about the error.
to the next step. Some steps are safe to automatically be retried in certain
circumstances. To help you diagnose the problem, the explain response shows
the step that failed, the step info which provides information about the error,
and information about the retry attempts executed for the failed step if it's
the case.

[source,console-result]
--------------------------------------------------
Expand All @@ -262,10 +265,12 @@ the step that failed and the step info provides information about the error.
"step": "ERROR",
"step_time_millis": 1538475653317,
"step_time": "2018-10-15T13:45:22.577Z",
"failed_step": "attempt-rollover", <1>
"step_info": { <2>
"type": "resource_already_exists_exception",
"reason": "index [test-000057/H7lF9n36Rzqa-KfKcnGQMg] already exists",
"failed_step": "check-rollover-ready", <1>
"is_transitive_error": true, <2>
"failed_step_retry_count": 1, <3>
"step_info": { <4>
"type": "cluster_block_exception",
"reason": "index [test-000057/H7lF9n36Rzqa-KfKcnGQMg] blocked by: [FORBIDDEN/5/index read-only (api)",
"index_uuid": "H7lF9n36Rzqa-KfKcnGQMg",
"index": "test-000057"
},
Expand All @@ -290,4 +295,9 @@ the step that failed and the step info provides information about the error.
// TESTRESPONSE[skip:not possible to get the cluster into this state in a docs test]

<1> The step that caused the error
<2> What went wrong
<2> Indicates if the error was caused by a transitive malfunction. If this
is true, ILM will retry the failed step automatically.
<3> Shows the retry attempt count. ILM will stop retrying a failed step after a
configurable number of retries (controlled by the
`index.lifecycle.max_failed_step_retries_count` setting)
<4> What went wrong
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
private static final ParseField ACTION_FIELD = new ParseField("action");
private static final ParseField STEP_FIELD = new ParseField("step");
private static final ParseField FAILED_STEP_FIELD = new ParseField("failed_step");
private static final ParseField IS_TRANSITIVE_ERROR_FIELD = new ParseField("is_transitive_error");
private static final ParseField FAILED_STEP_RETRY_COUNT_FIELD = new ParseField("failed_step_retry_count");
private static final ParseField PHASE_TIME_MILLIS_FIELD = new ParseField("phase_time_millis");
private static final ParseField PHASE_TIME_FIELD = new ParseField("phase_time");
private static final ParseField ACTION_TIME_MILLIS_FIELD = new ParseField("action_time_millis");
Expand All @@ -56,6 +58,8 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
(String) a[5],
(String) a[6],
(String) a[7],
(Boolean) a[14],
(Integer) a[15],
(Long) (a[8]),
(Long) (a[9]),
(Long) (a[10]),
Expand Down Expand Up @@ -83,6 +87,8 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
PARSER.declareObject(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> PhaseExecutionInfo.parse(p, ""),
PHASE_EXECUTION_INFO);
PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), AGE_FIELD);
PARSER.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), IS_TRANSITIVE_ERROR_FIELD);
PARSER.declareInt(ConstructingObjectParser.optionalConstructorArg(), FAILED_STEP_RETRY_COUNT_FIELD);
}

private final String index;
Expand All @@ -98,21 +104,25 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
private final boolean managedByILM;
private final BytesReference stepInfo;
private final PhaseExecutionInfo phaseExecutionInfo;
private final Boolean isTransitiveError;
private final Integer failedStepRetryCount;

public static IndexLifecycleExplainResponse newManagedIndexResponse(String index, String policyName, Long lifecycleDate,
String phase, String action, String step, String failedStep, Long phaseTime, Long actionTime, Long stepTime,
BytesReference stepInfo, PhaseExecutionInfo phaseExecutionInfo) {
return new IndexLifecycleExplainResponse(index, true, policyName, lifecycleDate, phase, action, step, failedStep, phaseTime,
actionTime, stepTime, stepInfo, phaseExecutionInfo);
String phase, String action, String step, String failedStep, Boolean isTransitiveError, Integer failedStepRetryCount,
Long phaseTime, Long actionTime, Long stepTime, BytesReference stepInfo, PhaseExecutionInfo phaseExecutionInfo) {
return new IndexLifecycleExplainResponse(index, true, policyName, lifecycleDate, phase, action, step, failedStep,
isTransitiveError, failedStepRetryCount, phaseTime, actionTime, stepTime, stepInfo, phaseExecutionInfo);
}

public static IndexLifecycleExplainResponse newUnmanagedIndexResponse(String index) {
return new IndexLifecycleExplainResponse(index, false, null, null, null, null, null, null, null, null, null, null, null);
return new IndexLifecycleExplainResponse(index, false, null, null, null, null, null, null, null, null, null, null, null, null,
null);
}

private IndexLifecycleExplainResponse(String index, boolean managedByILM, String policyName, Long lifecycleDate,
String phase, String action, String step, String failedStep, Long phaseTime, Long actionTime,
Long stepTime, BytesReference stepInfo, PhaseExecutionInfo phaseExecutionInfo) {
String phase, String action, String step, String failedStep, Boolean isTransitiveError,
Integer failedStepRetryCount, Long phaseTime, Long actionTime, Long stepTime,
BytesReference stepInfo, PhaseExecutionInfo phaseExecutionInfo) {
if (managedByILM) {
if (policyName == null) {
throw new IllegalArgumentException("[" + POLICY_NAME_FIELD.getPreferredName() + "] cannot be null for managed index");
Expand Down Expand Up @@ -143,6 +153,8 @@ private IndexLifecycleExplainResponse(String index, boolean managedByILM, String
this.actionTime = actionTime;
this.stepTime = stepTime;
this.failedStep = failedStep;
this.isTransitiveError = isTransitiveError;
this.failedStepRetryCount = failedStepRetryCount;
this.stepInfo = stepInfo;
this.phaseExecutionInfo = phaseExecutionInfo;
}
Expand All @@ -162,13 +174,17 @@ public IndexLifecycleExplainResponse(StreamInput in) throws IOException {
stepTime = in.readOptionalLong();
stepInfo = in.readOptionalBytesReference();
phaseExecutionInfo = in.readOptionalWriteable(PhaseExecutionInfo::new);
isTransitiveError = in.readOptionalBoolean();
failedStepRetryCount = in.readOptionalVInt();
} else {
policyName = null;
lifecycleDate = null;
phase = null;
action = null;
step = null;
failedStep = null;
isTransitiveError = null;
failedStepRetryCount = null;
phaseTime = null;
actionTime = null;
stepTime = null;
Expand All @@ -193,6 +209,8 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeOptionalLong(stepTime);
out.writeOptionalBytesReference(stepInfo);
out.writeOptionalWriteable(phaseExecutionInfo);
out.writeOptionalBoolean(isTransitiveError);
out.writeOptionalVInt(failedStepRetryCount);
}
}

Expand Down Expand Up @@ -248,6 +266,14 @@ public PhaseExecutionInfo getPhaseExecutionInfo() {
return phaseExecutionInfo;
}

public Boolean isTransitiveError() {
return isTransitiveError;
}

public Integer getFailedStepRetryCount() {
return failedStepRetryCount;
}

public TimeValue getAge() {
if (lifecycleDate == null) {
return TimeValue.MINUS_ONE;
Expand Down Expand Up @@ -288,6 +314,12 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
if (Strings.hasLength(failedStep)) {
builder.field(FAILED_STEP_FIELD.getPreferredName(), failedStep);
}
if (isTransitiveError != null) {
builder.field(IS_TRANSITIVE_ERROR_FIELD.getPreferredName(), isTransitiveError);
}
if(failedStepRetryCount != null) {
builder.field(FAILED_STEP_RETRY_COUNT_FIELD.getPreferredName(), failedStepRetryCount);
}
if (stepInfo != null && stepInfo.length() > 0) {
builder.rawField(STEP_INFO_FIELD.getPreferredName(), stepInfo.streamInput(), XContentType.JSON);
}
Expand All @@ -301,8 +333,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws

@Override
public int hashCode() {
return Objects.hash(index, managedByILM, policyName, lifecycleDate, phase, action, step, failedStep, phaseTime, actionTime,
stepTime, stepInfo, phaseExecutionInfo);
return Objects.hash(index, managedByILM, policyName, lifecycleDate, phase, action, step, failedStep, isTransitiveError,
failedStepRetryCount, phaseTime, actionTime, stepTime, stepInfo, phaseExecutionInfo);
}

@Override
Expand All @@ -322,6 +354,8 @@ public boolean equals(Object obj) {
Objects.equals(action, other.action) &&
Objects.equals(step, other.step) &&
Objects.equals(failedStep, other.failedStep) &&
Objects.equals(isTransitiveError, other.isTransitiveError) &&
Objects.equals(failedStepRetryCount, other.failedStepRetryCount) &&
Objects.equals(phaseTime, other.phaseTime) &&
Objects.equals(actionTime, other.actionTime) &&
Objects.equals(stepTime, other.stepTime) &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ private static IndexLifecycleExplainResponse randomManagedIndexExplainResponse()
stepNull ? null : randomAlphaOfLength(10),
stepNull ? null : randomAlphaOfLength(10),
randomBoolean() ? null : randomAlphaOfLength(10),
stepNull ? null : randomBoolean(),
stepNull ? null : randomInt(15),
stepNull ? null : randomNonNegativeLong(),
stepNull ? null : randomNonNegativeLong(),
stepNull ? null : randomNonNegativeLong(),
Expand All @@ -69,6 +71,8 @@ public void testInvalidStepDetails() {
(numNull == 2) ? null : randomAlphaOfLength(10),
(numNull == 3) ? null : randomAlphaOfLength(10),
randomBoolean() ? null : randomAlphaOfLength(10),
randomBoolean() ? null : randomBoolean(),
randomBoolean() ? null : randomInt(15),
randomBoolean() ? null : randomNonNegativeLong(),
randomBoolean() ? null : randomNonNegativeLong(),
randomBoolean() ? null : randomNonNegativeLong(),
Expand Down Expand Up @@ -106,6 +110,8 @@ protected IndexLifecycleExplainResponse mutateInstance(IndexLifecycleExplainResp
String action = instance.getAction();
String step = instance.getStep();
String failedStep = instance.getFailedStep();
Boolean isTransitiveError = instance.isTransitiveError();
Integer failedStepRetryCount = instance.getFailedStepRetryCount();
Long policyTime = instance.getLifecycleDate();
Long phaseTime = instance.getPhaseTime();
Long actionTime = instance.getActionTime();
Expand All @@ -114,7 +120,7 @@ protected IndexLifecycleExplainResponse mutateInstance(IndexLifecycleExplainResp
BytesReference stepInfo = instance.getStepInfo();
PhaseExecutionInfo phaseExecutionInfo = instance.getPhaseExecutionInfo();
if (managed) {
switch (between(0, 10)) {
switch (between(0, 11)) {
case 0:
index = index + randomAlphaOfLengthBetween(1, 5);
break;
Expand Down Expand Up @@ -162,11 +168,15 @@ protected IndexLifecycleExplainResponse mutateInstance(IndexLifecycleExplainResp
break;
case 10:
return IndexLifecycleExplainResponse.newUnmanagedIndexResponse(index);
case 11:
isTransitiveError = true;
failedStepRetryCount = randomInt(13);
break;
default:
throw new AssertionError("Illegal randomisation branch");
}
return IndexLifecycleExplainResponse.newManagedIndexResponse(index, policy, policyTime, phase, action, step, failedStep,
phaseTime, actionTime, stepTime, stepInfo, phaseExecutionInfo);
isTransitiveError, failedStepRetryCount, phaseTime, actionTime, stepTime, stepInfo, phaseExecutionInfo);
} else {
switch (between(0, 1)) {
case 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.hasKey;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.nullValue;

Expand Down Expand Up @@ -881,8 +882,14 @@ public void testExplainFilters() throws Exception {
assertNotNull(onlyErrorsResponse);
assertThat(onlyErrorsResponse, allOf(hasKey(errorIndex), hasKey(nonexistantPolicyIndex)));
assertThat(onlyErrorsResponse, allOf(not(hasKey(goodIndex)), not(hasKey(unmanagedIndex))));

Map<String, Object> errorIndexResponse = onlyErrorsResponse.get(errorIndex);
assertThat(errorIndex + "should've had the rollover step retried once",
errorIndexResponse.get("failed_step_retry_count"), is(1));
assertThat(errorIndexResponse.get("is_transitive_error"), is(true));
});
}

public void testILMRolloverRetriesOnReadOnlyBlock() throws Exception {
String firstIndex = index + "-000001";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ protected void doMasterOperation(ExplainLifecycleRequest request, String[] concr
lifecycleState.getAction(),
lifecycleState.getStep(),
lifecycleState.getFailedStep(),
lifecycleState.isTransitiveError(),
lifecycleState.getFailedStepRetryCount(),
lifecycleState.getPhaseTime(),
lifecycleState.getActionTime(),
lifecycleState.getStepTime(),
Expand Down

0 comments on commit 016d1c9

Please sign in to comment.