Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: #242 Explicit terminal and retry exceptions for cleaner logging and poison pills #291

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package io.confluent.parallelconsumer;

/*-
* Copyright (C) 2020-2022 Confluent, Inc.
*/

/**
* A user's processing function can throw this exception, which signals to PC that processing of the message has failed,
* and that it should be retired at a later time.
* <p>
* The advantage of throwing this exception explicitly, is that PC will not log an ERROR. If any other type of exception
* is thrown by the user's function, that will be logged as an error (but will still be retried later).
* <p>
* So in short, if this exception is thrown, nothing will be logged (except at DEBUG level), any other exception will be
* logged as an error.
*/
public class PCRetriableException extends RuntimeException {
public PCRetriableException(String message) {
super(message);
}

public PCRetriableException(String message, Throwable cause) {
super(message, cause);
}

public PCRetriableException(Throwable cause) {
super(cause);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package io.confluent.parallelconsumer;

/*-
* Copyright (C) 2020-2022 Confluent, Inc.
*/

/**
* A user's processing function can throw this exception, which signals to PC that processing of the message has failed,
* and that it should be retired at a later time.
* <p>
* The advantage of throwing this exception explicitly, is that PC will not log an ERROR. If any other type of exception
* is thrown by the user's function, that will be logged as an error (but will still be retried later).
* <p>
* So in short, if this exception is thrown, nothing will be logged (except at DEBUG level), any other exception will be
* logged as an error.
*/
Comment on lines +7 to +16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update java doc

public class PCTerminalException extends RuntimeException {
public PCTerminalException(String message) {
super(message);
}

public PCTerminalException(String message, Throwable cause) {
super(message, cause);
}

public PCTerminalException(Throwable cause) {
super(cause);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,14 @@ public boolean isUsingBatching() {
@Builder.Default
private final int maxFailureHistory = 10;

private final TerminalFailureReaction terminalFailureReaction;

public enum TerminalFailureReaction {
SHUTDOWN,
SKIP,
// DLQ, TODO
}

/**
* @return the combined target of the desired concurrency by the configured batch size
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
*/

import io.confluent.csid.utils.TimeUtils;
import io.confluent.parallelconsumer.ParallelConsumer;
import io.confluent.parallelconsumer.ParallelConsumerOptions;
import io.confluent.parallelconsumer.PollContextInternal;
import io.confluent.parallelconsumer.*;
import io.confluent.parallelconsumer.state.WorkContainer;
import io.confluent.parallelconsumer.state.WorkManager;
import lombok.*;
Expand Down Expand Up @@ -39,12 +37,16 @@
import static io.confluent.csid.utils.BackportUtils.isEmpty;
import static io.confluent.csid.utils.BackportUtils.toSeconds;
import static io.confluent.csid.utils.StringUtils.msg;
import static io.confluent.parallelconsumer.ParallelConsumerOptions.TerminalFailureReaction.SHUTDOWN;
import static io.confluent.parallelconsumer.ParallelConsumerOptions.TerminalFailureReaction.SKIP;
import static io.confluent.parallelconsumer.internal.State.*;
import static java.time.Duration.ofMillis;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;
import static lombok.AccessLevel.PRIVATE;
import static lombok.AccessLevel.PROTECTED;
import static org.slf4j.event.Level.DEBUG;
import static org.slf4j.event.Level.WARN;

/**
* @see ParallelConsumer
Expand Down Expand Up @@ -1078,11 +1080,11 @@ private void updateLastCommitCheckTime() {
/**
* Run the supplied function.
*/
// todo extract class from this point
protected <R> List<ParallelConsumer.Tuple<ConsumerRecord<K, V>, R>> runUserFunction(Function<PollContextInternal<K, V>, List<R>> usersFunction,
Consumer<R> callback,
List<WorkContainer<K, V>> workContainerBatch) {
// call the user's function
List<R> resultsFromUserFunction;
// catch and process any internal error
try {
if (log.isDebugEnabled()) {
// first offset of the batch
Expand All @@ -1099,37 +1101,101 @@ protected <R> List<ParallelConsumer.Tuple<ConsumerRecord<K, V>, R>> runUserFunct
}

PollContextInternal<K, V> context = new PollContextInternal<>(workContainerBatch);
resultsFromUserFunction = usersFunction.apply(context);

for (final WorkContainer<K, V> kvWorkContainer : workContainerBatch) {
onUserFunctionSuccess(kvWorkContainer, resultsFromUserFunction);
}
List<R> resultsFromUserFunction = runUserFunction(usersFunction, context);

// capture each result, against the input record
var intermediateResults = new ArrayList<Tuple<ConsumerRecord<K, V>, R>>();
for (R result : resultsFromUserFunction) {
log.trace("Running users call back...");
callback.accept(result);
}
return handleUserSuccess(callback, workContainerBatch, resultsFromUserFunction);
} catch (Exception e) {
handleUserRetriableFailure(workContainerBatch, e);

// fail or succeed, either way we're done
for (var kvWorkContainer : workContainerBatch) {
addToMailBoxOnUserFunctionSuccess(kvWorkContainer, resultsFromUserFunction);
}
log.trace("User function future registered");
// throw again to make the future failed
throw e;
}
}

return intermediateResults;
} catch (Exception e) {
// handle fail
log.error("Exception caught in user function running stage, registering WC as failed, returning to mailbox", e);
for (var wc : workContainerBatch) {
wc.onUserFunctionFailure(e);
addToMailbox(wc); // always add on error
private <R> ArrayList<Tuple<ConsumerRecord<K, V>, R>> handleUserSuccess(Consumer<R> callback, List<WorkContainer<K, V>> workContainerBatch, List<R> resultsFromUserFunction) {
for (final WorkContainer<K, V> kvWorkContainer : workContainerBatch) {
onUserFunctionSuccess(kvWorkContainer, resultsFromUserFunction);
}

// capture each result, against the input record
var intermediateResults = new ArrayList<Tuple<ConsumerRecord<K, V>, R>>();
for (R result : resultsFromUserFunction) {
log.trace("Running users call back...");
callback.accept(result);
}

// fail or succeed, either way we're done
for (var kvWorkContainer : workContainerBatch) {
addToMailBoxOnUserFunctionSuccess(kvWorkContainer, resultsFromUserFunction);
}
log.trace("User function future registered");
return intermediateResults;
}

private <R> List<R> runUserFunction(Function<PollContextInternal<K, V>, List<R>> usersFunction,
PollContextInternal<K, V> context) {
try {
return usersFunction.apply(context);
} catch (PCTerminalException e) {
var reaction = getOptions().getTerminalFailureReaction();

if (reaction == SKIP) {
log.warn("Terminal error in user function, skipping record due to configuration in {} - triggering context: {}",
ParallelConsumerOptions.class.getSimpleName(),
context);

// return empty result to cause system to skip as if it succeeded
return new ArrayList<>();
} else if (reaction == SHUTDOWN) {
log.error("Shutting down upon terminal failure in user function due to {} setting in {} - triggering context: {}",
ParallelConsumerOptions.TerminalFailureReaction.class.getSimpleName(),
ParallelConsumerOptions.class.getSimpleName(),
context);

closeDontDrainFirst();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it shutdown gracefully in this case? i.e. commit all that succeeded by this point, maybe should give some time for inflight processes as well (if it doesnt already) ? - to reduce possible duplicates on restart / rebalance.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, it is graceful - closeDontDrainFirst will commit everything that's done first - drain isn't graceful, it means process everything in buffers.
Inflight - good point, not 100% will need to double check.


// throw again to make the future failed
throw e;
} else {
throw new InternalRuntimeError(msg("Unsupported reaction config ({}) - submit a bug report.", reaction));
}
throw e; // trow again to make the future failed
} catch (Exception e) {
log.error("Unknown internal error handling user function dispatch, terminating");

closeDontDrainFirst();

// throw again to make the future failed
throw e;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is current behaviour for user function exceptions ? is it a breaking change in behaviour?
I think i am missing the flow here - wouldn't this just kill PC and not retry the message?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

current = retry :)

wouldn't this just kill PC and not retry the message?

yeah there was a bug - check latest version?

}

private void handleUserRetriableFailure(List<WorkContainer<K, V>> workContainerBatch, Exception e) {
logUserFunctionException(e);
markRecordsFailed(workContainerBatch, e);
}

private void markRecordsFailed(List<WorkContainer<K, V>> workContainerBatch, Exception e) {
for (var wc : workContainerBatch) {
wc.onUserFunctionFailure(e);
addToMailbox(wc); // always add on error
}
}

/**
* If user explicitly throws the {@link PCRetriableException}, then don't log it, as the user is already aware.
* <p>
* <a href=https://english.stackexchange.com/questions/305273/retriable-or-retryable#305274>Retriable or
* Retryable?</a> Kafka uses Retriable, so we'll go with that ;)
*/
private void logUserFunctionException(Exception e) {
boolean explicitlyRetryable = e instanceof PCRetriableException;
var level = explicitlyRetryable ? DEBUG : WARN;
var prefix = explicitlyRetryable ? "Explicit " + PCRetriableException.class.getSimpleName() + " caught - " : "";
var message = prefix + "Exception in user function, registering record as failed, returning to queue";
log.atLevel(level).log(message, e);
}

protected void addToMailBoxOnUserFunctionSuccess(WorkContainer<K, V> wc, List<?> resultsFromUserFunction) {
addToMailbox(wc);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package io.confluent.parallelconsumer;

/*-
* Copyright (C) 2020-2021 Confluent, Inc.
* Copyright (C) 2020-2022 Confluent, Inc.
*/

/**
* Used for testing error handling - easier to identify than a plan exception.
*/
public class FakeRuntimeError extends RuntimeException {
public class FakeRuntimeError extends PCRetriableException {
public FakeRuntimeError(String msg) {
super(msg);
}
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
<surefire.version>3.0.0-M6</surefire.version>

<!-- core -->
<slf4j.version>1.7.36</slf4j.version>
<slf4j.version>2.0.0-alpha7</slf4j.version>
<kafka.version>3.1.0</kafka.version>
<version.unij>0.1.3</version.unij>

Expand Down