Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Kafka with Redistribute #32344

Merged
merged 5 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1696,7 +1696,7 @@ public PCollection<KafkaRecord<K, V>> expand(PBegin input) {
}

if (kafkaRead.isRedistributed()) {
if (kafkaRead.isCommitOffsetsInFinalizeEnabled()) {
if (kafkaRead.isCommitOffsetsInFinalizeEnabled() && kafkaRead.isAllowDuplicates()) {
LOG.warn(
"Offsets committed due to usage of commitOffsetsInFinalize() may not capture all work processed due to use of withRedistribute()");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update the log to reflect allow duplicates.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

}
Expand Down Expand Up @@ -1797,7 +1797,7 @@ public PCollection<KafkaRecord<K, V>> expand(PBegin input) {
return pcol.apply(
"Insert Redistribute with Shards",
Redistribute.<KafkaRecord<K, V>>arbitrarily()
.withAllowDuplicates(true)
.withAllowDuplicates(kafkaRead.isAllowDuplicates())
.withNumBuckets((int) kafkaRead.getRedistributeNumKeys()));
}
}
Expand Down Expand Up @@ -2654,10 +2654,10 @@ public PCollection<KafkaRecord<K, V>> expand(PCollection<KafkaSourceDescriptor>
if (getRedistributeNumKeys() == 0) {
LOG.warn("This will create a key per record, which is sub-optimal for most use cases.");
}
if (isCommitOffsetEnabled() || configuredKafkaCommit()) {
if ((isCommitOffsetEnabled() || configuredKafkaCommit()) && isAllowDuplicates()) {
LOG.warn(
"Either auto_commit is set, or commitOffsetEnabled is enabled (or both), but since "
+ "withRestribute() is enabled, the runner may have additional work processed that "
+ "withRestribute() is enabled with allow duplicates, the runner may have additional work processed that "
+ "is ahead of the current checkpoint");
}
}
Expand Down Expand Up @@ -2717,7 +2717,7 @@ public PCollection<KafkaRecord<K, V>> expand(PCollection<KafkaSourceDescriptor>
< 0) {
// Redistribute is not allowed with commits prior to 2.59.0, since there is a Reshuffle
// prior to the redistribute. The reshuffle will occur before commits are offsetted and
// before outputting KafkaRecords. Adding a redistrube then afterwards doesn't provide
// before outputting KafkaRecords. Adding a redistribute then afterwards doesn't provide
// additional performance benefit.
checkArgument(
!isRedistribute(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,13 @@ private PipelineResult testReadTransformCreationWithImplementationBoundPropertie
Function<KafkaIO.Read<Integer, Long>, KafkaIO.Read<Integer, Long>> kafkaReadDecorator) {
p.apply(
kafkaReadDecorator.apply(
mkKafkaReadTransform(1000, null, new ValueAsTimestampFn(), false, 0)));
mkKafkaReadTransform(
1000,
null,
new ValueAsTimestampFn(),
false, /*redistribute*/
false, /*allowDuplicates*/
0)));
return p.run();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,13 @@ public Consumer<byte[], byte[]> apply(Map<String, Object> config) {

static KafkaIO.Read<Integer, Long> mkKafkaReadTransform(
int numElements, @Nullable SerializableFunction<KV<Integer, Long>, Instant> timestampFn) {
return mkKafkaReadTransform(numElements, numElements, timestampFn, false, 0);
return mkKafkaReadTransform(
numElements,
numElements,
timestampFn,
false, /*redistribute*/
false, /*allowDuplicates*/
0);
}

/**
Expand All @@ -394,6 +400,7 @@ static KafkaIO.Read<Integer, Long> mkKafkaReadTransform(
@Nullable Integer maxNumRecords,
@Nullable SerializableFunction<KV<Integer, Long>, Instant> timestampFn,
@Nullable Boolean redistribute,
@Nullable Boolean withAllowDuplicates,
@Nullable Integer numKeys) {

KafkaIO.Read<Integer, Long> reader =
Expand All @@ -409,13 +416,21 @@ static KafkaIO.Read<Integer, Long> mkKafkaReadTransform(
reader = reader.withMaxNumRecords(maxNumRecords);
}

if (withAllowDuplicates == null) {
withAllowDuplicates = false;
}

if (timestampFn != null) {
reader = reader.withTimestampFn(timestampFn);
}

if (redistribute) {
if (numKeys != null) {
reader = reader.withRedistribute().withRedistributeNumKeys(numKeys);
reader =
reader
.withRedistribute()
.withAllowDuplicates(withAllowDuplicates)
.withRedistributeNumKeys(numKeys);
}
reader = reader.withRedistribute();
}
Expand Down Expand Up @@ -629,12 +644,18 @@ public void testRiskyConfigurationWarnsProperly() {
}

@Test
public void testCommitOffsetsInFinalizeAndRedistributeWarnings() {
public void testCommitOffsetsInFinalizeAndRedistributeWarningsWithAllowDuplicates() {
int numElements = 1000;

PCollection<Long> input =
p.apply(
mkKafkaReadTransform(numElements, numElements, new ValueAsTimestampFn(), true, 0)
mkKafkaReadTransform(
numElements,
numElements,
new ValueAsTimestampFn(),
true, /*redistribute*/
true, /*allowDuplicates*/
0)
.commitOffsetsInFinalize()
.withConsumerConfigUpdates(
ImmutableMap.of(ConsumerConfig.GROUP_ID_CONFIG, "group_id"))
Expand All @@ -648,6 +669,29 @@ public void testCommitOffsetsInFinalizeAndRedistributeWarnings() {
"Offsets committed due to usage of commitOffsetsInFinalize() may not capture all work processed due to use of withRedistribute()");
}

@Test
public void testCommitOffsetsInFinalizeAndRedistributeNoWarningsWithAllowDuplicates() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like it should be NoWarningsWithNoAllowDuplicates

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do want to mention commits are enabled, since if we do not commit offsets, enabling allow duplicates also has no warning.
updated test name to NoWarningsWithNoAllowDuplicatesAndCommitOffsets, and updated the other test name so its more clear that the two are testing behaviour with and without allowDuplicates=true.

int numElements = 1000;

PCollection<Long> input =
p.apply(
mkKafkaReadTransform(
numElements,
numElements,
new ValueAsTimestampFn(),
true, /*redistribute*/
false, /*allowDuplicates*/
0)
.commitOffsetsInFinalize()
.withConsumerConfigUpdates(
ImmutableMap.of(ConsumerConfig.GROUP_ID_CONFIG, "group_id"))
.withoutMetadata())
.apply(Values.create());

addCountingAsserts(input, numElements);
p.run();
}

@Test
public void testNumKeysIgnoredWithRedistributeNotEnabled() {
thrown.expect(Exception.class);
Expand All @@ -658,7 +702,13 @@ public void testNumKeysIgnoredWithRedistributeNotEnabled() {

PCollection<Long> input =
p.apply(
mkKafkaReadTransform(numElements, numElements, new ValueAsTimestampFn(), false, 0)
mkKafkaReadTransform(
numElements,
numElements,
new ValueAsTimestampFn(),
false, /*redistribute*/
false, /*allowDuplicates*/
0)
.withRedistributeNumKeys(100)
.commitOffsetsInFinalize()
.withConsumerConfigUpdates(
Expand Down Expand Up @@ -2016,7 +2066,13 @@ public void testUnboundedSourceStartReadTime() {

PCollection<Long> input =
p.apply(
mkKafkaReadTransform(numElements, maxNumRecords, new ValueAsTimestampFn(), false, 0)
mkKafkaReadTransform(
numElements,
maxNumRecords,
new ValueAsTimestampFn(),
false, /*redistribute*/
false, /*allowDuplicates*/
0)
.withStartReadTime(new Instant(startTime))
.withoutMetadata())
.apply(Values.create());
Expand All @@ -2040,7 +2096,13 @@ public void testUnboundedSourceStartReadTimeException() {
int startTime = numElements / 20;

p.apply(
mkKafkaReadTransform(numElements, numElements, new ValueAsTimestampFn(), false, 0)
mkKafkaReadTransform(
numElements,
numElements,
new ValueAsTimestampFn(),
false, /*redistribute*/
false, /*allowDuplicates*/
0)
.withStartReadTime(new Instant(startTime))
.withoutMetadata())
.apply(Values.create());
Expand Down
Loading