From cfe8feee7c5e180bd2671d15330bc46e228ea384 Mon Sep 17 00:00:00 2001 From: Jack McCluskey <34928439+jrmccluskey@users.noreply.github.com> Date: Fri, 30 Aug 2024 11:03:03 -0400 Subject: [PATCH] Improve BatchElements documentation (#32082) * Imporve BatchElements documentation * Add link to new documentation * Update sdks/python/apache_beam/transforms/util.py Co-authored-by: Jonathan Sabbagh <108473809+jbsabbagh@users.noreply.github.com> * linting * Apply suggestions from code review Co-authored-by: tvalentyn * line-too-long * Update sdks/python/apache_beam/transforms/util.py --------- Co-authored-by: Jonathan Sabbagh <108473809+jbsabbagh@users.noreply.github.com> Co-authored-by: tvalentyn --- sdks/python/apache_beam/transforms/util.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sdks/python/apache_beam/transforms/util.py b/sdks/python/apache_beam/transforms/util.py index 750d98f0789c..a27c7aca9e20 100644 --- a/sdks/python/apache_beam/transforms/util.py +++ b/sdks/python/apache_beam/transforms/util.py @@ -802,6 +802,20 @@ class BatchElements(PTransform): corresponding to its contents. Each batch is emitted with a timestamp at the end of their window. + When the max_batch_duration_secs arg is provided, a stateful implementation + of BatchElements is used to batch elements across bundles. This is most + impactful in streaming applications where many bundles only contain one + element. Larger max_batch_duration_secs values `might` reduce the throughput + of the transform, while smaller values might improve the throughput but + make it more likely that batches are smaller than the target batch size. + + As a general recommendation, start with low values (e.g. 0.005 aka 5ms) and + increase as needed to get the desired tradeoff between target batch size + and latency or throughput. + + For more information on tuning parameters to this transform, see + https://beam.apache.org/documentation/patterns/batch-elements + Args: min_batch_size: (optional) the smallest size of a batch max_batch_size: (optional) the largest size of a batch