diff --git a/.test-infra/jenkins/job_PreCommit_Java_InfluxDb_IO_Direct.groovy b/.test-infra/jenkins/job_PreCommit_Java_InfluxDb_IO_Direct.groovy index cd2a0bb52929..86efaf8a1edd 100644 --- a/.test-infra/jenkins/job_PreCommit_Java_InfluxDb_IO_Direct.groovy +++ b/.test-infra/jenkins/job_PreCommit_Java_InfluxDb_IO_Direct.groovy @@ -16,28 +16,48 @@ * limitations under the License. */ -import PrecommitJobBuilder +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.InternalTimeServiceManager; +import org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl; +import org.apache.flink.streaming.api.operators.sorted.state.BatchExecutionInternalTimeServiceManager; -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'Java_InfluxDb_IO_Direct', - gradleTasks: [ - ':sdks:java:io:influxdb:build', - ], - gradleSwitches: [ - '-PdisableSpotlessCheck=true', - '-PdisableCheckStyle=true' - ], // spotless checked in separate pre-commit - triggerPathPatterns: [ - '^sdks/java/core/src/main/.*$', - '^sdks/java/io/common/.*$', - '^sdks/java/io/influxdb/.*$', - ], - timeoutMins: 60, - ) -builder.build { - publishers { - archiveJunit('**/build/test-results/**/*.xml') +/** Compatibility layer for {@link AbstractStreamOperator} breaking changes. */ +public abstract class AbstractStreamOperatorCompat + extends AbstractStreamOperator { + + /** + * Getter for timeServiceManager, which has been made private in Flink 1.11. + * + * @return Time service manager. + */ + protected InternalTimeServiceManager getTimeServiceManagerCompat() { + return getTimeServiceManager() + .orElseThrow(() -> new IllegalStateException("Time service manager is not set.")); + } + + /** + * This call has been removed from {@link AbstractStreamOperator} in Flink 1.12. + * + *

{@link InternalTimeServiceManagerImpl#numProcessingTimeTimers()} + */ + protected int numProcessingTimeTimers() { + return getTimeServiceManager() + .map( + manager -> { + InternalTimeServiceManager tsm = getTimeServiceManagerCompat(); + if (tsm instanceof InternalTimeServiceManagerImpl) { + final InternalTimeServiceManagerImpl cast = + (InternalTimeServiceManagerImpl) getTimeServiceManagerCompat(); + return cast.numProcessingTimeTimers(); + } else if (tsm instanceof BatchExecutionInternalTimeServiceManager) { + return 0; + } else { + throw new IllegalStateException( + String.format( + "Unknown implementation of InternalTimerServiceManager. %s", tsm)); + } + }) + .orElse(0); } /** Release all of the operator's resources. */ diff --git a/CHANGES.md b/CHANGES.md index 42332557caeb..62f746f7c2cd 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -56,9 +56,317 @@ * Added FlinkRunner for Flink 1.18. ([#30789](https://github.com/apache/beam/issues/30789)) * New highly anticipated feature X added to Python SDK ([#X](https://github.com/apache/beam/issues/X)). * New highly anticipated feature Y added to Java SDK ([#Y](https://github.com/apache/beam/issues/Y)). +* Previously deprecated Avro-dependent code (Beam Release 2.46.0) has been finally removed from Java SDK "core" package. +Please, use `beam-sdks-java-extensions-avro` instead. This will allow to easily update Avro version in user code without +potential breaking changes in Beam "core" since the Beam Avro extension already supports the latest Avro versions and +should handle this. ([#25252](https://github.com/apache/beam/issues/25252)). +* Publishing Java 21 SDK container images now supported as part of Apache Beam release process. ([#28120](https://github.com/apache/beam/issues/28120)) + * Direct Runner and Dataflow Runner V2 support running pipeline on Java21. Support for other runners are planned in upcoming versions. ## I/Os +* Support for X source added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). + +## New Features / Improvements + +* Add `UseDataStreamForBatch` pipeline option to the Flink runner. When it is set to true, Flink runner will run batch + jobs using the DataStream API. By default the option is set to false, so the batch jobs are still executed + using the DataSet API. +* `upload_graph` as one of the Experiments options for DataflowRunner is no longer required when the graph is larger than 10MB for Java SDK ([PR#28621](https://github.com/apache/beam/pull/28621). + +## Breaking Changes + +* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). +* `org.apache.beam.sdk.io.CountingSource.CounterMark` uses custom `CounterMarkCoder` as a default coder since all Avro-dependent +classes finally moved to `extensions/avro`. In case if it's still required to use `AvroCoder` for `CounterMark`, then, +as a workaround, a copy of "old" `CountingSource` class should be placed into a project code and used directly +([#25252](https://github.com/apache/beam/issues/25252)). + +## Deprecations + +* X behavior is deprecated and will be removed in X versions ([#X](https://github.com/apache/beam/issues/X)). + +## Bugfixes + +* Fixed "Desired bundle size 0 bytes must be greater than 0" in Java SDK's BigtableIO.BigtableSource when you have more cores than bytes to read (Java) [#28793](https://github.com/apache/beam/issues/28793). +* `watch_file_pattern` arg of the [RunInference](https://github.com/apache/beam/blob/104c10b3ee536a9a3ea52b4dbf62d86b669da5d9/sdks/python/apache_beam/ml/inference/base.py#L997) arg had no effect prior to 2.52.0. To use the behavior of arg `watch_file_pattern` prior to 2.52.0, follow the documentation at https://beam.apache.org/documentation/ml/side-input-updates/ and use `WatchFilePattern` PTransform as a SideInput. ([#28948](https://github.com/apache/beam/pulls/28948)) +* `MLTransform` doesn't output artifacts such as min, max and quantiles. Instead, `MLTransform` will add a feature to output these artifacts as human readable format - [#29017](https://github.com/apache/beam/issues/29017). For now, to use the artifacts such as min and max that were produced by the eariler `MLTransform`, use `read_artifact_location` of `MLTransform`, which reads artifacts that were produced earlier in a different `MLTransform` ([#29016](https://github.com/apache/beam/pull/29016/)) + +## Security Fixes +* Fixed [CVE-YYYY-NNNN](https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN) (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). +* Fixed [CVE-2023-39325](https://www.cve.org/CVERecord?id=CVE-2023-39325) (Java/Python/Go) ([#29118](https://github.com/apache/beam/issues/29118)). + +## Known Issues + +* ([#X](https://github.com/apache/beam/issues/X)). + +# [2.51.0] - 2023-10-03 + +## New Features / Improvements + +* In Python, [RunInference](https://beam.apache.org/documentation/sdks/python-machine-learning/#why-use-the-runinference-api) now supports loading many models in the same transform using a [KeyedModelHandler](https://beam.apache.org/documentation/sdks/python-machine-learning/#use-a-keyed-modelhandler) ([#27628](https://github.com/apache/beam/issues/27628)). +* In Python, the [VertexAIModelHandlerJSON](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.vertex_ai_inference.html#apache_beam.ml.inference.vertex_ai_inference.VertexAIModelHandlerJSON) now supports passing in inference_args. These will be passed through to the Vertex endpoint as parameters. +* Added support to run `mypy` on user pipelines ([#27906](https://github.com/apache/beam/issues/27906)) + + +## Breaking Changes + +* Removed fastjson library dependency for Beam SQL. Table property is changed to be based on jackson ObjectNode (Java) ([#24154](https://github.com/apache/beam/issues/24154)). +* Removed TensorFlow from Beam Python container images [PR](https://github.com/apache/beam/pull/28424). If you have been negatively affected by this change, please comment on [#20605](https://github.com/apache/beam/issues/20605). +* Removed the parameter `t reflect.Type` from `parquetio.Write`. The element type is derived from the input PCollection (Go) ([#28490](https://github.com/apache/beam/issues/28490)) +* Refactor BeamSqlSeekableTable.setUp adding a parameter joinSubsetType. [#28283](https://github.com/apache/beam/issues/28283) + + +## Bugfixes + +* Fixed exception chaining issue in GCS connector (Python) ([#26769](https://github.com/apache/beam/issues/26769#issuecomment-1700422615)). +* Fixed streaming inserts exception handling, GoogleAPICallErrors are now retried according to retry strategy and routed to failed rows where appropriate rather than causing a pipeline error (Python) ([#21080](https://github.com/apache/beam/issues/21080)). +* Fixed a bug in Python SDK's cross-language Bigtable sink that mishandled records that don't have an explicit timestamp set: [#28632](https://github.com/apache/beam/issues/28632). + + +## Security Fixes +* Python containers updated, fixing [CVE-2021-30474](https://nvd.nist.gov/vuln/detail/CVE-2021-30474), [CVE-2021-30475](https://nvd.nist.gov/vuln/detail/CVE-2021-30475), [CVE-2021-30473](https://nvd.nist.gov/vuln/detail/CVE-2021-30473), [CVE-2020-36133](https://nvd.nist.gov/vuln/detail/CVE-2020-36133), [CVE-2020-36131](https://nvd.nist.gov/vuln/detail/CVE-2020-36131), [CVE-2020-36130](https://nvd.nist.gov/vuln/detail/CVE-2020-36130), and [CVE-2020-36135](https://nvd.nist.gov/vuln/detail/CVE-2020-36135) +* Used go 1.21.1 to build, fixing [CVE-2023-39320](https://security-tracker.debian.org/tracker/CVE-2023-39320) + +## Known Issues + +* Python pipelines using BigQuery Storage Read API must pin `fastavro` + dependency to 1.8.3 or earlier: [#28811](https://github.com/apache/beam/issues/28811) + +# [2.50.0] - 2023-08-30 + +## Highlights + +* Spark 3.2.2 is used as default version for Spark runner ([#23804](https://github.com/apache/beam/issues/23804)). +* The Go SDK has a new default local runner, called Prism ([#24789](https://github.com/apache/beam/issues/24789)). +* All Beam released container images are now [multi-arch images](https://cloud.google.com/kubernetes-engine/docs/how-to/build-multi-arch-for-arm#what_is_a_multi-arch_image) that support both x86 and ARM CPU architectures. + +## I/Os + +* Java KafkaIO now supports picking up topics via topicPattern ([#26948](https://github.com/apache/beam/pull/26948)) +* Support for read from Cosmos DB Core SQL API ([#23604](https://github.com/apache/beam/issues/23604)) +* Upgraded to HBase 2.5.5 for HBaseIO. (Java) ([#27711](https://github.com/apache/beam/issues/19554)) +* Added support for GoogleAdsIO source (Java) ([#27681](https://github.com/apache/beam/pull/27681)). + +## New Features / Improvements + +* The Go SDK now requires Go 1.20 to build. ([#27558](https://github.com/apache/beam/issues/27558)) +* The Go SDK has a new default local runner, Prism. ([#24789](https://github.com/apache/beam/issues/24789)). + * Prism is a portable runner that executes each transform independantly, ensuring coders. + * At this point it supercedes the Go direct runner in functionality. The Go direct runner is now deprecated. + * See https://github.com/apache/beam/blob/master/sdks/go/pkg/beam/runners/prism/README.md for the goals and features of Prism. +* Hugging Face Model Handler for RunInference added to Python SDK. ([#26632](https://github.com/apache/beam/pull/26632)) +* Hugging Face Pipelines support for RunInference added to Python SDK. ([#27399](https://github.com/apache/beam/pull/27399)) +* Vertex AI Model Handler for RunInference now supports private endpoints ([#27696](https://github.com/apache/beam/pull/27696)) +* MLTransform transform added with support for common ML pre/postprocessing operations ([#26795](https://github.com/apache/beam/pull/26795)) +* Upgraded the Kryo extension for the Java SDK to Kryo 5.5.0. This brings in bug fixes, performance improvements, and serialization of Java 14 records. ([#27635](https://github.com/apache/beam/issues/27635)) +* All Beam released container images are now [multi-arch images](https://cloud.google.com/kubernetes-engine/docs/how-to/build-multi-arch-for-arm#what_is_a_multi-arch_image) that support both x86 and ARM CPU architectures. ([#27674](https://github.com/apache/beam/issues/27674)). The multi-arch container images include: + * All versions of Go, Python, Java and Typescript SDK containers. + * All versions of Flink job server containers. + * Java and Python expansion service containers. + * Transform service controller container. + * Spark3 job server container. +* Added support for batched writes to AWS SQS for improved throughput (Java, AWS 2).([#21429](https://github.com/apache/beam/issues/21429)) + +## Breaking Changes + +* Python SDK: Legacy runner support removed from Dataflow, all pipelines must use runner v2. +* Python SDK: Dataflow Runner will no longer stage Beam SDK from PyPI in the `--staging_location` at pipeline submission. Custom container images that are not based on Beam's default image must include Apache Beam installation.([#26996](https://github.com/apache/beam/issues/26996)) + +## Deprecations + +* The Go Direct Runner is now Deprecated. It remains available to reduce migration churn. + * Tests can be set back to the direct runner by overriding TestMain: `func TestMain(m *testing.M) { ptest.MainWithDefault(m, "direct") }` + * It's recommended to fix issues seen in tests using Prism, as they can also happen on any portable runner. + * Use the generic register package for your pipeline DoFns to ensure pipelines function on portable runners, like prism. + * Do not rely on closures or using package globals for DoFn configuration. They don't function on portable runners. + +## Bugfixes + +* Fixed DirectRunner bug in Python SDK where GroupByKey gets empty PCollection and fails when pipeline option `direct_num_workers!=1`.([#27373](https://github.com/apache/beam/pull/27373)) +* Fixed BigQuery I/O bug when estimating size on queries that utilize row-level security ([#27474](https://github.com/apache/beam/pull/27474)) + +## Known Issues + +* Long-running Python pipelines might experience a memory leak: [#28246](https://github.com/apache/beam/issues/28246). +* Python Pipelines using BigQuery IO or `orjson` dependency might experience segmentation faults or get stuck: [#28318](https://github.com/apache/beam/issues/28318). +* Beam Python containers rely on a version of Debian/aom that has several security vulnerabilities: [CVE-2021-30474](https://nvd.nist.gov/vuln/detail/CVE-2021-30474), [CVE-2021-30475](https://nvd.nist.gov/vuln/detail/CVE-2021-30475), [CVE-2021-30473](https://nvd.nist.gov/vuln/detail/CVE-2021-30473), [CVE-2020-36133](https://nvd.nist.gov/vuln/detail/CVE-2020-36133), [CVE-2020-36131](https://nvd.nist.gov/vuln/detail/CVE-2020-36131), [CVE-2020-36130](https://nvd.nist.gov/vuln/detail/CVE-2020-36130), and [CVE-2020-36135](https://nvd.nist.gov/vuln/detail/CVE-2020-36135) +* Python SDK's cross-language Bigtable sink mishandles records that don't have an explicit timestamp set: [#28632](https://github.com/apache/beam/issues/28632). To avoid this issue, set explicit timestamps for all records before writing to Bigtable. + + +# [2.49.0] - 2023-07-17 + + +## I/Os + +* Support for Bigtable Change Streams added in Java `BigtableIO.ReadChangeStream` ([#27183](https://github.com/apache/beam/issues/27183)) + +## New Features / Improvements + +* Allow prebuilding large images when using `--prebuild_sdk_container_engine=cloud_build`, like images depending on `tensorflow` or `torch` ([#27023](https://github.com/apache/beam/pull/27023)). +* Disabled `pip` cache when installing packages on the workers. This reduces the size of prebuilt Python container images ([#27035](https://github.com/apache/beam/pull/27035)). +* Select dedicated avro datum reader and writer (Java) ([#18874](https://github.com/apache/beam/issues/18874)). +* Timer API for the Go SDK (Go) ([#22737](https://github.com/apache/beam/issues/22737)). + +## Deprecations + +* Removed Python 3.7 support. ([#26447](https://github.com/apache/beam/issues/26447)) + +## Bugfixes + +* Fixed KinesisIO `NullPointerException` when a progress check is made before the reader is started (IO) ([#23868](https://github.com/apache/beam/issues/23868)) + +## Known Issues + +* Long-running Python pipelines might experience a memory leak: [#28246](https://github.com/apache/beam/issues/28246). + + +# [2.48.0] - 2023-05-31 + +## Highlights + +* "Experimental" annotation cleanup: the annotation and concept have been removed from Beam to avoid + the misperception of code as "not ready". Any proposed breaking changes will be subject to + case-by-case pro/con decision making (and generally avoided) rather than using the "Experimental" + to allow them. + +## I/Os + +* Added rename for GCS and copy for local filesystem (Go) ([#25779](https://github.com/apache/beam/issues/26064)). +* Added support for enhanced fan-out in KinesisIO.Read (Java) ([#19967](https://github.com/apache/beam/issues/19967)). + * This change is not compatible with Flink savepoints created by Beam 2.46.0 applications which had KinesisIO sources. +* Added textio.ReadWithFilename transform (Go) ([#25812](https://github.com/apache/beam/issues/25812)). +* Added fileio.MatchContinuously transform (Go) ([#26186](https://github.com/apache/beam/issues/26186)). + +## New Features / Improvements + +* Allow passing service name for google-cloud-profiler (Python) ([#26280](https://github.com/apache/beam/issues/26280)). +* Dead letter queue support added to RunInference in Python ([#24209](https://github.com/apache/beam/issues/24209)). +* Support added for defining pre/postprocessing operations on the RunInference transform ([#26308](https://github.com/apache/beam/issues/26308)) +* Adds a Docker Compose based transform service that can be used to discover and use portable Beam transforms ([#26023](https://github.com/apache/beam/pull/26023)). + +## Breaking Changes + +* Passing a tag into MultiProcessShared is now required in the Python SDK ([#26168](https://github.com/apache/beam/issues/26168)). +* CloudDebuggerOptions is removed (deprecated in Beam v2.47.0) for Dataflow runner as the Google Cloud Debugger service is [shutting down](https://cloud.google.com/debugger/docs/deprecations). (Java) ([#25959](https://github.com/apache/beam/issues/25959)). +* AWS 2 client providers (deprecated in Beam [v2.38.0](#2380---2022-04-20)) are finally removed ([#26681](https://github.com/apache/beam/issues/26681)). +* AWS 2 SnsIO.writeAsync (deprecated in Beam v2.37.0 due to risk of data loss) was finally removed ([#26710](https://github.com/apache/beam/issues/26710)). +* AWS 2 coders (deprecated in Beam v2.43.0 when adding Schema support for AWS Sdk Pojos) are finally removed ([#23315](https://github.com/apache/beam/issues/23315)). + +## Deprecations + + +## Bugfixes + +* Fixed Java bootloader failing with Too Long Args due to long classpaths, with a pathing jar. (Java) ([#25582](https://github.com/apache/beam/issues/25582)). + +## Known Issues + +* PubsubIO writes will throw *SizeLimitExceededException* for any message above 100 bytes, when used in batch (bounded) mode. (Java) ([#27000](https://github.com/apache/beam/issues/27000)). +* Long-running Python pipelines might experience a memory leak: [#28246](https://github.com/apache/beam/issues/28246). +* Python SDK's cross-language Bigtable sink mishandles records that don't have an explicit timestamp set: [#28632](https://github.com/apache/beam/issues/28632). To avoid this issue, set explicit timestamps for all records before writing to Bigtable. + + +# [2.47.0] - 2023-05-10 + +## Highlights + +* Apache Beam adds Python 3.11 support ([#23848](https://github.com/apache/beam/issues/23848)). + +## I/Os + +* BigQuery Storage Write API is now available in Python SDK via cross-language ([#21961](https://github.com/apache/beam/issues/21961)). +* Added HbaseIO support for writing RowMutations (ordered by rowkey) to Hbase (Java) ([#25830](https://github.com/apache/beam/issues/25830)). +* Added fileio transforms MatchFiles, MatchAll and ReadMatches (Go) ([#25779](https://github.com/apache/beam/issues/25779)). +* Add integration test for JmsIO + fix issue with multiple connections (Java) ([#25887](https://github.com/apache/beam/issues/25887)). + +## New Features / Improvements + +* The Flink runner now supports Flink 1.16.x ([#25046](https://github.com/apache/beam/issues/25046)). +* Schema'd PTransforms can now be directly applied to Beam dataframes just like PCollections. + (Note that when doing multiple operations, it may be more efficient to explicitly chain the operations + like `df | (Transform1 | Transform2 | ...)` to avoid excessive conversions.) +* The Go SDK adds new transforms periodic.Impulse and periodic.Sequence that extends support + for slowly updating side input patterns. ([#23106](https://github.com/apache/beam/issues/23106)) +* Several Google client libraries in Python SDK dependency chain were updated to latest available major versions. ([#24599](https://github.com/apache/beam/pull/24599)) + +## Breaking Changes + +* If a main session fails to load, the pipeline will now fail at worker startup. ([#25401](https://github.com/apache/beam/issues/25401)). +* Python pipeline options will now ignore unparsed command line flags prefixed with a single dash. ([#25943](https://github.com/apache/beam/issues/25943)). +* The SmallestPerKey combiner now requires keyword-only arguments for specifying optional parameters, such as `key` and `reverse`. ([#25888](https://github.com/apache/beam/issues/25888)). + +## Deprecations + +* Cloud Debugger support and its pipeline options are deprecated and will be removed in the next Beam version, + in response to the Google Cloud Debugger service [turning down](https://cloud.google.com/debugger/docs/deprecations). (Java) ([#25959](https://github.com/apache/beam/issues/25959)). + +## Bugfixes + +* BigQuery sink in STORAGE_WRITE_API mode in batch pipelines could result in data consistency issues during the handling of other unrelated transient errors for Beam SDKs 2.35.0 - 2.46.0 (inclusive). For more details see: https://github.com/apache/beam/issues/26521 + +## Known Issues + +* The google-cloud-profiler dependency was accidentally removed from Beam's Python Docker + Image [#26998](https://github.com/apache/beam/issues/26698). [Dataflow Docker images](https://cloud.google.com/dataflow/docs/concepts/sdk-worker-dependencies) still preinstall this dependency. +* Long-running Python pipelines might experience a memory leak: [#28246](https://github.com/apache/beam/issues/28246). + +# [2.46.0] - 2023-03-10 + +## Highlights + +* Java SDK containers migrated to [Eclipse Temurin](https://hub.docker.com/_/eclipse-temurin) + as a base. This change migrates away from the deprecated [OpenJDK](https://hub.docker.com/_/openjdk) + container. Eclipse Temurin is currently based upon Ubuntu 22.04 while the OpenJDK + container was based upon Debian 11. +* RunInference PTransform will accept model paths as SideInputs in Python SDK. ([#24042](https://github.com/apache/beam/issues/24042)) +* RunInference supports ONNX runtime in Python SDK ([#22972](https://github.com/apache/beam/issues/22972)) +* Tensorflow Model Handler for RunInference in Python SDK ([#25366](https://github.com/apache/beam/issues/25366)) +* Java SDK modules migrated to use `:sdks:java:extensions:avro` ([#24748](https://github.com/apache/beam/issues/24748)) + +## I/Os + +* Added in JmsIO a retry policy for failed publications (Java) ([#24971](https://github.com/apache/beam/issues/24971)). +* Support for `LZMA` compression/decompression of text files added to the Python SDK ([#25316](https://github.com/apache/beam/issues/25316)) +* Added ReadFrom/WriteTo Csv/Json as top-level transforms to the Python SDK. + +## New Features / Improvements + +* Add UDF metrics support for Samza portable mode. +* Option for SparkRunner to avoid the need of SDF output to fit in memory ([#23852](https://github.com/apache/beam/issues/23852)). + This helps e.g. with ParquetIO reads. Turn the feature on by adding experiment `use_bounded_concurrent_output_for_sdf`. +* Add `WatchFilePattern` transform, which can be used as a side input to the RunInference PTransfrom to watch for model updates using a file pattern. ([#24042](https://github.com/apache/beam/issues/24042)) +* Add support for loading TorchScript models with `PytorchModelHandler`. The TorchScript model path can be + passed to PytorchModelHandler using `torch_script_model_path=`. ([#25321](https://github.com/apache/beam/pull/25321)) +* The Go SDK now requires Go 1.19 to build. ([#25545](https://github.com/apache/beam/pull/25545)) +* The Go SDK now has an initial native Go implementation of a portable Beam Runner called Prism. ([#24789](https://github.com/apache/beam/pull/24789)) + * For more details and current state see https://github.com/apache/beam/tree/master/sdks/go/pkg/beam/runners/prism. + +## Breaking Changes + +* The deprecated SparkRunner for Spark 2 (see [2.41.0](#2410---2022-08-23)) was removed ([#25263](https://github.com/apache/beam/pull/25263)). +* Python's BatchElements performs more aggressive batching in some cases, + capping at 10 second rather than 1 second batches by default and excluding + fixed cost in this computation to better handle cases where the fixed cost + is larger than a single second. To get the old behavior, one can pass + `target_batch_duration_secs_including_fixed_cost=1` to BatchElements. +* Dataflow runner enables sibling SDK protocol for Python pipelines using custom containers on Beam 2.46.0 and newer SDKs. + If your Python pipeline starts to stall after you switch to 2.46.0 and you use a custom container, please verify + that your custom container does not include artifacts from older Beam SDK releases. In particular, check in your `Dockerfile` + that the Beam container entrypoint and/or Beam base image version match the Beam SDK version used at job submission. + +## Deprecations + +* Avro related classes are deprecated in module `beam-sdks-java-core` and will be eventually removed. Please, migrate to a new module `beam-sdks-java-extensions-avro` instead by importing the classes from `org.apache.beam.sdk.extensions.avro` package. + For the sake of migration simplicity, the relative package path and the whole class hierarchy of Avro related classes in new module is preserved the same as it was before. + For example, import `org.apache.beam.sdk.extensions.avro.coders.AvroCoder` class instead of`org.apache.beam.sdk.coders.AvroCoder`. ([#24749](https://github.com/apache/beam/issues/24749)). + +## Bugfixes + + +# [2.45.0] - 2023-02-15 +## I/Os + * Support for X source added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). * MongoDB IO connector added (Go) ([#24575](https://github.com/apache/beam/issues/24575)). diff --git a/runners/flink/flink_runner.gradle b/runners/flink/flink_runner.gradle index f4a38b96217e..817355294de0 100644 --- a/runners/flink/flink_runner.gradle +++ b/runners/flink/flink_runner.gradle @@ -331,9 +331,6 @@ def createValidatesRunnerTask(Map m) { if (!config.streaming) { // FlinkBatchExecutionInternalTimeService does not support timer registration on timer firing. excludeTestsMatching 'org.apache.beam.sdk.transforms.ParDoTest$TimestampTests.testOnTimerTimestampSkew' - } else { - // https://github.com/apache/beam/issues/25485 - excludeTestsMatching 'org.apache.beam.sdk.transforms.GroupByKeyTest$BasicTests.testAfterProcessingTimeContinuationTriggerUsingState' } } } diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java index 34b0c2bfb307..5726c11330b4 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java @@ -262,6 +262,7 @@ public void translateNode( FlinkUnboundedSource unboundedSource = FlinkSource.unbounded( + transform.getName(), rawSource, new SerializablePipelineOptions(context.getPipelineOptions()), parallelism); @@ -419,6 +420,7 @@ public void translateNode( FlinkBoundedSource flinkBoundedSource = FlinkSource.bounded( + transform.getName(), rawSource, new SerializablePipelineOptions(context.getPipelineOptions()), parallelism);