Merge branch 'apache:master' into master

xianhualiu · Apr 10, 2024 · 2986a82 · 2986a82
2 parents ffb52f3 + 0658874
commit 2986a82
Show file tree

Hide file tree

Showing 150 changed files with 6,207 additions and 378 deletions.
diff --git a/.github/workflows/IO_Iceberg.yml b/.github/workflows/IO_Iceberg.yml
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: IcebergIO Unit Tests
+
+on:
+  push:
+    tags: ['v*']
+    branches: ['master', 'release-*']
+    paths:
+      - "sdks/java/io/iceberg/**"
+      - ".github/workflows/IO_Iceberg.yml"
+  pull_request_target:
+    branches: ['master', 'release-*']
+    paths:
+      - "sdks/java/io/iceberg/**"
+      - 'release/trigger_all_tests.json'
+      - '.github/trigger_files/IO_Iceberg.json'
+  issue_comment:
+    types: [created]
+  schedule:
+    - cron: '15 1/6 * * *'
+  workflow_dispatch:
+
+#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event
+permissions:
+  actions: write
+  pull-requests: write
+  checks: write
+  contents: read
+  deployments: read
+  id-token: none
+  issues: write
+  discussions: read
+  packages: read
+  pages: read
+  repository-projects: read
+  security-events: read
+  statuses: read
+
+# This allows a subsequently queued workflow run to interrupt previous runs
+concurrency:
+  group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}'
+  cancel-in-progress: true
+
+env:
+  GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }}
+  GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }}
+  GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }}
+
+jobs:
+  IO_Iceberg:
+    name: ${{ matrix.job_name }} (${{ matrix.job_phrase }})
+    strategy:
+      matrix:
+        job_name: ["IO_Iceberg"]
+    timeout-minutes: 60
+    if: |
+      github.event_name == 'push' ||
+      github.event_name == 'pull_request_target' ||
+      (github.event_name == 'schedule' && github.repository == 'apache/beam') ||
+      github.event_name == 'workflow_dispatch' ||
+      github.event.comment.body == 'Run Java_Amqp_IO_Direct PreCommit'
+    runs-on: [self-hosted, ubuntu-20.04, main]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup repository
+        uses: ./.github/actions/setup-action
+        with:
+          comment_phrase: ${{ matrix.job_phrase }}
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }})
+      - name: Setup environment
+        uses: ./.github/actions/setup-environment-action
+      - name: run Amqp IO build script
+        uses: ./.github/actions/gradle-command-self-hosted-action
+        with:
+          gradle-command: :sdks:java:io:iceberg:build
+          arguments: |
+            -PdisableSpotlessCheck=true \
+            -PdisableCheckStyle=true \
+      - name: Archive JUnit Test Results
+        uses: actions/upload-artifact@v4
+        if: ${{ !success() }}
+        with:
+          name: JUnit Test Results
+          path: "**/build/reports/tests/"
+      - name: Publish JUnit Test Results
+        uses: EnricoMi/publish-unit-test-result-action@v2
+        if: always()
+        with:
+          commit: '${{ env.prsha || env.GITHUB_SHA }}'
+          comment_mode: ${{ github.event_name == 'issue_comment'  && 'always' || 'off' }}
+          files: '**/build/test-results/**/*.xml'
+      - name: Archive SpotBugs Results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: SpotBugs Results
+          path: '**/build/reports/spotbugs/*.html'
+      - name: Publish SpotBugs Results
+        uses: jwgmeligmeyling/[email protected]
+        if: always()
+        with:
+          name: Publish SpotBugs
+          path: '**/build/reports/spotbugs/*.html'
diff --git a/CHANGES.md b/CHANGES.md
@@ -52,6 +52,7 @@
 
 * ([#X](https://github.com/apache/beam/issues/X)).
 -->
+
 # [2.56.0] - Unreleased
 
 ## Highlights
@@ -89,6 +90,12 @@
 
 * ([#X](https://github.com/apache/beam/issues/X)).
 
+# [2.55.1]
+
+## Bugfixes
+
+* Fixed issue that broke WriteToJson in languages other than Java (X-lang) ([#30776](https://github.com/apache/beam/issues/30776)).
+
 # [2.55.0] - 2024-03-25
 
 ## Highlights
@@ -134,6 +141,7 @@
 ## Known Issues
 
 * In Python pipelines, when shutting down inactive bundle processors, shutdown logic can overaggressively hold the lock, blocking acceptance of new work. Symptoms of this issue include slowness or stuckness in long-running jobs. Fixed in 2.56.0 ([#30679](https://github.com/apache/beam/pull/30679)).
+* WriteToJson broken in languages other than Java (X-lang) ([#30776](https://github.com/apache/beam/issues/30776)).
 
 # [2.54.0] - 2024-02-14
 

diff --git a/build.gradle.kts b/build.gradle.kts
@@ -303,6 +303,7 @@ tasks.register("javaPreCommit") {
   dependsOn(":sdks:java:io:synthetic:build")
   dependsOn(":sdks:java:io:xml:build")
   dependsOn(":sdks:java:javadoc:allJavadoc")
+  dependsOn(":sdks:java:managed:build")
   dependsOn(":sdks:java:testing:expansion-service:build")
   dependsOn(":sdks:java:testing:jpms-tests:build")
   dependsOn(":sdks:java:testing:load-tests:build")

diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy
@@ -601,16 +601,16 @@ class BeamModulePlugin implements Plugin<Project> {
     def dbcp2_version = "2.9.0"
     def errorprone_version = "2.10.0"
     // [bomupgrader] determined by: com.google.api:gax, consistent with: google_cloud_platform_libraries_bom
-    def gax_version = "2.42.0"
+    def gax_version = "2.46.1"
     def google_ads_version = "26.0.0"
     def google_clients_version = "2.0.0"
     def google_cloud_bigdataoss_version = "2.2.16"
     // [bomupgrader] determined by: com.google.cloud:google-cloud-spanner, consistent with: google_cloud_platform_libraries_bom
-    def google_cloud_spanner_version = "6.58.0"
+    def google_cloud_spanner_version = "6.62.0"
     def google_code_gson_version = "2.10.1"
     def google_oauth_clients_version = "1.34.1"
     // [bomupgrader] determined by: io.grpc:grpc-netty, consistent with: google_cloud_platform_libraries_bom
-    def grpc_version = "1.61.0"
+    def grpc_version = "1.62.2"
     def guava_version = "32.1.2-jre"
     def hadoop_version = "2.10.2"
     def hamcrest_version = "2.1"
@@ -640,7 +640,7 @@ class BeamModulePlugin implements Plugin<Project> {
     def spotbugs_version = "4.0.6"
     def testcontainers_version = "1.17.3"
     // [bomupgrader] determined by: org.apache.arrow:arrow-memory-core, consistent with: google_cloud_platform_libraries_bom
-    def arrow_version = "15.0.0"
+    def arrow_version = "15.0.1"
     def jmh_version = "1.34"
     def jupiter_version = "5.7.0"
 
@@ -730,12 +730,12 @@ class BeamModulePlugin implements Plugin<Project> {
         google_api_client_jackson2                  : "com.google.api-client:google-api-client-jackson2:$google_clients_version",
         google_api_client_java6                     : "com.google.api-client:google-api-client-java6:$google_clients_version",
         google_api_common                           : "com.google.api:api-common", // google_cloud_platform_libraries_bom sets version
-        google_api_services_bigquery                : "com.google.apis:google-api-services-bigquery:v2-rev20240124-2.0.0",  // [bomupgrader] sets version
-        google_api_services_cloudresourcemanager    : "com.google.apis:google-api-services-cloudresourcemanager:v1-rev20240128-2.0.0",  // [bomupgrader] sets version
+        google_api_services_bigquery                : "com.google.apis:google-api-services-bigquery:v2-rev20240229-2.0.0",  // [bomupgrader] sets version
+        google_api_services_cloudresourcemanager    : "com.google.apis:google-api-services-cloudresourcemanager:v1-rev20240310-2.0.0",  // [bomupgrader] sets version
         google_api_services_dataflow                : "com.google.apis:google-api-services-dataflow:v1b3-rev20240218-$google_clients_version",
         google_api_services_healthcare              : "com.google.apis:google-api-services-healthcare:v1-rev20240130-$google_clients_version",
         google_api_services_pubsub                  : "com.google.apis:google-api-services-pubsub:v1-rev20220904-$google_clients_version",
-        google_api_services_storage                 : "com.google.apis:google-api-services-storage:v1-rev20240205-2.0.0",  // [bomupgrader] sets version
+        google_api_services_storage                 : "com.google.apis:google-api-services-storage:v1-rev20240311-2.0.0",  // [bomupgrader] sets version
         google_auth_library_credentials             : "com.google.auth:google-auth-library-credentials", // google_cloud_platform_libraries_bom sets version
         google_auth_library_oauth2_http             : "com.google.auth:google-auth-library-oauth2-http", // google_cloud_platform_libraries_bom sets version
         google_cloud_bigquery                       : "com.google.cloud:google-cloud-bigquery", // google_cloud_platform_libraries_bom sets version
@@ -747,13 +747,13 @@ class BeamModulePlugin implements Plugin<Project> {
         google_cloud_core_grpc                      : "com.google.cloud:google-cloud-core-grpc", // google_cloud_platform_libraries_bom sets version
         google_cloud_datacatalog_v1beta1            : "com.google.cloud:google-cloud-datacatalog", // google_cloud_platform_libraries_bom sets version
         google_cloud_dataflow_java_proto_library_all: "com.google.cloud.dataflow:google-cloud-dataflow-java-proto-library-all:0.5.160304",
-        google_cloud_datastore_v1_proto_client      : "com.google.cloud.datastore:datastore-v1-proto-client:2.18.3",   // [bomupgrader] sets version
+        google_cloud_datastore_v1_proto_client      : "com.google.cloud.datastore:datastore-v1-proto-client:2.19.0",   // [bomupgrader] sets version
         google_cloud_firestore                      : "com.google.cloud:google-cloud-firestore", // google_cloud_platform_libraries_bom sets version
         google_cloud_pubsub                         : "com.google.cloud:google-cloud-pubsub", // google_cloud_platform_libraries_bom sets version
         google_cloud_pubsublite                     : "com.google.cloud:google-cloud-pubsublite",  // google_cloud_platform_libraries_bom sets version
         // [bomupgrader] the BOM version is set by scripts/tools/bomupgrader.py. If update manually, also update
         // libraries-bom version on sdks/java/container/license_scripts/dep_urls_java.yaml
-        google_cloud_platform_libraries_bom         : "com.google.cloud:libraries-bom:26.32.0",
+        google_cloud_platform_libraries_bom         : "com.google.cloud:libraries-bom:26.36.0",
         google_cloud_spanner                        : "com.google.cloud:google-cloud-spanner", // google_cloud_platform_libraries_bom sets version
         google_cloud_spanner_test                   : "com.google.cloud:google-cloud-spanner:$google_cloud_spanner_version:tests",
         google_cloud_vertexai                       : "com.google.cloud:google-cloud-vertexai", // google_cloud_platform_libraries_bom sets version
@@ -1329,7 +1329,7 @@ class BeamModulePlugin implements Plugin<Project> {
         "com.google.auto.service:auto-service-annotations:$autoservice_version",
         "com.google.auto.value:auto-value-annotations:$autovalue_version",
         "com.google.code.findbugs:jsr305:$jsr305_version",
-        "com.google.j2objc:j2objc-annotations:2.8",
+        "com.google.j2objc:j2objc-annotations:3.0.0",
         // These dependencies are needed to avoid error-prone warnings on package-info.java files,
         // also to include the annotations to suppress warnings.
         //

diff --git a/examples/notebooks/get-started/try-apache-beam-yaml.ipynb b/examples/notebooks/get-started/try-apache-beam-yaml.ipynb
@@ -533,7 +533,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Beam YAML has EXPERIMENTAL ability to do aggregations to group and combine values across records. The is accomplished via the `Combine` transform type. Currently `Combine` needs to be in the `yaml_experimental_features` option (see the bottom of the pipeline) to use this transform.\n",
+    "Beam YAML has EXPERIMENTAL ability to do aggregations to group and combine values across records. The is accomplished via the `Combine` transform type.\n",
     "\n",
     "In this example we'll aggregate our records based on the `is_adult` classification. We'll calculate an average age for each of the groups."
    ]
@@ -566,9 +566,7 @@
     "          total:\n",
     "            value: age\n",
     "            fn: mean\n",
-    "    - type: LogForTesting\n",
-    "options:\n",
-    "  yaml_experimental_features: Combine"
+    "    - type: LogForTesting",
    ]
   },
   {

diff --git a/learning/prompts/documentation-lookup-nolinks/01_basic_learning_apache_beam.md b/learning/prompts/documentation-lookup-nolinks/01_basic_learning_apache_beam.md
@@ -0,0 +1,16 @@
+Prompt:
+What is Apache Beam?
+
+Response:
+Apache Beam is an open-source unified programming model for data processing pipelines. It provides a set of language-specific SDKs that allow you to define and execute data processing workflows.
+Apache Beam allows you to write pipelines in one of the supported languages (Java, Python, Go, and Scala). The pipelines can be executed on one of the supported runners (Apache Flink, Apache Spark, and Google Cloud Dataflow).
+
+Learning Apache Beam is a valuable skill for data processing and analysis. Here are some resources to get you started:
+* The 'Get Started' section of the Apache Beam website is a perfect place to start. It provides an overview of Apache Beam and its key concepts.
+* For a hands-on introduction, you can start by exploring the WordCount example provided in the 'Get Started' section, which demonstrates the basic concepts of Apache Beam.
+* Tour of Beam is an interactive tour that teaches you core Beam concepts. It provides a sandbox environment to write and run pipelines while exploring different topics.
+* Beam Playground offers an interactive space to experiment with the Apache Beam transforms and examples without installing anything on your system.
+
+After you have a basic understanding of Apache Beam, you can start to build your own pipelines. The Apache Beam website has a number of resources that can help you get started, including Apache Beam Documentation and Apache Beam Community. For those interested in the source code, you can explore the Apache Beam repository on GitHub.
+
+By leveraging these resources, you can build a strong foundation in Apache Beam and start constructing your own data processing pipelines. Additionally, Apache Beam is an open-source project, and contributions are welcome. Whether you're fixing bugs, improving documentation, or adding new features, your work can benefit the entire Beam community. To learn more about how you can contribute, visit the 'Contribute' section of the Apache Beam website.
diff --git a/learning/prompts/documentation-lookup-nolinks/02_basic_pipelines.md b/learning/prompts/documentation-lookup-nolinks/02_basic_pipelines.md
@@ -0,0 +1,18 @@
+Prompt:
+What is a pipeline in Apache Beam?
+
+Response:
+In Apache Beam, a pipeline encapsulates the entirety of a data processing task, including all the data and each step of the process. Essentially, it's a Directed Acyclic Graph of transformations (known as `PTransform`) applied to data collections (`PCollection`).
+
+The simplest pipelines in Apache Beam follow a linear flow of operations, typically adhering to a read-process-write pattern. However, pipelines can also be significantly more complex, featuring multiple input sources, output sinks, and operations (known as a `PTransform`) that can both read from and output to multiple `PCollection` objects. For more information about pipeline design and best practices, see the 'Common Pipeline Patterns' section in the Apache Beam documentation.
+
+To use Apache Beam, your driver program must first create an instance of the Beam SDK class `Pipeline` (typically in the `main()` function).
+
+```python
+import apache_beam as beam
+
+with beam.Pipeline() as pipeline:
+  pass  # Build your pipeline here.
+```
+
+When you create your pipeline, you need to set configuration options. You can set your pipeline’s configuration options programmatically. You can also set the options ahead of time, or read them from the command line, and then pass them to the `Pipeline` object when you create the object. This option is often easier than setting the pipeline configuration options programmatically.
diff --git a/learning/prompts/documentation-lookup-nolinks/03_basic_configuring_pipelines.md b/learning/prompts/documentation-lookup-nolinks/03_basic_configuring_pipelines.md
@@ -0,0 +1,31 @@
+Prompt:
+How do I configure pipeline in Apache Beam?
+
+Response:
+Configuring pipeline options in Apache Beam is crucial for specifying the execution environment, managing resources, and tailoring the pipeline's behavior to meet specific data processing needs and performance requirements.
+
+You can set pipeline options programmatically or pass them in using the command line. These options include runner-specific settings, job name, project ID (for cloud runners), machine types, number of workers, and more.
+
+Apache Beam offers a variety of standard pipeline options that allow you to customize and optimize your data processing pipelines.
+
+Beam SDKs include a command-line parser that you can use to set pipeline options. Use command-line arguments in the format `--<option>=<value>`. For example, the following command sets the `--runner` option `DirectRunner` and the `--project` option `my-project-id`:
+
+```bash
+python my-pipeline.py --runner=DirectRunner --project=my-project-id
+```
+
+To set the pipeline options programmatically, use the `PipelineOptions` class. For example, the following code sets the `--runner` option to `DirectRunner` and the `--project` option to `my-project-id`:
+
+```python
+from apache_beam import Pipeline
+from apache_beam.options.pipeline_options import PipelineOptions
+
+options = PipelineOptions(
+    project='my-project-id',
+    runner='DirectRunner'
+)
+```
+
+In addition to the standard pipeline options, you can add custom pipeline options. For a common pattern for configuring pipeline options, see the 'Pipeline option patterns' section in the Apache Beam documentation.
+
+The WordCount example pipeline in the 'Get Started' section of the Apache Beam documentation demonstrates how to set pipeline options at runtime by using command-line options.
diff --git a/learning/prompts/documentation-lookup-nolinks/04_basic_pcollections.md b/learning/prompts/documentation-lookup-nolinks/04_basic_pcollections.md
@@ -0,0 +1,25 @@
+Prompt:
+What is a `PCollection` in Apache Beam?
+
+Response:
+A `PCollection` in Apache Beam is an unordered bag of elements. Each `PCollection` is a potentially distributed, homogeneous data set or data stream, and is owned by the specific `Pipeline` object for which it is created. It’s the primary data structure used in Apache Beam pipelines to handle large-scale data processing in batch and streaming modes.
+
+The following example shows how to create a `PCollection` using the `Create` transform:
+
+```python
+import apache_beam as beam
+
+with beam.Pipeline() as pipeline:
+  pcollection = pipeline | beam.Create([...])  # Create a PCollection
+```
+
+A `PCollection` can either be bounded or unbounded, making it versatile for different data source types. Bounded `PCollection` objects represent a finite data set, such as files or databases, ideal for batch processing. Unbounded `PCollection` objects, on the other hand, represent data streams that continuously grow over time, such as real-time event logs, suitable for stream processing.
+
+Beam’s computational patterns and transforms are focused on situations where distributed data-parallel computation is required. Therefore, a `PCollection` has the following key characteristics:
+   - All elements must be of the same type (with support of structured types).
+   - Every `PCollection` has a coder, which is a specification of the binary format of the elements.
+   - Elements are immutable. They can't be altered after creation.
+   - Random access to individual elements of the collection isn't allowed.
+   - Beam encodes each element for distributed processing.
+
+For more information about `PCollection` objects in Apache Beam and usage patterns, see the 'Beam Programming Guide: PCollections' section in the Apache Beam documentation.