diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml index 67f8b21445dc..a2fbeae1319c 100644 --- a/.github/ISSUE_TEMPLATE/bug.yml +++ b/.github/ISSUE_TEMPLATE/bug.yml @@ -50,6 +50,7 @@ body: - "Priority: 2 (default / most bugs should be filed as P2)" - "Priority: 1 (data loss / total loss of function)" - "Priority: 0 (outage / urgent vulnerability)" + default: 1 validations: required: true - type: checkboxes @@ -68,6 +69,7 @@ body: - label: "Component: Beam playground" - label: "Component: Beam katas" - label: "Component: Website" + - label: "Component: Infrastructure" - label: "Component: Spark Runner" - label: "Component: Flink Runner" - label: "Component: Samza Runner" diff --git a/.github/ISSUE_TEMPLATE/failing_test.yml b/.github/ISSUE_TEMPLATE/failing_test.yml index 44e1cd720745..4295624995fb 100644 --- a/.github/ISSUE_TEMPLATE/failing_test.yml +++ b/.github/ISSUE_TEMPLATE/failing_test.yml @@ -56,6 +56,7 @@ body: - "Priority: 2 (backlog / disabled test but we think the product is healthy)" - "Priority: 1 (unhealthy code / failing or flaky postcommit so we cannot be sure the product is healthy)" - "Priority: 0 (outage / failing precommit test impacting development)" + default: 1 validations: required: true - type: checkboxes @@ -74,6 +75,7 @@ body: - label: "Component: Beam playground" - label: "Component: Beam katas" - label: "Component: Website" + - label: "Component: Infrastructure" - label: "Component: Spark Runner" - label: "Component: Flink Runner" - label: "Component: Samza Runner" diff --git a/.github/ISSUE_TEMPLATE/feature.yml b/.github/ISSUE_TEMPLATE/feature.yml index 11234a5e1501..e47c7c0751ce 100644 --- a/.github/ISSUE_TEMPLATE/feature.yml +++ b/.github/ISSUE_TEMPLATE/feature.yml @@ -44,6 +44,7 @@ body: options: - "Priority: 3 (nice-to-have improvement)" - "Priority: 2 (default / most feature requests should be filed as P2)" + default: 1 validations: required: true - type: checkboxes @@ -62,6 +63,7 @@ body: - label: "Component: Beam playground" - label: "Component: Beam katas" - label: "Component: Website" + - label: "Component: Infrastructure" - label: "Component: Spark Runner" - label: "Component: Flink Runner" - label: "Component: Samza Runner" diff --git a/.github/ISSUE_TEMPLATE/task.yml b/.github/ISSUE_TEMPLATE/task.yml index 477b91b181be..8da74a65d8f2 100644 --- a/.github/ISSUE_TEMPLATE/task.yml +++ b/.github/ISSUE_TEMPLATE/task.yml @@ -45,6 +45,7 @@ body: - "Priority: 3 (nice-to-have improvement)" - "Priority: 2 (default / most normal work should be filed as P2)" - "Priority: 1 (urgent / mostly reserved for critical bugs)" + default: 1 validations: required: true - type: checkboxes @@ -63,6 +64,7 @@ body: - label: "Component: Beam playground" - label: "Component: Beam katas" - label: "Component: Website" + - label: "Component: Infrastructure" - label: "Component: Spark Runner" - label: "Component: Flink Runner" - label: "Component: Samza Runner" diff --git a/.github/issue-rules.yml b/.github/issue-rules.yml index b01a22dafd78..c4acb2945575 100644 --- a/.github/issue-rules.yml +++ b/.github/issue-rules.yml @@ -46,6 +46,8 @@ rules: addLabels: ['katas'] - contains: '[x] Component: Website' addLabels: ['website'] +- contains: '[x] Component: Infrastructure' + addLabels: ['infra'] - contains: '[x] Component: Spark' addLabels: ['spark'] - contains: '[x] Component: Flink' diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json index 1efc8e9e4405..bbdc3a3910ef 100644 --- a/.github/trigger_files/IO_Iceberg_Integration_Tests.json +++ b/.github/trigger_files/IO_Iceberg_Integration_Tests.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 1 + "modification": 3 } diff --git a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Direct.json b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Direct.json index b970762c8397..38ae94aee2fa 100644 --- a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Direct.json +++ b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Direct.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "https://github.com/apache/beam/pull/31156": "noting that PR #31156 should run this test" + "https://github.com/apache/beam/pull/31761": "noting that PR #31761 should run this test" } diff --git a/.github/workflows/IO_Iceberg_Integration_Tests.yml b/.github/workflows/IO_Iceberg_Integration_Tests.yml index d7c9c6d95746..20d1f4bb60fd 100644 --- a/.github/workflows/IO_Iceberg_Integration_Tests.yml +++ b/.github/workflows/IO_Iceberg_Integration_Tests.yml @@ -72,12 +72,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Run IcebergIO Integration Test uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/IO_Iceberg_Performance_Tests.yml b/.github/workflows/IO_Iceberg_Performance_Tests.yml index 40bd43aa17ed..976fbedeadad 100644 --- a/.github/workflows/IO_Iceberg_Performance_Tests.yml +++ b/.github/workflows/IO_Iceberg_Performance_Tests.yml @@ -72,12 +72,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Run IcebergIO Performance Test uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/IO_Iceberg_Unit_Tests.yml b/.github/workflows/IO_Iceberg_Unit_Tests.yml index 1787756ab68b..0d72b0da8597 100644 --- a/.github/workflows/IO_Iceberg_Unit_Tests.yml +++ b/.github/workflows/IO_Iceberg_Unit_Tests.yml @@ -91,12 +91,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: run IcebergIO build script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_CancelStaleDataflowJobs.yml b/.github/workflows/beam_CancelStaleDataflowJobs.yml index b568b91dd34a..e8dfee525e31 100644 --- a/.github/workflows/beam_CancelStaleDataflowJobs.yml +++ b/.github/workflows/beam_CancelStaleDataflowJobs.yml @@ -73,12 +73,6 @@ jobs: uses: ./.github/actions/setup-environment-action with: disable-cache: true - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: run cancel stale dataflow jobs uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_CleanUpGCPResources.yml b/.github/workflows/beam_CleanUpGCPResources.yml index cf77dd68a92e..29d602357d6f 100644 --- a/.github/workflows/beam_CleanUpGCPResources.yml +++ b/.github/workflows/beam_CleanUpGCPResources.yml @@ -73,13 +73,8 @@ jobs: uses: ./.github/actions/setup-environment-action with: disable-cache: true - - name: Authenticate on GCP - id: auth - uses: google-github-actions/setup-gcloud@v0 - with: - service_account_email: ${{ secrets.GCP_SA_EMAIL }} - service_account_key: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} + - name: Setup gcloud + uses: google-github-actions/setup-gcloud@v2 - name: Install gcloud bigtable cli run: gcloud components install cbt - name: run cleanup GCP resources diff --git a/.github/workflows/beam_CleanUpPrebuiltSDKImages.yml b/.github/workflows/beam_CleanUpPrebuiltSDKImages.yml index 14e5f9783b61..20de04854282 100644 --- a/.github/workflows/beam_CleanUpPrebuiltSDKImages.yml +++ b/.github/workflows/beam_CleanUpPrebuiltSDKImages.yml @@ -73,12 +73,6 @@ jobs: uses: ./.github/actions/setup-environment-action with: disable-cache: true - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: GCloud Docker credential helper run: | gcloud auth configure-docker gcr.io && \ diff --git a/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml index 5ded71a7652a..e2afb2e2cfd7 100644 --- a/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml @@ -48,7 +48,7 @@ env: INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} GCLOUD_ZONE: us-central1-a - CLUSTER_NAME: beam-loadtests-python-cogbk-flink-batch-${{ github.run_id }} + CLUSTER_NAME: beam-loadtests-py-cogbk-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml index 061a1b9e210e..bae2f9f82ee1 100644 --- a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml @@ -48,7 +48,7 @@ env: INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} GCLOUD_ZONE: us-central1-a - CLUSTER_NAME: beam-loadtests-python-pardo-flink-batch-${{ github.run_id }} + CLUSTER_NAME: beam-loadtests-py-pardo-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml index bec926ab9656..4485b7187f80 100644 --- a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml @@ -48,7 +48,7 @@ env: INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} GCLOUD_ZONE: us-central1-a - CLUSTER_NAME: beam-loadtests-python-pardo-flink-stream-${{ github.run_id }} + CLUSTER_NAME: beam-loadtests-py-pardo-flink-stream-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar diff --git a/.github/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml index 5fde65bdccd1..6dfb0634d7f5 100644 --- a/.github/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml @@ -71,12 +71,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_Cdap.yml b/.github/workflows/beam_PerformanceTests_Cdap.yml index 2848b555185d..3b32129b761c 100644 --- a/.github/workflows/beam_PerformanceTests_Cdap.yml +++ b/.github/workflows/beam_PerformanceTests_Cdap.yml @@ -71,12 +71,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml index 58b96f7e3526..071577ade0f1 100644 --- a/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml @@ -71,12 +71,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_HadoopFormat.yml b/.github/workflows/beam_PerformanceTests_HadoopFormat.yml index 03c2732ce4ff..a30ef9aab510 100644 --- a/.github/workflows/beam_PerformanceTests_HadoopFormat.yml +++ b/.github/workflows/beam_PerformanceTests_HadoopFormat.yml @@ -71,12 +71,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_JDBC.yml b/.github/workflows/beam_PerformanceTests_JDBC.yml index 2305a779a09c..c65be423d156 100644 --- a/.github/workflows/beam_PerformanceTests_JDBC.yml +++ b/.github/workflows/beam_PerformanceTests_JDBC.yml @@ -71,12 +71,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_Kafka_IO.yml b/.github/workflows/beam_PerformanceTests_Kafka_IO.yml index 2b620043c37b..39e49db09196 100644 --- a/.github/workflows/beam_PerformanceTests_Kafka_IO.yml +++ b/.github/workflows/beam_PerformanceTests_Kafka_IO.yml @@ -73,12 +73,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml index 63b70cb810e9..6329c8ce8f5e 100644 --- a/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml @@ -71,12 +71,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_MongoDBIO_IT.yml b/.github/workflows/beam_PerformanceTests_MongoDBIO_IT.yml index 655c61ea373a..e2ce7ed94e5b 100644 --- a/.github/workflows/beam_PerformanceTests_MongoDBIO_IT.yml +++ b/.github/workflows/beam_PerformanceTests_MongoDBIO_IT.yml @@ -71,12 +71,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml index d76dd3061ac8..929d214bd676 100644 --- a/.github/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml @@ -71,12 +71,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_SingleStoreIO.yml b/.github/workflows/beam_PerformanceTests_SingleStoreIO.yml index f0f7ec1d373f..f842d31fba0c 100644 --- a/.github/workflows/beam_PerformanceTests_SingleStoreIO.yml +++ b/.github/workflows/beam_PerformanceTests_SingleStoreIO.yml @@ -72,12 +72,6 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_SparkReceiver_IO.yml b/.github/workflows/beam_PerformanceTests_SparkReceiver_IO.yml index 5872cc332417..ec3bc1a23fd9 100644 --- a/.github/workflows/beam_PerformanceTests_SparkReceiver_IO.yml +++ b/.github/workflows/beam_PerformanceTests_SparkReceiver_IO.yml @@ -71,12 +71,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml index 103775b034ff..f6c0ddece2f6 100644 --- a/.github/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml @@ -73,12 +73,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml index 5808ddad6572..362cbdaedd64 100644 --- a/.github/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml @@ -71,12 +71,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml b/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml index d4204ab09451..8abc8a3199dd 100644 --- a/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml +++ b/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml @@ -73,12 +73,6 @@ jobs: uses: ./.github/actions/setup-environment-action with: python-version: default - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml b/.github/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml index bfbfc6c04119..353f86e082c5 100644 --- a/.github/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml +++ b/.github/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml @@ -74,12 +74,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml b/.github/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml index b6662f3d6595..3925bd924714 100644 --- a/.github/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml +++ b/.github/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml @@ -76,25 +76,19 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: cluster_name: io-datastores k8s_namespace: ${{ matrix.job_name }}-${{ github.run_id }} remove_finalizer: memsqlclusters.memsql.com/sdb-cluster - - name: Install Singlestore operator + - name: Install SingleStore operator run: | kubectl apply -f ${{github.workspace}}/.test-infra/kubernetes/singlestore/sdb-rbac.yaml kubectl apply -f ${{github.workspace}}/.test-infra/kubernetes/singlestore/sdb-cluster-crd.yaml kubectl apply -f ${{github.workspace}}/.test-infra/kubernetes/singlestore/sdb-operator.yaml kubectl wait --for=condition=Ready pod -l name=sdb-operator --timeout=120s - - name: Install Singlestore cluster + - name: Install SingleStore cluster id: install_singlestore run: | kubectl apply -f ${{github.workspace}}/.test-infra/kubernetes/singlestore/sdb-cluster.yaml diff --git a/.github/workflows/beam_StressTests_Java_KafkaIO.yml b/.github/workflows/beam_StressTests_Java_KafkaIO.yml index e84c49f01478..9e4550338992 100644 --- a/.github/workflows/beam_StressTests_Java_KafkaIO.yml +++ b/.github/workflows/beam_StressTests_Java_KafkaIO.yml @@ -71,12 +71,6 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment uses: ./.github/actions/setup-environment-action - - name: Authenticate on GCP - id: auth - uses: google-github-actions/auth@v1 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: diff --git a/.github/workflows/build_runner_image.yml b/.github/workflows/build_runner_image.yml index 0492622f8847..0f17a9073daf 100644 --- a/.github/workflows/build_runner_image.yml +++ b/.github/workflows/build_runner_image.yml @@ -41,14 +41,6 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.sha }} - - name: Authenticate on GCP - if: github.ref == 'refs/heads/master' - uses: google-github-actions/setup-gcloud@v0 - with: - service_account_email: ${{ secrets.GCP_SA_EMAIL }} - service_account_key: ${{ secrets.GCP_SA_KEY }} - project_id: ${{ secrets.GCP_PROJECT_ID }} - export_default_credentials: true - name: GCloud Docker credential helper run: | gcloud auth configure-docker ${{env.docker_registry}} diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 5142b0b22c30..1275b38b9d23 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -45,7 +45,7 @@ jobs: check_env_variables: timeout-minutes: 5 name: "Check environment variables" - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-20.04, main] env: EVENT_NAME: ${{ github.event_name }} PY_VERSIONS_FULL: "cp38-* cp39-* cp310-* cp311-* cp312-*" @@ -59,8 +59,8 @@ jobs: run: "./scripts/ci/ci_check_are_gcp_variables_set.sh" id: check_gcp_variables env: - GCP_SA_EMAIL: ${{ secrets.GCP_SA_EMAIL }} - GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }} + GCP_SA_EMAIL: "not used by self hosted runner" + GCP_SA_KEY: "not used by self hosted runner" GCP_PYTHON_WHEELS_BUCKET: ${{ secrets.GCP_PYTHON_WHEELS_BUCKET }} GCP_PROJECT_ID: "not-needed-here" GCP_REGION: "not-needed-here" @@ -80,7 +80,7 @@ jobs: echo "py-versions-full=$PY_VERSIONS_FULL" >> $GITHUB_OUTPUT build_source: - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-20.04, main] name: Build python source distribution outputs: is_rc: ${{ steps.is_rc.outputs.is_rc }} @@ -190,14 +190,9 @@ jobs: needs: - build_source - check_env_variables - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-20.04, main] if: needs.check_env_variables.outputs.gcp-variables-set == 'true' && github.event_name != 'pull_request' steps: - - name: Authenticate on GCP - uses: google-github-actions/setup-gcloud@v0 - with: - service_account_email: ${{ secrets.GCP_SA_EMAIL }} - service_account_key: ${{ secrets.GCP_SA_KEY }} - name: Remove existing files on GCS bucket run: gsutil rm -r ${{ env.GCP_PATH }} || true @@ -206,7 +201,7 @@ jobs: needs: - prepare_gcs - check_env_variables - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-20.04, main] if: needs.check_env_variables.outputs.gcp-variables-set == 'true' steps: - name: Download compressed sources from artifacts @@ -215,11 +210,6 @@ jobs: with: name: source_zip path: source/ - - name: Authenticate on GCP - uses: google-github-actions/setup-gcloud@v0 - with: - service_account_email: ${{ secrets.GCP_SA_EMAIL }} - service_account_key: ${{ secrets.GCP_SA_KEY }} - name: Copy sources to GCS bucket run: gsutil cp -r -a public-read source/* ${{ env.GCP_PATH }} @@ -230,19 +220,20 @@ jobs: - build_source env: CIBW_ARCHS_LINUX: ${{matrix.arch}} - runs-on: ${{ matrix.os_python.os }} + runs-on: ${{ matrix.os_python.runner }} + timeout-minutes: 480 strategy: matrix: os_python: [ - {"os": "ubuntu-latest", "python": "${{ needs.check_env_variables.outputs.py-versions-full }}" }, + {"os": "ubuntu-20.04", "runner": [self-hosted, ubuntu-20.04, main], "python": "${{ needs.check_env_variables.outputs.py-versions-full }}" }, # Temporarily pin to macos-13 because macos-latest breaks this build # TODO(https://github.com/apache/beam/issues/31114) - {"os": "macos-13", "python": "${{ needs.check_env_variables.outputs.py-versions-test }}" }, - {"os": "windows-latest", "python": "${{ needs.check_env_variables.outputs.py-versions-test }}" }, + {"os": "macos-13", "runner": "macos-13", "python": "${{ needs.check_env_variables.outputs.py-versions-test }}" }, + {"os": "windows-latest", "runner": "windows-latest", "python": "${{ needs.check_env_variables.outputs.py-versions-test }}" }, ] arch: [auto] include: - - os_python: {"os": "ubuntu-latest", "python": "${{ needs.check_env_variables.outputs.py-versions-test }}" } + - os_python: {"os": "ubuntu-20.04", "runner": [self-hosted, ubuntu-20.04, main], "python": "${{ needs.check_env_variables.outputs.py-versions-test }}" } arch: aarch64 steps: - name: Download python source distribution from artifacts @@ -324,16 +315,16 @@ jobs: needs: - build_wheels - check_env_variables - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-20.04, main] if: needs.check_env_variables.outputs.gcp-variables-set == 'true' && github.event_name != 'pull_request' strategy: matrix: # Temporarily pin to macos-13 because macos-latest breaks this build # TODO(https://github.com/apache/beam/issues/31114) - os : [ubuntu-latest, macos-13, windows-latest] + os : [ubuntu-20.04, macos-13, windows-latest] arch: [auto] include: - - os: "ubuntu-latest" + - os: ubuntu-20.04 arch: aarch64 steps: - name: Download wheels from artifacts @@ -342,11 +333,6 @@ jobs: with: name: wheelhouse-${{ matrix.os }}${{ (matrix.arch == 'aarch64' && '-aarch64') || '' }} path: wheelhouse/ - - name: Authenticate on GCP - uses: google-github-actions/setup-gcloud@v0 - with: - service_account_email: ${{ secrets.GCP_SA_EMAIL }} - service_account_key: ${{ secrets.GCP_SA_KEY }} - name: Copy wheels to GCS bucket run: gsutil cp -r -a public-read wheelhouse/* ${{ env.GCP_PATH }} - name: Create github action information file on GCS bucket @@ -375,14 +361,9 @@ jobs: needs: - upload_wheels_to_gcs - check_env_variables - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-20.04, main] if: needs.check_env_variables.outputs.gcp-variables-set == 'true' && github.event_name != 'pull_request' steps: - - name: Authenticate on GCP - uses: google-github-actions/setup-gcloud@v0 - with: - service_account_email: ${{ secrets.GCP_SA_EMAIL }} - service_account_key: ${{ secrets.GCP_SA_KEY }} - name: List file on Google Cloud Storage Bucket run: gsutil ls "${{ env.GCP_PATH }}*" @@ -393,7 +374,7 @@ jobs: needs: - build_source - build_wheels - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 60 if: github.repository_owner == 'apache' && github.event_name == 'schedule' steps: diff --git a/.github/workflows/java_tests.yml b/.github/workflows/java_tests.yml index 0e9d862f91a3..1d6441b24681 100644 --- a/.github/workflows/java_tests.yml +++ b/.github/workflows/java_tests.yml @@ -162,11 +162,10 @@ jobs: fail-fast: false matrix: os: [[self-hosted, ubuntu-20.04, main], windows-latest] + # TODO(https://github.com/apache/beam/issues/31848) run on Dataflow after fixes credential on macOS/win GHA runner if: | - needs.check_gcp_variables.outputs.gcp-variables-set == 'true' && ( - (github.event_name == 'push' || github.event_name == 'schedule') || + needs.check_gcp_variables.outputs.gcp-variables-set == 'true' && (github.event_name == 'workflow_dispatch' && github.event.inputs.runDataflow == 'true') - ) steps: - name: Check out code uses: actions/checkout@v4 @@ -179,12 +178,10 @@ jobs: java-version: 11 go-version: default - name: Authenticate on GCP - uses: google-github-actions/setup-gcloud@v0 + uses: google-github-actions/auth@v1 with: - service_account_email: ${{ secrets.GCP_SA_EMAIL }} - service_account_key: ${{ secrets.GCP_SA_KEY }} + credentials_json: ${{ secrets.GCP_SA_KEY }} project_id: ${{ secrets.GCP_PROJECT_ID }} - export_default_credentials: true - name: Run WordCount uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/playground_backend_precommit.yml b/.github/workflows/playground_backend_precommit.yml index 4c45547f4698..79517e705c27 100644 --- a/.github/workflows/playground_backend_precommit.yml +++ b/.github/workflows/playground_backend_precommit.yml @@ -60,7 +60,7 @@ jobs: sudo apt-get install sbt --yes sudo wget https://codeload.github.com/spotify/scio.g8/zip/7c1ba7c1651dfd70976028842e721da4107c0d6d -O scio.g8.zip && unzip scio.g8.zip && mv scio.g8-7c1ba7c1651dfd70976028842e721da4107c0d6d /opt/scio.g8 - name: Set up Cloud SDK and its components - uses: google-github-actions/setup-gcloud@v0 + uses: google-github-actions/setup-gcloud@v2 with: install_components: 'beta,cloud-datastore-emulator' version: '${{ env.DATASTORE_EMULATOR_VERSION }}' diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index efc5af84c4bc..a65b26645533 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -153,6 +153,8 @@ jobs: python_wordcount_dataflow: name: 'Python Wordcount Dataflow' + # TODO(https://github.com/apache/beam/issues/31848) run on Dataflow after fixes credential on macOS/win GHA runner + if: (github.event_name == 'workflow_dispatch' && github.event.inputs.runDataflow == 'true') needs: - build_python_sdk_source runs-on: ${{ matrix.os }} @@ -175,12 +177,11 @@ jobs: name: python_sdk_source path: apache-beam-source - name: Authenticate on GCP - uses: google-github-actions/setup-gcloud@v0 + id: auth + uses: google-github-actions/auth@v1 with: - service_account_email: ${{ secrets.GCP_SA_EMAIL }} - service_account_key: ${{ secrets.GCP_SA_KEY }} + credentials_json: ${{ secrets.GCP_SA_KEY }} project_id: ${{ secrets.GCP_PROJECT_ID }} - export_default_credentials: true - name: Install requirements working-directory: ./sdks/python run: pip install setuptools --upgrade && pip install -e ".[gcp]" diff --git a/.github/workflows/run_perf_alert_tool.yml b/.github/workflows/run_perf_alert_tool.yml index 1f623571acde..4bb5df41dcfb 100644 --- a/.github/workflows/run_perf_alert_tool.yml +++ b/.github/workflows/run_perf_alert_tool.yml @@ -40,12 +40,6 @@ jobs: uses: actions/setup-python@v5 with: python-version: 3.8 - - name: Authenticate on GCP - if: github.event_name != 'pull_request' - uses: google-github-actions/setup-gcloud@v0 - with: - service_account_key: ${{ secrets.GCP_SA_KEY }} - export_default_credentials: true - name: Install Apache Beam working-directory: ./sdks/python run: pip install -e .[gcp,test] diff --git a/.github/workflows/typescript_tests.yml b/.github/workflows/typescript_tests.yml index 0fdcfb070a22..1b45ea67b5c6 100644 --- a/.github/workflows/typescript_tests.yml +++ b/.github/workflows/typescript_tests.yml @@ -147,12 +147,10 @@ jobs: pip install 'pandas>=1.0,<1.5' pip install -e ".[gcp]" - name: Authenticate on GCP - uses: google-github-actions/setup-gcloud@v0 + uses: google-github-actions/auth@v1 with: - service_account_email: ${{ secrets.GCP_SA_EMAIL }} - service_account_key: ${{ secrets.GCP_SA_KEY }} + credentials_json: ${{ secrets.GCP_SA_KEY }} project_id: ${{ secrets.GCP_PROJECT_ID }} - export_default_credentials: true - run: npm ci working-directory: ./sdks/typescript - run: npm run build diff --git a/.test-infra/mock-apis/poetry.lock b/.test-infra/mock-apis/poetry.lock index b36baff7a74b..98985df7ea4a 100644 --- a/.test-infra/mock-apis/poetry.lock +++ b/.test-infra/mock-apis/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "beautifulsoup4" @@ -188,19 +188,18 @@ files = [ [[package]] name = "setuptools" -version = "68.2.2" +version = "70.0.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-68.2.2-py3-none-any.whl", hash = "sha256:b454a35605876da60632df1a60f736524eb73cc47bbc9f3f1ef1b644de74fd2a"}, - {file = "setuptools-68.2.2.tar.gz", hash = "sha256:4ac1475276d2f1c48684874089fefcd83bd7162ddaafb81fac866ba0db282a87"}, + {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, + {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "soupsieve" diff --git a/.test-infra/tools/stale_bq_datasets_cleaner.sh b/.test-infra/tools/stale_bq_datasets_cleaner.sh index c4afabe11e9a..326000fdc754 100755 --- a/.test-infra/tools/stale_bq_datasets_cleaner.sh +++ b/.test-infra/tools/stale_bq_datasets_cleaner.sh @@ -18,7 +18,7 @@ # Deletes stale and old BQ datasets that are left after tests. # -set -exuo pipefail +set -euo pipefail PROJECT=apache-beam-testing MAX_RESULT=1500 @@ -51,7 +51,7 @@ for dataset in ${BQ_DATASETS[@]}; do # date command usage depending on OS echo "Deleted $dataset (modified `date -d @$LAST_MODIFIED`)" elif [[ $OSTYPE == "darwin"* ]]; then - echo "Deleted $dataset (modified `date -r @$LAST_MODIFIED`)" + echo "Deleted $dataset (modified `date -r $LAST_MODIFIED`)" fi else echo "Tried and failed to delete $dataset" diff --git a/CHANGES.md b/CHANGES.md index 96c436d89ecd..d082f03fd310 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -53,7 +53,7 @@ * ([#X](https://github.com/apache/beam/issues/X)). --> -# [2.58.0] - Unreleased +# [2.59.0] - Unreleased ## Highlights @@ -62,19 +62,18 @@ ## I/Os -* Support for [Solace](https://solace.com/) source (`SolaceIO.Read`) added (Java) ([#31440](https://github.com/apache/beam/issues/31440)). +* Support for X source added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* Improvements to the performance of BigqueryIO when using withPropagateSuccessfulStorageApiWrites(true) method (Java) ([#31840](https://github.com/apache/beam/pull/31840)). ## New Features / Improvements -* Multiple RunInference instances can now share the same model instance by setting the model_identifier parameter (Python) ([#31665](https://github.com/apache/beam/issues/31665)). -* [IcebergIO] All specified catalog properties are passed through to the connector ([#31726](https://github.com/apache/beam/pull/31726)) -* Removed a 3rd party LGPL dependency from the Go SDK ([#31765](https://github.com/apache/beam/issues/31765)). -* Support for MapState and SetState when using Dataflow Runner v1 with Streaming Engine (Java) ([[#18200](https://github.com/apache/beam/issues/18200)]) +* X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* Go SDK Minimum Go Version updated to 1.21 ([#32092](https://github.com/apache/beam/pull/32092)). +* Updated Go protobuf package to new version (Go) ([#21515](https://github.com/apache/beam/issues/21515)). ## Breaking Changes * X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). -* [IcebergIO] IcebergCatalogConfig was changed to support specifying catalog properties in a key-store fashion ([#31726](https://github.com/apache/beam/pull/31726)) ## Deprecations @@ -82,7 +81,7 @@ ## Bugfixes -* Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* Fixed incorrect service account impersonation flow for Python pipelines using BigQuery IOs ([#32030](https://github.com/apache/beam/issues/32030)). ## Security Fixes * Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). @@ -91,6 +90,31 @@ * ([#X](https://github.com/apache/beam/issues/X)). +# [2.58.0] - 2024-08-06 + +## Highlights + +* Support for [Solace](https://solace.com/) source (`SolaceIO.Read`) added (Java) ([#31440](https://github.com/apache/beam/issues/31440)). + +## New Features / Improvements + +* Multiple RunInference instances can now share the same model instance by setting the model_identifier parameter (Python) ([#31665](https://github.com/apache/beam/issues/31665)). +* Added options to control the number of Storage API multiplexing connections ([#31721](https://github.com/apache/beam/pull/31721)) +* [BigQueryIO] Better handling for batch Storage Write API when it hits AppendRows throughput quota ([#31837](https://github.com/apache/beam/pull/31837)) +* [IcebergIO] All specified catalog properties are passed through to the connector ([#31726](https://github.com/apache/beam/pull/31726)) +* Removed a 3rd party LGPL dependency from the Go SDK ([#31765](https://github.com/apache/beam/issues/31765)). +* Support for MapState and SetState when using Dataflow Runner v1 with Streaming Engine (Java) ([[#18200](https://github.com/apache/beam/issues/18200)]) + +## Breaking Changes + +* [IcebergIO] IcebergCatalogConfig was changed to support specifying catalog properties in a key-store fashion ([#31726](https://github.com/apache/beam/pull/31726)) +* [SpannerIO] Added validation that query and table cannot be specified at the same time for SpannerIO.read(). Previously withQuery overrides withTable, if set ([#24956](https://github.com/apache/beam/issues/24956)). + +## Bugfixes + +* [BigQueryIO] Fixed a bug in batch Storage Write API that frequently exhausted concurrent connections quota ([#31710](https://github.com/apache/beam/pull/31710)) +* Fixed a logging issue where Python worker dependency installation logs sometimes were not emitted in a timely manner ([#31977](https://github.com/apache/beam/pull/31977)) + # [2.57.0] - 2024-06-26 ## Highlights @@ -487,6 +511,7 @@ as a workaround, a copy of "old" `CountingSource` class should be placed into a ## Known Issues * Long-running Python pipelines might experience a memory leak: [#28246](https://github.com/apache/beam/issues/28246). +* Python pipelines using the `--impersonate_service_account` option with BigQuery IOs might fail on Dataflow ([#32030](https://github.com/apache/beam/issues/32030)). This is fixed in 2.59.0 release. # [2.48.0] - 2023-05-31 diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index bbd23a08bcd6..ee116423e4b0 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -758,6 +758,7 @@ class BeamModulePlugin implements Plugin { // [bomupgrader] the BOM version is set by scripts/tools/bomupgrader.py. If update manually, also update // libraries-bom version on sdks/java/container/license_scripts/dep_urls_java.yaml google_cloud_platform_libraries_bom : "com.google.cloud:libraries-bom:26.39.0", + google_cloud_secret_manager : "com.google.cloud:google-cloud-secretmanager", // google_cloud_platform_libraries_bom sets version google_cloud_spanner : "com.google.cloud:google-cloud-spanner", // google_cloud_platform_libraries_bom sets version google_cloud_spanner_test : "com.google.cloud:google-cloud-spanner:$google_cloud_spanner_version:tests", google_cloud_vertexai : "com.google.cloud:google-cloud-vertexai", // google_cloud_platform_libraries_bom sets version @@ -858,6 +859,7 @@ class BeamModulePlugin implements Plugin { proto_google_cloud_firestore_v1 : "com.google.api.grpc:proto-google-cloud-firestore-v1", // google_cloud_platform_libraries_bom sets version proto_google_cloud_pubsub_v1 : "com.google.api.grpc:proto-google-cloud-pubsub-v1", // google_cloud_platform_libraries_bom sets version proto_google_cloud_pubsublite_v1 : "com.google.api.grpc:proto-google-cloud-pubsublite-v1", // google_cloud_platform_libraries_bom sets version + proto_google_cloud_secret_manager_v1 : "com.google.api.grpc:proto-google-cloud-secretmanager-v1", // google_cloud_platform_libraries_bom sets version proto_google_cloud_spanner_v1 : "com.google.api.grpc:proto-google-cloud-spanner-v1", // google_cloud_platform_libraries_bom sets version proto_google_cloud_spanner_admin_database_v1: "com.google.api.grpc:proto-google-cloud-spanner-admin-database-v1", // google_cloud_platform_libraries_bom sets version proto_google_common_protos : "com.google.api.grpc:proto-google-common-protos", // google_cloud_platform_libraries_bom sets version @@ -1976,8 +1978,8 @@ class BeamModulePlugin implements Plugin { def dependencyNode = dependenciesNode.appendNode('dependency') def appendClassifier = { dep -> dep.artifacts.each { art -> - if (art.hasProperty('archiveClassifier')) { - dependencyNode.appendNode('archiveClassifier', art.archiveClassifier) + if (art.hasProperty('classifier')) { + dependencyNode.appendNode('classifier', art.classifier) } } } diff --git a/contributor-docs/code-change-guide.md b/contributor-docs/code-change-guide.md index 935a2c6276c5..f0785d3509d0 100644 --- a/contributor-docs/code-change-guide.md +++ b/contributor-docs/code-change-guide.md @@ -286,14 +286,27 @@ Integration tests differ from standard pipelines in the following ways: * They have a default timeout of 15 minutes. * The pipeline options are set in the system property `beamTestPipelineOptions`. -To configure the test, you need to set the property `-DbeamTestPipelineOptions=[...]`. This property sets the runner that the test uses. - -The following example demonstrates how to run an integration test by using the command line. This example includes the options required to run the pipeline on the Dataflow runner. +To configure the test pipeline, you need to set the property `-DbeamTestPipelineOptions=[...]`. This property sets the pipeline option that the test uses, for example, ``` -DbeamTestPipelineOptions='["--runner=TestDataflowRunner","--project=mygcpproject","--region=us-central1","--stagingLocation=gs://mygcsbucket/path"]' ``` +For some projects, `beamTestPipelineOptions` is explicitly configured in `build.gradle`. +Checkout the sources of the corresponding build file for setting. For example, +in `sdks/java/io/google-cloud-platform/build.gradle`, it sets `beamTestPipelineOptions` +from project properties 'gcpProject', 'gcpTempRoot', etc, and when not assigned, +it defaults to `apache-beam-testing` GCP project. To run the test in your own project, +assign these project properties with command line: + +``` +./gradlew :sdks:java:io:google-cloud-platform:integrationTest -PgcpProject= -PgcpTempRoot= +``` + +Some other projects (e.g. `sdks/java/io/jdbc`, `sdks/java/io/kafka`) does not +assemble (overwrite) `beamTestPipelineOptions` in `build.gradle`, then just set +it explicitly with `-DbeamTestPipelineOptions='[...]'`, as aforementioned. + #### Write integration tests To set up a `TestPipeline` object in an integration test, use the following code: @@ -423,6 +436,17 @@ If you're using Dataflow Runner v2 and `sdks/java/harness` or its dependencies ( --sdkContainerImage="us.gcr.io/apache-beam-testing/beam_java11_sdk:2.49.0-custom" ``` +#### Snapshot Version Containers + +By default, a Snapshot version for an SDK under development will use the containers published to the [apache-beam-testing project's container registry](https://us.gcr.io/apache-beam-testing/github-actions). For example, the most recent snapshot container for Java 17 can be found [here](https://us.gcr.io/apache-beam-testing/github-actions/beam_java17_sdk). + +When a version is entering the [release candidate stage](https://github.com/apache/beam/blob/master/contributor-docs/release-guide.md), one final SNAPSHOT version will be published. +This SNAPSHOT version will use the final containers published on [DockerHub](https://hub.docker.com/search?q=apache%2Fbeam). + +**NOTE:** During the release process, there may be some downtime where a container is not available for use for a SNAPSHOT version. To avoid this, it is recommended to either switch to the latest SNAPSHOT version available or to use [custom containers](https://beam.apache.org/documentation/runtime/environments/#custom-containers). You should also only rely on snapshot versions for important workloads if absolutely necessary. + +Certain runners may override this snapshot behavior; for example, the Dataflow runner overrides all SNAPSHOT containers into a [single registry](https://console.cloud.google.com/gcr/images/cloud-dataflow/GLOBAL/v1beta3). The same downtime will still be incurred, however, when switching to the final container + ## Python guide The Beam Python SDK is distributed as a single wheel, which is more straightforward than the Java SDK. diff --git a/contributor-docs/release-guide.md b/contributor-docs/release-guide.md index c0e8e7c67ce7..b3d3c77d25df 100644 --- a/contributor-docs/release-guide.md +++ b/contributor-docs/release-guide.md @@ -1006,18 +1006,18 @@ write to BigQuery, and create a cluster of machines for running containers (for In comment area, type in `Run Python ReleaseCandidate` to trigger validation. * **Python Leaderboard & GameStats** - * **Get staging RC** `wget https://dist.apache.org/repos/dist/dev/beam/2.5.0/* ` + * **Get staging RC** `wget https://dist.apache.org/repos/dist/dev/beam/2.XX.0/* ` * **Verify the hashes** ``` - sha512sum -c apache-beam-2.5.0-python.tar.gz.sha512 - sha512sum -c apache-beam-2.5.0-source-release.tar.gz.sha512 + sha512sum -c apache_beam-2.XX.0-python.tar.gz.sha512 + sha512sum -c apache_beam-2.XX.0-source-release.tar.gz.sha512 ``` * **Build SDK** ``` sudo apt-get install unzip - unzip apache-beam-2.5.0-source-release.tar.gz + unzip apache_beam-2.XX.0-source-release.tar.gz python setup.py sdist ``` * **Setup virtual environment** @@ -1030,8 +1030,8 @@ write to BigQuery, and create a cluster of machines for running containers (for * **Install SDK** ``` - pip install dist/apache-beam-2.5.0.tar.gz - pip install dist/apache-beam-2.5.0.tar.gz[gcp] + pip install dist/apache_beam-2.XX.0.tar.gz + pip install dist/apache_beam-2.XX.0.tar.gz[gcp] ``` * **Setup GCP** diff --git a/examples/java/src/main/java/org/apache/beam/examples/complete/kafkatopubsub/transforms/FormatTransform.java b/examples/java/src/main/java/org/apache/beam/examples/complete/kafkatopubsub/transforms/FormatTransform.java index 2d9089fcd29a..296d7e7d2409 100644 --- a/examples/java/src/main/java/org/apache/beam/examples/complete/kafkatopubsub/transforms/FormatTransform.java +++ b/examples/java/src/main/java/org/apache/beam/examples/complete/kafkatopubsub/transforms/FormatTransform.java @@ -17,6 +17,7 @@ */ package org.apache.beam.examples.complete.kafkatopubsub.transforms; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; import org.apache.beam.examples.complete.kafkatopubsub.avro.AvroDataClass; @@ -37,7 +38,6 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PDone; import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.kafka.common.serialization.StringDeserializer; @@ -120,7 +120,8 @@ public PDone expand(PCollection input) { MapElements.into(TypeDescriptor.of(PubsubMessage.class)) .via( (String json) -> - new PubsubMessage(json.getBytes(Charsets.UTF_8), ImmutableMap.of()))) + new PubsubMessage( + json.getBytes(StandardCharsets.UTF_8), ImmutableMap.of()))) .apply( "writePubsubMessagesToPubSub", PubsubIO.writeMessages().to(options.getOutputTopic())); } diff --git a/examples/notebooks/beam-ml/bigtable_enrichment_transform.ipynb b/examples/notebooks/beam-ml/bigtable_enrichment_transform.ipynb index 7510831bac40..95be8b1d957c 100644 --- a/examples/notebooks/beam-ml/bigtable_enrichment_transform.ipynb +++ b/examples/notebooks/beam-ml/bigtable_enrichment_transform.ipynb @@ -139,41 +139,33 @@ }, "source": [ "### Authenticate with Google Cloud\n", - "This notebook reads data from Pub/Sub and Bigtable. To use your Google Cloud account, authenticate this notebook." + "This notebook reads data from Pub/Sub and Bigtable. To use your Google Cloud account, authenticate this notebook.\n", + "To prepare for this step, replace ``, ``, and `` with the appropriate values for your setup. These fields are used with Bigtable." ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "Kz9sccyGBqz3" + "id": "wEXucyi2liij" }, "outputs": [], "source": [ - "from google.colab import auth\n", - "auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nAmGgUMt48o9" - }, - "source": [ - "Replace ``, ``, and `` with the appropriate values for your setup. These fields are used with Bigtable." + "PROJECT_ID = \"\"\n", + "INSTANCE_ID = \"\"\n", + "TABLE_ID = \"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "wEXucyi2liij" + "id": "Kz9sccyGBqz3" }, "outputs": [], "source": [ - "PROJECT_ID = \"\"\n", - "INSTANCE_ID = \"\"\n", - "TABLE_ID = \"\"" + "from google.colab import auth\n", + "auth.authenticate_user(project_id=PROJECT_ID)" ] }, { @@ -879,4 +871,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/release/src/main/scripts/run_rc_validation.sh b/release/src/main/scripts/run_rc_validation.sh index 0f2bfe4aaec2..91bfa9e2f8bb 100755 --- a/release/src/main/scripts/run_rc_validation.sh +++ b/release/src/main/scripts/run_rc_validation.sh @@ -300,14 +300,14 @@ if [[ ("$python_leaderboard_direct" = true \ cd ${LOCAL_BEAM_DIR} echo "---------------------Downloading Python Staging RC----------------------------" - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz.sha512 - if [[ ! -f apache-beam-${RELEASE_VER}.tar.gz ]]; then + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache_beam-${RELEASE_VER}.tar.gz + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache_beam-${RELEASE_VER}.tar.gz.sha512 + if [[ ! -f apache_beam-${RELEASE_VER}.tar.gz ]]; then { echo "Fail to download Python Staging RC files." ;exit 1; } fi echo "--------------------------Verifying Hashes------------------------------------" - sha512sum -c apache-beam-${RELEASE_VER}.tar.gz.sha512 + sha512sum -c apache_beam-${RELEASE_VER}.tar.gz.sha512 echo "--------------------------Updating ~/.m2/settings.xml-------------------------" cd ~ @@ -378,7 +378,7 @@ if [[ ("$python_leaderboard_direct" = true \ pip install --upgrade pip setuptools wheel echo "--------------------------Installing Python SDK-------------------------------" - pip install apache-beam-${RELEASE_VER}.tar.gz[gcp] + pip install apache_beam-${RELEASE_VER}.tar.gz[gcp] echo "----------------Starting Leaderboard with DirectRunner-----------------------" if [[ "$python_leaderboard_direct" = true ]]; then @@ -434,7 +434,7 @@ if [[ ("$python_leaderboard_direct" = true \ --dataset ${LEADERBOARD_DF_DATASET} \ --runner DataflowRunner \ --temp_location=${USER_GCS_BUCKET}/temp/ \ - --sdk_location apache-beam-${RELEASE_VER}.tar.gz; \ + --sdk_location apache_beam-${RELEASE_VER}.tar.gz; \ exec bash" echo "***************************************************************" @@ -509,7 +509,7 @@ if [[ ("$python_leaderboard_direct" = true \ --dataset ${GAMESTATS_DF_DATASET} \ --runner DataflowRunner \ --temp_location=${USER_GCS_BUCKET}/temp/ \ - --sdk_location apache-beam-${RELEASE_VER}.tar.gz \ + --sdk_location apache_beam-${RELEASE_VER}.tar.gz \ --fixed_window_duration ${FIXED_WINDOW_DURATION}; exec bash" echo "***************************************************************" @@ -566,14 +566,14 @@ if [[ ("$python_xlang_quickstart" = true) \ cd ${LOCAL_BEAM_DIR} echo "---------------------Downloading Python Staging RC----------------------------" - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz.sha512 - if [[ ! -f apache-beam-${RELEASE_VER}.tar.gz ]]; then + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache_beam-${RELEASE_VER}.tar.gz + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache_beam-${RELEASE_VER}.tar.gz.sha512 + if [[ ! -f apache_beam-${RELEASE_VER}.tar.gz ]]; then { echo "Failed to download Python Staging RC files." ;exit 1; } fi echo "--------------------------Verifying Hashes------------------------------------" - sha512sum -c apache-beam-${RELEASE_VER}.tar.gz.sha512 + sha512sum -c apache_beam-${RELEASE_VER}.tar.gz.sha512 `which pip` install --upgrade pip `which pip` install --upgrade setuptools @@ -593,7 +593,7 @@ if [[ ("$python_xlang_quickstart" = true) \ ln -s ${LOCAL_BEAM_DIR}/sdks beam_env_${py_version}/lib/sdks echo "--------------------------Installing Python SDK-------------------------------" - pip install apache-beam-${RELEASE_VER}.tar.gz + pip install apache_beam-${RELEASE_VER}.tar.gz echo '************************************************************'; echo '* Running Python Multi-language Quickstart with DirectRunner'; @@ -672,14 +672,14 @@ if [[ ("$java_xlang_quickstart" = true) \ cd ${LOCAL_BEAM_DIR} echo "---------------------Downloading Python Staging RC----------------------------" - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz.sha512 - if [[ ! -f apache-beam-${RELEASE_VER}.tar.gz ]]; then + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache_beam-${RELEASE_VER}.tar.gz + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache_beam-${RELEASE_VER}.tar.gz.sha512 + if [[ ! -f apache_beam-${RELEASE_VER}.tar.gz ]]; then { echo "Failed to download Python Staging RC files." ;exit 1; } fi echo "--------------------------Verifying Hashes------------------------------------" - sha512sum -c apache-beam-${RELEASE_VER}.tar.gz.sha512 + sha512sum -c apache_beam-${RELEASE_VER}.tar.gz.sha512 `which pip` install --upgrade pip `which pip` install --upgrade setuptools @@ -699,7 +699,7 @@ if [[ ("$java_xlang_quickstart" = true) \ ln -s ${LOCAL_BEAM_DIR}/sdks beam_env_${py_version}/lib/sdks echo "--------------------------Installing Python SDK-------------------------------" - pip install apache-beam-${RELEASE_VER}.tar.gz[dataframe] + pip install apache_beam-${RELEASE_VER}.tar.gz[dataframe] # Deacrivating in the main shell. We will reactivate the virtual environment new shells # for the expansion service and the job server. @@ -768,14 +768,14 @@ if [[ ("$python_xlang_kafka_taxi_dataflow" = true cd ${LOCAL_BEAM_DIR} echo "---------------------Downloading Python Staging RC----------------------------" - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz.sha512 - if [[ ! -f apache-beam-${RELEASE_VER}.tar.gz ]]; then + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache_beam-${RELEASE_VER}.tar.gz + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache_beam-${RELEASE_VER}.tar.gz.sha512 + if [[ ! -f apache_beam-${RELEASE_VER}.tar.gz ]]; then { echo "Fail to download Python Staging RC files." ;exit 1; } fi echo "--------------------------Verifying Hashes------------------------------------" - sha512sum -c apache-beam-${RELEASE_VER}.tar.gz.sha512 + sha512sum -c apache_beam-${RELEASE_VER}.tar.gz.sha512 `which pip` install --upgrade pip `which pip` install --upgrade setuptools @@ -807,7 +807,7 @@ if [[ ("$python_xlang_kafka_taxi_dataflow" = true ln -s ${LOCAL_BEAM_DIR}/sdks beam_env_${py_version}/lib/sdks echo "--------------------------Installing Python SDK-------------------------------" - pip install apache-beam-${RELEASE_VER}.tar.gz[gcp] + pip install apache_beam-${RELEASE_VER}.tar.gz[gcp] echo "----------------Starting XLang Kafka Taxi with DataflowRunner---------------------" if [[ "$python_xlang_kafka_taxi_dataflow" = true ]]; then @@ -837,7 +837,7 @@ if [[ ("$python_xlang_kafka_taxi_dataflow" = true --temp_location=${USER_GCS_BUCKET}/temp/ \ --with_metadata \ --beam_services=\"{\\\"sdks:java:io:expansion-service:shadowJar\\\": \\\"${KAFKA_EXPANSION_SERVICE_JAR}\\\"}\" \ - --sdk_location apache-beam-${RELEASE_VER}.tar.gz; \ + --sdk_location apache_beam-${RELEASE_VER}.tar.gz; \ exec bash" echo "***************************************************************" @@ -882,7 +882,7 @@ if [[ ("$python_xlang_kafka_taxi_dataflow" = true --temp_location=${USER_GCS_BUCKET}/temp/ \ --output_topic projects/${USER_GCP_PROJECT}/topics/${SQL_TAXI_TOPIC} \ --beam_services=\"{\\\":sdks:java:extensions:sql:expansion-service:shadowJar\\\": \\\"${SQL_EXPANSION_SERVICE_JAR}\\\"}\" \ - --sdk_location apache-beam-${RELEASE_VER}.tar.gz; \ + --sdk_location apache_beam-${RELEASE_VER}.tar.gz; \ exec bash" echo "***************************************************************" diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java index 93dfb8e3ebc8..466d4ad46eb6 100644 --- a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java +++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java @@ -19,7 +19,6 @@ import com.google.auto.value.AutoValue; import java.io.Serializable; -import java.util.HashSet; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.StreamSupport; @@ -50,12 +49,16 @@ public static StringSetData empty() { * Combines this {@link StringSetData} with other, both original StringSetData are left intact. */ public StringSetData combine(StringSetData other) { - // do not merge other on this as this StringSetData might hold an immutable set like in case - // of EmptyStringSetData - Set combined = new HashSet<>(); - combined.addAll(this.stringSet()); - combined.addAll(other.stringSet()); - return StringSetData.create(combined); + if (this.stringSet().isEmpty()) { + return other; + } else if (other.stringSet().isEmpty()) { + return this; + } else { + ImmutableSet.Builder combined = ImmutableSet.builder(); + combined.addAll(this.stringSet()); + combined.addAll(other.stringSet()); + return StringSetData.create(combined.build()); + } } /** diff --git a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ExecutorServiceParallelExecutor.java b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ExecutorServiceParallelExecutor.java index 59dc736693d0..95cadef7afdb 100644 --- a/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ExecutorServiceParallelExecutor.java +++ b/runners/direct-java/src/main/java/org/apache/beam/runners/direct/ExecutorServiceParallelExecutor.java @@ -18,6 +18,7 @@ package org.apache.beam.runners.direct; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import java.io.Closeable; import java.util.ArrayList; import java.util.Collection; import java.util.Map; @@ -30,11 +31,13 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; +import org.apache.beam.runners.core.metrics.MetricsContainerImpl; import org.apache.beam.runners.local.ExecutionDriver; import org.apache.beam.runners.local.ExecutionDriver.DriverState; import org.apache.beam.runners.local.PipelineMessageReceiver; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult.State; +import org.apache.beam.sdk.metrics.MetricsEnvironment; import org.apache.beam.sdk.runners.AppliedPTransform; import org.apache.beam.sdk.util.UserCodeException; import org.apache.beam.sdk.values.PCollection; @@ -155,13 +158,21 @@ public void start(DirectGraph graph, RootProviderRegistry rootProviderRegistry) ImmutableMap.Builder, Queue>> pendingRootBundles = ImmutableMap.builder(); for (AppliedPTransform root : graph.getRootTransforms()) { + MetricsContainerImpl metricsContainer = new MetricsContainerImpl(root.getFullName()); Queue> pending = Queues.newArrayDeque(); - try { + try (Closeable metricsScope = MetricsEnvironment.scopedMetricsContainer(metricsContainer)) { Collection> initialInputs = rootProviderRegistry.getInitialInputs(root, numTargetSplits); pending.addAll(initialInputs); } catch (Exception e) { throw UserCodeException.wrap(e); + } finally { + // Metrics emitted initial split are reported along with the first bundle + if (pending.peek() != null) { + evaluationContext + .getMetrics() + .commitPhysical(pending.peek(), metricsContainer.getCumulative()); + } } pendingRootBundles.put(root, pending); } diff --git a/runners/flink/flink_runner.gradle b/runners/flink/flink_runner.gradle index da3dbe08b503..c8f492a901d3 100644 --- a/runners/flink/flink_runner.gradle +++ b/runners/flink/flink_runner.gradle @@ -335,6 +335,8 @@ def createValidatesRunnerTask(Map m) { // Extremely flaky: https://github.com/apache/beam/issues/19814 excludeTestsMatching 'org.apache.beam.sdk.transforms.ParDoLifecycleTest.testTeardownCalledAfterExceptionInProcessElementStateful' excludeTestsMatching 'org.apache.beam.sdk.transforms.ParDoLifecycleTest.testTeardownCalledAfterExceptionInStartBundleStateful' + // TODO(https://github.com/apache/beam/issues/29972) due to runtimeContext initialized after initial split + excludeTestsMatching 'org.apache.beam.sdk.metrics.MetricsTest$AttemptedMetricTests.testBoundedSourceMetricsInSplit' } } } diff --git a/runners/flink/job-server/flink_job_server.gradle b/runners/flink/job-server/flink_job_server.gradle index 9b565f119a62..56a58df4fb09 100644 --- a/runners/flink/job-server/flink_job_server.gradle +++ b/runners/flink/job-server/flink_job_server.gradle @@ -171,7 +171,6 @@ def portableValidatesRunnerTask(String name, boolean streaming, boolean checkpoi excludeCategories 'org.apache.beam.sdk.testing.UsesCustomWindowMerging' excludeCategories 'org.apache.beam.sdk.testing.UsesFailureMessage' excludeCategories 'org.apache.beam.sdk.testing.UsesGaugeMetrics' - excludeCategories 'org.apache.beam.sdk.testing.UsesStringSetMetrics' excludeCategories 'org.apache.beam.sdk.testing.UsesParDoLifecycle' excludeCategories 'org.apache.beam.sdk.testing.UsesMapState' excludeCategories 'org.apache.beam.sdk.testing.UsesMultimapState' diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/StreamingImpulseSource.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/StreamingImpulseSource.java index 8f21e42d61e6..871d7a5a3989 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/StreamingImpulseSource.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/StreamingImpulseSource.java @@ -17,8 +17,8 @@ */ package org.apache.beam.runners.flink.translation.wrappers.streaming.io; +import java.nio.charset.StandardCharsets; import org.apache.beam.sdk.util.WindowedValue; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -60,7 +60,8 @@ public void run(SourceContext> ctx) { while (running && (messageCount == 0 || count < subtaskCount)) { synchronized (ctx.getCheckpointLock()) { ctx.collect( - WindowedValue.valueInGlobalWindow(String.valueOf(count).getBytes(Charsets.UTF_8))); + WindowedValue.valueInGlobalWindow( + String.valueOf(count).getBytes(StandardCharsets.UTF_8))); count++; } diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java index 4a628eeb4fdf..22516cbc9633 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java @@ -25,7 +25,7 @@ import java.io.ByteArrayOutputStream; import java.io.PrintStream; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; +import java.nio.charset.StandardCharsets; import org.junit.Test; /** Tests for {@link FlinkJobServerDriver}. */ @@ -104,7 +104,7 @@ public void testJobServerDriver() throws Exception { boolean success = false; while (!success) { newErr.flush(); - String output = baos.toString(Charsets.UTF_8.name()); + String output = baos.toString(StandardCharsets.UTF_8.name()); if (output.contains("JobService started on localhost:") && output.contains("ArtifactStagingService started on localhost:") && output.contains("ExpansionService started on localhost:")) { @@ -114,7 +114,8 @@ public void testJobServerDriver() throws Exception { } } assertThat(driver.getJobServerUrl(), is(not(nullValue()))); - assertThat(baos.toString(Charsets.UTF_8.name()), containsString(driver.getJobServerUrl())); + assertThat( + baos.toString(StandardCharsets.UTF_8.name()), containsString(driver.getJobServerUrl())); assertThat(driverThread.isAlive(), is(true)); } catch (Throwable t) { // restore to print exception @@ -149,7 +150,7 @@ public void testJobServerDriverWithoutExpansionService() throws Exception { boolean success = false; while (!success) { newErr.flush(); - String output = baos.toString(Charsets.UTF_8.name()); + String output = baos.toString(StandardCharsets.UTF_8.name()); if (output.contains("JobService started on localhost:") && output.contains("ArtifactStagingService started on localhost:")) { success = true; @@ -161,7 +162,8 @@ public void testJobServerDriverWithoutExpansionService() throws Exception { } } assertThat(driver.getJobServerUrl(), is(not(nullValue()))); - assertThat(baos.toString(Charsets.UTF_8.name()), containsString(driver.getJobServerUrl())); + assertThat( + baos.toString(StandardCharsets.UTF_8.name()), containsString(driver.getJobServerUrl())); assertThat(driverThread.isAlive(), is(true)); } catch (Throwable t) { // restore to print exception diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java index 9d898ed53a89..3b92c282c38a 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java @@ -38,6 +38,7 @@ import java.lang.reflect.Method; import java.net.MalformedURLException; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -56,7 +57,6 @@ import org.apache.beam.sdk.util.construction.PTransformMatchers; import org.apache.beam.sdk.util.construction.PTransformTranslation; import org.apache.beam.sdk.util.construction.resources.PipelineResources; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.RemoteEnvironment; @@ -373,7 +373,7 @@ public void processElement(ProcessContext ctx) { } replacementStdErr.flush(); assertThat( - new String(byteArrayOutputStream.toByteArray(), Charsets.UTF_8), + new String(byteArrayOutputStream.toByteArray(), StandardCharsets.UTF_8), containsString( "UnboundedSources present which rely on checkpointing, but checkpointing is disabled.")); } diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java index 22a9ce4f39ab..cf860717def3 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java @@ -20,6 +20,7 @@ import java.io.File; import java.lang.reflect.Field; import java.lang.reflect.Modifier; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.security.Permission; import java.util.Collection; @@ -30,7 +31,6 @@ import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.util.construction.resources.PipelineResources; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -200,7 +200,7 @@ private static void prepareEnvironment() throws Exception { RestOptions.PORT.key(), flinkCluster.getRestPort()); - Files.write(file.toPath(), config.getBytes(Charsets.UTF_8)); + Files.write(file.toPath(), config.getBytes(StandardCharsets.UTF_8)); // Create a new environment with the location of the Flink config for CliFrontend ImmutableMap newEnv = diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java index cf5b2b555124..2eb0545b7794 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java @@ -103,7 +103,6 @@ import org.apache.beam.sdk.values.WindowingStrategy; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.Struct; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -818,7 +817,7 @@ private void testEnsureDeferredStateCleanupTimerFiring(boolean withCheckpointing BagState state = // State from the SDK Harness is stored as ByteStrings operator.keyedStateInternals.state( stateNamespace, StateTags.bag(stateId, ByteStringCoder.of())); - state.add(ByteString.copyFrom("userstate".getBytes(Charsets.UTF_8))); + state.add(ByteString.copyFrom("userstate".getBytes(StandardCharsets.UTF_8))); assertThat(testHarness.numKeyedStateEntries(), is(1)); // user timer that fires after the end of the window and after state cleanup @@ -966,7 +965,7 @@ public void testEnsureStateCleanupOnFinalWatermark() throws Exception { BagState state = // State from the SDK Harness is stored as ByteStrings operator.keyedStateInternals.state( stateNamespace, StateTags.bag(stateId, ByteStringCoder.of())); - state.add(ByteString.copyFrom("userstate".getBytes(Charsets.UTF_8))); + state.add(ByteString.copyFrom("userstate".getBytes(StandardCharsets.UTF_8))); // No timers have been set for cleanup assertThat(testHarness.numEventTimeTimers(), is(0)); // State has been created @@ -988,8 +987,8 @@ public void testCacheTokenHandling() throws Exception { new ExecutableStageDoFnOperator.BagUserStateFactory<>( test, stateBackend, NoopLock.get(), null); - ByteString key1 = ByteString.copyFrom("key1", Charsets.UTF_8); - ByteString key2 = ByteString.copyFrom("key2", Charsets.UTF_8); + ByteString key1 = ByteString.copyFrom("key1", StandardCharsets.UTF_8); + ByteString key2 = ByteString.copyFrom("key2", StandardCharsets.UTF_8); Map> userStateMapMock = Mockito.mock(Map.class); diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/FlinkKeyUtilsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/FlinkKeyUtilsTest.java index cab45632ac55..cdf461b5fde8 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/FlinkKeyUtilsTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/FlinkKeyUtilsTest.java @@ -22,12 +22,12 @@ import static org.hamcrest.core.Is.is; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.coders.VoidCoder; import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.junit.Test; /** Tests for {@link FlinkKeyUtils}. */ @@ -66,7 +66,7 @@ public void testCoderContext() throws Exception { @Test @SuppressWarnings("ByteBufferBackingArray") public void testFromEncodedKey() { - ByteString input = ByteString.copyFrom("hello world".getBytes(Charsets.UTF_8)); + ByteString input = ByteString.copyFrom("hello world".getBytes(StandardCharsets.UTF_8)); ByteBuffer encodedKey = FlinkKeyUtils.fromEncodedKey(input); assertThat(encodedKey.array(), is(input.toByteArray())); } diff --git a/runners/google-cloud-dataflow-java/arm/build.gradle b/runners/google-cloud-dataflow-java/arm/build.gradle index 4771ee5efe82..ae63cdf8bdb7 100644 --- a/runners/google-cloud-dataflow-java/arm/build.gradle +++ b/runners/google-cloud-dataflow-java/arm/build.gradle @@ -84,19 +84,19 @@ def javaVer = "java8" if (project.hasProperty('testJavaVersion')) { javaVer = "java${project.getProperty('testJavaVersion')}" } -def dataflowProject = project.findProperty('dataflowProject') ?: 'apache-beam-testing' -def dataflowRegion = project.findProperty('dataflowRegion') ?: 'us-central1' -def dataflowValidatesTempRoot = project.findProperty('dataflowTempRoot') ?: 'gs://temp-storage-for-validates-runner-tests' +def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing' +def gcpRegion = project.findProperty('gcpRegion') ?: 'us-central1' +def dataflowValidatesTempRoot = project.findProperty('gcpTempRoot') ?: 'gs://temp-storage-for-validates-runner-tests' def firestoreDb = project.findProperty('firestoreDb') ?: 'firestoredb' -def dockerImageRoot = project.findProperty('docker-repository-root') ?: "us.gcr.io/${dataflowProject}/java-postcommit-it" +def dockerImageRoot = project.findProperty('docker-repository-root') ?: "us.gcr.io/${gcpProject}/java-postcommit-it" def DockerJavaMultiarchImageContainer = "${dockerImageRoot}/${project.docker_image_default_repo_prefix}${javaVer}_sdk" def dockerTag = project.findProperty('docker-tag') ?: new Date().format('yyyyMMddHHmmss') ext.DockerJavaMultiarchImageName = "${DockerJavaMultiarchImageContainer}:${dockerTag}" as String def runnerV2PipelineOptionsARM = [ "--runner=TestDataflowRunner", - "--project=${dataflowProject}", - "--region=${dataflowRegion}", + "--project=${gcpProject}", + "--region=${gcpRegion}", "--tempRoot=${dataflowValidatesTempRoot}", "--sdkContainerImage=${project.ext.DockerJavaMultiarchImageName}", "--experiments=use_unified_worker,use_runner_v2", diff --git a/runners/google-cloud-dataflow-java/build.gradle b/runners/google-cloud-dataflow-java/build.gradle index c9fa85e41655..5d898bb57d86 100644 --- a/runners/google-cloud-dataflow-java/build.gradle +++ b/runners/google-cloud-dataflow-java/build.gradle @@ -51,8 +51,8 @@ evaluationDependsOn(":sdks:java:container:java11") ext.dataflowLegacyEnvironmentMajorVersion = '8' ext.dataflowFnapiEnvironmentMajorVersion = '8' -ext.dataflowLegacyContainerVersion = 'beam-master-20240306' -ext.dataflowFnapiContainerVersion = 'beam-master-20240306' +ext.dataflowLegacyContainerVersion = 'beam-master-20240718' +ext.dataflowFnapiContainerVersion = 'beam-master-20240718' ext.dataflowContainerBaseRepository = 'gcr.io/cloud-dataflow/v1beta3' processResources { @@ -138,18 +138,18 @@ dependencies { googleCloudPlatformIntegrationTest project(path: ":sdks:java:io:google-cloud-platform", configuration: "testRuntimeMigration") } -def dataflowProject = project.findProperty('dataflowProject') ?: 'apache-beam-testing' -def dataflowRegion = project.findProperty('dataflowRegion') ?: 'us-central1' -def dataflowValidatesTempRoot = project.findProperty('dataflowTempRoot') ?: 'gs://temp-storage-for-validates-runner-tests' -def dataflowPostCommitTempRoot = project.findProperty('dataflowTempRoot') ?: 'gs://temp-storage-for-end-to-end-tests' -def dataflowPostCommitTempRootKms = project.findProperty('dataflowTempRootKms') ?: 'gs://temp-storage-for-end-to-end-tests-cmek' -def dataflowUploadTemp = project.findProperty('dataflowTempRoot') ?: 'gs://temp-storage-for-upload-tests' +def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing' +def gcpRegion = project.findProperty('gcpRegion') ?: 'us-central1' +def dataflowValidatesTempRoot = project.findProperty('gcpTempRoot') ?: 'gs://temp-storage-for-validates-runner-tests' +def dataflowPostCommitTempRoot = project.findProperty('gcpTempRoot') ?: 'gs://temp-storage-for-end-to-end-tests' +def dataflowPostCommitTempRootKms = project.findProperty('gcpTempRootKms') ?: 'gs://temp-storage-for-end-to-end-tests-cmek' +def dataflowUploadTemp = project.findProperty('gcpTempRoot') ?: 'gs://temp-storage-for-upload-tests' def testFilesToStage = project.findProperty('filesToStage') ?: 'test.txt' def dataflowLegacyWorkerJar = project.findProperty('dataflowWorkerJar') ?: project(":runners:google-cloud-dataflow-java:worker").shadowJar.archivePath def dataflowKmsKey = project.findProperty('dataflowKmsKey') ?: "projects/apache-beam-testing/locations/global/keyRings/beam-it/cryptoKeys/test" def firestoreDb = project.findProperty('firestoreDb') ?: 'firestoredb' -def dockerImageRoot = project.findProperty('dockerImageRoot') ?: "us.gcr.io/${dataflowProject}/java-postcommit-it" +def dockerImageRoot = project.findProperty('dockerImageRoot') ?: "us.gcr.io/${gcpProject.replaceAll(':', '/')}/java-postcommit-it" def dockerJavaImageContainer = "${dockerImageRoot}/java" def dockerPythonImageContainer = "${dockerImageRoot}/python" def dockerTag = new Date().format('yyyyMMddHHmmss') @@ -158,8 +158,8 @@ ext.dockerPythonImageName = "${dockerPythonImageContainer}:${dockerTag}" def legacyPipelineOptions = [ "--runner=TestDataflowRunner", - "--project=${dataflowProject}", - "--region=${dataflowRegion}", + "--project=${gcpProject}", + "--region=${gcpRegion}", "--tempRoot=${dataflowValidatesTempRoot}", "--dataflowWorkerJar=${dataflowLegacyWorkerJar}", "--workerHarnessContainerImage=", @@ -167,8 +167,8 @@ def legacyPipelineOptions = [ def runnerV2PipelineOptions = [ "--runner=TestDataflowRunner", - "--project=${dataflowProject}", - "--region=${dataflowRegion}", + "--project=${gcpProject}", + "--region=${gcpRegion}", "--tempRoot=${dataflowValidatesTempRoot}", "--sdkContainerImage=${dockerJavaImageContainer}:${dockerTag}", "--experiments=use_unified_worker,use_runner_v2", @@ -183,7 +183,6 @@ def commonLegacyExcludeCategories = [ 'org.apache.beam.sdk.testing.UsesExternalService', 'org.apache.beam.sdk.testing.UsesDistributionMetrics', 'org.apache.beam.sdk.testing.UsesGaugeMetrics', - 'org.apache.beam.sdk.testing.UsesStringSetMetrics', 'org.apache.beam.sdk.testing.UsesMultimapState', 'org.apache.beam.sdk.testing.UsesTestStream', 'org.apache.beam.sdk.testing.UsesParDoLifecycle', @@ -435,14 +434,14 @@ createCrossLanguageValidatesRunnerTask( semiPersistDir: "/var/opt/google", pythonPipelineOptions: [ "--runner=TestDataflowRunner", - "--project=${dataflowProject}", - "--region=${dataflowRegion}", + "--project=${gcpProject}", + "--region=${gcpRegion}", "--sdk_harness_container_image_overrides=.*java.*,${dockerJavaImageContainer}:${dockerTag}", ], javaPipelineOptions: [ "--runner=TestDataflowRunner", - "--project=${dataflowProject}", - "--region=${dataflowRegion}", + "--project=${gcpProject}", + "--region=${gcpRegion}", "--tempRoot=${dataflowValidatesTempRoot}", "--sdkContainerImage=${dockerJavaImageContainer}:${dockerTag}", "--sdkHarnessContainerImageOverrides=.*python.*,${dockerPythonImageContainer}:${dockerTag}", @@ -455,9 +454,9 @@ createCrossLanguageValidatesRunnerTask( ], goScriptOptions: [ "--runner dataflow", - "--project ${dataflowProject}", - "--dataflow_project ${dataflowProject}", - "--region ${dataflowRegion}", + "--project ${gcpProject}", + "--dataflow_project ${gcpProject}", + "--region ${gcpRegion}", "--tests \"./test/integration/xlang ./test/integration/io/xlang/...\"", "--sdk_overrides \".*java.*,${dockerJavaImageContainer}:${dockerTag}\"", ], @@ -554,8 +553,8 @@ task googleCloudPlatformLegacyWorkerIntegrationTest(type: Test, dependsOn: copyG dependsOn ":runners:google-cloud-dataflow-java:worker:shadowJar" systemProperty "beamTestPipelineOptions", JsonOutput.toJson([ "--runner=TestDataflowRunner", - "--project=${dataflowProject}", - "--region=${dataflowRegion}", + "--project=${gcpProject}", + "--region=${gcpRegion}", "--tempRoot=${dataflowPostCommitTempRoot}", "--dataflowWorkerJar=${dataflowLegacyWorkerJar}", "--workerHarnessContainerImage=", @@ -582,8 +581,8 @@ task googleCloudPlatformLegacyWorkerKmsIntegrationTest(type: Test) { dependsOn ":runners:google-cloud-dataflow-java:worker:shadowJar" systemProperty "beamTestPipelineOptions", JsonOutput.toJson([ "--runner=TestDataflowRunner", - "--project=${dataflowProject}", - "--region=${dataflowRegion}", + "--project=${gcpProject}", + "--region=${gcpRegion}", "--tempRoot=${dataflowPostCommitTempRootKms}", "--dataflowWorkerJar=${dataflowLegacyWorkerJar}", "--workerHarnessContainerImage=", @@ -668,8 +667,8 @@ task coreSDKJavaLegacyWorkerIntegrationTest(type: Test) { systemProperty "beamTestPipelineOptions", JsonOutput.toJson([ "--runner=TestDataflowRunner", - "--project=${dataflowProject}", - "--region=${dataflowRegion}", + "--project=${gcpProject}", + "--region=${gcpRegion}", "--tempRoot=${dataflowPostCommitTempRoot}", "--dataflowWorkerJar=${dataflowLegacyWorkerJar}", "--workerHarnessContainerImage=", @@ -714,8 +713,6 @@ task postCommitRunnerV2 { dependsOn coreSDKJavaRunnerV2IntegrationTest } -def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing' -def gcpRegion = project.findProperty('gcpRegion') ?: 'us-central1' def gcsBucket = project.findProperty('gcsBucket') ?: 'temp-storage-for-release-validation-tests/nightly-snapshot-validation' def bqDataset = project.findProperty('bqDataset') ?: 'beam_postrelease_mobile_gaming' def pubsubTopic = project.findProperty('pubsubTopic') ?: 'java_mobile_gaming_topic' @@ -745,7 +742,7 @@ createJavaExamplesArchetypeValidationTask(type: 'MobileGaming', bqDataset: bqDataset, pubsubTopic: pubsubTopic) -// Standalone task for testing GCS upload, use with -PfilesToStage and -PdataflowTempRoot. +// Standalone task for testing GCS upload, use with -PfilesToStage and -PgcpTempRoot. task GCSUpload(type: JavaExec) { mainClass = 'org.apache.beam.runners.dataflow.util.GCSUploadMain' classpath = sourceSets.test.runtimeClasspath diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowMetrics.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowMetrics.java index 1fad140717f6..46fdce507c3d 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowMetrics.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowMetrics.java @@ -24,11 +24,11 @@ import com.google.api.services.dataflow.model.JobMetrics; import com.google.api.services.dataflow.model.MetricUpdate; import java.io.IOException; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.Set; import org.apache.beam.model.pipeline.v1.RunnerApi; import org.apache.beam.sdk.metrics.DistributionResult; import org.apache.beam.sdk.metrics.GaugeResult; @@ -191,7 +191,7 @@ private StringSetResult getStringSetValue(MetricUpdate metricUpdate) { if (metricUpdate.getSet() == null) { return StringSetResult.empty(); } - return StringSetResult.create(ImmutableSet.copyOf(((Set) metricUpdate.getSet()))); + return StringSetResult.create(ImmutableSet.copyOf(((Collection) metricUpdate.getSet()))); } private DistributionResult getDistributionValue(MetricUpdate metricUpdate) { diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowPipelineTranslator.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowPipelineTranslator.java index f905e136e83b..1fedcd8f3a29 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowPipelineTranslator.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowPipelineTranslator.java @@ -42,6 +42,7 @@ import com.google.api.services.dataflow.model.Job; import com.google.api.services.dataflow.model.Step; import com.google.api.services.dataflow.model.WorkerPool; +import java.nio.charset.StandardCharsets; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; @@ -110,7 +111,6 @@ import org.apache.beam.sdk.values.WindowingStrategy; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.net.PercentCodec; @@ -618,7 +618,7 @@ static class StepTranslator implements StepTranslationContext { // For compatibility with URL encoding implementations that represent space as +, // always encode + as %2b even though we don't encode space as +. private final PercentCodec percentCodec = - new PercentCodec("+".getBytes(Charsets.US_ASCII), false); + new PercentCodec("+".getBytes(StandardCharsets.US_ASCII), false); private StepTranslator(Translator translator, Step step) { this.translator = translator; @@ -764,7 +764,8 @@ private void addResourceHints(ResourceHints hints) { try { urlEncodedHints.put( entry.getKey(), - new String(percentCodec.encode(entry.getValue().toBytes()), Charsets.US_ASCII)); + new String( + percentCodec.encode(entry.getValue().toBytes()), StandardCharsets.US_ASCII)); } catch (EncoderException e) { // Should never happen. throw new RuntimeException("Invalid value for resource hint: " + entry.getKey(), e); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/BatchModeExecutionContext.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/BatchModeExecutionContext.java index 8c038189ae62..aeef7784c2c3 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/BatchModeExecutionContext.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/BatchModeExecutionContext.java @@ -19,7 +19,6 @@ import com.google.api.services.dataflow.model.CounterUpdate; import com.google.api.services.dataflow.model.SideInputInfo; -import java.util.Collections; import java.util.Objects; import java.util.concurrent.TimeUnit; import org.apache.beam.runners.core.InMemoryStateInternals; @@ -40,6 +39,7 @@ import org.apache.beam.runners.dataflow.worker.profiler.ScopedProfiler.ProfileScope; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.metrics.MetricName; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.metrics.MetricsContainer; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.state.TimeDomain; @@ -68,9 +68,6 @@ public class BatchModeExecutionContext private Object key; private final MetricsContainerRegistry containerRegistry; - - // TODO(https://github.com/apache/beam/issues/19632): Move throttle time Metric to a dedicated - // namespace. protected static final String DATASTORE_THROTTLE_TIME_NAMESPACE = "org.apache.beam.sdk.io.gcp.datastore.DatastoreV1$DatastoreWriterFn"; protected static final String HTTP_CLIENT_API_THROTTLE_TIME_NAMESPACE = @@ -79,10 +76,6 @@ public class BatchModeExecutionContext "org.apache.beam.sdk.io.gcp.bigquery.BigQueryServicesImpl$DatasetServiceImpl"; protected static final String BIGQUERY_READ_THROTTLE_TIME_NAMESPACE = "org.apache.beam.sdk.io.gcp.bigquery.BigQueryServicesImpl$StorageClientImpl"; - protected static final String THROTTLE_TIME_COUNTER_NAME = "throttling-msecs"; - - // TODO(BEAM-31814): Remove once Dataflow legacy runner supports this. - private final boolean populateStringSetMetrics; private BatchModeExecutionContext( CounterFactory counterFactory, @@ -91,8 +84,7 @@ private BatchModeExecutionContext( ReaderFactory readerFactory, PipelineOptions options, DataflowExecutionStateTracker executionStateTracker, - DataflowExecutionStateRegistry executionStateRegistry, - boolean populateStringSetMetrics) { + DataflowExecutionStateRegistry executionStateRegistry) { super( counterFactory, createMetricsContainerRegistry(), @@ -105,7 +97,6 @@ private BatchModeExecutionContext( this.dataCache = dataCache; this.containerRegistry = (MetricsContainerRegistry) getMetricsContainerRegistry(); - this.populateStringSetMetrics = populateStringSetMetrics; } private static MetricsContainerRegistry createMetricsContainerRegistry() { @@ -141,8 +132,7 @@ public static BatchModeExecutionContext forTesting( counterFactory, options, "test-work-item-id"), - stateRegistry, - true); + stateRegistry); } public static BatchModeExecutionContext forTesting(PipelineOptions options, String stageName) { @@ -255,8 +245,7 @@ public static BatchModeExecutionContext create( counterFactory, options, workItemId), - executionStateRegistry, - false); + executionStateRegistry); } /** Create a new {@link StepContext}. */ @@ -526,10 +515,7 @@ public Iterable extractMetricUpdates(boolean isFinalUpdate) { update -> MetricsToCounterUpdateConverter.fromDistribution( update.getKey(), true, update.getUpdate())), - FluentIterable.from( - populateStringSetMetrics - ? updates.stringSetUpdates() - : Collections.emptyList()) + FluentIterable.from(updates.stringSetUpdates()) .transform( update -> MetricsToCounterUpdateConverter.fromStringSet( @@ -550,11 +536,18 @@ public Iterable extractMsecCounters(boolean isFinalUpdate) { public Long extractThrottleTime() { long totalThrottleMsecs = 0L; for (MetricsContainerImpl container : containerRegistry.getContainers()) { - // TODO(https://github.com/apache/beam/issues/19632): Update throttling counters to use - // generic throttling-msecs metric. + CounterCell userThrottlingTime = + container.tryGetCounter( + MetricName.named( + Metrics.THROTTLE_TIME_NAMESPACE, Metrics.THROTTLE_TIME_COUNTER_NAME)); + if (userThrottlingTime != null) { + totalThrottleMsecs += userThrottlingTime.getCumulative(); + } + CounterCell dataStoreThrottlingTime = container.tryGetCounter( - MetricName.named(DATASTORE_THROTTLE_TIME_NAMESPACE, THROTTLE_TIME_COUNTER_NAME)); + MetricName.named( + DATASTORE_THROTTLE_TIME_NAMESPACE, Metrics.THROTTLE_TIME_COUNTER_NAME)); if (dataStoreThrottlingTime != null) { totalThrottleMsecs += dataStoreThrottlingTime.getCumulative(); } @@ -562,7 +555,7 @@ public Long extractThrottleTime() { CounterCell httpClientApiThrottlingTime = container.tryGetCounter( MetricName.named( - HTTP_CLIENT_API_THROTTLE_TIME_NAMESPACE, THROTTLE_TIME_COUNTER_NAME)); + HTTP_CLIENT_API_THROTTLE_TIME_NAMESPACE, Metrics.THROTTLE_TIME_COUNTER_NAME)); if (httpClientApiThrottlingTime != null) { totalThrottleMsecs += httpClientApiThrottlingTime.getCumulative(); } @@ -570,14 +563,16 @@ public Long extractThrottleTime() { CounterCell bigqueryStreamingInsertThrottleTime = container.tryGetCounter( MetricName.named( - BIGQUERY_STREAMING_INSERT_THROTTLE_TIME_NAMESPACE, THROTTLE_TIME_COUNTER_NAME)); + BIGQUERY_STREAMING_INSERT_THROTTLE_TIME_NAMESPACE, + Metrics.THROTTLE_TIME_COUNTER_NAME)); if (bigqueryStreamingInsertThrottleTime != null) { totalThrottleMsecs += bigqueryStreamingInsertThrottleTime.getCumulative(); } CounterCell bigqueryReadThrottleTime = container.tryGetCounter( - MetricName.named(BIGQUERY_READ_THROTTLE_TIME_NAMESPACE, THROTTLE_TIME_COUNTER_NAME)); + MetricName.named( + BIGQUERY_READ_THROTTLE_TIME_NAMESPACE, Metrics.THROTTLE_TIME_COUNTER_NAME)); if (bigqueryReadThrottleTime != null) { totalThrottleMsecs += bigqueryReadThrottleTime.getCumulative(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowSystemMetrics.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowSystemMetrics.java index 640febc616ba..c5a24df192eb 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowSystemMetrics.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowSystemMetrics.java @@ -20,15 +20,14 @@ import org.apache.beam.runners.dataflow.worker.counters.CounterName; import org.apache.beam.runners.dataflow.worker.counters.NameContext; import org.apache.beam.sdk.metrics.MetricName; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; /** This holds system metrics related constants used in Batch and Streaming. */ public class DataflowSystemMetrics { public static final MetricName THROTTLING_MSECS_METRIC_NAME = - MetricName.named("dataflow-throttling-metrics", "throttling-msecs"); - - // TODO: Provide an utility in SDK 'ThrottlingReporter' to update throttling time. + MetricName.named("dataflow-throttling-metrics", Metrics.THROTTLE_TIME_COUNTER_NAME); /** System counters populated by streaming dataflow workers. */ public enum StreamingSystemCounterNames { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricTrackingWindmillServerStub.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricTrackingWindmillServerStub.java deleted file mode 100644 index d808d4f4ab58..000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricTrackingWindmillServerStub.java +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker; - -import com.google.auto.value.AutoBuilder; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.atomic.AtomicInteger; -import javax.annotation.concurrent.GuardedBy; -import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; -import org.apache.beam.sdk.annotations.Internal; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.SettableFuture; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.joda.time.Duration; - -/** - * Wrapper around a {@link WindmillServerStub} that tracks metrics for the number of in-flight - * requests and throttles requests when memory pressure is high. - * - *

External API: individual worker threads request state for their computation via {@link - * #getStateData}. However, requests are either issued using a pool of streaming rpcs or possibly - * batched requests. - */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -public class MetricTrackingWindmillServerStub { - - private static final int MAX_READS_PER_BATCH = 60; - private static final int MAX_ACTIVE_READS = 10; - private static final Duration STREAM_TIMEOUT = Duration.standardSeconds(30); - private final AtomicInteger activeSideInputs = new AtomicInteger(); - private final AtomicInteger activeStateReads = new AtomicInteger(); - private final AtomicInteger activeHeartbeats = new AtomicInteger(); - private final WindmillServerStub server; - private final MemoryMonitor gcThrashingMonitor; - private final boolean useStreamingRequests; - - private final WindmillStreamPool getDataStreamPool; - - // This may be the same instance as getDataStreamPool based upon options. - private final WindmillStreamPool heartbeatStreamPool; - - @GuardedBy("this") - private final List pendingReadBatches; - - @GuardedBy("this") - private int activeReadThreads = 0; - - @Internal - @AutoBuilder(ofClass = MetricTrackingWindmillServerStub.class) - public abstract static class Builder { - - abstract Builder setServer(WindmillServerStub server); - - abstract Builder setGcThrashingMonitor(MemoryMonitor gcThrashingMonitor); - - abstract Builder setUseStreamingRequests(boolean useStreamingRequests); - - abstract Builder setUseSeparateHeartbeatStreams(boolean useSeparateHeartbeatStreams); - - abstract Builder setNumGetDataStreams(int numGetDataStreams); - - abstract MetricTrackingWindmillServerStub build(); - } - - public static Builder builder(WindmillServerStub server, MemoryMonitor gcThrashingMonitor) { - return new AutoBuilder_MetricTrackingWindmillServerStub_Builder() - .setServer(server) - .setGcThrashingMonitor(gcThrashingMonitor) - .setUseStreamingRequests(false) - .setUseSeparateHeartbeatStreams(false) - .setNumGetDataStreams(1); - } - - MetricTrackingWindmillServerStub( - WindmillServerStub server, - MemoryMonitor gcThrashingMonitor, - boolean useStreamingRequests, - boolean useSeparateHeartbeatStreams, - int numGetDataStreams) { - this.server = server; - this.gcThrashingMonitor = gcThrashingMonitor; - this.useStreamingRequests = useStreamingRequests; - if (useStreamingRequests) { - getDataStreamPool = - WindmillStreamPool.create( - Math.max(1, numGetDataStreams), STREAM_TIMEOUT, this.server::getDataStream); - if (useSeparateHeartbeatStreams) { - heartbeatStreamPool = - WindmillStreamPool.create(1, STREAM_TIMEOUT, this.server::getDataStream); - } else { - heartbeatStreamPool = getDataStreamPool; - } - } else { - getDataStreamPool = heartbeatStreamPool = null; - } - // This is used as a queue but is expected to be less than 10 batches. - this.pendingReadBatches = new ArrayList<>(); - } - - // Adds the entry to a read batch for sending to the windmill server. If a non-null batch is - // returned, this thread will be responsible for sending the batch and should wait for the batch - // startRead to be notified. - // If null is returned, the entry was added to a read batch that will be issued by another thread. - private @Nullable ReadBatch addToReadBatch(QueueEntry entry) { - synchronized (this) { - ReadBatch batch; - if (activeReadThreads < MAX_ACTIVE_READS) { - assert (pendingReadBatches.isEmpty()); - activeReadThreads += 1; - // fall through to below synchronized block - } else if (pendingReadBatches.isEmpty() - || pendingReadBatches.get(pendingReadBatches.size() - 1).reads.size() - >= MAX_READS_PER_BATCH) { - // This is the first read of a batch, it will be responsible for sending the batch. - batch = new ReadBatch(); - pendingReadBatches.add(batch); - batch.reads.add(entry); - return batch; - } else { - // This fits within an existing batch, it will be sent by the first blocking thread in the - // batch. - pendingReadBatches.get(pendingReadBatches.size() - 1).reads.add(entry); - return null; - } - } - ReadBatch batch = new ReadBatch(); - batch.reads.add(entry); - batch.startRead.set(true); - return batch; - } - - private void issueReadBatch(ReadBatch batch) { - try { - boolean read = batch.startRead.get(); - assert (read); - } catch (InterruptedException e) { - // We don't expect this thread to be interrupted. To simplify handling, we just fall through - // to issuing - // the call. - assert (false); - Thread.currentThread().interrupt(); - } catch (ExecutionException e) { - // startRead is a SettableFuture so this should never occur. - throw new AssertionError("Should not have exception on startRead", e); - } - Map> pendingResponses = - new HashMap<>(batch.reads.size()); - Map computationBuilders = new HashMap<>(); - for (QueueEntry entry : batch.reads) { - Windmill.ComputationGetDataRequest.Builder computationBuilder = - computationBuilders.computeIfAbsent( - entry.computation, - k -> Windmill.ComputationGetDataRequest.newBuilder().setComputationId(k)); - - computationBuilder.addRequests(entry.request); - pendingResponses.put( - WindmillComputationKey.create( - entry.computation, entry.request.getKey(), entry.request.getShardingKey()), - entry.response); - } - - // Build the full GetDataRequest from the KeyedGetDataRequests pulled from the queue. - Windmill.GetDataRequest.Builder builder = Windmill.GetDataRequest.newBuilder(); - for (Windmill.ComputationGetDataRequest.Builder computationBuilder : - computationBuilders.values()) { - builder.addRequests(computationBuilder); - } - - try { - Windmill.GetDataResponse response = server.getData(builder.build()); - - // Dispatch the per-key responses back to the waiting threads. - for (Windmill.ComputationGetDataResponse computationResponse : response.getDataList()) { - for (Windmill.KeyedGetDataResponse keyResponse : computationResponse.getDataList()) { - pendingResponses - .get( - WindmillComputationKey.create( - computationResponse.getComputationId(), - keyResponse.getKey(), - keyResponse.getShardingKey())) - .set(keyResponse); - } - } - } catch (RuntimeException e) { - // Fan the exception out to the reads. - for (QueueEntry entry : batch.reads) { - entry.response.setException(e); - } - } finally { - synchronized (this) { - assert (activeReadThreads >= 1); - if (pendingReadBatches.isEmpty()) { - activeReadThreads--; - } else { - // Notify the thread responsible for issuing the next batch read. - ReadBatch startBatch = pendingReadBatches.remove(0); - startBatch.startRead.set(true); - } - } - } - } - - public Windmill.KeyedGetDataResponse getStateData( - String computation, Windmill.KeyedGetDataRequest request) { - gcThrashingMonitor.waitForResources("GetStateData"); - activeStateReads.getAndIncrement(); - - try { - if (useStreamingRequests) { - GetDataStream stream = getDataStreamPool.getStream(); - try { - return stream.requestKeyedData(computation, request); - } finally { - getDataStreamPool.releaseStream(stream); - } - } else { - SettableFuture response = SettableFuture.create(); - ReadBatch batch = addToReadBatch(new QueueEntry(computation, request, response)); - if (batch != null) { - issueReadBatch(batch); - } - return response.get(); - } - } catch (Exception e) { - throw new RuntimeException(e); - } finally { - activeStateReads.getAndDecrement(); - } - } - - public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { - gcThrashingMonitor.waitForResources("GetSideInputData"); - activeSideInputs.getAndIncrement(); - try { - if (useStreamingRequests) { - GetDataStream stream = getDataStreamPool.getStream(); - try { - return stream.requestGlobalData(request); - } finally { - getDataStreamPool.releaseStream(stream); - } - } else { - return server - .getData( - Windmill.GetDataRequest.newBuilder().addGlobalDataFetchRequests(request).build()) - .getGlobalData(0); - } - } catch (Exception e) { - throw new RuntimeException("Failed to get side input: ", e); - } finally { - activeSideInputs.getAndDecrement(); - } - } - - /** Tells windmill processing is ongoing for the given keys. */ - public void refreshActiveWork(Map> heartbeats) { - if (heartbeats.isEmpty()) { - return; - } - activeHeartbeats.set(heartbeats.size()); - try { - if (useStreamingRequests) { - GetDataStream stream = heartbeatStreamPool.getStream(); - try { - stream.refreshActiveWork(heartbeats); - } finally { - heartbeatStreamPool.releaseStream(stream); - } - } else { - // This code path is only used by appliance which sends heartbeats (used to refresh active - // work) as KeyedGetDataRequests. So we must translate the HeartbeatRequest to a - // KeyedGetDataRequest here regardless of the value of sendKeyedGetDataRequests. - Windmill.GetDataRequest.Builder builder = Windmill.GetDataRequest.newBuilder(); - for (Map.Entry> entry : heartbeats.entrySet()) { - Windmill.ComputationGetDataRequest.Builder perComputationBuilder = - Windmill.ComputationGetDataRequest.newBuilder(); - perComputationBuilder.setComputationId(entry.getKey()); - for (HeartbeatRequest request : entry.getValue()) { - perComputationBuilder.addRequests( - Windmill.KeyedGetDataRequest.newBuilder() - .setShardingKey(request.getShardingKey()) - .setWorkToken(request.getWorkToken()) - .setCacheToken(request.getCacheToken()) - .addAllLatencyAttribution(request.getLatencyAttributionList()) - .build()); - } - builder.addRequests(perComputationBuilder.build()); - } - server.getData(builder.build()); - } - } finally { - activeHeartbeats.set(0); - } - } - - public void printHtml(PrintWriter writer) { - writer.println("Active Fetches:"); - writer.println(" Side Inputs: " + activeSideInputs.get()); - writer.println(" State Reads: " + activeStateReads.get()); - if (!useStreamingRequests) { - synchronized (this) { - writer.println(" Read threads: " + activeReadThreads); - writer.println(" Pending read batches: " + pendingReadBatches.size()); - } - } - writer.println("Heartbeat Keys Active: " + activeHeartbeats.get()); - } - - private static final class ReadBatch { - ArrayList reads = new ArrayList<>(); - SettableFuture startRead = SettableFuture.create(); - } - - private static final class QueueEntry { - - final String computation; - final Windmill.KeyedGetDataRequest request; - final SettableFuture response; - - QueueEntry( - String computation, - Windmill.KeyedGetDataRequest request, - SettableFuture response) { - this.computation = computation; - this.request = request; - this.response = response; - } - } -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OperationalLimits.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OperationalLimits.java new file mode 100644 index 000000000000..e9ee8f39cba4 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OperationalLimits.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker; + +import com.google.auto.value.AutoBuilder; + +/** Keep track of any operational limits required by the backend. */ +public class OperationalLimits { + // Maximum size of a commit from a single work item. + public final long maxWorkItemCommitBytes; + // Maximum size of a single output element's serialized key. + public final long maxOutputKeyBytes; + // Maximum size of a single output element's serialized value. + public final long maxOutputValueBytes; + // Whether to throw an exception when processing output that violates any of the given limits. + public final boolean throwExceptionOnLargeOutput; + + OperationalLimits( + long maxWorkItemCommitBytes, + long maxOutputKeyBytes, + long maxOutputValueBytes, + boolean throwExceptionOnLargeOutput) { + this.maxWorkItemCommitBytes = maxWorkItemCommitBytes; + this.maxOutputKeyBytes = maxOutputKeyBytes; + this.maxOutputValueBytes = maxOutputValueBytes; + this.throwExceptionOnLargeOutput = throwExceptionOnLargeOutput; + } + + @AutoBuilder(ofClass = OperationalLimits.class) + public interface Builder { + Builder setMaxWorkItemCommitBytes(long bytes); + + Builder setMaxOutputKeyBytes(long bytes); + + Builder setMaxOutputValueBytes(long bytes); + + Builder setThrowExceptionOnLargeOutput(boolean shouldThrow); + + OperationalLimits build(); + } + + public static Builder builder() { + return new AutoBuilder_OperationalLimits_Builder() + .setMaxWorkItemCommitBytes(Long.MAX_VALUE) + .setMaxOutputKeyBytes(Long.MAX_VALUE) + .setMaxOutputValueBytes(Long.MAX_VALUE) + .setThrowExceptionOnLargeOutput(false); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OutputTooLargeException.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OutputTooLargeException.java new file mode 100644 index 000000000000..9f4b413841c5 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/OutputTooLargeException.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker; + +import org.checkerframework.checker.nullness.qual.Nullable; + +/** Indicates that an output element was too large. */ +public class OutputTooLargeException extends RuntimeException { + public OutputTooLargeException(String reason) { + super(reason); + } + + /** Returns whether an exception was caused by a {@link OutputTooLargeException}. */ + public static boolean isCausedByOutputTooLargeException(@Nullable Throwable t) { + while (t != null) { + if (t instanceof OutputTooLargeException) { + return true; + } + t = t.getCause(); + } + return false; + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index 0e46e7e4687e..90f072be997e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -23,19 +23,18 @@ import com.google.api.services.dataflow.model.CounterUpdate; import com.google.api.services.dataflow.model.MapTask; import com.google.auto.value.AutoValue; -import java.util.Collection; -import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Random; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Supplier; @@ -48,30 +47,34 @@ import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; import org.apache.beam.runners.dataflow.worker.streaming.ComputationStateCache; import org.apache.beam.runners.dataflow.worker.streaming.StageInfo; -import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; -import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.streaming.WorkHeartbeatResponseProcessor; import org.apache.beam.runners.dataflow.worker.streaming.config.ComputationConfig; import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingApplianceComputationConfigFetcher; import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingEngineComputationConfigFetcher; import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingEnginePipelineConfig; +import org.apache.beam.runners.dataflow.worker.streaming.harness.SingleSourceWorkerHarness; +import org.apache.beam.runners.dataflow.worker.streaming.harness.SingleSourceWorkerHarness.GetWorkSender; import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingCounters; +import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingWorkerHarness; import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingWorkerStatusPages; import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingWorkerStatusReporter; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.LatencyAttribution; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; import org.apache.beam.runners.dataflow.worker.windmill.appliance.JniWindmillApplianceServer; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.CompleteCommit; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.StreamingApplianceWorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.StreamingEngineWorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ApplianceGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.StreamPoolGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.ChannelzServlet; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcDispatcherClient; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillServer; @@ -87,22 +90,23 @@ import org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures.StreamingEngineFailureTracker; import org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures.WorkFailureProcessor; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.ActiveWorkRefresher; -import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.ActiveWorkRefreshers; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.ApplianceHeartbeatSender; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.StreamPoolHeartbeatSender; import org.apache.beam.sdk.fn.IdGenerator; import org.apache.beam.sdk.fn.IdGenerators; import org.apache.beam.sdk.fn.JvmInitializers; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.io.gcp.bigquery.BigQuerySinkMetrics; -import org.apache.beam.sdk.metrics.MetricName; import org.apache.beam.sdk.metrics.MetricsEnvironment; import org.apache.beam.sdk.util.construction.CoderTranslation; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.*; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheStats; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Uninterruptibles; import org.joda.time.Duration; import org.joda.time.Instant; import org.slf4j.Logger; @@ -112,14 +116,7 @@ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) }) -public class StreamingDataflowWorker { - - // TODO(https://github.com/apache/beam/issues/19632): Update throttling counters to use generic - // throttling-msecs metric. - public static final MetricName BIGQUERY_STREAMING_INSERT_THROTTLE_TIME = - MetricName.named( - "org.apache.beam.sdk.io.gcp.bigquery.BigQueryServicesImpl$DatasetServiceImpl", - "throttling-msecs"); +public final class StreamingDataflowWorker { /** * Sinks are marked 'full' in {@link StreamingModeExecutionContext} once the amount of data sinked @@ -129,45 +126,44 @@ public class StreamingDataflowWorker { */ public static final int MAX_SINK_BYTES = 10_000_000; - // Maximum number of threads for processing. Currently, each thread processes one key at a time. - static final int MAX_PROCESSING_THREADS = 300; - static final long THREAD_EXPIRATION_TIME_SEC = 60; - static final int GET_WORK_STREAM_TIMEOUT_MINUTES = 3; - static final Duration COMMIT_STREAM_TIMEOUT = Duration.standardMinutes(1); private static final Logger LOG = LoggerFactory.getLogger(StreamingDataflowWorker.class); + /** + * Maximum number of threads for processing. Currently, each thread processes one key at a time. + */ + private static final int MAX_PROCESSING_THREADS = 300; + /** The idGenerator to generate unique id globally. */ private static final IdGenerator ID_GENERATOR = IdGenerators.decrementingLongs(); - private static final int DEFAULT_STATUS_PORT = 8081; - // Maximum size of the result of a GetWork request. + /** Maximum size of the result of a GetWork request. */ private static final long MAX_GET_WORK_FETCH_BYTES = 64L << 20; // 64m /** Maximum number of failure stacktraces to report in each update sent to backend. */ private static final int MAX_FAILURES_TO_REPORT_IN_UPDATE = 1000; - private static final Random clientIdGenerator = new Random(); + private static final long THREAD_EXPIRATION_TIME_SEC = 60; + private static final Duration COMMIT_STREAM_TIMEOUT = Duration.standardMinutes(1); + private static final Duration GET_DATA_STREAM_TIMEOUT = Duration.standardSeconds(30); + private static final int DEFAULT_STATUS_PORT = 8081; + private static final Random CLIENT_ID_GENERATOR = new Random(); private static final String CHANNELZ_PATH = "/channelz"; - final WindmillStateCache stateCache; + + private final WindmillStateCache stateCache; private final StreamingWorkerStatusPages statusPages; private final ComputationConfig.Fetcher configFetcher; private final ComputationStateCache computationStateCache; private final BoundedQueueExecutor workUnitExecutor; - private final WindmillServerStub windmillServer; - private final Thread dispatchThread; + private final StreamingWorkerHarness streamingWorkerHarness; private final AtomicBoolean running = new AtomicBoolean(); private final DataflowWorkerHarnessOptions options; - private final long clientId; - private final MetricTrackingWindmillServerStub metricTrackingWindmillServer; - private final MemoryMonitor memoryMonitor; - private final Thread memoryMonitorThread; + private final BackgroundMemoryMonitor memoryMonitor; private final ReaderCache readerCache; private final DataflowExecutionStateSampler sampler = DataflowExecutionStateSampler.instance(); private final ActiveWorkRefresher activeWorkRefresher; private final WorkCommitter workCommitter; private final StreamingWorkerStatusReporter workerStatusReporter; private final StreamingCounters streamingCounters; - private final StreamingWorkScheduler streamingWorkScheduler; private StreamingDataflowWorker( WindmillServerStub windmillServer, @@ -185,10 +181,13 @@ private StreamingDataflowWorker( WorkFailureProcessor workFailureProcessor, StreamingCounters streamingCounters, MemoryMonitor memoryMonitor, - AtomicInteger maxWorkItemCommitBytes, + AtomicReference operationalLimits, GrpcWindmillStreamFactory windmillStreamFactory, Function executorSupplier, ConcurrentMap stageInfoMap) { + // Register standard file systems. + FileSystems.setDefaultPipelineOptions(options); + this.configFetcher = configFetcher; this.computationStateCache = computationStateCache; this.stateCache = windmillStateCache; @@ -207,122 +206,137 @@ private StreamingDataflowWorker( this.workCommitter = windmillServiceEnabled - ? StreamingEngineWorkCommitter.create( - WindmillStreamPool.create( - numCommitThreads, COMMIT_STREAM_TIMEOUT, windmillServer::commitWorkStream) - ::getCloseableStream, - numCommitThreads, - this::onCompleteCommit) + ? StreamingEngineWorkCommitter.builder() + .setCommitWorkStreamFactory( + WindmillStreamPool.create( + numCommitThreads, + COMMIT_STREAM_TIMEOUT, + windmillServer::commitWorkStream) + ::getCloseableStream) + .setNumCommitSenders(numCommitThreads) + .setOnCommitComplete(this::onCompleteCommit) + .build() : StreamingApplianceWorkCommitter.create( windmillServer::commitWork, this::onCompleteCommit); this.workUnitExecutor = workUnitExecutor; - memoryMonitorThread = new Thread(memoryMonitor); - memoryMonitorThread.setPriority(Thread.MIN_PRIORITY); - memoryMonitorThread.setName("MemoryMonitor"); - - dispatchThread = - new Thread( - () -> { - LOG.info("Dispatch starting"); - if (windmillServiceEnabled) { - streamingDispatchLoop(); - } else { - dispatchLoop(); - } - LOG.info("Dispatch done"); - }); - dispatchThread.setDaemon(true); - dispatchThread.setPriority(Thread.MIN_PRIORITY); - dispatchThread.setName("DispatchThread"); - this.clientId = clientId; - this.windmillServer = windmillServer; - this.metricTrackingWindmillServer = - MetricTrackingWindmillServerStub.builder(windmillServer, memoryMonitor) - .setUseStreamingRequests(windmillServiceEnabled) - .setUseSeparateHeartbeatStreams(options.getUseSeparateWindmillHeartbeatStreams()) - .setNumGetDataStreams(options.getWindmillGetDataStreamCount()) - .build(); + this.workerStatusReporter = workerStatusReporter; + this.streamingCounters = streamingCounters; + this.memoryMonitor = BackgroundMemoryMonitor.create(memoryMonitor); + StreamingWorkScheduler streamingWorkScheduler = + StreamingWorkScheduler.create( + options, + clock, + readerCache, + mapTaskExecutorFactory, + workUnitExecutor, + stateCache::forComputation, + failureTracker, + workFailureProcessor, + streamingCounters, + hotKeyLogger, + sampler, + operationalLimits, + ID_GENERATOR, + stageInfoMap); - // Register standard file systems. - FileSystems.setDefaultPipelineOptions(options); + ThrottlingGetDataMetricTracker getDataMetricTracker = + new ThrottlingGetDataMetricTracker(memoryMonitor); + WorkerStatusPages workerStatusPages = + WorkerStatusPages.create(DEFAULT_STATUS_PORT, memoryMonitor); + StreamingWorkerStatusPages.Builder statusPagesBuilder = StreamingWorkerStatusPages.builder(); + int stuckCommitDurationMillis; + GetDataClient getDataClient; + HeartbeatSender heartbeatSender; + if (windmillServiceEnabled) { + WindmillStreamPool getDataStreamPool = + WindmillStreamPool.create( + Math.max(1, options.getWindmillGetDataStreamCount()), + GET_DATA_STREAM_TIMEOUT, + windmillServer::getDataStream); + getDataClient = new StreamPoolGetDataClient(getDataMetricTracker, getDataStreamPool); + heartbeatSender = + new StreamPoolHeartbeatSender( + options.getUseSeparateWindmillHeartbeatStreams() + ? WindmillStreamPool.create( + 1, GET_DATA_STREAM_TIMEOUT, windmillServer::getDataStream) + : getDataStreamPool); + stuckCommitDurationMillis = + options.getStuckCommitDurationMillis() > 0 ? options.getStuckCommitDurationMillis() : 0; + statusPagesBuilder + .setDebugCapture( + new DebugCapture.Manager(options, workerStatusPages.getDebugCapturePages())) + .setChannelzServlet( + new ChannelzServlet( + CHANNELZ_PATH, options, windmillServer::getWindmillServiceEndpoints)) + .setWindmillStreamFactory(windmillStreamFactory); + } else { + getDataClient = new ApplianceGetDataClient(windmillServer, getDataMetricTracker); + heartbeatSender = new ApplianceHeartbeatSender(windmillServer::getData); + stuckCommitDurationMillis = 0; + } - int stuckCommitDurationMillis = - windmillServiceEnabled && options.getStuckCommitDurationMillis() > 0 - ? options.getStuckCommitDurationMillis() - : 0; this.activeWorkRefresher = - ActiveWorkRefreshers.createDispatchedActiveWorkRefresher( + new ActiveWorkRefresher( clock, options.getActiveWorkRefreshPeriodMillis(), stuckCommitDurationMillis, computationStateCache::getAllPresentComputations, sampler, - metricTrackingWindmillServer::refreshActiveWork, - executorSupplier.apply("RefreshWork")); + executorSupplier.apply("RefreshWork"), + getDataMetricTracker::trackHeartbeats); - WorkerStatusPages workerStatusPages = - WorkerStatusPages.create(DEFAULT_STATUS_PORT, memoryMonitor); - StreamingWorkerStatusPages.Builder statusPagesBuilder = - StreamingWorkerStatusPages.builder() + this.statusPages = + statusPagesBuilder .setClock(clock) .setClientId(clientId) .setIsRunning(running) .setStatusPages(workerStatusPages) .setStateCache(stateCache) - .setComputationStateCache(computationStateCache) + .setComputationStateCache(this.computationStateCache) .setCurrentActiveCommitBytes(workCommitter::currentActiveCommitBytes) - .setGetDataStatusProvider(metricTrackingWindmillServer::printHtml) - .setWorkUnitExecutor(workUnitExecutor); - - this.statusPages = - windmillServiceEnabled - ? statusPagesBuilder - .setDebugCapture( - new DebugCapture.Manager(options, workerStatusPages.getDebugCapturePages())) - .setChannelzServlet(new ChannelzServlet(CHANNELZ_PATH, options, windmillServer)) - .setWindmillStreamFactory(windmillStreamFactory) - .build() - : statusPagesBuilder.build(); + .setGetDataStatusProvider(getDataClient::printHtml) + .setWorkUnitExecutor(workUnitExecutor) + .build(); - this.workerStatusReporter = workerStatusReporter; - this.streamingCounters = streamingCounters; - this.memoryMonitor = memoryMonitor; + Windmill.GetWorkRequest request = + Windmill.GetWorkRequest.newBuilder() + .setClientId(clientId) + .setMaxItems(chooseMaximumBundlesOutstanding()) + .setMaxBytes(MAX_GET_WORK_FETCH_BYTES) + .build(); - this.streamingWorkScheduler = - StreamingWorkScheduler.create( - options, - clock, - readerCache, - mapTaskExecutorFactory, - workUnitExecutor, - stateCache::forComputation, - metricTrackingWindmillServer::getSideInputData, - failureTracker, - workFailureProcessor, - streamingCounters, - hotKeyLogger, - sampler, - maxWorkItemCommitBytes, - ID_GENERATOR, - stageInfoMap); + this.streamingWorkerHarness = + SingleSourceWorkerHarness.builder() + .setStreamingWorkScheduler(streamingWorkScheduler) + .setWorkCommitter(workCommitter) + .setGetDataClient(getDataClient) + .setComputationStateFetcher(this.computationStateCache::get) + .setWaitForResources(() -> memoryMonitor.waitForResources("GetWork")) + .setHeartbeatSender(heartbeatSender) + .setGetWorkSender( + windmillServiceEnabled + ? GetWorkSender.forStreamingEngine( + receiver -> windmillServer.getWorkStream(request, receiver)) + : GetWorkSender.forAppliance(() -> windmillServer.getWork(request))) + .build(); LOG.debug("windmillServiceEnabled: {}", windmillServiceEnabled); LOG.debug("WindmillServiceEndpoint: {}", options.getWindmillServiceEndpoint()); LOG.debug("WindmillServicePort: {}", options.getWindmillServicePort()); LOG.debug("LocalWindmillHostport: {}", options.getLocalWindmillHostport()); - LOG.debug("maxWorkItemCommitBytes: {}", maxWorkItemCommitBytes.get()); } public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions options) { - long clientId = clientIdGenerator.nextLong(); + long clientId = CLIENT_ID_GENERATOR.nextLong(); MemoryMonitor memoryMonitor = MemoryMonitor.fromOptions(options); ConcurrentMap stageInfo = new ConcurrentHashMap<>(); StreamingCounters streamingCounters = StreamingCounters.create(); WorkUnitClient dataflowServiceClient = new DataflowWorkUnitClient(options, LOG); BoundedQueueExecutor workExecutor = createWorkUnitExecutor(options); - AtomicInteger maxWorkItemCommitBytes = new AtomicInteger(Integer.MAX_VALUE); + AtomicReference operationalLimits = + new AtomicReference<>(OperationalLimits.builder().build()); WindmillStateCache windmillStateCache = WindmillStateCache.builder() .setSizeMb(options.getWorkerCacheMb()) @@ -340,7 +354,7 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o createConfigFetcherComputationStateCacheAndWindmillClient( options, dataflowServiceClient, - maxWorkItemCommitBytes, + operationalLimits, windmillStreamFactoryBuilder, configFetcher -> ComputationStateCache.create( @@ -398,7 +412,7 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o workFailureProcessor, streamingCounters, memoryMonitor, - maxWorkItemCommitBytes, + operationalLimits, configFetcherComputationStateCacheAndWindmillClient.windmillStreamFactory(), executorSupplier, stageInfo); @@ -414,15 +428,16 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o createConfigFetcherComputationStateCacheAndWindmillClient( DataflowWorkerHarnessOptions options, WorkUnitClient dataflowServiceClient, - AtomicInteger maxWorkItemCommitBytes, + AtomicReference operationalLimits, GrpcWindmillStreamFactory.Builder windmillStreamFactoryBuilder, Function computationStateCacheFactory) { ComputationConfig.Fetcher configFetcher; WindmillServerStub windmillServer; ComputationStateCache computationStateCache; - GrpcDispatcherClient dispatcherClient = GrpcDispatcherClient.create(createStubFactory(options)); GrpcWindmillStreamFactory windmillStreamFactory; if (options.isEnableStreamingEngine()) { + GrpcDispatcherClient dispatcherClient = + GrpcDispatcherClient.create(createStubFactory(options)); configFetcher = StreamingEngineComputationConfigFetcher.create( options.getGlobalConfigRefreshPeriod().getMillis(), @@ -430,8 +445,9 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o config -> onPipelineConfig( config, + options, dispatcherClient::consumeWindmillDispatcherEndpoints, - maxWorkItemCommitBytes)); + operationalLimits::set)); computationStateCache = computationStateCacheFactory.apply(configFetcher); windmillStreamFactory = windmillStreamFactoryBuilder @@ -450,7 +466,10 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o options.getWindmillServiceStreamingRpcHealthCheckPeriodMs()) .build(); windmillServer = - GrpcWindmillServer.create(options, windmillStreamFactory, dispatcherClient); + GrpcWindmillServer.create( + options, + windmillStreamFactory, + GrpcDispatcherClient.create(createStubFactory(options))); } else { windmillStreamFactory = windmillStreamFactoryBuilder.build(); windmillServer = new JniWindmillApplianceServer(options.getLocalWindmillHostport()); @@ -477,9 +496,9 @@ static StreamingDataflowWorker forTesting( Supplier clock, Function executorSupplier, int localRetryTimeoutMs, - int maxWorkItemCommitBytesOverrides) { + OperationalLimits limits) { ConcurrentMap stageInfo = new ConcurrentHashMap<>(); - AtomicInteger maxWorkItemCommitBytes = new AtomicInteger(maxWorkItemCommitBytesOverrides); + AtomicReference operationalLimits = new AtomicReference<>(limits); BoundedQueueExecutor workExecutor = createWorkUnitExecutor(options); WindmillStateCache stateCache = WindmillStateCache.builder() @@ -496,8 +515,9 @@ static StreamingDataflowWorker forTesting( config -> onPipelineConfig( config, + options, windmillServer::setWindmillServiceEndpoints, - maxWorkItemCommitBytes)) + operationalLimits::set)) : new StreamingApplianceComputationConfigFetcher(windmillServer::getConfig); ConcurrentMap stateNameMap = new ConcurrentHashMap<>(prePopulatedStateNameMappings); @@ -565,7 +585,7 @@ static StreamingDataflowWorker forTesting( workFailureProcessor, streamingCounters, memoryMonitor, - maxWorkItemCommitBytes, + operationalLimits, options.isEnableStreamingEngine() ? windmillStreamFactory .setHealthCheckIntervalMillis( @@ -578,12 +598,18 @@ static StreamingDataflowWorker forTesting( private static void onPipelineConfig( StreamingEnginePipelineConfig config, + DataflowWorkerHarnessOptions options, Consumer> consumeWindmillServiceEndpoints, - AtomicInteger maxWorkItemCommitBytes) { - if (config.maxWorkItemCommitBytes() != maxWorkItemCommitBytes.get()) { - LOG.info("Setting maxWorkItemCommitBytes to {}", maxWorkItemCommitBytes); - maxWorkItemCommitBytes.set((int) config.maxWorkItemCommitBytes()); - } + Consumer operationalLimits) { + + operationalLimits.accept( + OperationalLimits.builder() + .setMaxWorkItemCommitBytes(config.maxWorkItemCommitBytes()) + .setMaxOutputKeyBytes(config.maxOutputKeyBytes()) + .setMaxOutputValueBytes(config.maxOutputValueBytes()) + .setThrowExceptionOnLargeOutput( + DataflowRunner.hasExperiment(options, "throw_exceptions_on_large_output")) + .build()); if (!config.windmillServiceEndpoints().isEmpty()) { consumeWindmillServiceEndpoints.accept(config.windmillServiceEndpoints()); @@ -678,10 +704,6 @@ private static ChannelCachingStubFactory createStubFactory( return ChannelCachingRemoteStubFactory.create(workerOptions.getGcpCredential(), channelCache); } - private static void sleep(int millis) { - Uninterruptibles.sleepUninterruptibly(millis, TimeUnit.MILLISECONDS); - } - private static int chooseMaxThreads(DataflowWorkerHarnessOptions options) { if (options.getNumberOfWorkerHarnessThreads() != 0) { return options.getNumberOfWorkerHarnessThreads(); @@ -710,7 +732,7 @@ private static void enableBigQueryMetrics() { } @VisibleForTesting - final void reportPeriodicWorkerUpdatesForTest() { + void reportPeriodicWorkerUpdatesForTest() { workerStatusReporter.reportPeriodicWorkerUpdates(); } @@ -739,6 +761,11 @@ int numCommitThreads() { return workCommitter.parallelism(); } + @VisibleForTesting + CacheStats getStateCacheStats() { + return stateCache.getCacheStats(); + } + @VisibleForTesting ComputationStateCache getComputationStateCache() { return computationStateCache; @@ -747,14 +774,10 @@ ComputationStateCache getComputationStateCache() { @SuppressWarnings("FutureReturnValueIgnored") public void start() { running.set(true); - configFetcher.start(); - - memoryMonitorThread.start(); - dispatchThread.start(); + memoryMonitor.start(); + streamingWorkerHarness.start(); sampler.start(); - - workCommitter.start(); workerStatusReporter.start(); activeWorkRefresher.start(); } @@ -768,120 +791,19 @@ private void startStatusPages() { void stop() { try { configFetcher.stop(); - activeWorkRefresher.stop(); statusPages.stop(); running.set(false); - dispatchThread.interrupt(); - dispatchThread.join(); - - workCommitter.stop(); - memoryMonitor.stop(); - memoryMonitorThread.join(); + streamingWorkerHarness.shutdown(); + memoryMonitor.shutdown(); workUnitExecutor.shutdown(); - computationStateCache.closeAndInvalidateAll(); - workerStatusReporter.stop(); } catch (Exception e) { LOG.warn("Exception while shutting down: ", e); } } - private void dispatchLoop() { - while (running.get()) { - memoryMonitor.waitForResources("GetWork"); - - int backoff = 1; - Windmill.GetWorkResponse workResponse = null; - do { - try { - workResponse = getWork(); - if (workResponse.getWorkCount() > 0) { - break; - } - } catch (WindmillServerStub.RpcException e) { - LOG.warn("GetWork failed, retrying:", e); - } - sleep(backoff); - backoff = Math.min(1000, backoff * 2); - } while (running.get()); - for (final Windmill.ComputationWorkItems computationWork : workResponse.getWorkList()) { - final String computationId = computationWork.getComputationId(); - Optional maybeComputationState = computationStateCache.get(computationId); - if (!maybeComputationState.isPresent()) { - continue; - } - - final ComputationState computationState = maybeComputationState.get(); - final Instant inputDataWatermark = - WindmillTimeUtils.windmillToHarnessWatermark(computationWork.getInputDataWatermark()); - Watermarks.Builder watermarks = - Watermarks.builder() - .setInputDataWatermark(Preconditions.checkNotNull(inputDataWatermark)) - .setSynchronizedProcessingTime( - WindmillTimeUtils.windmillToHarnessWatermark( - computationWork.getDependentRealtimeInputWatermark())); - - for (final Windmill.WorkItem workItem : computationWork.getWorkList()) { - streamingWorkScheduler.scheduleWork( - computationState, - workItem, - watermarks.setOutputDataWatermark(workItem.getOutputDataWatermark()).build(), - Work.createProcessingContext( - computationId, metricTrackingWindmillServer::getStateData, workCommitter::commit), - /* getWorkStreamLatencies= */ Collections.emptyList()); - } - } - } - } - - void streamingDispatchLoop() { - while (running.get()) { - GetWorkStream stream = - windmillServer.getWorkStream( - Windmill.GetWorkRequest.newBuilder() - .setClientId(clientId) - .setMaxItems(chooseMaximumBundlesOutstanding()) - .setMaxBytes(MAX_GET_WORK_FETCH_BYTES) - .build(), - (String computation, - Instant inputDataWatermark, - Instant synchronizedProcessingTime, - Windmill.WorkItem workItem, - Collection getWorkStreamLatencies) -> - computationStateCache - .get(computation) - .ifPresent( - computationState -> { - memoryMonitor.waitForResources("GetWork"); - streamingWorkScheduler.scheduleWork( - computationState, - workItem, - Watermarks.builder() - .setInputDataWatermark(inputDataWatermark) - .setSynchronizedProcessingTime(synchronizedProcessingTime) - .setOutputDataWatermark(workItem.getOutputDataWatermark()) - .build(), - Work.createProcessingContext( - computationState.getComputationId(), - metricTrackingWindmillServer::getStateData, - workCommitter::commit), - getWorkStreamLatencies); - })); - try { - // Reconnect every now and again to enable better load balancing. - // If at any point the server closes the stream, we will reconnect immediately; otherwise - // we half-close the stream after some time and create a new one. - if (!stream.awaitTermination(GET_WORK_STREAM_TIMEOUT_MINUTES, TimeUnit.MINUTES)) { - stream.close(); - } - } catch (InterruptedException e) { - // Continue processing until !running.get() - } - } - } - private void onCompleteCommit(CompleteCommit completeCommit) { if (completeCommit.status() != Windmill.CommitStatus.OK) { readerCache.invalidateReader( @@ -900,15 +822,6 @@ private void onCompleteCommit(CompleteCommit completeCommit) { completeCommit.shardedKey(), completeCommit.workId())); } - private Windmill.GetWorkResponse getWork() { - return windmillServer.getWork( - Windmill.GetWorkRequest.newBuilder() - .setClientId(clientId) - .setMaxItems(chooseMaximumBundlesOutstanding()) - .setMaxBytes(MAX_GET_WORK_FETCH_BYTES) - .build()); - } - @VisibleForTesting public Iterable buildCounters() { return Iterables.concat( @@ -940,4 +853,34 @@ private static ConfigFetcherComputationStateCacheAndWindmillClient create( abstract GrpcWindmillStreamFactory windmillStreamFactory(); } + + /** + * Monitors memory pressure on a background executor. May be used to throttle calls, blocking if + * there is memory pressure. + */ + @AutoValue + abstract static class BackgroundMemoryMonitor { + private static BackgroundMemoryMonitor create(MemoryMonitor memoryMonitor) { + return new AutoValue_StreamingDataflowWorker_BackgroundMemoryMonitor( + memoryMonitor, + Executors.newSingleThreadScheduledExecutor( + new ThreadFactoryBuilder() + .setNameFormat("MemoryMonitor") + .setPriority(Thread.MIN_PRIORITY) + .build())); + } + + abstract MemoryMonitor memoryMonitor(); + + abstract ExecutorService executor(); + + private void start() { + executor().execute(memoryMonitor()); + } + + private void shutdown() { + memoryMonitor().stop(); + executor().shutdown(); + } + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java index dd6353060abc..a594dbb1e0f7 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java @@ -129,6 +129,10 @@ public class StreamingModeExecutionContext extends DataflowExecutionContext(); @@ -191,8 +187,7 @@ public Histogram getPerWorkerHistogram( public Iterable extractUpdates() { return counterUpdates() .append(distributionUpdates()) - .append(gaugeUpdates()) - .append(populateStringSetUpdates ? stringSetUpdates() : Collections.emptyList()); + .append(gaugeUpdates().append(stringSetUpdates())); } private FluentIterable counterUpdates() { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillSink.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillSink.java index 1f26572941a0..78d0c6b4550a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillSink.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillSink.java @@ -44,6 +44,8 @@ import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.checkerframework.checker.nullness.qual.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; @SuppressWarnings({ "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) @@ -54,6 +56,7 @@ class WindmillSink extends Sink> { private final Coder valueCoder; private final Coder> windowsCoder; private StreamingModeExecutionContext context; + private static final Logger LOG = LoggerFactory.getLogger(WindmillSink.class); WindmillSink( String destinationName, @@ -172,6 +175,28 @@ public long add(WindowedValue data) throws IOException { key = context.getSerializedKey(); value = encode(valueCoder, data.getValue()); } + if (key.size() > context.getMaxOutputKeyBytes()) { + if (context.throwExceptionsForLargeOutput()) { + throw new OutputTooLargeException("Key too large: " + key.size()); + } else { + LOG.error( + "Trying to output too large key with size " + + key.size() + + ". Limit is " + + context.getMaxOutputKeyBytes()); + } + } + if (value.size() > context.getMaxOutputValueBytes()) { + if (context.throwExceptionsForLargeOutput()) { + throw new OutputTooLargeException("Value too large: " + value.size()); + } else { + LOG.error( + "Trying to output too large value with size " + + value.size() + + ". Limit is " + + context.getMaxOutputValueBytes()); + } + } Windmill.KeyedMessageBundle.Builder keyedOutput = productionMap.get(key); if (keyedOutput == null) { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkItemCancelledException.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkItemCancelledException.java index 934977fe0985..ec5122a8732a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkItemCancelledException.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkItemCancelledException.java @@ -26,6 +26,10 @@ public WorkItemCancelledException(long sharding_key) { super("Work item cancelled for key " + sharding_key); } + public WorkItemCancelledException(Throwable e) { + super(e); + } + /** Returns whether an exception was caused by a {@link WorkItemCancelledException}. */ public static boolean isWorkItemCancelledException(Throwable t) { while (t != null) { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/graph/Nodes.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/graph/Nodes.java index 6092d0d64de5..d82432417000 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/graph/Nodes.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/graph/Nodes.java @@ -29,11 +29,11 @@ import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; +import java.nio.charset.StandardCharsets; import org.apache.beam.runners.dataflow.worker.util.common.worker.Operation; import org.apache.beam.runners.dataflow.worker.util.common.worker.OutputReceiver; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.extensions.gcp.util.Transport; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; /** Container class for different types of network nodes. All nodes only have reference equality. */ @@ -59,7 +59,7 @@ private static String toStringWithTrimmedLiterals(GenericJson json) { ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); final JsonGenerator baseGenerator = MoreObjects.firstNonNull(json.getFactory(), Transport.getJsonFactory()) - .createJsonGenerator(byteStream, Charsets.UTF_8); + .createJsonGenerator(byteStream, StandardCharsets.UTF_8); JsonGenerator generator = new JsonGenerator() { @Override @@ -164,7 +164,7 @@ public void enablePrettyPrint() throws IOException { generator.enablePrettyPrint(); generator.serialize(json); generator.flush(); - return byteStream.toString(Charsets.UTF_8.name()); + return byteStream.toString(StandardCharsets.UTF_8.name()); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java index 3e226514d57e..c80c3a882e52 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java @@ -18,6 +18,7 @@ package org.apache.beam.runners.dataflow.worker.streaming; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList.toImmutableList; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap.flatteningToImmutableListMultimap; import java.io.PrintWriter; import java.util.ArrayDeque; @@ -31,13 +32,9 @@ import java.util.Queue; import java.util.concurrent.atomic.AtomicReference; import java.util.function.BiConsumer; -import java.util.stream.Stream; import javax.annotation.Nullable; import javax.annotation.concurrent.GuardedBy; import javax.annotation.concurrent.ThreadSafe; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; @@ -45,6 +42,7 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Multimap; import org.joda.time.Duration; @@ -106,29 +104,6 @@ private static String elapsedString(Instant start, Instant end) { return activeFor.toString().substring(2); } - private static Stream toHeartbeatRequestStream( - Entry> shardedKeyAndWorkQueue, - Instant refreshDeadline, - DataflowExecutionStateSampler sampler) { - ShardedKey shardedKey = shardedKeyAndWorkQueue.getKey(); - Deque workQueue = shardedKeyAndWorkQueue.getValue(); - - return workQueue.stream() - .map(ExecutableWork::work) - .filter(work -> work.getStartTime().isBefore(refreshDeadline)) - // Don't send heartbeats for queued work we already know is failed. - .filter(work -> !work.isFailed()) - .map( - work -> - Windmill.HeartbeatRequest.newBuilder() - .setShardingKey(shardedKey.shardingKey()) - .setWorkToken(work.getWorkItem().getWorkToken()) - .setCacheToken(work.getWorkItem().getCacheToken()) - .addAllLatencyAttribution( - work.getLatencyAttributions(/* isHeartbeat= */ true, sampler)) - .build()); - } - /** * Activates {@link Work} for the {@link ShardedKey}. Outcome can be 1 of 4 {@link * ActivateWorkResult} @@ -219,6 +194,31 @@ synchronized void failWorkForKey(Multimap failedWork) { } } + /** + * Returns a read only view of current active work. + * + * @implNote Do not return a reference to the underlying workQueue as iterations over it will + * cause a {@link java.util.ConcurrentModificationException} as it is not a thread-safe data + * structure. + */ + synchronized ImmutableListMultimap getReadOnlyActiveWork() { + return activeWork.entrySet().stream() + .collect( + flatteningToImmutableListMultimap( + Entry::getKey, + e -> + e.getValue().stream() + .map(executableWork -> (RefreshableWork) executableWork.work()))); + } + + synchronized ImmutableList getRefreshableWork(Instant refreshDeadline) { + return activeWork.values().stream() + .flatMap(Deque::stream) + .map(ExecutableWork::work) + .filter(work -> !work.isFailed() && work.getStartTime().isBefore(refreshDeadline)) + .collect(toImmutableList()); + } + private void incrementActiveWorkBudget(Work work) { activeGetWorkBudget.updateAndGet( getWorkBudget -> getWorkBudget.apply(1, work.getWorkItem().getSerializedSize())); @@ -324,13 +324,6 @@ private synchronized ImmutableMap getStuckCommitsAt( return stuckCommits.build(); } - synchronized ImmutableList getKeyHeartbeats( - Instant refreshDeadline, DataflowExecutionStateSampler sampler) { - return activeWork.entrySet().stream() - .flatMap(entry -> toHeartbeatRequestStream(entry, refreshDeadline, sampler)) - .collect(toImmutableList()); - } - /** * Returns the current aggregate {@link GetWorkBudget} that is active on the user worker. Active * means that the work is received from Windmill, being processed or queued to be processed in diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java index 434e78484799..f3b0ba16fbe2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java @@ -23,14 +23,13 @@ import java.util.Optional; import java.util.concurrent.ConcurrentLinkedQueue; import javax.annotation.Nullable; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Multimap; import org.joda.time.Instant; @@ -147,10 +146,12 @@ private void forceExecute(ExecutableWork executableWork) { executor.forceExecute(executableWork, executableWork.work().getWorkItem().getSerializedSize()); } - /** Gets HeartbeatRequests for any work started before refreshDeadline. */ - public ImmutableList getKeyHeartbeats( - Instant refreshDeadline, DataflowExecutionStateSampler sampler) { - return activeWorkState.getKeyHeartbeats(refreshDeadline, sampler); + public ImmutableListMultimap currentActiveWorkReadOnly() { + return activeWorkState.getReadOnlyActiveWork(); + } + + public ImmutableList getRefreshableWork(Instant refreshDeadline) { + return activeWorkState.getRefreshableWork(refreshDeadline); } public GetWorkBudget getActiveWorkBudget() { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationWorkExecutor.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationWorkExecutor.java index dd34e85bc93c..8a00194887da 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationWorkExecutor.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationWorkExecutor.java @@ -24,6 +24,7 @@ import org.apache.beam.runners.core.metrics.ExecutionStateTracker; import org.apache.beam.runners.dataflow.worker.DataflowMapTaskExecutor; import org.apache.beam.runners.dataflow.worker.DataflowWorkExecutor; +import org.apache.beam.runners.dataflow.worker.OperationalLimits; import org.apache.beam.runners.dataflow.worker.StreamingModeExecutionContext; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.util.common.worker.ElementCounter; @@ -45,7 +46,7 @@ * @implNote Once closed, it cannot be reused. */ // TODO(m-trieu): See if this can be combined/cleaned up with StreamingModeExecutionContext as the -// seperation of responsibilities are unclear. +// separation of responsibilities are unclear. @AutoValue @Internal @NotThreadSafe @@ -72,9 +73,11 @@ public final void executeWork( Work work, WindmillStateReader stateReader, SideInputStateFetcher sideInputStateFetcher, + OperationalLimits operationalLimits, Windmill.WorkItemCommitRequest.Builder outputBuilder) throws Exception { - context().start(key, work, stateReader, sideInputStateFetcher, outputBuilder); + context() + .start(key, work, stateReader, sideInputStateFetcher, operationalLimits, outputBuilder); workExecutor().execute(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ExecutableWork.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ExecutableWork.java index bdf8a7814ea3..db279f066630 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ExecutableWork.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ExecutableWork.java @@ -31,7 +31,7 @@ public static ExecutableWork create(Work work, Consumer executeWorkFn) { public abstract Work work(); - abstract Consumer executeWorkFn(); + public abstract Consumer executeWorkFn(); @Override public void run() { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java new file mode 100644 index 000000000000..c51b04f23719 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/RefreshableWork.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming; + +import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; + +/** View of {@link Work} that exposes an interface for work refreshing. */ +@Internal +public interface RefreshableWork { + + WorkId id(); + + ShardedKey getShardedKey(); + + HeartbeatSender heartbeatSender(); + + ImmutableList getHeartbeatLatencyAttributions( + DataflowExecutionStateSampler sampler); + + void setFailed(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/StageInfo.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/StageInfo.java index 8f14ea26a461..a18ca8cfd6dc 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/StageInfo.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/StageInfo.java @@ -17,7 +17,7 @@ */ package org.apache.beam.runners.dataflow.worker.streaming; -import static org.apache.beam.runners.dataflow.worker.DataflowSystemMetrics.THROTTLING_MSECS_METRIC_NAME; +import static org.apache.beam.sdk.metrics.Metrics.THROTTLE_TIME_COUNTER_NAME; import com.google.api.services.dataflow.model.CounterStructuredName; import com.google.api.services.dataflow.model.CounterUpdate; @@ -28,7 +28,6 @@ import java.util.List; import org.apache.beam.runners.dataflow.worker.DataflowSystemMetrics; import org.apache.beam.runners.dataflow.worker.MetricsContainerRegistry; -import org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker; import org.apache.beam.runners.dataflow.worker.StreamingModeExecutionContext.StreamingModeExecutionStateRegistry; import org.apache.beam.runners.dataflow.worker.StreamingStepMetricsContainer; import org.apache.beam.runners.dataflow.worker.counters.Counter; @@ -93,20 +92,13 @@ public List extractCounterUpdates() { } /** - * Checks if the step counter affects any per-stage counters. Currently 'throttled_millis' is the + * Checks if the step counter affects any per-stage counters. Currently 'throttled-msecs' is the * only counter updated. */ private void translateKnownStepCounters(CounterUpdate stepCounterUpdate) { CounterStructuredName structuredName = stepCounterUpdate.getStructuredNameAndMetadata().getName(); - if ((THROTTLING_MSECS_METRIC_NAME.getNamespace().equals(structuredName.getOriginNamespace()) - && THROTTLING_MSECS_METRIC_NAME.getName().equals(structuredName.getName())) - || (StreamingDataflowWorker.BIGQUERY_STREAMING_INSERT_THROTTLE_TIME - .getNamespace() - .equals(structuredName.getOriginNamespace()) - && StreamingDataflowWorker.BIGQUERY_STREAMING_INSERT_THROTTLE_TIME - .getName() - .equals(structuredName.getName()))) { + if (THROTTLE_TIME_COUNTER_NAME.equals(structuredName.getName())) { long msecs = DataflowCounterUpdateExtractor.splitIntToLong(stepCounterUpdate.getInteger()); if (msecs > 0) { throttledMsecs().addValue(msecs); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java index ed3f2671b40c..e77823602eda 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java @@ -27,14 +27,14 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Optional; -import java.util.function.BiFunction; import java.util.function.Consumer; -import java.util.function.Function; import java.util.function.Supplier; import javax.annotation.concurrent.NotThreadSafe; import org.apache.beam.repackaged.core.org.apache.commons.lang3.tuple.Pair; import org.apache.beam.runners.dataflow.worker.ActiveMessageMetadata; import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataResponse; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.LatencyAttribution; @@ -45,7 +45,9 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.Commit; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.joda.time.Duration; @@ -58,7 +60,7 @@ */ @NotThreadSafe @Internal -public final class Work { +public final class Work implements RefreshableWork { private final ShardedKey shardedKey; private final WorkItem workItem; private final ProcessingContext processingContext; @@ -105,9 +107,10 @@ public static Work create( public static ProcessingContext createProcessingContext( String computationId, - BiFunction getKeyedDataFn, - Consumer workCommitter) { - return ProcessingContext.create(computationId, getKeyedDataFn, workCommitter); + GetDataClient getDataClient, + Consumer workCommitter, + HeartbeatSender heartbeatSender) { + return ProcessingContext.create(computationId, getDataClient, workCommitter, heartbeatSender); } private static LatencyAttribution.Builder createLatencyAttributionWithActiveLatencyBreakdown( @@ -151,12 +154,17 @@ public WorkItem getWorkItem() { return workItem; } + @Override public ShardedKey getShardedKey() { return shardedKey; } public Optional fetchKeyedState(KeyedGetDataRequest keyedGetDataRequest) { - return processingContext.keyedDataFetcher().apply(keyedGetDataRequest); + return processingContext.fetchKeyedState(keyedGetDataRequest); + } + + public GlobalData fetchSideInput(GlobalDataRequest request) { + return processingContext.getDataClient().getSideInputData(request); } public Watermarks watermarks() { @@ -180,6 +188,7 @@ public void setState(State state) { this.currentState = TimedState.create(state, now); } + @Override public void setFailed() { this.isFailed = true; } @@ -196,6 +205,11 @@ public String getLatencyTrackingId() { return latencyTrackingId; } + @Override + public HeartbeatSender heartbeatSender() { + return processingContext.heartbeatSender(); + } + public void queueCommit(WorkItemCommitRequest commitRequest, ComputationState computationState) { setState(State.COMMIT_QUEUED); processingContext.workCommitter().accept(Commit.create(commitRequest, computationState, this)); @@ -205,6 +219,7 @@ public WindmillStateReader createWindmillStateReader() { return WindmillStateReader.forWork(this); } + @Override public WorkId id() { return id; } @@ -216,7 +231,25 @@ private void recordGetWorkStreamLatencies(Collection getWork } } + @Override + public ImmutableList getHeartbeatLatencyAttributions( + DataflowExecutionStateSampler sampler) { + return getLatencyAttributions(/* isHeartbeat= */ true, sampler); + } + public ImmutableList getLatencyAttributions( + DataflowExecutionStateSampler sampler) { + return getLatencyAttributions(/* isHeartbeat= */ false, sampler); + } + + private Duration getTotalDurationAtLatencyAttributionState(LatencyAttribution.State state) { + Duration duration = totalDurationPerState.getOrDefault(state, Duration.ZERO); + return state == this.currentState.state().toLatencyAttributionState() + ? duration.plus(new Duration(this.currentState.startTime(), clock.get())) + : duration; + } + + private ImmutableList getLatencyAttributions( boolean isHeartbeat, DataflowExecutionStateSampler sampler) { return Arrays.stream(LatencyAttribution.State.values()) .map(state -> Pair.of(state, getTotalDurationAtLatencyAttributionState(state))) @@ -233,13 +266,6 @@ public ImmutableList getLatencyAttributions( .collect(toImmutableList()); } - private Duration getTotalDurationAtLatencyAttributionState(LatencyAttribution.State state) { - Duration duration = totalDurationPerState.getOrDefault(state, Duration.ZERO); - return state == this.currentState.state().toLatencyAttributionState() - ? duration.plus(new Duration(this.currentState.startTime(), clock.get())) - : duration; - } - private LatencyAttribution createLatencyAttribution( LatencyAttribution.State state, boolean isHeartbeat, @@ -314,25 +340,29 @@ public abstract static class ProcessingContext { private static ProcessingContext create( String computationId, - BiFunction getKeyedDataFn, - Consumer workCommitter) { + GetDataClient getDataClient, + Consumer workCommitter, + HeartbeatSender heartbeatSender) { return new AutoValue_Work_ProcessingContext( - computationId, - request -> Optional.ofNullable(getKeyedDataFn.apply(computationId, request)), - workCommitter); + computationId, getDataClient, heartbeatSender, workCommitter); } /** Computation that the {@link Work} belongs to. */ public abstract String computationId(); /** Handles GetData requests to streaming backend. */ - public abstract Function> - keyedDataFetcher(); + public abstract GetDataClient getDataClient(); + + public abstract HeartbeatSender heartbeatSender(); /** * {@link WorkCommitter} that commits completed work to the backend Windmill worker handling the * {@link WorkItem}. */ public abstract Consumer workCommitter(); + + private Optional fetchKeyedState(KeyedGetDataRequest request) { + return Optional.ofNullable(getDataClient().getStateData(computationId(), request)); + } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/WorkId.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/WorkId.java index f8f8d1901914..d4e7f05d255f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/WorkId.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/WorkId.java @@ -41,9 +41,9 @@ public static WorkId of(Windmill.WorkItem workItem) { .build(); } - abstract long cacheToken(); + public abstract long cacheToken(); - abstract long workToken(); + public abstract long workToken(); @AutoValue.Builder public abstract static class Builder { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java index 51d1507af5fe..d230aac54c63 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java @@ -157,7 +157,7 @@ private static Optional fetchConfigWithRetry( } } - private static StreamingEnginePipelineConfig createPipelineConfig(StreamingConfigTask config) { + private StreamingEnginePipelineConfig createPipelineConfig(StreamingConfigTask config) { StreamingEnginePipelineConfig.Builder pipelineConfig = StreamingEnginePipelineConfig.builder(); if (config.getUserStepToStateFamilyNameMap() != null) { pipelineConfig.setUserStepToStateFamilyNameMap(config.getUserStepToStateFamilyNameMap()); @@ -187,6 +187,20 @@ private static StreamingEnginePipelineConfig createPipelineConfig(StreamingConfi pipelineConfig.setMaxWorkItemCommitBytes(config.getMaxWorkItemCommitBytes().intValue()); } + if (config.getOperationalLimits() != null) { + if (config.getOperationalLimits().getMaxKeyBytes() != null + && config.getOperationalLimits().getMaxKeyBytes() > 0 + && config.getOperationalLimits().getMaxKeyBytes() <= Integer.MAX_VALUE) { + pipelineConfig.setMaxOutputKeyBytes(config.getOperationalLimits().getMaxKeyBytes()); + } + if (config.getOperationalLimits().getMaxProductionOutputBytes() != null + && config.getOperationalLimits().getMaxProductionOutputBytes() > 0 + && config.getOperationalLimits().getMaxProductionOutputBytes() <= Integer.MAX_VALUE) { + pipelineConfig.setMaxOutputValueBytes( + config.getOperationalLimits().getMaxProductionOutputBytes()); + } + } + return pipelineConfig.build(); } @@ -273,7 +287,7 @@ private synchronized void fetchInitialPipelineGlobalConfig() { private Optional fetchGlobalConfig() { return fetchConfigWithRetry(dataflowServiceClient::getGlobalStreamingConfigWorkItem) - .map(StreamingEngineComputationConfigFetcher::createPipelineConfig); + .map(config -> createPipelineConfig(config)); } @FunctionalInterface diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEnginePipelineConfig.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEnginePipelineConfig.java index b5b761ada703..8f1ff93f6a49 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEnginePipelineConfig.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEnginePipelineConfig.java @@ -34,12 +34,18 @@ public abstract class StreamingEnginePipelineConfig { public static StreamingEnginePipelineConfig.Builder builder() { return new AutoValue_StreamingEnginePipelineConfig.Builder() .setMaxWorkItemCommitBytes(DEFAULT_MAX_WORK_ITEM_COMMIT_BYTES) + .setMaxOutputKeyBytes(Long.MAX_VALUE) + .setMaxOutputValueBytes(Long.MAX_VALUE) .setUserStepToStateFamilyNameMap(new HashMap<>()) .setWindmillServiceEndpoints(ImmutableSet.of()); } public abstract long maxWorkItemCommitBytes(); + public abstract long maxOutputKeyBytes(); + + public abstract long maxOutputValueBytes(); + public abstract Map userStepToStateFamilyNameMap(); public abstract ImmutableSet windmillServiceEndpoints(); @@ -48,6 +54,10 @@ public static StreamingEnginePipelineConfig.Builder builder() { public abstract static class Builder { public abstract Builder setMaxWorkItemCommitBytes(long value); + public abstract Builder setMaxOutputKeyBytes(long value); + + public abstract Builder setMaxOutputValueBytes(long value); + public abstract Builder setUserStepToStateFamilyNameMap(Map value); public abstract Builder setWindmillServiceEndpoints(ImmutableSet value); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarness.java similarity index 86% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarness.java index 4760062c5754..3556b7ce2919 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarness.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; +package org.apache.beam.runners.dataflow.worker.streaming.harness; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap.toImmutableMap; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet.toImmutableSet; @@ -45,6 +45,10 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkerMetadataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.StreamGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcDispatcherClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.ChannelCachingStubFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; @@ -66,16 +70,19 @@ import org.slf4j.LoggerFactory; /** - * Client for StreamingEngine. Given a {@link GetWorkBudget}, divides the budget and starts the - * {@link WindmillStream.GetWorkStream}(s). + * {@link StreamingWorkerHarness} implementation that manages fan out to multiple backend + * destinations. Given a {@link GetWorkBudget}, divides the budget and starts the {@link + * WindmillStream.GetWorkStream}(s). */ @Internal @CheckReturnValue @ThreadSafe -public final class StreamingEngineClient { - private static final Logger LOG = LoggerFactory.getLogger(StreamingEngineClient.class); +public final class FanOutStreamingEngineWorkerHarness implements StreamingWorkerHarness { + private static final Logger LOG = + LoggerFactory.getLogger(FanOutStreamingEngineWorkerHarness.class); private static final String PUBLISH_NEW_WORKER_METADATA_THREAD = "PublishNewWorkerMetadataThread"; private static final String CONSUME_NEW_WORKER_METADATA_THREAD = "ConsumeNewWorkerMetadataThread"; + private final JobHeader jobHeader; private final GrpcWindmillStreamFactory streamFactory; private final WorkItemScheduler workItemScheduler; @@ -91,6 +98,7 @@ public final class StreamingEngineClient { private final Supplier getWorkerMetadataStream; private final Queue newWindmillEndpoints; private final Function workCommitterFactory; + private final ThrottlingGetDataMetricTracker getDataMetricTracker; /** Writes are guarded by synchronization, reads are lock free. */ private final AtomicReference connections; @@ -98,7 +106,7 @@ public final class StreamingEngineClient { private volatile boolean started; @SuppressWarnings("FutureReturnValueIgnored") - private StreamingEngineClient( + private FanOutStreamingEngineWorkerHarness( JobHeader jobHeader, GetWorkBudget totalGetWorkBudget, GrpcWindmillStreamFactory streamFactory, @@ -107,8 +115,10 @@ private StreamingEngineClient( GetWorkBudgetDistributor getWorkBudgetDistributor, GrpcDispatcherClient dispatcherClient, long clientId, - Function workCommitterFactory) { + Function workCommitterFactory, + ThrottlingGetDataMetricTracker getDataMetricTracker) { this.jobHeader = jobHeader; + this.getDataMetricTracker = getDataMetricTracker; this.started = false; this.streamFactory = streamFactory; this.workItemScheduler = workItemScheduler; @@ -147,23 +157,15 @@ private StreamingEngineClient( private static ExecutorService singleThreadedExecutorServiceOf(String threadName) { return Executors.newSingleThreadScheduledExecutor( - new ThreadFactoryBuilder() - .setNameFormat(threadName) - .setUncaughtExceptionHandler( - (t, e) -> { - LOG.error( - "{} failed due to uncaught exception during execution. ", t.getName(), e); - throw new StreamingEngineClientException(e); - }) - .build()); + new ThreadFactoryBuilder().setNameFormat(threadName).build()); } /** - * Creates an instance of {@link StreamingEngineClient} in a non-started state. + * Creates an instance of {@link FanOutStreamingEngineWorkerHarness} in a non-started state. * * @implNote Does not block the calling thread. Callers must explicitly call {@link #start()}. */ - public static StreamingEngineClient create( + public static FanOutStreamingEngineWorkerHarness create( JobHeader jobHeader, GetWorkBudget totalGetWorkBudget, GrpcWindmillStreamFactory streamingEngineStreamFactory, @@ -171,8 +173,9 @@ public static StreamingEngineClient create( ChannelCachingStubFactory channelCachingStubFactory, GetWorkBudgetDistributor getWorkBudgetDistributor, GrpcDispatcherClient dispatcherClient, - Function workCommitterFactory) { - return new StreamingEngineClient( + Function workCommitterFactory, + ThrottlingGetDataMetricTracker getDataMetricTracker) { + return new FanOutStreamingEngineWorkerHarness( jobHeader, totalGetWorkBudget, streamingEngineStreamFactory, @@ -181,11 +184,12 @@ public static StreamingEngineClient create( getWorkBudgetDistributor, dispatcherClient, /* clientId= */ new Random().nextLong(), - workCommitterFactory); + workCommitterFactory, + getDataMetricTracker); } @VisibleForTesting - static StreamingEngineClient forTesting( + static FanOutStreamingEngineWorkerHarness forTesting( JobHeader jobHeader, GetWorkBudget totalGetWorkBudget, GrpcWindmillStreamFactory streamFactory, @@ -194,9 +198,10 @@ static StreamingEngineClient forTesting( GetWorkBudgetDistributor getWorkBudgetDistributor, GrpcDispatcherClient dispatcherClient, long clientId, - Function workCommitterFactory) { - StreamingEngineClient streamingEngineClient = - new StreamingEngineClient( + Function workCommitterFactory, + ThrottlingGetDataMetricTracker getDataMetricTracker) { + FanOutStreamingEngineWorkerHarness fanOutStreamingEngineWorkProvider = + new FanOutStreamingEngineWorkerHarness( jobHeader, totalGetWorkBudget, streamFactory, @@ -205,12 +210,14 @@ static StreamingEngineClient forTesting( getWorkBudgetDistributor, dispatcherClient, clientId, - workCommitterFactory); - streamingEngineClient.start(); - return streamingEngineClient; + workCommitterFactory, + getDataMetricTracker); + fanOutStreamingEngineWorkProvider.start(); + return fanOutStreamingEngineWorkProvider; } @SuppressWarnings("ReturnValueIgnored") + @Override public synchronized void start() { Preconditions.checkState(!started, "StreamingEngineClient cannot start twice."); // Starts the stream, this value is memoized. @@ -240,7 +247,7 @@ public ImmutableSet currentWindmillEndpoints() { * Fetches {@link GetDataStream} mapped to globalDataKey if one exists, or defaults to {@link * GetDataStream} pointing to dispatcher. */ - public GetDataStream getGlobalDataStream(String globalDataKey) { + private GetDataStream getGlobalDataStream(String globalDataKey) { return Optional.ofNullable(connections.get().globalDataStreams().get(globalDataKey)) .map(Supplier::get) .orElseGet( @@ -261,9 +268,10 @@ private void startWorkerMetadataConsumer() { } @VisibleForTesting - public synchronized void finish() { + @Override + public synchronized void shutdown() { Preconditions.checkState(started, "StreamingEngineClient never started."); - getWorkerMetadataStream.get().close(); + getWorkerMetadataStream.get().halfClose(); getWorkBudgetRefresher.stop(); newWorkerMetadataPublisher.shutdownNow(); newWorkerMetadataConsumer.shutdownNow(); @@ -325,10 +333,13 @@ private synchronized ImmutableMap createNewWindmil .collect( toImmutableMap( Function.identity(), - // Reuse existing stubs if they exist. endpoint -> - currentConnections.getOrDefault( - endpoint, WindmillConnection.from(endpoint, this::createWindmillStub)))); + // Reuse existing stubs if they exist. Optional.orElseGet only calls the + // supplier if the value is not present, preventing constructing expensive + // objects. + Optional.ofNullable(currentConnections.get(endpoint)) + .orElseGet( + () -> WindmillConnection.from(endpoint, this::createWindmillStub)))); } private synchronized ImmutableMap @@ -390,7 +401,7 @@ private WindmillStreamSender createAndStartWindmillStreamSenderFor( // GetWorkBudgetDistributor. WindmillStreamSender windmillStreamSender = WindmillStreamSender.create( - connection.stub(), + connection, GetWorkRequest.newBuilder() .setClientId(clientId) .setJobId(jobHeader.getJobId()) @@ -400,6 +411,9 @@ private WindmillStreamSender createAndStartWindmillStreamSenderFor( GetWorkBudget.noBudget(), streamFactory, workItemScheduler, + getDataStream -> + StreamGetDataClient.create( + getDataStream, this::getGlobalDataStream, getDataMetricTracker), workCommitterFactory); windmillStreamSender.startStreams(); return windmillStreamSender; @@ -411,11 +425,4 @@ private CloudWindmillServiceV1Alpha1Stub createWindmillStub(Endpoint endpoint) { .map(channelCachingStubFactory::createWindmillServiceStub) .orElseGet(dispatcherClient::getWindmillServiceStub); } - - private static class StreamingEngineClientException extends IllegalStateException { - - private StreamingEngineClientException(Throwable exception) { - super(exception); - } - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/SingleSourceWorkerHarness.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/SingleSourceWorkerHarness.java new file mode 100644 index 000000000000..bc93e6d89c41 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/SingleSourceWorkerHarness.java @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.harness; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly; + +import com.google.auto.value.AutoBuilder; +import com.google.auto.value.AutoOneOf; +import java.util.Collections; +import java.util.Optional; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Function; +import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; +import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; +import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; +import org.apache.beam.runners.dataflow.worker.streaming.Work; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub.RpcException; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; +import org.apache.beam.runners.dataflow.worker.windmill.work.processing.StreamingWorkScheduler; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.joda.time.Instant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link StreamingWorkerHarness} implementations that fetch {@link + * org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem}(s) from a single source. + */ +@Internal +public final class SingleSourceWorkerHarness implements StreamingWorkerHarness { + private static final Logger LOG = LoggerFactory.getLogger(SingleSourceWorkerHarness.class); + private static final int GET_WORK_STREAM_TIMEOUT_MINUTES = 3; + + private final AtomicBoolean isRunning; + private final WorkCommitter workCommitter; + private final GetDataClient getDataClient; + private final HeartbeatSender heartbeatSender; + private final StreamingWorkScheduler streamingWorkScheduler; + private final Runnable waitForResources; + private final Function> computationStateFetcher; + private final ExecutorService workProviderExecutor; + private final GetWorkSender getWorkSender; + + SingleSourceWorkerHarness( + WorkCommitter workCommitter, + GetDataClient getDataClient, + HeartbeatSender heartbeatSender, + StreamingWorkScheduler streamingWorkScheduler, + Runnable waitForResources, + Function> computationStateFetcher, + GetWorkSender getWorkSender) { + this.workCommitter = workCommitter; + this.getDataClient = getDataClient; + this.heartbeatSender = heartbeatSender; + this.streamingWorkScheduler = streamingWorkScheduler; + this.waitForResources = waitForResources; + this.computationStateFetcher = computationStateFetcher; + this.workProviderExecutor = + Executors.newSingleThreadScheduledExecutor( + new ThreadFactoryBuilder() + .setDaemon(true) + .setPriority(Thread.MIN_PRIORITY) + .setNameFormat("DispatchThread") + .build()); + this.isRunning = new AtomicBoolean(false); + this.getWorkSender = getWorkSender; + } + + public static SingleSourceWorkerHarness.Builder builder() { + return new AutoBuilder_SingleSourceWorkerHarness_Builder(); + } + + @Override + public void start() { + Preconditions.checkState( + isRunning.compareAndSet(false, true), + "Multiple calls to {}.start() are not allowed.", + getClass()); + workCommitter.start(); + workProviderExecutor.execute( + () -> { + getDispatchLoop().run(); + LOG.info("Dispatch done"); + }); + } + + private Runnable getDispatchLoop() { + switch (getWorkSender.getKind()) { + case APPLIANCE: + LOG.info("Starting Dispatch in Appliance mode."); + return () -> applianceDispatchLoop(getWorkSender.appliance()); + case STREAMING_ENGINE: + LOG.info("Starting Dispatch in Streaming Engine mode."); + return () -> streamingEngineDispatchLoop(getWorkSender.streamingEngine()); + default: + // Will never happen switch is exhaustive. + throw new IllegalStateException("Invalid GetWorkSender.Kind: " + getWorkSender.getKind()); + } + } + + @Override + public void shutdown() { + Preconditions.checkState( + isRunning.compareAndSet(true, false), + "Multiple calls to {}.shutdown() are not allowed.", + getClass()); + workProviderExecutor.shutdown(); + boolean isTerminated = false; + try { + isTerminated = workProviderExecutor.awaitTermination(10, TimeUnit.SECONDS); + } catch (InterruptedException e) { + LOG.warn("Unable to shutdown {}", getClass()); + } + + if (!isTerminated) { + workProviderExecutor.shutdownNow(); + } + workCommitter.stop(); + } + + private void streamingEngineDispatchLoop( + Function getWorkStreamFactory) { + while (isRunning.get()) { + WindmillStream.GetWorkStream stream = + getWorkStreamFactory.apply( + (computationId, + inputDataWatermark, + synchronizedProcessingTime, + workItem, + getWorkStreamLatencies) -> + computationStateFetcher + .apply(computationId) + .ifPresent( + computationState -> { + waitForResources.run(); + streamingWorkScheduler.scheduleWork( + computationState, + workItem, + Watermarks.builder() + .setInputDataWatermark( + Preconditions.checkNotNull(inputDataWatermark)) + .setSynchronizedProcessingTime(synchronizedProcessingTime) + .setOutputDataWatermark(workItem.getOutputDataWatermark()) + .build(), + Work.createProcessingContext( + computationId, + getDataClient, + workCommitter::commit, + heartbeatSender), + getWorkStreamLatencies); + })); + try { + // Reconnect every now and again to enable better load balancing. + // If at any point the server closes the stream, we will reconnect immediately; otherwise + // we half-close the stream after some time and create a new one. + if (!stream.awaitTermination(GET_WORK_STREAM_TIMEOUT_MINUTES, TimeUnit.MINUTES)) { + stream.halfClose(); + } + } catch (InterruptedException e) { + // Continue processing until !running.get() + } + } + } + + private void applianceDispatchLoop(Supplier getWorkFn) { + while (isRunning.get()) { + waitForResources.run(); + int backoff = 1; + Windmill.GetWorkResponse workResponse = null; + do { + try { + workResponse = getWorkFn.get(); + if (workResponse.getWorkCount() > 0) { + break; + } + } catch (RpcException e) { + LOG.warn("GetWork failed, retrying:", e); + } + sleepUninterruptibly(backoff, TimeUnit.MILLISECONDS); + backoff = Math.min(1000, backoff * 2); + } while (isRunning.get()); + for (Windmill.ComputationWorkItems computationWork : + Preconditions.checkNotNull(workResponse).getWorkList()) { + String computationId = computationWork.getComputationId(); + Optional maybeComputationState = + computationStateFetcher.apply(computationId); + if (!maybeComputationState.isPresent()) { + continue; + } + + ComputationState computationState = maybeComputationState.get(); + Instant inputDataWatermark = + WindmillTimeUtils.windmillToHarnessWatermark(computationWork.getInputDataWatermark()); + Watermarks.Builder watermarks = + Watermarks.builder() + .setInputDataWatermark(Preconditions.checkNotNull(inputDataWatermark)) + .setSynchronizedProcessingTime( + WindmillTimeUtils.windmillToHarnessWatermark( + computationWork.getDependentRealtimeInputWatermark())); + + for (Windmill.WorkItem workItem : computationWork.getWorkList()) { + streamingWorkScheduler.scheduleWork( + computationState, + workItem, + watermarks.setOutputDataWatermark(workItem.getOutputDataWatermark()).build(), + Work.createProcessingContext( + computationId, getDataClient, workCommitter::commit, heartbeatSender), + /* getWorkStreamLatencies= */ Collections.emptyList()); + } + } + } + } + + @AutoBuilder + public interface Builder { + Builder setWorkCommitter(WorkCommitter workCommitter); + + Builder setGetDataClient(GetDataClient getDataClient); + + Builder setHeartbeatSender(HeartbeatSender heartbeatSender); + + Builder setStreamingWorkScheduler(StreamingWorkScheduler streamingWorkScheduler); + + Builder setWaitForResources(Runnable waitForResources); + + Builder setComputationStateFetcher( + Function> computationStateFetcher); + + Builder setGetWorkSender(GetWorkSender getWorkSender); + + SingleSourceWorkerHarness build(); + } + + @AutoOneOf(GetWorkSender.Kind.class) + public abstract static class GetWorkSender { + + public static GetWorkSender forStreamingEngine( + Function getWorkStreamFactory) { + return AutoOneOf_SingleSourceWorkerHarness_GetWorkSender.streamingEngine( + getWorkStreamFactory); + } + + public static GetWorkSender forAppliance(Supplier getWorkFn) { + return AutoOneOf_SingleSourceWorkerHarness_GetWorkSender.appliance(getWorkFn); + } + + abstract Function streamingEngine(); + + abstract Supplier appliance(); + + abstract Kind getKind(); + + enum Kind { + STREAMING_ENGINE, + APPLIANCE + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineConnectionState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingEngineConnectionState.java similarity index 97% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineConnectionState.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingEngineConnectionState.java index 8d784456d655..3c85ee6abe1f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineConnectionState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingEngineConnectionState.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; +package org.apache.beam.runners.dataflow.worker.streaming.harness; import com.google.auto.value.AutoValue; import java.util.function.Supplier; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerHarness.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerHarness.java new file mode 100644 index 000000000000..c1b4570e2260 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerHarness.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.harness; + +import org.apache.beam.sdk.annotations.Internal; + +/** Provides an interface to start streaming worker processing. */ +@Internal +public interface StreamingWorkerHarness { + void start(); + + void shutdown(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSender.java similarity index 81% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSender.java index e9f008eb522e..45aa403ee71b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSender.java @@ -15,22 +15,26 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; +package org.apache.beam.runners.dataflow.worker.streaming.harness; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.function.Supplier; import javax.annotation.concurrent.ThreadSafe; -import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillConnection; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.StreamingEngineThrottleTimers; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; +import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudgetSpender; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.FixedStreamHeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Suppliers; @@ -55,7 +59,7 @@ */ @Internal @ThreadSafe -public class WindmillStreamSender { +final class WindmillStreamSender implements GetWorkBudgetSpender { private final AtomicBoolean started; private final AtomicReference getWorkBudget; private final Supplier getWorkStream; @@ -65,11 +69,12 @@ public class WindmillStreamSender { private final StreamingEngineThrottleTimers streamingEngineThrottleTimers; private WindmillStreamSender( - CloudWindmillServiceV1Alpha1Stub stub, + WindmillConnection connection, GetWorkRequest getWorkRequest, AtomicReference getWorkBudget, GrpcWindmillStreamFactory streamingEngineStreamFactory, WorkItemScheduler workItemScheduler, + Function getDataClientFactory, Function workCommitterFactory) { this.started = new AtomicBoolean(false); this.getWorkBudget = getWorkBudget; @@ -83,39 +88,42 @@ private WindmillStreamSender( Suppliers.memoize( () -> streamingEngineStreamFactory.createGetDataStream( - stub, streamingEngineThrottleTimers.getDataThrottleTimer())); + connection.stub(), streamingEngineThrottleTimers.getDataThrottleTimer())); this.commitWorkStream = Suppliers.memoize( () -> streamingEngineStreamFactory.createCommitWorkStream( - stub, streamingEngineThrottleTimers.commitWorkThrottleTimer())); + connection.stub(), streamingEngineThrottleTimers.commitWorkThrottleTimer())); this.workCommitter = Suppliers.memoize(() -> workCommitterFactory.apply(commitWorkStream.get())); this.getWorkStream = Suppliers.memoize( () -> streamingEngineStreamFactory.createDirectGetWorkStream( - stub, + connection, withRequestBudget(getWorkRequest, getWorkBudget.get()), streamingEngineThrottleTimers.getWorkThrottleTimer(), - getDataStream, + () -> FixedStreamHeartbeatSender.create(getDataStream.get()), + () -> getDataClientFactory.apply(getDataStream.get()), workCommitter, workItemScheduler)); } - public static WindmillStreamSender create( - CloudWindmillServiceV1Alpha1Stub stub, + static WindmillStreamSender create( + WindmillConnection connection, GetWorkRequest getWorkRequest, GetWorkBudget getWorkBudget, GrpcWindmillStreamFactory streamingEngineStreamFactory, WorkItemScheduler workItemScheduler, + Function getDataClientFactory, Function workCommitterFactory) { return new WindmillStreamSender( - stub, + connection, getWorkRequest, new AtomicReference<>(getWorkBudget), streamingEngineStreamFactory, workItemScheduler, + getDataClientFactory, workCommitterFactory); } @@ -138,13 +146,14 @@ void closeAllStreams() { // streaming RPCs by possibly making calls over the network. Do not close the streams unless // they have already been started. if (started.get()) { - getWorkStream.get().close(); - getDataStream.get().close(); + getWorkStream.get().shutdown(); + getDataStream.get().shutdown(); workCommitter.get().stop(); - commitWorkStream.get().close(); + commitWorkStream.get().shutdown(); } } + @Override public void adjustBudget(long itemsDelta, long bytesDelta) { getWorkBudget.set(getWorkBudget.get().apply(itemsDelta, bytesDelta)); if (started.get()) { @@ -152,19 +161,16 @@ public void adjustBudget(long itemsDelta, long bytesDelta) { } } - public void adjustBudget(GetWorkBudget adjustment) { - adjustBudget(adjustment.items(), adjustment.bytes()); - } - - public GetWorkBudget remainingGetWorkBudget() { + @Override + public GetWorkBudget remainingBudget() { return started.get() ? getWorkStream.get().remainingBudget() : getWorkBudget.get(); } - public long getAndResetThrottleTime() { + long getAndResetThrottleTime() { return streamingEngineThrottleTimers.getAndResetThrottleTime(); } - public long getCurrentActiveCommitBytes() { + long getCurrentActiveCommitBytes() { return started.get() ? workCommitter.get().currentActiveCommitBytes() : 0; } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java index 7fd2487575c2..303cdeb94f8c 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java @@ -30,11 +30,11 @@ import java.util.function.Function; import javax.annotation.concurrent.NotThreadSafe; import org.apache.beam.runners.core.InMemoryMultimapSideInputView; -import org.apache.beam.runners.dataflow.options.DataflowStreamingPipelineOptions; import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; +import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.IterableCoder; import org.apache.beam.sdk.coders.KvCoder; @@ -46,14 +46,14 @@ import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.WindowingStrategy; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** Class responsible for fetching state from the windmill server. */ +/** Class responsible for fetching side input state from the streaming backend. */ @NotThreadSafe +@Internal public class SideInputStateFetcher { private static final Logger LOG = LoggerFactory.getLogger(SideInputStateFetcher.class); @@ -64,13 +64,6 @@ public class SideInputStateFetcher { private final Function fetchGlobalDataFn; private long bytesRead = 0L; - public SideInputStateFetcher( - Function fetchGlobalDataFn, - DataflowStreamingPipelineOptions options) { - this(fetchGlobalDataFn, SideInputCache.create(options)); - } - - @VisibleForTesting SideInputStateFetcher( Function fetchGlobalDataFn, SideInputCache sideInputCache) { this.fetchGlobalDataFn = fetchGlobalDataFn; @@ -103,12 +96,56 @@ private static Coder getCoder(PCollectionView view) { return view.getCoderInternal(); } - /** Returns a view of the underlying cache that keeps track of bytes read separately. */ - public SideInputStateFetcher byteTrackingView() { - return new SideInputStateFetcher(fetchGlobalDataFn, sideInputCache); + private static SideInput createSideInputCacheEntry( + PCollectionView view, GlobalData data) throws IOException { + Iterable rawData = decodeRawData(view, data); + switch (getViewFn(view).getMaterialization().getUrn()) { + case ITERABLE_MATERIALIZATION_URN: + { + @SuppressWarnings({ + "unchecked", // ITERABLE_MATERIALIZATION_URN has ViewFn. + "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) + }) + ViewFn viewFn = (ViewFn) getViewFn(view); + return SideInput.ready(viewFn.apply(() -> rawData), data.getData().size()); + } + case MULTIMAP_MATERIALIZATION_URN: + { + @SuppressWarnings({ + "unchecked", // MULTIMAP_MATERIALIZATION_URN has ViewFn. + "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) + }) + ViewFn viewFn = (ViewFn) getViewFn(view); + Coder keyCoder = ((KvCoder) getCoder(view)).getKeyCoder(); + + @SuppressWarnings({ + "unchecked", // Safe since multimap rawData is of type Iterable> + "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) + }) + T multimapSideInputValue = + viewFn.apply( + InMemoryMultimapSideInputView.fromIterable(keyCoder, (Iterable) rawData)); + return SideInput.ready(multimapSideInputValue, data.getData().size()); + } + default: + { + throw new IllegalStateException( + "Unknown side input materialization format requested: " + + getViewFn(view).getMaterialization().getUrn()); + } + } } - public long getBytesRead() { + private static void validateViewMaterialization(PCollectionView view) { + String materializationUrn = getViewFn(view).getMaterialization().getUrn(); + checkState( + SUPPORTED_MATERIALIZATIONS.contains(materializationUrn), + "Only materialization's of type %s supported, received %s", + SUPPORTED_MATERIALIZATIONS, + materializationUrn); + } + + public final long getBytesRead() { return bytesRead; } @@ -200,53 +237,4 @@ private SideInput loadSideInputFromWindmill( bytesRead += data.getSerializedSize(); return data.getIsReady() ? createSideInputCacheEntry(view, data) : SideInput.notReady(); } - - private void validateViewMaterialization(PCollectionView view) { - String materializationUrn = getViewFn(view).getMaterialization().getUrn(); - checkState( - SUPPORTED_MATERIALIZATIONS.contains(materializationUrn), - "Only materialization's of type %s supported, received %s", - SUPPORTED_MATERIALIZATIONS, - materializationUrn); - } - - private SideInput createSideInputCacheEntry(PCollectionView view, GlobalData data) - throws IOException { - Iterable rawData = decodeRawData(view, data); - switch (getViewFn(view).getMaterialization().getUrn()) { - case ITERABLE_MATERIALIZATION_URN: - { - @SuppressWarnings({ - "unchecked", // ITERABLE_MATERIALIZATION_URN has ViewFn. - "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) - }) - ViewFn viewFn = (ViewFn) getViewFn(view); - return SideInput.ready(viewFn.apply(() -> rawData), data.getData().size()); - } - case MULTIMAP_MATERIALIZATION_URN: - { - @SuppressWarnings({ - "unchecked", // MULTIMAP_MATERIALIZATION_URN has ViewFn. - "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) - }) - ViewFn viewFn = (ViewFn) getViewFn(view); - Coder keyCoder = ((KvCoder) getCoder(view)).getKeyCoder(); - - @SuppressWarnings({ - "unchecked", // Safe since multimap rawData is of type Iterable> - "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) - }) - T multimapSideInputValue = - viewFn.apply( - InMemoryMultimapSideInputView.fromIterable(keyCoder, (Iterable) rawData)); - return SideInput.ready(multimapSideInputValue, data.getData().size()); - } - default: - { - throw new IllegalStateException( - "Unknown side input materialization format requested: " - + getViewFn(view).getMaterialization().getUrn()); - } - } - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherFactory.java new file mode 100644 index 000000000000..fd42b9ff1801 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.sideinput; + +import java.util.function.Function; +import org.apache.beam.runners.dataflow.options.DataflowStreamingPipelineOptions; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; +import org.apache.beam.sdk.annotations.Internal; + +/** + * Factory class for generating {@link SideInputStateFetcher} instances that share a {@link + * SideInputCache}. + */ +@Internal +public final class SideInputStateFetcherFactory { + private final SideInputCache globalSideInputCache; + + private SideInputStateFetcherFactory(SideInputCache globalSideInputCache) { + this.globalSideInputCache = globalSideInputCache; + } + + public static SideInputStateFetcherFactory fromOptions(DataflowStreamingPipelineOptions options) { + return new SideInputStateFetcherFactory(SideInputCache.create(options)); + } + + public SideInputStateFetcher createSideInputStateFetcher( + Function fetchGlobalDataFn) { + return new SideInputStateFetcher(fetchGlobalDataFn, globalSideInputCache); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ApplianceWindmillClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ApplianceWindmillClient.java new file mode 100644 index 000000000000..2cd3748eb31b --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ApplianceWindmillClient.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill; + +import org.apache.beam.sdk.annotations.Internal; + +/** Client for WindmillService via Streaming Appliance. */ +@Internal +public interface ApplianceWindmillClient { + /** Get a batch of work to process. */ + Windmill.GetWorkResponse getWork(Windmill.GetWorkRequest request); + + /** Get additional data such as state needed to process work. */ + Windmill.GetDataResponse getData(Windmill.GetDataRequest request); + + /** Commit the work, issuing any output productions, state modifications etc. */ + Windmill.CommitWorkResponse commitWork(Windmill.CommitWorkRequest request); + + /** Get configuration data from the server. */ + Windmill.GetConfigResponse getConfig(Windmill.GetConfigRequest request); + + /** Report execution information to the server. */ + Windmill.ReportStatsResponse reportStats(Windmill.ReportStatsRequest request); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamingEngineWindmillClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamingEngineWindmillClient.java new file mode 100644 index 000000000000..e02e6c112358 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamingEngineWindmillClient.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill; + +import java.util.Set; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; + +/** Client for WindmillService via Streaming Engine. */ +@Internal +public interface StreamingEngineWindmillClient { + /** Returns the windmill service endpoints set by setWindmillServiceEndpoints */ + ImmutableSet getWindmillServiceEndpoints(); + + /** + * Sets the new endpoints used to talk to windmill. Upon first call, the stubs are initialized. On + * subsequent calls, if endpoints are different from previous values new stubs are created, + * replacing the previous ones. + */ + void setWindmillServiceEndpoints(Set endpoints); + + /** + * Gets work to process, returned as a stream. + * + *

Each time a WorkItem is received, it will be passed to the given receiver. The returned + * GetWorkStream object can be used to control the lifetime of the stream. + */ + WindmillStream.GetWorkStream getWorkStream( + Windmill.GetWorkRequest request, WorkItemReceiver receiver); + + /** Get additional data such as state needed to process work, returned as a stream. */ + WindmillStream.GetDataStream getDataStream(); + + /** Returns a stream allowing individual WorkItemCommitRequests to be streamed to Windmill. */ + WindmillStream.CommitWorkStream commitWorkStream(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java index a20c2f02b269..7d199afc0861 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillConnection.java @@ -27,6 +27,8 @@ @AutoValue @Internal public abstract class WindmillConnection { + private static final String NO_BACKEND_WORKER_TOKEN = ""; + public static WindmillConnection from( Endpoint windmillEndpoint, Function endpointToStubFn) { @@ -40,23 +42,24 @@ public static WindmillConnection from( } public static Builder builder() { - return new AutoValue_WindmillConnection.Builder(); + return new AutoValue_WindmillConnection.Builder() + .setBackendWorkerToken(NO_BACKEND_WORKER_TOKEN); } - public abstract Optional backendWorkerToken(); + public abstract String backendWorkerToken(); public abstract Optional directEndpoint(); public abstract CloudWindmillServiceV1Alpha1Stub stub(); @AutoValue.Builder - abstract static class Builder { + public abstract static class Builder { abstract Builder setBackendWorkerToken(String backendWorkerToken); public abstract Builder setDirectEndpoint(WindmillServiceAddress value); - abstract Builder setStub(CloudWindmillServiceV1Alpha1Stub stub); + public abstract Builder setStub(CloudWindmillServiceV1Alpha1Stub stub); - abstract WindmillConnection build(); + public abstract WindmillConnection build(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java index 0785ae96626e..5f7fd6da9d4b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java @@ -59,11 +59,6 @@ public ImmutableSet getWindmillServiceEndpoints() { return ImmutableSet.of(); } - @Override - public boolean isReady() { - return true; - } - @Override public Windmill.GetWorkResponse getWork(Windmill.GetWorkRequest workRequest) { try { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java index 7d0c4f5aba32..cd753cb8ec91 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java @@ -18,65 +18,11 @@ package org.apache.beam.runners.dataflow.worker.windmill; import java.io.PrintWriter; -import java.util.Set; import org.apache.beam.runners.dataflow.worker.status.StatusDataProvider; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; /** Stub for communicating with a Windmill server. */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -public abstract class WindmillServerStub implements StatusDataProvider { - - /** - * Sets the new endpoints used to talk to windmill. Upon first call, the stubs are initialized. On - * subsequent calls, if endpoints are different from previous values new stubs are created, - * replacing the previous ones. - */ - public abstract void setWindmillServiceEndpoints(Set endpoints); - - /* - * Returns the windmill service endpoints set by setWindmillServiceEndpoints - */ - public abstract ImmutableSet getWindmillServiceEndpoints(); - - /** Returns true iff this WindmillServerStub is ready for making API calls. */ - public abstract boolean isReady(); - - /** Get a batch of work to process. */ - public abstract Windmill.GetWorkResponse getWork(Windmill.GetWorkRequest request); - - /** Get additional data such as state needed to process work. */ - public abstract Windmill.GetDataResponse getData(Windmill.GetDataRequest request); - - /** Commit the work, issuing any output productions, state modifications etc. */ - public abstract Windmill.CommitWorkResponse commitWork(Windmill.CommitWorkRequest request); - - /** Get configuration data from the server. */ - public abstract Windmill.GetConfigResponse getConfig(Windmill.GetConfigRequest request); - - /** Report execution information to the server. */ - public abstract Windmill.ReportStatsResponse reportStats(Windmill.ReportStatsRequest request); - - /** - * Gets work to process, returned as a stream. - * - *

Each time a WorkItem is received, it will be passed to the given receiver. The returned - * GetWorkStream object can be used to control the lifetime of the stream. - */ - public abstract GetWorkStream getWorkStream( - Windmill.GetWorkRequest request, WorkItemReceiver receiver); - - /** Get additional data such as state needed to process work, returned as a stream. */ - public abstract GetDataStream getDataStream(); - - /** Returns a stream allowing individual WorkItemCommitRequests to be streamed to Windmill. */ - public abstract CommitWorkStream commitWorkStream(); +public abstract class WindmillServerStub + implements ApplianceWindmillClient, StreamingEngineWindmillClient, StatusDataProvider { /** Returns the amount of time the server has been throttled and resets the time to 0. */ public abstract long getAndResetThrottleTime(); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java index 028a5c2e1d4b..58aecfc71e00 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java @@ -69,6 +69,7 @@ public abstract class AbstractWindmillStream implements Win protected static final int RPC_STREAM_CHUNK_SIZE = 2 << 20; private static final Logger LOG = LoggerFactory.getLogger(AbstractWindmillStream.class); protected final AtomicBoolean clientClosed; + private final AtomicBoolean isShutdown; private final AtomicLong lastSendTimeMs; private final Executor executor; private final BackOff backoff; @@ -84,19 +85,23 @@ public abstract class AbstractWindmillStream implements Win private final Supplier> requestObserverSupplier; // Indicates if the current stream in requestObserver is closed by calling close() method private final AtomicBoolean streamClosed; + private final String backendWorkerToken; private @Nullable StreamObserver requestObserver; protected AbstractWindmillStream( + String debugStreamType, Function, StreamObserver> clientFactory, BackOff backoff, StreamObserverFactory streamObserverFactory, Set> streamRegistry, - int logEveryNStreamFailures) { + int logEveryNStreamFailures, + String backendWorkerToken) { + this.backendWorkerToken = backendWorkerToken; this.executor = Executors.newSingleThreadExecutor( new ThreadFactoryBuilder() .setDaemon(true) - .setNameFormat("WindmillStream-thread") + .setNameFormat(createThreadName(debugStreamType, backendWorkerToken)) .build()); this.backoff = backoff; this.streamRegistry = streamRegistry; @@ -111,12 +116,19 @@ protected AbstractWindmillStream( this.lastErrorTime = new AtomicReference<>(); this.sleepUntil = new AtomicLong(); this.finishLatch = new CountDownLatch(1); + this.isShutdown = new AtomicBoolean(false); this.requestObserverSupplier = () -> streamObserverFactory.from( clientFactory, new AbstractWindmillStream.ResponseObserver()); } + private static String createThreadName(String streamType, String backendWorkerToken) { + return !backendWorkerToken.isEmpty() + ? String.format("%s-%s-WindmillStream-thread", streamType, backendWorkerToken) + : String.format("%s-WindmillStream-thread", streamType); + } + private static long debugDuration(long nowMs, long startMs) { if (startMs <= 0) { return -1; @@ -140,6 +152,11 @@ private static long debugDuration(long nowMs, long startMs) { */ protected abstract void startThrottleTimer(); + /** Reflects that {@link #shutdown()} was explicitly called. */ + protected boolean isShutdown() { + return isShutdown.get(); + } + private StreamObserver requestObserver() { if (requestObserver == null) { throw new NullPointerException( @@ -175,7 +192,7 @@ protected final void startStream() { requestObserver = requestObserverSupplier.get(); onNewStream(); if (clientClosed.get()) { - close(); + halfClose(); } return; } @@ -238,7 +255,7 @@ public final void appendSummaryHtml(PrintWriter writer) { protected abstract void appendSpecificHtml(PrintWriter writer); @Override - public final synchronized void close() { + public final synchronized void halfClose() { // Synchronization of close and onCompleted necessary for correct retry logic in onNewStream. clientClosed.set(true); requestObserver().onCompleted(); @@ -255,6 +272,30 @@ public final Instant startTime() { return new Instant(startTimeMs.get()); } + @Override + public String backendWorkerToken() { + return backendWorkerToken; + } + + @Override + public void shutdown() { + if (isShutdown.compareAndSet(false, true)) { + requestObserver() + .onError(new WindmillStreamShutdownException("Explicit call to shutdown stream.")); + } + } + + private void setLastError(String error) { + lastError.set(error); + lastErrorTime.set(DateTime.now()); + } + + public static class WindmillStreamShutdownException extends RuntimeException { + public WindmillStreamShutdownException(String message) { + super(message); + } + } + private class ResponseObserver implements StreamObserver { @Override @@ -280,7 +321,7 @@ public void onCompleted() { private void onStreamFinished(@Nullable Throwable t) { synchronized (this) { - if (clientClosed.get() && !hasPendingRequests()) { + if (isShutdown.get() || (clientClosed.get() && !hasPendingRequests())) { streamRegistry.remove(AbstractWindmillStream.this); finishLatch.countDown(); return; @@ -337,9 +378,4 @@ private void onStreamFinished(@Nullable Throwable t) { executor.execute(AbstractWindmillStream.this::startStream); } } - - private void setLastError(String error) { - lastError.set(error); - lastErrorTime.set(DateTime.now()); - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java index d044e9300790..31bd4e146a78 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java @@ -18,6 +18,7 @@ package org.apache.beam.runners.dataflow.worker.windmill.client; import java.io.Closeable; +import java.util.Collection; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; @@ -32,8 +33,12 @@ /** Superclass for streams returned by streaming Windmill methods. */ @ThreadSafe public interface WindmillStream { + + /** An identifier for the backend worker where the stream is sending/receiving RPCs. */ + String backendWorkerToken(); + /** Indicates that no more requests will be sent. */ - void close(); + void halfClose(); /** Waits for the server to close its end of the connection, with timeout. */ boolean awaitTermination(int time, TimeUnit unit) throws InterruptedException; @@ -41,6 +46,12 @@ public interface WindmillStream { /** Returns when the stream was opened. */ Instant startTime(); + /** + * Shutdown the stream. There should be no further interactions with the stream once this has been + * called. + */ + void shutdown(); + /** Handle representing a stream of GetWork responses. */ @ThreadSafe interface GetWorkStream extends WindmillStream { @@ -62,7 +73,7 @@ Windmill.KeyedGetDataResponse requestKeyedData( Windmill.GlobalData requestGlobalData(Windmill.GlobalDataRequest request); /** Tells windmill processing is ongoing for the given keys. */ - void refreshActiveWork(Map> heartbeats); + void refreshActiveWork(Map> heartbeats); void onHeartbeatResponse(List responses); } @@ -70,6 +81,12 @@ Windmill.KeyedGetDataResponse requestKeyedData( /** Interface for streaming CommitWorkRequests to Windmill. */ @ThreadSafe interface CommitWorkStream extends WindmillStream { + /** + * Returns a builder that can be used for sending requests. Each builder is not thread-safe but + * different builders for the same stream may be used simultaneously. + */ + CommitWorkStream.RequestBatcher batcher(); + @NotThreadSafe interface RequestBatcher extends Closeable { /** @@ -92,12 +109,6 @@ default void close() { flush(); } } - - /** - * Returns a builder that can be used for sending requests. Each builder is not thread-safe but - * different builders for the same stream may be used simultaneously. - */ - RequestBatcher batcher(); } /** Interface for streaming GetWorkerMetadata requests to Windmill. */ diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPool.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPool.java index 0e4e085c066c..f14fc40fdfdf 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPool.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPool.java @@ -128,7 +128,7 @@ public StreamT getStream() { return resultStream; } finally { if (closeThisStream != null) { - closeThisStream.close(); + closeThisStream.halfClose(); } } } @@ -166,7 +166,7 @@ public void releaseStream(StreamT stream) { } if (closeStream) { - stream.close(); + stream.halfClose(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java index ed4dcfa212f1..bf1007bc4bfb 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitter.java @@ -17,9 +17,11 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.client.commits; +import com.google.auto.value.AutoBuilder; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Consumer; import java.util.function.Supplier; @@ -45,6 +47,7 @@ public final class StreamingEngineWorkCommitter implements WorkCommitter { private static final Logger LOG = LoggerFactory.getLogger(StreamingEngineWorkCommitter.class); private static final int TARGET_COMMIT_BATCH_KEYS = 5; private static final int MAX_COMMIT_QUEUE_BYTES = 500 << 20; // 500MB + private static final String NO_BACKEND_WORKER_TOKEN = ""; private final Supplier> commitWorkStreamFactory; private final WeightedBoundedQueue commitQueue; @@ -52,11 +55,13 @@ public final class StreamingEngineWorkCommitter implements WorkCommitter { private final AtomicLong activeCommitBytes; private final Consumer onCommitComplete; private final int numCommitSenders; + private final AtomicBoolean isRunning; - private StreamingEngineWorkCommitter( + StreamingEngineWorkCommitter( Supplier> commitWorkStreamFactory, int numCommitSenders, - Consumer onCommitComplete) { + Consumer onCommitComplete, + String backendWorkerToken) { this.commitWorkStreamFactory = commitWorkStreamFactory; this.commitQueue = WeightedBoundedQueue.create( @@ -67,34 +72,48 @@ private StreamingEngineWorkCommitter( new ThreadFactoryBuilder() .setDaemon(true) .setPriority(Thread.MAX_PRIORITY) - .setNameFormat("CommitThread-%d") + .setNameFormat( + backendWorkerToken.isEmpty() + ? "CommitThread-%d" + : "CommitThread-" + backendWorkerToken + "-%d") .build()); this.activeCommitBytes = new AtomicLong(); this.onCommitComplete = onCommitComplete; this.numCommitSenders = numCommitSenders; + this.isRunning = new AtomicBoolean(false); } - public static StreamingEngineWorkCommitter create( - Supplier> commitWorkStreamFactory, - int numCommitSenders, - Consumer onCommitComplete) { - return new StreamingEngineWorkCommitter( - commitWorkStreamFactory, numCommitSenders, onCommitComplete); + public static Builder builder() { + return new AutoBuilder_StreamingEngineWorkCommitter_Builder() + .setBackendWorkerToken(NO_BACKEND_WORKER_TOKEN) + .setNumCommitSenders(1); } @Override @SuppressWarnings("FutureReturnValueIgnored") public void start() { - if (!commitSenders.isShutdown()) { - for (int i = 0; i < numCommitSenders; i++) { - commitSenders.submit(this::streamingCommitLoop); - } + Preconditions.checkState( + isRunning.compareAndSet(false, true), "Multiple calls to WorkCommitter.start()."); + for (int i = 0; i < numCommitSenders; i++) { + commitSenders.submit(this::streamingCommitLoop); } } @Override public void commit(Commit commit) { - commitQueue.put(commit); + boolean isShutdown = !this.isRunning.get(); + if (commit.work().isFailed() || isShutdown) { + if (isShutdown) { + LOG.debug( + "Trying to queue commit on shutdown, failing commit=[computationId={}, shardingKey={}, workId={} ].", + commit.computationId(), + commit.work().getShardedKey(), + commit.work().id()); + } + failCommit(commit); + } else { + commitQueue.put(commit); + } } @Override @@ -104,15 +123,14 @@ public long currentActiveCommitBytes() { @Override public void stop() { - if (!commitSenders.isTerminated()) { - commitSenders.shutdownNow(); - try { - commitSenders.awaitTermination(10, TimeUnit.SECONDS); - } catch (InterruptedException e) { - LOG.warn( - "Commit senders didn't complete shutdown within 10 seconds, continuing to drain queue", - e); - } + Preconditions.checkState(isRunning.compareAndSet(true, false)); + commitSenders.shutdownNow(); + try { + commitSenders.awaitTermination(10, TimeUnit.SECONDS); + } catch (InterruptedException e) { + LOG.warn( + "Commit senders didn't complete shutdown within 10 seconds, continuing to drain queue.", + e); } drainCommitQueue(); } @@ -138,12 +156,13 @@ public int parallelism() { private void streamingCommitLoop() { @Nullable Commit initialCommit = null; try { - while (true) { + while (isRunning.get()) { if (initialCommit == null) { try { // Block until we have a commit or are shutting down. initialCommit = commitQueue.take(); } catch (InterruptedException e) { + Thread.currentThread().interrupt(); return; } } @@ -156,17 +175,14 @@ private void streamingCommitLoop() { } try (CloseableStream closeableCommitStream = - commitWorkStreamFactory.get()) { - CommitWorkStream commitStream = closeableCommitStream.stream(); - try (CommitWorkStream.RequestBatcher batcher = commitStream.batcher()) { - if (!tryAddToCommitBatch(initialCommit, batcher)) { - throw new AssertionError( - "Initial commit on flushed stream should always be accepted."); - } - // Batch additional commits to the stream and possibly make an un-batched commit the - // next initial commit. - initialCommit = expandBatch(batcher); + commitWorkStreamFactory.get(); + CommitWorkStream.RequestBatcher batcher = closeableCommitStream.stream().batcher()) { + if (!tryAddToCommitBatch(initialCommit, batcher)) { + throw new AssertionError("Initial commit on flushed stream should always be accepted."); } + // Batch additional commits to the stream and possibly make an un-batched commit the + // next initial commit. + initialCommit = expandBatch(batcher); } catch (Exception e) { LOG.error("Error occurred sending commits.", e); } @@ -187,7 +203,7 @@ private boolean tryAddToCommitBatch(Commit commit, CommitWorkStream.RequestBatch batcher.commitWorkItem( commit.computationId(), commit.request(), - (commitStatus) -> { + commitStatus -> { onCommitComplete.accept(CompleteCommit.create(commit, commitStatus)); activeCommitBytes.addAndGet(-commit.getSize()); }); @@ -201,9 +217,11 @@ private boolean tryAddToCommitBatch(Commit commit, CommitWorkStream.RequestBatch return isCommitAccepted; } - // Helper to batch additional commits into the commit batch as long as they fit. - // Returns a commit that was removed from the queue but not consumed or null. - private Commit expandBatch(CommitWorkStream.RequestBatcher batcher) { + /** + * Helper to batch additional commits into the commit batch as long as they fit. Returns a commit + * that was removed from the queue but not consumed or null. + */ + private @Nullable Commit expandBatch(CommitWorkStream.RequestBatcher batcher) { int commits = 1; while (true) { Commit commit; @@ -214,6 +232,7 @@ private Commit expandBatch(CommitWorkStream.RequestBatcher batcher) { commit = commitQueue.poll(); } } catch (InterruptedException e) { + Thread.currentThread().interrupt(); return null; } @@ -233,4 +252,22 @@ private Commit expandBatch(CommitWorkStream.RequestBatcher batcher) { commits++; } } + + @AutoBuilder + public interface Builder { + Builder setCommitWorkStreamFactory( + Supplier> commitWorkStreamFactory); + + Builder setNumCommitSenders(int numCommitSenders); + + Builder setOnCommitComplete(Consumer onCommitComplete); + + Builder setBackendWorkerToken(String backendWorkerToken); + + StreamingEngineWorkCommitter autoBuild(); + + default WorkCommitter build() { + return autoBuild(); + } + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java new file mode 100644 index 000000000000..e0500dde0c53 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ApplianceGetDataClient.java @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import javax.annotation.concurrent.GuardedBy; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.WindmillComputationKey; +import org.apache.beam.runners.dataflow.worker.windmill.ApplianceWindmillClient; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ComputationGetDataRequest; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.SettableFuture; +import org.checkerframework.checker.nullness.qual.Nullable; + +/** Appliance implementation of {@link GetDataClient}. */ +@Internal +@ThreadSafe +public final class ApplianceGetDataClient implements GetDataClient { + private static final int MAX_READS_PER_BATCH = 60; + private static final int MAX_ACTIVE_READS = 10; + + private final ApplianceWindmillClient windmillClient; + private final ThrottlingGetDataMetricTracker getDataMetricTracker; + + @GuardedBy("this") + private final List pendingReadBatches; + + @GuardedBy("this") + private int activeReadThreads; + + public ApplianceGetDataClient( + ApplianceWindmillClient windmillClient, ThrottlingGetDataMetricTracker getDataMetricTracker) { + this.windmillClient = windmillClient; + this.getDataMetricTracker = getDataMetricTracker; + this.pendingReadBatches = new ArrayList<>(); + this.activeReadThreads = 0; + } + + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computationId, Windmill.KeyedGetDataRequest request) { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling()) { + SettableFuture response = SettableFuture.create(); + ReadBatch batch = addToReadBatch(new QueueEntry(computationId, request, response)); + if (batch != null) { + issueReadBatch(batch); + } + return response.get(); + } catch (Exception e) { + throw new GetDataException( + "Error occurred fetching state for computation=" + + computationId + + ", key=" + + request.getShardingKey(), + e); + } + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) { + try (AutoCloseable ignored = getDataMetricTracker.trackSideInputFetchWithThrottling()) { + return windmillClient + .getData(Windmill.GetDataRequest.newBuilder().addGlobalDataFetchRequests(request).build()) + .getGlobalData(0); + } catch (Exception e) { + throw new GetDataException( + "Error occurred fetching side input for tag=" + request.getDataId(), e); + } + } + + @Override + public synchronized void printHtml(PrintWriter writer) { + getDataMetricTracker.printHtml(writer); + writer.println(" Read threads: " + activeReadThreads); + writer.println(" Pending read batches: " + pendingReadBatches.size()); + } + + private void issueReadBatch(ReadBatch batch) { + try { + // Possibly block until the batch is allowed to start. + batch.startRead.get(); + } catch (InterruptedException e) { + // We don't expect this thread to be interrupted. To simplify handling, we just fall through + // to issuing the call. + assert (false); + Thread.currentThread().interrupt(); + } catch (ExecutionException e) { + // startRead is a SettableFuture so this should never occur. + throw new AssertionError("Should not have exception on startRead", e); + } + Map> pendingResponses = + new HashMap<>(batch.reads.size()); + Map computationBuilders = new HashMap<>(); + for (QueueEntry entry : batch.reads) { + ComputationGetDataRequest.Builder computationBuilder = + computationBuilders.computeIfAbsent( + entry.computation, k -> ComputationGetDataRequest.newBuilder().setComputationId(k)); + + computationBuilder.addRequests(entry.request); + pendingResponses.put( + WindmillComputationKey.create( + entry.computation, entry.request.getKey(), entry.request.getShardingKey()), + entry.response); + } + + // Build the full GetDataRequest from the KeyedGetDataRequests pulled from the queue. + Windmill.GetDataRequest.Builder builder = Windmill.GetDataRequest.newBuilder(); + for (ComputationGetDataRequest.Builder computationBuilder : computationBuilders.values()) { + builder.addRequests(computationBuilder); + } + + try { + Windmill.GetDataResponse response = windmillClient.getData(builder.build()); + // Dispatch the per-key responses back to the waiting threads. + for (Windmill.ComputationGetDataResponse computationResponse : response.getDataList()) { + for (Windmill.KeyedGetDataResponse keyResponse : computationResponse.getDataList()) { + pendingResponses + .get( + WindmillComputationKey.create( + computationResponse.getComputationId(), + keyResponse.getKey(), + keyResponse.getShardingKey())) + .set(keyResponse); + } + } + } catch (RuntimeException e) { + // Fan the exception out to the reads. + for (QueueEntry entry : batch.reads) { + entry.response.setException(e); + } + } finally { + synchronized (this) { + Preconditions.checkState(activeReadThreads >= 1); + if (pendingReadBatches.isEmpty()) { + activeReadThreads--; + } else { + // Notify the thread responsible for issuing the next batch read. + ReadBatch startBatch = pendingReadBatches.remove(0); + startBatch.startRead.set(null); + } + } + } + } + + /** + * Adds the entry to a read batch for sending to the windmill server. If a non-null batch is + * returned, this thread will be responsible for sending the batch and should wait for the batch + * startRead to be notified. If null is returned, the entry was added to a read batch that will be + * issued by another thread. + */ + private @Nullable ReadBatch addToReadBatch(QueueEntry entry) { + synchronized (this) { + ReadBatch batch; + if (activeReadThreads < MAX_ACTIVE_READS) { + assert (pendingReadBatches.isEmpty()); + activeReadThreads += 1; + // fall through to below synchronized block + } else if (pendingReadBatches.isEmpty() + || pendingReadBatches.get(pendingReadBatches.size() - 1).reads.size() + >= MAX_READS_PER_BATCH) { + // This is the first read of a batch, it will be responsible for sending the batch. + batch = new ReadBatch(); + pendingReadBatches.add(batch); + batch.reads.add(entry); + return batch; + } else { + // This fits within an existing batch, it will be sent by the first blocking thread in the + // batch. + pendingReadBatches.get(pendingReadBatches.size() - 1).reads.add(entry); + return null; + } + } + ReadBatch batch = new ReadBatch(); + batch.reads.add(entry); + batch.startRead.set(null); + return batch; + } + + private static final class ReadBatch { + ArrayList reads = new ArrayList<>(); + SettableFuture startRead = SettableFuture.create(); + } + + private static final class QueueEntry { + final String computation; + final Windmill.KeyedGetDataRequest request; + final SettableFuture response; + + QueueEntry( + String computation, + Windmill.KeyedGetDataRequest request, + SettableFuture response) { + this.computation = computation; + this.request = request; + this.response = response; + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java new file mode 100644 index 000000000000..c732591bf12d --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/GetDataClient.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import java.io.PrintWriter; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataResponse; +import org.apache.beam.sdk.annotations.Internal; + +/** Client for streaming backend GetData API. */ +@Internal +public interface GetDataClient { + /** + * Issues a blocking call to fetch state data for a specific computation and {@link + * org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem}. + * + * @throws GetDataException when there was an unexpected error during the attempted fetch. + */ + KeyedGetDataResponse getStateData(String computationId, KeyedGetDataRequest request) + throws GetDataException; + + /** + * Issues a blocking call to fetch side input data. + * + * @throws GetDataException when there was an unexpected error during the attempted fetch. + */ + GlobalData getSideInputData(GlobalDataRequest request) throws GetDataException; + + void printHtml(PrintWriter writer); + + final class GetDataException extends RuntimeException { + GetDataException(String message, Throwable cause) { + super(message, cause); + } + + GetDataException(String message) { + super(message); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamGetDataClient.java new file mode 100644 index 000000000000..c8e058e7e230 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamGetDataClient.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import java.io.PrintWriter; +import java.util.function.Function; +import org.apache.beam.runners.dataflow.worker.WorkItemCancelledException; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.sdk.annotations.Internal; + +/** {@link GetDataClient} that fetches data directly from a specific {@link GetDataStream}. */ +@Internal +public final class StreamGetDataClient implements GetDataClient { + + private final GetDataStream getDataStream; + private final Function sideInputGetDataStreamFactory; + private final ThrottlingGetDataMetricTracker getDataMetricTracker; + + private StreamGetDataClient( + GetDataStream getDataStream, + Function sideInputGetDataStreamFactory, + ThrottlingGetDataMetricTracker getDataMetricTracker) { + this.getDataStream = getDataStream; + this.sideInputGetDataStreamFactory = sideInputGetDataStreamFactory; + this.getDataMetricTracker = getDataMetricTracker; + } + + public static GetDataClient create( + GetDataStream getDataStream, + Function sideInputGetDataStreamFactory, + ThrottlingGetDataMetricTracker getDataMetricTracker) { + return new StreamGetDataClient( + getDataStream, sideInputGetDataStreamFactory, getDataMetricTracker); + } + + /** + * @throws WorkItemCancelledException when the fetch fails due to the stream being shutdown, + * indicating that the {@link + * org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem} that triggered the + * fetch has been cancelled. + */ + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computationId, Windmill.KeyedGetDataRequest request) throws GetDataException { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling()) { + return getDataStream.requestKeyedData(computationId, request); + } catch (AbstractWindmillStream.WindmillStreamShutdownException e) { + throw new WorkItemCancelledException(request.getShardingKey()); + } catch (Exception e) { + throw new GetDataException( + "Error occurred fetching state for computation=" + + computationId + + ", key=" + + request.getShardingKey(), + e); + } + } + + /** + * @throws WorkItemCancelledException when the fetch fails due to the stream being shutdown, + * indicating that the {@link + * org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem} that triggered the + * fetch has been cancelled. + */ + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) + throws GetDataException { + GetDataStream sideInputGetDataStream = + sideInputGetDataStreamFactory.apply(request.getDataId().getTag()); + try (AutoCloseable ignored = getDataMetricTracker.trackSideInputFetchWithThrottling()) { + return sideInputGetDataStream.requestGlobalData(request); + } catch (AbstractWindmillStream.WindmillStreamShutdownException e) { + throw new WorkItemCancelledException(e); + } catch (Exception e) { + throw new GetDataException( + "Error occurred fetching side input for tag=" + request.getDataId(), e); + } + } + + @Override + public void printHtml(PrintWriter writer) { + getDataMetricTracker.printHtml(writer); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamPoolGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamPoolGetDataClient.java new file mode 100644 index 000000000000..49fe3e4bdc15 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/StreamPoolGetDataClient.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import java.io.PrintWriter; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; +import org.apache.beam.sdk.annotations.Internal; + +/** + * StreamingEngine implementation of {@link GetDataClient}. + * + * @implNote Uses {@link WindmillStreamPool} to send requests. + */ +@Internal +@ThreadSafe +public final class StreamPoolGetDataClient implements GetDataClient { + + private final WindmillStreamPool getDataStreamPool; + private final ThrottlingGetDataMetricTracker getDataMetricTracker; + + public StreamPoolGetDataClient( + ThrottlingGetDataMetricTracker getDataMetricTracker, + WindmillStreamPool getDataStreamPool) { + this.getDataMetricTracker = getDataMetricTracker; + this.getDataStreamPool = getDataStreamPool; + } + + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computationId, KeyedGetDataRequest request) { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling(); + CloseableStream closeableStream = getDataStreamPool.getCloseableStream()) { + return closeableStream.stream().requestKeyedData(computationId, request); + } catch (Exception e) { + throw new GetDataException( + "Error occurred fetching state for computation=" + + computationId + + ", key=" + + request.getShardingKey(), + e); + } + } + + @Override + public Windmill.GlobalData getSideInputData(GlobalDataRequest request) { + try (AutoCloseable ignored = getDataMetricTracker.trackSideInputFetchWithThrottling(); + CloseableStream closeableStream = getDataStreamPool.getCloseableStream()) { + return closeableStream.stream().requestGlobalData(request); + } catch (Exception e) { + throw new GetDataException( + "Error occurred fetching side input for tag=" + request.getDataId(), e); + } + } + + @Override + public void printHtml(PrintWriter writer) { + getDataMetricTracker.printHtml(writer); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java new file mode 100644 index 000000000000..6bb00292e29a --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTracker.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import com.google.auto.value.AutoValue; +import java.io.PrintWriter; +import java.util.concurrent.atomic.AtomicInteger; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; + +/** + * Wraps GetData calls to track metrics for the number of in-flight requests and throttles requests + * when memory pressure is high. + */ +@Internal +@ThreadSafe +public final class ThrottlingGetDataMetricTracker { + private static final String GET_STATE_DATA_RESOURCE_CONTEXT = "GetStateData"; + private static final String GET_SIDE_INPUT_RESOURCE_CONTEXT = "GetSideInputData"; + + private final MemoryMonitor gcThrashingMonitor; + private final AtomicInteger activeStateReads; + private final AtomicInteger activeSideInputs; + private final AtomicInteger activeHeartbeats; + + public ThrottlingGetDataMetricTracker(MemoryMonitor gcThrashingMonitor) { + this.gcThrashingMonitor = gcThrashingMonitor; + this.activeStateReads = new AtomicInteger(); + this.activeSideInputs = new AtomicInteger(); + this.activeHeartbeats = new AtomicInteger(); + } + + /** + * Tracks a state data fetch. If there is memory pressure, may throttle requests. Returns an + * {@link AutoCloseable} that will decrement the metric after the call is finished. + */ + AutoCloseable trackStateDataFetchWithThrottling() { + gcThrashingMonitor.waitForResources(GET_STATE_DATA_RESOURCE_CONTEXT); + activeStateReads.getAndIncrement(); + return activeStateReads::getAndDecrement; + } + + /** + * Tracks a side input fetch. If there is memory pressure, may throttle requests. Returns an + * {@link AutoCloseable} that will decrement the metric after the call is finished. + */ + AutoCloseable trackSideInputFetchWithThrottling() { + gcThrashingMonitor.waitForResources(GET_SIDE_INPUT_RESOURCE_CONTEXT); + activeSideInputs.getAndIncrement(); + return activeSideInputs::getAndDecrement; + } + + /** + * Tracks heartbeat request metrics. Returns an {@link AutoCloseable} that will decrement the + * metric after the call is finished. + */ + public AutoCloseable trackHeartbeats(int numHeartbeats) { + activeHeartbeats.getAndAdd(numHeartbeats); + return () -> activeHeartbeats.getAndAdd(-numHeartbeats); + } + + public void printHtml(PrintWriter writer) { + writer.println("Active Fetches:"); + writer.println(" Side Inputs: " + activeSideInputs.get()); + writer.println(" State Reads: " + activeStateReads.get()); + writer.println("Heartbeat Keys Active: " + activeHeartbeats.get()); + } + + @VisibleForTesting + ReadOnlySnapshot getMetricsSnapshot() { + return ReadOnlySnapshot.create( + activeSideInputs.get(), activeStateReads.get(), activeHeartbeats.get()); + } + + @VisibleForTesting + @AutoValue + abstract static class ReadOnlySnapshot { + + private static ReadOnlySnapshot create( + int activeSideInputs, int activeStateReads, int activeHeartbeats) { + return new AutoValue_ThrottlingGetDataMetricTracker_ReadOnlySnapshot( + activeSideInputs, activeStateReads, activeHeartbeats); + } + + abstract int activeSideInputs(); + + abstract int activeStateReads(); + + abstract int activeHeartbeats(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServlet.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServlet.java index e0f823d79ade..adfb380d2164 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServlet.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServlet.java @@ -23,6 +23,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.function.Supplier; import java.util.stream.Collectors; import javax.annotation.Nullable; import javax.servlet.ServletException; @@ -31,7 +32,6 @@ import org.apache.beam.runners.dataflow.options.DataflowStreamingPipelineOptions; import org.apache.beam.runners.dataflow.worker.status.BaseStatusServlet; import org.apache.beam.runners.dataflow.worker.status.DebugCapture; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.channelz.v1.*; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.protobuf.services.ChannelzService; @@ -47,16 +47,16 @@ public class ChannelzServlet extends BaseStatusServlet implements DebugCapture.C private static final int MAX_TOP_CHANNELS_TO_RETURN = 500; private final ChannelzService channelzService; - private final WindmillServerStub windmillServerStub; + private final Supplier> currentWindmillEndpoints; private final boolean showOnlyWindmillServiceChannels; public ChannelzServlet( String path, DataflowStreamingPipelineOptions options, - WindmillServerStub windmillServerStub) { + Supplier> currentWindmillEndpoints) { super(path); channelzService = ChannelzService.newInstance(MAX_TOP_CHANNELS_TO_RETURN); - this.windmillServerStub = windmillServerStub; + this.currentWindmillEndpoints = currentWindmillEndpoints; showOnlyWindmillServiceChannels = options.getChannelzShowOnlyWindmillServiceChannels(); } @@ -81,14 +81,6 @@ public void captureData(PrintWriter writer) { writer.println(""); } - // channelz proto says there won't be cycles in the ref graph. - // we track visited ids to be defensive and prevent any accidental cycles. - private static class VisitedSets { - - Set channels = new HashSet<>(); - Set subchannels = new HashSet<>(); - } - private void appendTopChannels(PrintWriter writer) { SettableFuture future = SettableFuture.create(); // IDEA: If there are more than MAX_TOP_CHANNELS_TO_RETURN top channels @@ -127,8 +119,7 @@ private void appendTopChannels(PrintWriter writer) { } private List filterWindmillChannels(List channels) { - ImmutableSet windmillServiceEndpoints = - windmillServerStub.getWindmillServiceEndpoints(); + ImmutableSet windmillServiceEndpoints = currentWindmillEndpoints.get(); Set windmillServiceHosts = windmillServiceEndpoints.stream().map(HostAndPort::getHost).collect(Collectors.toSet()); List windmillChannels = new ArrayList<>(); @@ -291,4 +282,12 @@ public void onCompleted() { } }; } + + // channelz proto says there won't be cycles in the ref graph. + // we track visited ids to be defensive and prevent any accidental cycles. + private static class VisitedSets { + + Set channels = new HashSet<>(); + Set subchannels = new HashSet<>(); + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GetWorkResponseChunkAssembler.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GetWorkResponseChunkAssembler.java new file mode 100644 index 000000000000..9f30f75919f9 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GetWorkResponseChunkAssembler.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; + +import com.google.auto.value.AutoValue; +import java.io.IOException; +import java.util.List; +import java.util.Optional; +import javax.annotation.Nullable; +import javax.annotation.concurrent.NotThreadSafe; +import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.joda.time.Instant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link ByteString} buffer of {@link + * org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkResponseChunk}(s). + * + *

Once all serialized chunks of an {@link WorkItem} have been received flushes (deserializes) + * the chunk of bytes and metadata into an {@link AssembledWorkItem}. + * + * @implNote This class is not thread safe, and provides no synchronization underneath. + */ +@NotThreadSafe +final class GetWorkResponseChunkAssembler { + private static final Logger LOG = LoggerFactory.getLogger(GetWorkResponseChunkAssembler.class); + + private final GetWorkTimingInfosTracker workTimingInfosTracker; + private @Nullable ComputationMetadata metadata; + private ByteString data; + private long bufferedSize; + + GetWorkResponseChunkAssembler() { + workTimingInfosTracker = new GetWorkTimingInfosTracker(System::currentTimeMillis); + data = ByteString.EMPTY; + bufferedSize = 0; + metadata = null; + } + + /** + * Appends the response chunk bytes to the {@link #data }byte buffer. Return the assembled + * WorkItem if all response chunks for a WorkItem have been received. + */ + Optional append(Windmill.StreamingGetWorkResponseChunk chunk) { + if (chunk.hasComputationMetadata()) { + metadata = ComputationMetadata.fromProto(chunk.getComputationMetadata()); + } + + data = data.concat(chunk.getSerializedWorkItem()); + bufferedSize += chunk.getSerializedWorkItem().size(); + workTimingInfosTracker.addTimingInfo(chunk.getPerWorkItemTimingInfosList()); + + // If the entire WorkItem has been received, assemble the WorkItem. + return chunk.getRemainingBytesForWorkItem() == 0 ? flushToWorkItem() : Optional.empty(); + } + + /** + * Attempt to flush the {@link #data} bytes into a {@link WorkItem} w/ it's metadata. Resets the + * data byte string and tracking metadata afterwards, whether the {@link WorkItem} deserialization + * was successful or not. + */ + private Optional flushToWorkItem() { + try { + return Optional.of( + AssembledWorkItem.create( + WorkItem.parseFrom(data.newInput()), + Preconditions.checkNotNull(metadata), + workTimingInfosTracker.getLatencyAttributions(), + bufferedSize)); + } catch (IOException e) { + LOG.error("Failed to parse work item from stream: ", e); + } finally { + workTimingInfosTracker.reset(); + data = ByteString.EMPTY; + bufferedSize = 0; + } + + return Optional.empty(); + } + + @AutoValue + abstract static class ComputationMetadata { + private static ComputationMetadata fromProto( + Windmill.ComputationWorkItemMetadata metadataProto) { + return new AutoValue_GetWorkResponseChunkAssembler_ComputationMetadata( + metadataProto.getComputationId(), + WindmillTimeUtils.windmillToHarnessWatermark(metadataProto.getInputDataWatermark()), + WindmillTimeUtils.windmillToHarnessWatermark( + metadataProto.getDependentRealtimeInputWatermark())); + } + + abstract String computationId(); + + abstract Instant inputDataWatermark(); + + abstract Instant synchronizedProcessingTime(); + } + + @AutoValue + abstract static class AssembledWorkItem { + + private static AssembledWorkItem create( + WorkItem workItem, + ComputationMetadata computationMetadata, + List latencyAttributions, + long size) { + return new AutoValue_GetWorkResponseChunkAssembler_AssembledWorkItem( + workItem, computationMetadata, latencyAttributions, size); + } + + abstract WorkItem workItem(); + + abstract ComputationMetadata computationMetadata(); + + abstract List latencyAttributions(); + + abstract long bufferedSize(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java index f9f579119d61..053843a8af25 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java @@ -57,6 +57,7 @@ public final class GrpcCommitWorkStream private final int streamingRpcBatchLimit; private GrpcCommitWorkStream( + String backendWorkerToken, Function, StreamObserver> startCommitWorkRpcFn, BackOff backoff, @@ -68,11 +69,13 @@ private GrpcCommitWorkStream( AtomicLong idGenerator, int streamingRpcBatchLimit) { super( + "CommitWorkStream", startCommitWorkRpcFn, backoff, streamObserverFactory, streamRegistry, - logEveryNStreamFailures); + logEveryNStreamFailures, + backendWorkerToken); pending = new ConcurrentHashMap<>(); this.idGenerator = idGenerator; this.jobHeader = jobHeader; @@ -81,6 +84,7 @@ private GrpcCommitWorkStream( } public static GrpcCommitWorkStream create( + String backendWorkerToken, Function, StreamObserver> startCommitWorkRpcFn, BackOff backoff, @@ -93,6 +97,7 @@ public static GrpcCommitWorkStream create( int streamingRpcBatchLimit) { GrpcCommitWorkStream commitWorkStream = new GrpcCommitWorkStream( + backendWorkerToken, startCommitWorkRpcFn, backoff, streamObserverFactory, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java index 6f4b5b7b33fb..45d010d7cfac 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java @@ -17,8 +17,6 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; -import com.google.auto.value.AutoValue; -import java.io.IOException; import java.io.PrintWriter; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -26,12 +24,9 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.function.Supplier; -import javax.annotation.Nullable; -import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ComputationWorkItemMetadata; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkResponseChunk; @@ -39,19 +34,18 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GetWorkResponseChunkAssembler.AssembledWorkItem; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.util.BackOff; -import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Suppliers; -import org.joda.time.Instant; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Implementation of {@link GetWorkStream} that passes along a specific {@link @@ -64,7 +58,6 @@ public final class GrpcDirectGetWorkStream extends AbstractWindmillStream implements GetWorkStream { - private static final Logger LOG = LoggerFactory.getLogger(GrpcDirectGetWorkStream.class); private static final StreamingGetWorkRequest HEALTH_CHECK_REQUEST = StreamingGetWorkRequest.newBuilder() .setRequestExtension( @@ -80,17 +73,21 @@ public final class GrpcDirectGetWorkStream private final GetWorkRequest request; private final WorkItemScheduler workItemScheduler; private final ThrottleTimer getWorkThrottleTimer; - private final Supplier getDataStream; + private final Supplier heartbeatSender; private final Supplier workCommitter; + private final Supplier getDataClient; /** * Map of stream IDs to their buffers. Used to aggregate streaming gRPC response chunks as they * come in. Once all chunks for a response has been received, the chunk is processed and the * buffer is cleared. + * + * @implNote Buffers are not persisted across stream restarts. */ - private final ConcurrentMap workItemBuffers; + private final ConcurrentMap workItemAssemblers; private GrpcDirectGetWorkStream( + String backendWorkerToken, Function< StreamObserver, StreamObserver> @@ -101,25 +98,32 @@ private GrpcDirectGetWorkStream( Set> streamRegistry, int logEveryNStreamFailures, ThrottleTimer getWorkThrottleTimer, - Supplier getDataStream, + Supplier heartbeatSender, + Supplier getDataClient, Supplier workCommitter, WorkItemScheduler workItemScheduler) { super( - startGetWorkRpcFn, backoff, streamObserverFactory, streamRegistry, logEveryNStreamFailures); + "GetWorkStream", + startGetWorkRpcFn, + backoff, + streamObserverFactory, + streamRegistry, + logEveryNStreamFailures, + backendWorkerToken); this.request = request; this.getWorkThrottleTimer = getWorkThrottleTimer; this.workItemScheduler = workItemScheduler; - this.workItemBuffers = new ConcurrentHashMap<>(); - // Use the same GetDataStream and CommitWorkStream instances to process all the work in this - // stream. - this.getDataStream = Suppliers.memoize(getDataStream::get); + this.workItemAssemblers = new ConcurrentHashMap<>(); + this.heartbeatSender = Suppliers.memoize(heartbeatSender::get); this.workCommitter = Suppliers.memoize(workCommitter::get); + this.getDataClient = Suppliers.memoize(getDataClient::get); this.inFlightBudget = new AtomicReference<>(GetWorkBudget.noBudget()); this.nextBudgetAdjustment = new AtomicReference<>(GetWorkBudget.noBudget()); this.pendingResponseBudget = new AtomicReference<>(GetWorkBudget.noBudget()); } public static GrpcDirectGetWorkStream create( + String backendWorkerToken, Function< StreamObserver, StreamObserver> @@ -130,11 +134,13 @@ public static GrpcDirectGetWorkStream create( Set> streamRegistry, int logEveryNStreamFailures, ThrottleTimer getWorkThrottleTimer, - Supplier getDataStream, + Supplier heartbeatSender, + Supplier getDataClient, Supplier workCommitter, WorkItemScheduler workItemScheduler) { GrpcDirectGetWorkStream getWorkStream = new GrpcDirectGetWorkStream( + backendWorkerToken, startGetWorkRpcFn, request, backoff, @@ -142,14 +148,16 @@ public static GrpcDirectGetWorkStream create( streamRegistry, logEveryNStreamFailures, getWorkThrottleTimer, - getDataStream, + heartbeatSender, + getDataClient, workCommitter, workItemScheduler); getWorkStream.startStream(); return getWorkStream; } - private static Watermarks createWatermarks(WorkItem workItem, ComputationMetadata metadata) { + private static Watermarks createWatermarks( + WorkItem workItem, GetWorkResponseChunkAssembler.ComputationMetadata metadata) { return Watermarks.builder() .setInputDataWatermark(metadata.inputDataWatermark()) .setOutputDataWatermark(workItem.getOutputDataWatermark()) @@ -157,14 +165,8 @@ private static Watermarks createWatermarks(WorkItem workItem, ComputationMetadat .build(); } - private synchronized GetWorkBudget getThenResetBudgetAdjustment() { - return nextBudgetAdjustment.getAndUpdate(unused -> GetWorkBudget.noBudget()); - } - - private void sendRequestExtension() { - // Just sent the request extension, reset the nextBudgetAdjustment. This will be set when - // adjustBudget is called. - GetWorkBudget adjustment = getThenResetBudgetAdjustment(); + private void sendRequestExtension(GetWorkBudget adjustment) { + inFlightBudget.getAndUpdate(budget -> budget.apply(adjustment)); StreamingGetWorkRequest extension = StreamingGetWorkRequest.newBuilder() .setRequestExtension( @@ -186,7 +188,7 @@ private void sendRequestExtension() { @Override protected synchronized void onNewStream() { - workItemBuffers.clear(); + workItemAssemblers.clear(); // Add the current in-flight budget to the next adjustment. Only positive values are allowed // here // with negatives defaulting to 0, since GetWorkBudgets cannot be created with negative values. @@ -215,7 +217,7 @@ public void appendSpecificHtml(PrintWriter writer) { // Number of buffers is same as distinct workers that sent work on this stream. writer.format( "GetWorkStream: %d buffers, %s inflight budget allowed.", - workItemBuffers.size(), inFlightBudget.get()); + workItemAssemblers.size(), inFlightBudget.get()); } @Override @@ -226,27 +228,49 @@ public void sendHealthCheck() { @Override protected void onResponse(StreamingGetWorkResponseChunk chunk) { getWorkThrottleTimer.stop(); - WorkItemBuffer workItemBuffer = - workItemBuffers.computeIfAbsent(chunk.getStreamId(), unused -> new WorkItemBuffer()); - workItemBuffer.append(chunk); + workItemAssemblers + .computeIfAbsent(chunk.getStreamId(), unused -> new GetWorkResponseChunkAssembler()) + .append(chunk) + .ifPresent(this::consumeAssembledWorkItem); + } - // The entire WorkItem has been received, it is ready to be processed. - if (chunk.getRemainingBytesForWorkItem() == 0) { - workItemBuffer.runAndReset(); - // Record the fact that there are now fewer outstanding messages and bytes on the stream. - inFlightBudget.updateAndGet(budget -> budget.subtract(1, workItemBuffer.bufferedSize())); + private void consumeAssembledWorkItem(AssembledWorkItem assembledWorkItem) { + // Record the fact that there are now fewer outstanding messages and bytes on the stream. + inFlightBudget.updateAndGet(budget -> budget.subtract(1, assembledWorkItem.bufferedSize())); + WorkItem workItem = assembledWorkItem.workItem(); + GetWorkResponseChunkAssembler.ComputationMetadata metadata = + assembledWorkItem.computationMetadata(); + pendingResponseBudget.getAndUpdate(budget -> budget.apply(1, workItem.getSerializedSize())); + try { + workItemScheduler.scheduleWork( + workItem, + createWatermarks(workItem, Preconditions.checkNotNull(metadata)), + createProcessingContext(Preconditions.checkNotNull(metadata.computationId())), + assembledWorkItem.latencyAttributions()); + } finally { + pendingResponseBudget.getAndUpdate(budget -> budget.apply(-1, -workItem.getSerializedSize())); } } + private Work.ProcessingContext createProcessingContext(String computationId) { + return Work.createProcessingContext( + computationId, getDataClient.get(), workCommitter.get()::commit, heartbeatSender.get()); + } + @Override protected void startThrottleTimer() { getWorkThrottleTimer.start(); } @Override - public synchronized void adjustBudget(long itemsDelta, long bytesDelta) { - nextBudgetAdjustment.set(nextBudgetAdjustment.get().apply(itemsDelta, bytesDelta)); - sendRequestExtension(); + public void adjustBudget(long itemsDelta, long bytesDelta) { + GetWorkBudget adjustment = + nextBudgetAdjustment + // Get the current value, and reset the nextBudgetAdjustment. This will be set again + // when adjustBudget is called. + .getAndUpdate(unused -> GetWorkBudget.noBudget()) + .apply(itemsDelta, bytesDelta); + sendRequestExtension(adjustment); } @Override @@ -260,74 +284,4 @@ public GetWorkBudget remainingBudget() { .apply(currentNextBudgetAdjustment) .apply(currentInflightBudget); } - - private synchronized void updatePendingResponseBudget(long itemsDelta, long bytesDelta) { - pendingResponseBudget.set(pendingResponseBudget.get().apply(itemsDelta, bytesDelta)); - } - - @AutoValue - abstract static class ComputationMetadata { - private static ComputationMetadata fromProto(ComputationWorkItemMetadata metadataProto) { - return new AutoValue_GrpcDirectGetWorkStream_ComputationMetadata( - metadataProto.getComputationId(), - WindmillTimeUtils.windmillToHarnessWatermark(metadataProto.getInputDataWatermark()), - WindmillTimeUtils.windmillToHarnessWatermark( - metadataProto.getDependentRealtimeInputWatermark())); - } - - abstract String computationId(); - - abstract Instant inputDataWatermark(); - - abstract Instant synchronizedProcessingTime(); - } - - private class WorkItemBuffer { - private final GetWorkTimingInfosTracker workTimingInfosTracker; - private ByteString data; - private @Nullable ComputationMetadata metadata; - - private WorkItemBuffer() { - workTimingInfosTracker = new GetWorkTimingInfosTracker(System::currentTimeMillis); - data = ByteString.EMPTY; - this.metadata = null; - } - - private void append(StreamingGetWorkResponseChunk chunk) { - if (chunk.hasComputationMetadata()) { - this.metadata = ComputationMetadata.fromProto(chunk.getComputationMetadata()); - } - - this.data = data.concat(chunk.getSerializedWorkItem()); - workTimingInfosTracker.addTimingInfo(chunk.getPerWorkItemTimingInfosList()); - } - - private long bufferedSize() { - return data.size(); - } - - private void runAndReset() { - try { - WorkItem workItem = WorkItem.parseFrom(data.newInput()); - updatePendingResponseBudget(1, workItem.getSerializedSize()); - workItemScheduler.scheduleWork( - workItem, - createWatermarks(workItem, Preconditions.checkNotNull(metadata)), - createProcessingContext(Preconditions.checkNotNull(metadata.computationId())), - // After the work item is successfully queued or dropped by ActiveWorkState, remove it - // from the pendingResponseBudget. - queuedWorkItem -> updatePendingResponseBudget(-1, -workItem.getSerializedSize()), - workTimingInfosTracker.getLatencyAttributions()); - } catch (IOException e) { - LOG.error("Failed to parse work item from stream: ", e); - } - workTimingInfosTracker.reset(); - data = ByteString.EMPTY; - } - - private Work.ProcessingContext createProcessingContext(String computationId) { - return Work.createProcessingContext( - computationId, getDataStream.get()::requestKeyedData, workCommitter.get()::commit); - } - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java index 033990017b24..cf2e7260592d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java @@ -75,7 +75,7 @@ public static GrpcDispatcherClient create(WindmillStubFactory windmillStubFactor } @VisibleForTesting - static GrpcDispatcherClient forTesting( + public static GrpcDispatcherClient forTesting( WindmillStubFactory windmillGrpcStubFactory, List windmillServiceStubs, List windmillMetadataServiceStubs, @@ -106,7 +106,7 @@ ImmutableSet getDispatcherEndpoints() { } /** Will block the calling thread until the initial endpoints are present. */ - CloudWindmillMetadataServiceV1Alpha1Stub getWindmillMetadataServiceStubBlocking() { + public CloudWindmillMetadataServiceV1Alpha1Stub getWindmillMetadataServiceStubBlocking() { boolean initialized = false; long secondsWaited = 0; while (!initialized) { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java index feb15c2ac83c..0e9a0c6316ee 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; +import java.util.Collection; import java.util.Deque; import java.util.List; import java.util.Map; @@ -75,6 +76,7 @@ public final class GrpcGetDataStream private final Consumer> processHeartbeatResponses; private GrpcGetDataStream( + String backendWorkerToken, Function, StreamObserver> startGetDataRpcFn, BackOff backoff, @@ -88,7 +90,13 @@ private GrpcGetDataStream( boolean sendKeyedGetDataRequests, Consumer> processHeartbeatResponses) { super( - startGetDataRpcFn, backoff, streamObserverFactory, streamRegistry, logEveryNStreamFailures); + "GetDataStream", + startGetDataRpcFn, + backoff, + streamObserverFactory, + streamRegistry, + logEveryNStreamFailures, + backendWorkerToken); this.idGenerator = idGenerator; this.getDataThrottleTimer = getDataThrottleTimer; this.jobHeader = jobHeader; @@ -100,6 +108,7 @@ private GrpcGetDataStream( } public static GrpcGetDataStream create( + String backendWorkerToken, Function, StreamObserver> startGetDataRpcFn, BackOff backoff, @@ -114,6 +123,7 @@ public static GrpcGetDataStream create( Consumer> processHeartbeatResponses) { GrpcGetDataStream getDataStream = new GrpcGetDataStream( + backendWorkerToken, startGetDataRpcFn, backoff, streamObserverFactory, @@ -189,11 +199,15 @@ public GlobalData requestGlobalData(GlobalDataRequest request) { } @Override - public void refreshActiveWork(Map> heartbeats) { + public void refreshActiveWork(Map> heartbeats) { + if (isShutdown()) { + throw new WindmillStreamShutdownException("Unable to refresh work for shutdown stream."); + } + StreamingGetDataRequest.Builder builder = StreamingGetDataRequest.newBuilder(); if (sendKeyedGetDataRequests) { long builderBytes = 0; - for (Map.Entry> entry : heartbeats.entrySet()) { + for (Map.Entry> entry : heartbeats.entrySet()) { for (HeartbeatRequest request : entry.getValue()) { // Calculate the bytes with some overhead for proto encoding. long bytes = (long) entry.getKey().length() + request.getSerializedSize() + 10; @@ -224,7 +238,7 @@ public void refreshActiveWork(Map> heartbeats) { } else { // No translation necessary, but we must still respect `RPC_STREAM_CHUNK_SIZE`. long builderBytes = 0; - for (Map.Entry> entry : heartbeats.entrySet()) { + for (Map.Entry> entry : heartbeats.entrySet()) { ComputationHeartbeatRequest.Builder computationHeartbeatBuilder = ComputationHeartbeatRequest.newBuilder().setComputationId(entry.getKey()); for (HeartbeatRequest request : entry.getValue()) { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java index 867180fb0d31..09ecbf3f3051 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java @@ -17,49 +17,39 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; -import java.io.IOException; import java.io.PrintWriter; -import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Function; -import javax.annotation.Nullable; -import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.LatencyAttribution; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkRequestExtension; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkResponseChunk; import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GetWorkResponseChunkAssembler.AssembledWorkItem; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.sdk.util.BackOff; -import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; -import org.joda.time.Instant; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -public final class GrpcGetWorkStream +final class GrpcGetWorkStream extends AbstractWindmillStream implements GetWorkStream { - private static final Logger LOG = LoggerFactory.getLogger(GrpcGetWorkStream.class); - private final GetWorkRequest request; private final WorkItemReceiver receiver; private final ThrottleTimer getWorkThrottleTimer; - private final Map buffers; + private final Map workItemAssemblers; private final AtomicLong inflightMessages; private final AtomicLong inflightBytes; private GrpcGetWorkStream( + String backendWorkerToken, Function< StreamObserver, StreamObserver> @@ -72,16 +62,23 @@ private GrpcGetWorkStream( ThrottleTimer getWorkThrottleTimer, WorkItemReceiver receiver) { super( - startGetWorkRpcFn, backoff, streamObserverFactory, streamRegistry, logEveryNStreamFailures); + "GetWorkStream", + startGetWorkRpcFn, + backoff, + streamObserverFactory, + streamRegistry, + logEveryNStreamFailures, + backendWorkerToken); this.request = request; this.getWorkThrottleTimer = getWorkThrottleTimer; this.receiver = receiver; - this.buffers = new ConcurrentHashMap<>(); + this.workItemAssemblers = new ConcurrentHashMap<>(); this.inflightMessages = new AtomicLong(); this.inflightBytes = new AtomicLong(); } public static GrpcGetWorkStream create( + String backendWorkerToken, Function< StreamObserver, StreamObserver> @@ -95,6 +92,7 @@ public static GrpcGetWorkStream create( WorkItemReceiver receiver) { GrpcGetWorkStream getWorkStream = new GrpcGetWorkStream( + backendWorkerToken, startGetWorkRpcFn, request, backoff, @@ -129,7 +127,7 @@ private void sendRequestExtension(long moreItems, long moreBytes) { @Override protected synchronized void onNewStream() { - buffers.clear(); + workItemAssemblers.clear(); inflightMessages.set(request.getMaxItems()); inflightBytes.set(request.getMaxBytes()); send(StreamingGetWorkRequest.newBuilder().setRequest(request).build()); @@ -145,7 +143,7 @@ public void appendSpecificHtml(PrintWriter writer) { // Number of buffers is same as distinct workers that sent work on this stream. writer.format( "GetWorkStream: %d buffers, %d inflight messages allowed, %d inflight bytes allowed", - buffers.size(), inflightMessages.intValue(), inflightBytes.intValue()); + workItemAssemblers.size(), inflightMessages.intValue(), inflightBytes.intValue()); } @Override @@ -160,30 +158,33 @@ public void sendHealthCheck() { @Override protected void onResponse(StreamingGetWorkResponseChunk chunk) { getWorkThrottleTimer.stop(); + workItemAssemblers + .computeIfAbsent(chunk.getStreamId(), unused -> new GetWorkResponseChunkAssembler()) + .append(chunk) + .ifPresent(this::consumeAssembledWorkItem); + } - GrpcGetWorkStream.WorkItemBuffer buffer = - buffers.computeIfAbsent( - chunk.getStreamId(), unused -> new GrpcGetWorkStream.WorkItemBuffer()); - buffer.append(chunk); - - if (chunk.getRemainingBytesForWorkItem() == 0) { - long size = buffer.bufferedSize(); - buffer.runAndReset(); - - // Record the fact that there are now fewer outstanding messages and bytes on the stream. - long numInflight = inflightMessages.decrementAndGet(); - long bytesInflight = inflightBytes.addAndGet(-size); - - // If the outstanding items or bytes limit has gotten too low, top both off with a - // GetWorkExtension. The goal is to keep the limits relatively close to their maximum - // values without sending too many extension requests. - if (numInflight < request.getMaxItems() / 2 || bytesInflight < request.getMaxBytes() / 2) { - long moreItems = request.getMaxItems() - numInflight; - long moreBytes = request.getMaxBytes() - bytesInflight; - inflightMessages.getAndAdd(moreItems); - inflightBytes.getAndAdd(moreBytes); - sendRequestExtension(moreItems, moreBytes); - } + private void consumeAssembledWorkItem(AssembledWorkItem assembledWorkItem) { + receiver.receiveWork( + assembledWorkItem.computationMetadata().computationId(), + assembledWorkItem.computationMetadata().inputDataWatermark(), + assembledWorkItem.computationMetadata().synchronizedProcessingTime(), + assembledWorkItem.workItem(), + assembledWorkItem.latencyAttributions()); + + // Record the fact that there are now fewer outstanding messages and bytes on the stream. + long numInflight = inflightMessages.decrementAndGet(); + long bytesInflight = inflightBytes.addAndGet(-assembledWorkItem.bufferedSize()); + + // If the outstanding items or bytes limit has gotten too low, top both off with a + // GetWorkExtension. The goal is to keep the limits relatively close to their maximum + // values without sending too many extension requests. + if (numInflight < request.getMaxItems() / 2 || bytesInflight < request.getMaxBytes() / 2) { + long moreItems = request.getMaxItems() - numInflight; + long moreBytes = request.getMaxBytes() - bytesInflight; + inflightMessages.getAndAdd(moreItems); + inflightBytes.getAndAdd(moreBytes); + sendRequestExtension(moreItems, moreBytes); } } @@ -204,63 +205,4 @@ public GetWorkBudget remainingBudget() { .setItems(request.getMaxItems() - inflightMessages.get()) .build(); } - - private class WorkItemBuffer { - private final GetWorkTimingInfosTracker workTimingInfosTracker; - private String computation; - @Nullable private Instant inputDataWatermark; - @Nullable private Instant synchronizedProcessingTime; - private ByteString data; - private long bufferedSize; - - @SuppressWarnings("initialization.fields.uninitialized") - WorkItemBuffer() { - workTimingInfosTracker = new GetWorkTimingInfosTracker(System::currentTimeMillis); - data = ByteString.EMPTY; - bufferedSize = 0; - } - - @SuppressWarnings("NullableProblems") - private void setMetadata(Windmill.ComputationWorkItemMetadata metadata) { - this.computation = metadata.getComputationId(); - this.inputDataWatermark = - WindmillTimeUtils.windmillToHarnessWatermark(metadata.getInputDataWatermark()); - this.synchronizedProcessingTime = - WindmillTimeUtils.windmillToHarnessWatermark( - metadata.getDependentRealtimeInputWatermark()); - } - - private void append(StreamingGetWorkResponseChunk chunk) { - if (chunk.hasComputationMetadata()) { - setMetadata(chunk.getComputationMetadata()); - } - - this.data = data.concat(chunk.getSerializedWorkItem()); - this.bufferedSize += chunk.getSerializedWorkItem().size(); - workTimingInfosTracker.addTimingInfo(chunk.getPerWorkItemTimingInfosList()); - } - - private long bufferedSize() { - return bufferedSize; - } - - private void runAndReset() { - try { - Windmill.WorkItem workItem = Windmill.WorkItem.parseFrom(data.newInput()); - List getWorkStreamLatencies = - workTimingInfosTracker.getLatencyAttributions(); - receiver.receiveWork( - computation, - inputDataWatermark, - synchronizedProcessingTime, - workItem, - getWorkStreamLatencies); - } catch (IOException e) { - LOG.error("Failed to parse work item from stream: ", e); - } - workTimingInfosTracker.reset(); - data = ByteString.EMPTY; - bufferedSize = 0; - } - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java index 3672f02c813f..44e21a9b18ed 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java @@ -65,11 +65,13 @@ private GrpcGetWorkerMetadataStream( ThrottleTimer getWorkerMetadataThrottleTimer, Consumer serverMappingConsumer) { super( + "GetWorkerMetadataStream", startGetWorkerMetadataRpcFn, backoff, streamObserverFactory, streamRegistry, - logEveryNStreamFailures); + logEveryNStreamFailures, + ""); this.workerMetadataRequest = WorkerMetadataRequest.newBuilder().setHeader(jobHeader).build(); this.metadataVersion = metadataVersion; this.getWorkerMetadataThrottleTimer = getWorkerMetadataThrottleTimer; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java index 0ab03a803180..1fce4d238b2e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java @@ -254,11 +254,6 @@ public void setWindmillServiceEndpoints(Set endpoints) { dispatcherClient.consumeWindmillDispatcherEndpoints(ImmutableSet.copyOf(endpoints)); } - @Override - public boolean isReady() { - return dispatcherClient.hasInitializedEndpoints(); - } - private synchronized void initializeLocalHost(int port) { this.maxBackoff = Duration.millis(500); if (options.isEnableStreamingEngine()) { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java index 14866f3f586b..92f031db9972 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java @@ -37,6 +37,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ComputationHeartbeatResponse; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillConnection; import org.apache.beam.runners.dataflow.worker.windmill.WindmillEndpoints; import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; @@ -44,10 +45,12 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkerMetadataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.sdk.util.FluentBackoff; @@ -69,6 +72,7 @@ public class GrpcWindmillStreamFactory implements StatusDataProvider { private static final int DEFAULT_STREAMING_RPC_BATCH_LIMIT = Integer.MAX_VALUE; private static final int DEFAULT_WINDMILL_MESSAGES_BETWEEN_IS_READY_CHECKS = 1; private static final int NO_HEALTH_CHECKS = -1; + private static final String NO_BACKEND_WORKER_TOKEN = ""; private final JobHeader jobHeader; private final int logEveryNStreamFailures; @@ -179,6 +183,7 @@ public GetWorkStream createGetWorkStream( ThrottleTimer getWorkThrottleTimer, WorkItemReceiver processWorkItem) { return GrpcGetWorkStream.create( + NO_BACKEND_WORKER_TOKEN, responseObserver -> withDefaultDeadline(stub).getWorkStream(responseObserver), request, grpcBackOff.get(), @@ -190,21 +195,24 @@ public GetWorkStream createGetWorkStream( } public GetWorkStream createDirectGetWorkStream( - CloudWindmillServiceV1Alpha1Stub stub, + WindmillConnection connection, GetWorkRequest request, ThrottleTimer getWorkThrottleTimer, - Supplier getDataStream, + Supplier heartbeatSender, + Supplier getDataClient, Supplier workCommitter, WorkItemScheduler workItemScheduler) { return GrpcDirectGetWorkStream.create( - responseObserver -> withDefaultDeadline(stub).getWorkStream(responseObserver), + connection.backendWorkerToken(), + responseObserver -> withDefaultDeadline(connection.stub()).getWorkStream(responseObserver), request, grpcBackOff.get(), newStreamObserverFactory(), streamRegistry, logEveryNStreamFailures, getWorkThrottleTimer, - getDataStream, + heartbeatSender, + getDataClient, workCommitter, workItemScheduler); } @@ -212,6 +220,7 @@ public GetWorkStream createDirectGetWorkStream( public GetDataStream createGetDataStream( CloudWindmillServiceV1Alpha1Stub stub, ThrottleTimer getDataThrottleTimer) { return GrpcGetDataStream.create( + NO_BACKEND_WORKER_TOKEN, responseObserver -> withDefaultDeadline(stub).getDataStream(responseObserver), grpcBackOff.get(), newStreamObserverFactory(), @@ -228,6 +237,7 @@ public GetDataStream createGetDataStream( public CommitWorkStream createCommitWorkStream( CloudWindmillServiceV1Alpha1Stub stub, ThrottleTimer commitWorkThrottleTimer) { return GrpcCommitWorkStream.create( + NO_BACKEND_WORKER_TOKEN, responseObserver -> withDefaultDeadline(stub).commitWorkStream(responseObserver), grpcBackOff.get(), newStreamObserverFactory(), diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/StreamObserverCancelledException.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/StreamObserverCancelledException.java new file mode 100644 index 000000000000..4ea209f31b1d --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/StreamObserverCancelledException.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers; + +import org.apache.beam.sdk.annotations.Internal; + +@Internal +public final class StreamObserverCancelledException extends RuntimeException { + public StreamObserverCancelledException(Throwable cause) { + super(cause); + } + + public StreamObserverCancelledException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/WorkItemScheduler.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/WorkItemScheduler.java index 17c9f7d80d5d..00784493fe3d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/WorkItemScheduler.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/WorkItemScheduler.java @@ -18,7 +18,6 @@ package org.apache.beam.runners.dataflow.worker.windmill.work; import java.util.Collection; -import java.util.function.Consumer; import javax.annotation.CheckReturnValue; import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; @@ -36,8 +35,6 @@ public interface WorkItemScheduler { * @param workItem {@link WorkItem} to be processed. * @param watermarks processing watermarks for the workItem. * @param processingContext for processing the workItem. - * @param ackWorkItemQueued Called after an attempt to queue the work item for processing. Used to - * free up pending budget. * @param getWorkStreamLatencies Latencies per processing stage for the WorkItem for reporting * back to Streaming Engine backend. */ @@ -45,6 +42,5 @@ void scheduleWork( WorkItem workItem, Watermarks watermarks, Work.ProcessingContext processingContext, - Consumer ackWorkItemQueued, Collection getWorkStreamLatencies); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributor.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributor.java index 3a17222d3e6b..403bb99efb4c 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributor.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributor.java @@ -26,14 +26,13 @@ import java.util.Map.Entry; import java.util.function.Function; import java.util.function.Supplier; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.WindmillStreamSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableCollection; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** Evenly distributes the provided budget across the available {@link WindmillStreamSender}(s). */ +/** Evenly distributes the provided budget across the available {@link GetWorkBudgetSpender}(s). */ @Internal final class EvenGetWorkBudgetDistributor implements GetWorkBudgetDistributor { private static final Logger LOG = LoggerFactory.getLogger(EvenGetWorkBudgetDistributor.class); @@ -50,10 +49,10 @@ private static boolean isBelowFiftyPercentOfTarget( } @Override - public void distributeBudget( - ImmutableCollection streams, GetWorkBudget getWorkBudget) { - if (streams.isEmpty()) { - LOG.debug("Cannot distribute budget to no streams."); + public void distributeBudget( + ImmutableCollection budgetOwners, GetWorkBudget getWorkBudget) { + if (budgetOwners.isEmpty()) { + LOG.debug("Cannot distribute budget to no owners."); return; } @@ -62,23 +61,21 @@ public void distributeBudget( return; } - Map desiredBudgets = - computeDesiredBudgets(streams, getWorkBudget); + Map desiredBudgets = computeDesiredBudgets(budgetOwners, getWorkBudget); - for (Entry streamAndDesiredBudget : - desiredBudgets.entrySet()) { - WindmillStreamSender stream = streamAndDesiredBudget.getKey(); + for (Entry streamAndDesiredBudget : desiredBudgets.entrySet()) { + GetWorkBudgetSpender getWorkBudgetSpender = streamAndDesiredBudget.getKey(); GetWorkBudget desired = streamAndDesiredBudget.getValue(); - GetWorkBudget remaining = stream.remainingGetWorkBudget(); + GetWorkBudget remaining = getWorkBudgetSpender.remainingBudget(); if (isBelowFiftyPercentOfTarget(remaining, desired)) { GetWorkBudget adjustment = desired.subtract(remaining); - stream.adjustBudget(adjustment); + getWorkBudgetSpender.adjustBudget(adjustment); } } } - private ImmutableMap computeDesiredBudgets( - ImmutableCollection streams, GetWorkBudget totalGetWorkBudget) { + private ImmutableMap computeDesiredBudgets( + ImmutableCollection streams, GetWorkBudget totalGetWorkBudget) { GetWorkBudget activeWorkBudget = activeWorkBudgetSupplier.get(); LOG.info("Current active work budget: {}", activeWorkBudget); // TODO: Fix possibly non-deterministic handing out of budgets. diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetDistributor.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetDistributor.java index 3ec9718e041e..d21de17e522c 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetDistributor.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetDistributor.java @@ -17,7 +17,6 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.work.budget; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.WindmillStreamSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableCollection; @@ -28,6 +27,6 @@ */ @Internal public interface GetWorkBudgetDistributor { - void distributeBudget( - ImmutableCollection streams, GetWorkBudget getWorkBudget); + void distributeBudget( + ImmutableCollection streams, GetWorkBudget getWorkBudget); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetSpender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetSpender.java new file mode 100644 index 000000000000..254b2589062e --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetSpender.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.budget; + +/** + * Represents something that spends {@link + * org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget} + */ +public interface GetWorkBudgetSpender { + void adjustBudget(long itemsDelta, long bytesDelta); + + default void adjustBudget(GetWorkBudget adjustment) { + adjustBudget(adjustment.items(), adjustment.bytes()); + } + + GetWorkBudget remainingBudget(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java index 334ab8efeae2..b0b6377dd8b1 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java @@ -23,7 +23,7 @@ import java.util.Optional; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.function.Supplier; import javax.annotation.concurrent.ThreadSafe; @@ -31,6 +31,7 @@ import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.DataflowMapTaskExecutorFactory; import org.apache.beam.runners.dataflow.worker.HotKeyLogger; +import org.apache.beam.runners.dataflow.worker.OperationalLimits; import org.apache.beam.runners.dataflow.worker.ReaderCache; import org.apache.beam.runners.dataflow.worker.WorkItemCancelledException; import org.apache.beam.runners.dataflow.worker.logging.DataflowWorkerLoggingMDC; @@ -43,6 +44,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingCounters; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; +import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcherFactory; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.Commit; @@ -73,7 +75,7 @@ public final class StreamingWorkScheduler { private final DataflowWorkerHarnessOptions options; private final Supplier clock; private final ComputationWorkExecutorFactory computationWorkExecutorFactory; - private final SideInputStateFetcher sideInputStateFetcher; + private final SideInputStateFetcherFactory sideInputStateFetcherFactory; private final FailureTracker failureTracker; private final WorkFailureProcessor workFailureProcessor; private final StreamingCommitFinalizer commitFinalizer; @@ -81,13 +83,13 @@ public final class StreamingWorkScheduler { private final HotKeyLogger hotKeyLogger; private final ConcurrentMap stageInfoMap; private final DataflowExecutionStateSampler sampler; - private final AtomicInteger maxWorkItemCommitBytes; + private final AtomicReference operationalLimits; public StreamingWorkScheduler( DataflowWorkerHarnessOptions options, Supplier clock, ComputationWorkExecutorFactory computationWorkExecutorFactory, - SideInputStateFetcher sideInputStateFetcher, + SideInputStateFetcherFactory sideInputStateFetcherFactory, FailureTracker failureTracker, WorkFailureProcessor workFailureProcessor, StreamingCommitFinalizer commitFinalizer, @@ -95,11 +97,11 @@ public StreamingWorkScheduler( HotKeyLogger hotKeyLogger, ConcurrentMap stageInfoMap, DataflowExecutionStateSampler sampler, - AtomicInteger maxWorkItemCommitBytes) { + AtomicReference operationalLimits) { this.options = options; this.clock = clock; this.computationWorkExecutorFactory = computationWorkExecutorFactory; - this.sideInputStateFetcher = sideInputStateFetcher; + this.sideInputStateFetcherFactory = sideInputStateFetcherFactory; this.failureTracker = failureTracker; this.workFailureProcessor = workFailureProcessor; this.commitFinalizer = commitFinalizer; @@ -107,7 +109,7 @@ public StreamingWorkScheduler( this.hotKeyLogger = hotKeyLogger; this.stageInfoMap = stageInfoMap; this.sampler = sampler; - this.maxWorkItemCommitBytes = maxWorkItemCommitBytes; + this.operationalLimits = operationalLimits; } public static StreamingWorkScheduler create( @@ -117,13 +119,12 @@ public static StreamingWorkScheduler create( DataflowMapTaskExecutorFactory mapTaskExecutorFactory, BoundedQueueExecutor workExecutor, Function stateCacheFactory, - Function fetchGlobalDataFn, FailureTracker failureTracker, WorkFailureProcessor workFailureProcessor, StreamingCounters streamingCounters, HotKeyLogger hotKeyLogger, DataflowExecutionStateSampler sampler, - AtomicInteger maxWorkItemCommitBytes, + AtomicReference operationalLimits, IdGenerator idGenerator, ConcurrentMap stageInfoMap) { ComputationWorkExecutorFactory computationWorkExecutorFactory = @@ -140,7 +141,7 @@ public static StreamingWorkScheduler create( options, clock, computationWorkExecutorFactory, - new SideInputStateFetcher(fetchGlobalDataFn, options), + SideInputStateFetcherFactory.fromOptions(options), failureTracker, workFailureProcessor, StreamingCommitFinalizer.create(workExecutor), @@ -148,7 +149,7 @@ public static StreamingWorkScheduler create( hotKeyLogger, stageInfoMap, sampler, - maxWorkItemCommitBytes); + operationalLimits); } private static long computeShuffleBytesRead(Windmill.WorkItem workItem) { @@ -292,7 +293,7 @@ private Windmill.WorkItemCommitRequest validateCommitRequestSize( Windmill.WorkItemCommitRequest commitRequest, String computationId, Windmill.WorkItem workItem) { - int byteLimit = maxWorkItemCommitBytes.get(); + long byteLimit = operationalLimits.get().maxWorkItemCommitBytes; int commitSize = commitRequest.getSerializedSize(); int estimatedCommitSize = commitSize < 0 ? Integer.MAX_VALUE : commitSize; @@ -347,7 +348,8 @@ private ExecuteWorkResult executeWork( try { WindmillStateReader stateReader = work.createWindmillStateReader(); - SideInputStateFetcher localSideInputStateFetcher = sideInputStateFetcher.byteTrackingView(); + SideInputStateFetcher localSideInputStateFetcher = + sideInputStateFetcherFactory.createSideInputStateFetcher(work::fetchSideInput); // If the read output KVs, then we can decode Windmill's byte key into userland // key object and provide it to the execution context for use with per-key state. @@ -375,7 +377,12 @@ private ExecuteWorkResult executeWork( // Blocks while executing work. computationWorkExecutor.executeWork( - executionKey, work, stateReader, localSideInputStateFetcher, outputBuilder); + executionKey, + work, + stateReader, + localSideInputStateFetcher, + operationalLimits.get(), + outputBuilder); if (work.isFailed()) { throw new WorkItemCancelledException(workItem.getShardingKey()); @@ -397,8 +404,7 @@ private ExecuteWorkResult executeWork( computationState.releaseComputationWorkExecutor(computationWorkExecutor); work.setState(Work.State.COMMIT_QUEUED); - outputBuilder.addAllPerWorkItemLatencyAttributions( - work.getLatencyAttributions(false, sampler)); + outputBuilder.addAllPerWorkItemLatencyAttributions(work.getLatencyAttributions(sampler)); return ExecuteWorkResult.create( outputBuilder, stateReader.getBytesRead() + localSideInputStateFetcher.getBytesRead()); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java index 96a6feec1da0..781285def020 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java @@ -17,13 +17,26 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap.toImmutableMap; + +import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; +import javax.annotation.Nullable; import javax.annotation.concurrent.ThreadSafe; import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; +import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.joda.time.Duration; import org.joda.time.Instant; import org.slf4j.Logger; @@ -37,29 +50,39 @@ * threshold is determined by {@link #activeWorkRefreshPeriodMillis} */ @ThreadSafe -public abstract class ActiveWorkRefresher { +@Internal +public final class ActiveWorkRefresher { private static final Logger LOG = LoggerFactory.getLogger(ActiveWorkRefresher.class); + private static final String FAN_OUT_REFRESH_WORK_EXECUTOR_NAME = + "FanOutActiveWorkRefreshExecutor-%d"; - protected final Supplier clock; - protected final int activeWorkRefreshPeriodMillis; - protected final Supplier> computations; - protected final DataflowExecutionStateSampler sampler; + private final Supplier clock; + private final int activeWorkRefreshPeriodMillis; + private final Supplier> computations; + private final DataflowExecutionStateSampler sampler; private final int stuckCommitDurationMillis; + private final HeartbeatTracker heartbeatTracker; private final ScheduledExecutorService activeWorkRefreshExecutor; + private final ExecutorService fanOutActiveWorkRefreshExecutor; - protected ActiveWorkRefresher( + public ActiveWorkRefresher( Supplier clock, int activeWorkRefreshPeriodMillis, int stuckCommitDurationMillis, Supplier> computations, DataflowExecutionStateSampler sampler, - ScheduledExecutorService activeWorkRefreshExecutor) { + ScheduledExecutorService activeWorkRefreshExecutor, + HeartbeatTracker heartbeatTracker) { this.clock = clock; this.activeWorkRefreshPeriodMillis = activeWorkRefreshPeriodMillis; this.stuckCommitDurationMillis = stuckCommitDurationMillis; this.computations = computations; this.sampler = sampler; this.activeWorkRefreshExecutor = activeWorkRefreshExecutor; + this.heartbeatTracker = heartbeatTracker; + this.fanOutActiveWorkRefreshExecutor = + Executors.newCachedThreadPool( + new ThreadFactoryBuilder().setNameFormat(FAN_OUT_REFRESH_WORK_EXECUTOR_NAME).build()); } @SuppressWarnings("FutureReturnValueIgnored") @@ -103,5 +126,70 @@ private void invalidateStuckCommits() { } } - protected abstract void refreshActiveWork(); + private void refreshActiveWork() { + Instant refreshDeadline = clock.get().minus(Duration.millis(activeWorkRefreshPeriodMillis)); + Map heartbeatsBySender = + aggregateHeartbeatsBySender(refreshDeadline); + if (heartbeatsBySender.isEmpty()) { + return; + } + + List> fanOutRefreshActiveWork = new ArrayList<>(); + + // Send the first heartbeat on the calling thread, and fan out the rest via the + // fanOutActiveWorkRefreshExecutor. + @Nullable Map.Entry firstHeartbeat = null; + for (Map.Entry heartbeat : heartbeatsBySender.entrySet()) { + if (firstHeartbeat == null) { + firstHeartbeat = heartbeat; + } else { + fanOutRefreshActiveWork.add( + CompletableFuture.runAsync( + () -> sendHeartbeatSafely(heartbeat), fanOutActiveWorkRefreshExecutor)); + } + } + + sendHeartbeatSafely(firstHeartbeat); + fanOutRefreshActiveWork.forEach(CompletableFuture::join); + } + + /** Aggregate the heartbeats across computations by HeartbeatSender for correct fan out. */ + private Map aggregateHeartbeatsBySender(Instant refreshDeadline) { + Map heartbeatsBySender = new HashMap<>(); + + // Aggregate the heartbeats across computations by HeartbeatSender for correct fan out. + for (ComputationState computationState : computations.get()) { + for (RefreshableWork work : computationState.getRefreshableWork(refreshDeadline)) { + heartbeatsBySender + .computeIfAbsent(work.heartbeatSender(), ignored -> Heartbeats.builder()) + .add(computationState.getComputationId(), work, sampler); + } + } + + return heartbeatsBySender.entrySet().stream() + .collect(toImmutableMap(Map.Entry::getKey, e -> e.getValue().build())); + } + + /** + * Send the {@link Heartbeats} using the {@link HeartbeatSender}. Safe since exceptions are caught + * and logged. + */ + private void sendHeartbeatSafely(Map.Entry heartbeat) { + try (AutoCloseable ignored = heartbeatTracker.trackHeartbeats(heartbeat.getValue().size())) { + HeartbeatSender sender = heartbeat.getKey(); + Heartbeats heartbeats = heartbeat.getValue(); + sender.sendHeartbeats(heartbeats); + } catch (Exception e) { + LOG.error( + "Unable to send {} heartbeats to {}.", + heartbeat.getValue().size(), + heartbeat.getKey(), + e); + } + } + + @FunctionalInterface + public interface HeartbeatTracker { + AutoCloseable trackHeartbeats(int numHeartbeats); + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefreshers.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefreshers.java deleted file mode 100644 index 5a59a7f1ae01..000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefreshers.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; - -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ScheduledExecutorService; -import java.util.function.Consumer; -import java.util.function.Supplier; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; -import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; -import org.joda.time.Instant; - -/** Utility class for {@link ActiveWorkRefresher}. */ -public final class ActiveWorkRefreshers { - public static ActiveWorkRefresher createDispatchedActiveWorkRefresher( - Supplier clock, - int activeWorkRefreshPeriodMillis, - int stuckCommitDurationMillis, - Supplier> computations, - DataflowExecutionStateSampler sampler, - Consumer>> activeWorkRefresherFn, - ScheduledExecutorService scheduledExecutorService) { - return new DispatchedActiveWorkRefresher( - clock, - activeWorkRefreshPeriodMillis, - stuckCommitDurationMillis, - computations, - sampler, - activeWorkRefresherFn, - scheduledExecutorService); - } -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ApplianceHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ApplianceHeartbeatSender.java new file mode 100644 index 000000000000..b0f714433805 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ApplianceHeartbeatSender.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import java.util.Collection; +import java.util.Map; +import java.util.function.Consumer; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.sdk.annotations.Internal; + +/** Streaming appliance implementation of {@link HeartbeatSender}. */ +@Internal +public final class ApplianceHeartbeatSender implements HeartbeatSender { + private final Consumer sendHeartbeatFn; + + public ApplianceHeartbeatSender(Consumer sendHeartbeatFn) { + this.sendHeartbeatFn = sendHeartbeatFn; + } + + /** + * Appliance which sends heartbeats (used to refresh active work) as KeyedGetDataRequests. So we + * must translate the HeartbeatRequest to a KeyedGetDataRequest here. + */ + @Override + public void sendHeartbeats(Heartbeats heartbeats) { + Windmill.GetDataRequest.Builder builder = Windmill.GetDataRequest.newBuilder(); + + for (Map.Entry> entry : + heartbeats.heartbeatRequests().asMap().entrySet()) { + Windmill.ComputationGetDataRequest.Builder perComputationBuilder = + Windmill.ComputationGetDataRequest.newBuilder(); + perComputationBuilder.setComputationId(entry.getKey()); + for (Windmill.HeartbeatRequest request : entry.getValue()) { + perComputationBuilder.addRequests( + Windmill.KeyedGetDataRequest.newBuilder() + .setShardingKey(request.getShardingKey()) + .setWorkToken(request.getWorkToken()) + .setCacheToken(request.getCacheToken()) + .addAllLatencyAttribution(request.getLatencyAttributionList()) + .build()); + } + builder.addRequests(perComputationBuilder.build()); + } + + sendHeartbeatFn.accept(builder.build()); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresher.java deleted file mode 100644 index f81233498fe3..000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresher.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; - -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ScheduledExecutorService; -import java.util.function.Consumer; -import java.util.function.Supplier; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; -import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.joda.time.Duration; -import org.joda.time.Instant; - -final class DispatchedActiveWorkRefresher extends ActiveWorkRefresher { - - private final Consumer>> activeWorkRefresherFn; - - DispatchedActiveWorkRefresher( - Supplier clock, - int activeWorkRefreshPeriodMillis, - int stuckCommitDurationMillis, - Supplier> computations, - DataflowExecutionStateSampler sampler, - Consumer>> activeWorkRefresherFn, - ScheduledExecutorService scheduledExecutorService) { - super( - clock, - activeWorkRefreshPeriodMillis, - stuckCommitDurationMillis, - computations, - sampler, - scheduledExecutorService); - this.activeWorkRefresherFn = activeWorkRefresherFn; - } - - @Override - protected void refreshActiveWork() { - Map> heartbeats = new HashMap<>(); - Instant refreshDeadline = clock.get().minus(Duration.millis(activeWorkRefreshPeriodMillis)); - - for (ComputationState computationState : computations.get()) { - heartbeats.put( - computationState.getComputationId(), - computationState.getKeyHeartbeats(refreshDeadline, sampler)); - } - - activeWorkRefresherFn.accept(heartbeats); - } -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java new file mode 100644 index 000000000000..33a55d1927f8 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/FixedStreamHeartbeatSender.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import java.util.Objects; +import javax.annotation.Nullable; +import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; +import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.sdk.annotations.Internal; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link HeartbeatSender} implementation that sends heartbeats directly on the underlying stream if + * the stream is not closed. + * + * @implNote + *

{@link #equals(Object)} and {@link #hashCode()} implementations delegate to internal + * {@link GetDataStream} implementations so that requests can be grouped and sent on the same + * stream instance. + *

This class is a stateless decorator to the underlying stream. + */ +@Internal +public final class FixedStreamHeartbeatSender implements HeartbeatSender { + private static final Logger LOG = LoggerFactory.getLogger(FixedStreamHeartbeatSender.class); + private final GetDataStream getDataStream; + + private FixedStreamHeartbeatSender(GetDataStream getDataStream) { + this.getDataStream = getDataStream; + } + + public static FixedStreamHeartbeatSender create(GetDataStream getDataStream) { + return new FixedStreamHeartbeatSender(getDataStream); + } + + @Override + public void sendHeartbeats(Heartbeats heartbeats) { + @Nullable String originalThreadName = null; + try { + String backendWorkerToken = getDataStream.backendWorkerToken(); + if (!backendWorkerToken.isEmpty()) { + // Decorate the thread name w/ the backendWorkerToken for debugging. Resets the thread's + // name after sending the heartbeats succeeds or fails. + originalThreadName = Thread.currentThread().getName(); + Thread.currentThread().setName(originalThreadName + "-" + backendWorkerToken); + } + getDataStream.refreshActiveWork(heartbeats.heartbeatRequests().asMap()); + } catch (AbstractWindmillStream.WindmillStreamShutdownException e) { + LOG.warn( + "Trying to refresh work w/ {} heartbeats on stream={} after work has moved off of worker." + + " heartbeats", + getDataStream.backendWorkerToken(), + heartbeats.heartbeatRequests().size()); + heartbeats.work().forEach(RefreshableWork::setFailed); + } finally { + if (originalThreadName != null) { + Thread.currentThread().setName(originalThreadName); + } + } + } + + @Override + public int hashCode() { + return Objects.hash(FixedStreamHeartbeatSender.class, getDataStream); + } + + @Override + public boolean equals(Object obj) { + return obj instanceof FixedStreamHeartbeatSender + && getDataStream.equals(((FixedStreamHeartbeatSender) obj).getDataStream); + } + + @Override + public String toString() { + return "HeartbeatSender-" + getDataStream.backendWorkerToken(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java new file mode 100644 index 000000000000..06559344332c --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/HeartbeatSender.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +/** + * Interface for sending heartbeats. + * + * @implNote Batching/grouping of heartbeats is performed by HeartbeatSender equality. + */ +@FunctionalInterface +public interface HeartbeatSender { + /** + * Send heartbeats. Heartbeats represent WorkItem that is actively being processed belonging to + * the computation. + */ + void sendHeartbeats(Heartbeats heartbeats); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java new file mode 100644 index 000000000000..071bf7fa3d43 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/Heartbeats.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import com.google.auto.value.AutoValue; +import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; +import org.apache.beam.runners.dataflow.worker.streaming.RefreshableWork; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableListMultimap; + +/** Heartbeat requests and the work that was used to generate the heartbeat requests. */ +@AutoValue +abstract class Heartbeats { + + static Heartbeats.Builder builder() { + return new AutoValue_Heartbeats.Builder(); + } + + abstract ImmutableList work(); + + abstract ImmutableListMultimap heartbeatRequests(); + + final int size() { + return heartbeatRequests().asMap().size(); + } + + @AutoValue.Builder + abstract static class Builder { + + abstract ImmutableList.Builder workBuilder(); + + abstract ImmutableListMultimap.Builder + heartbeatRequestsBuilder(); + + final Builder add( + String computationId, RefreshableWork work, DataflowExecutionStateSampler sampler) { + workBuilder().add(work); + heartbeatRequestsBuilder().put(computationId, createHeartbeatRequest(work, sampler)); + return this; + } + + private Windmill.HeartbeatRequest createHeartbeatRequest( + RefreshableWork work, DataflowExecutionStateSampler sampler) { + return Windmill.HeartbeatRequest.newBuilder() + .setShardingKey(work.getShardedKey().shardingKey()) + .setWorkToken(work.id().workToken()) + .setCacheToken(work.id().cacheToken()) + .addAllLatencyAttribution(work.getHeartbeatLatencyAttributions(sampler)) + .build(); + } + + abstract Heartbeats build(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java new file mode 100644 index 000000000000..e571f89f142c --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/StreamPoolHeartbeatSender.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; + +import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; +import org.apache.beam.sdk.annotations.Internal; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** StreamingEngine stream pool based implementation of {@link HeartbeatSender}. */ +@Internal +public final class StreamPoolHeartbeatSender implements HeartbeatSender { + private static final Logger LOG = LoggerFactory.getLogger(StreamPoolHeartbeatSender.class); + + private final WindmillStreamPool heartbeatStreamPool; + + public StreamPoolHeartbeatSender( + WindmillStreamPool heartbeatStreamPool) { + this.heartbeatStreamPool = heartbeatStreamPool; + } + + @Override + public void sendHeartbeats(Heartbeats heartbeats) { + try (CloseableStream closeableStream = + heartbeatStreamPool.getCloseableStream()) { + closeableStream.stream().refreshActiveWork(heartbeats.heartbeatRequests().asMap()); + } catch (Exception e) { + LOG.warn("Error occurred sending heartbeats=[{}].", heartbeats, e); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/BatchModeExecutionContextTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/BatchModeExecutionContextTest.java index 18bd814b4df7..4062fbf6ebed 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/BatchModeExecutionContextTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/BatchModeExecutionContextTest.java @@ -43,6 +43,7 @@ import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Distribution; import org.apache.beam.sdk.metrics.MetricName; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.metrics.MetricsContainer; import org.apache.beam.sdk.metrics.StringSet; import org.apache.beam.sdk.options.PipelineOptionsFactory; @@ -266,7 +267,7 @@ public void extractThrottleTimeCounters() { .getCounter( MetricName.named( BatchModeExecutionContext.DATASTORE_THROTTLE_TIME_NAMESPACE, - BatchModeExecutionContext.THROTTLE_TIME_COUNTER_NAME)); + Metrics.THROTTLE_TIME_COUNTER_NAME)); counter.inc(12000); counter.inc(17000); counter.inc(1000); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java index 127d46b7caf6..b3f7467cdbd3 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java @@ -28,6 +28,7 @@ import static org.junit.Assert.assertFalse; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -89,11 +90,10 @@ public final class FakeWindmillServer extends WindmillServerStub { private final AtomicInteger expectedExceptionCount; private final ErrorCollector errorCollector; private final ConcurrentHashMap> droppedStreamingCommits; - private int commitsRequested = 0; private final List getDataRequests = new ArrayList<>(); - private boolean isReady = true; - private boolean dropStreamingCommits = false; private final Consumer> processHeartbeatResponses; + private int commitsRequested = 0; + private boolean dropStreamingCommits = false; @GuardedBy("this") private ImmutableSet dispatcherEndpoints; @@ -232,7 +232,15 @@ public GetWorkStream getWorkStream(Windmill.GetWorkRequest request, WorkItemRece final CountDownLatch done = new CountDownLatch(1); return new GetWorkStream() { @Override - public void close() { + public String backendWorkerToken() { + return ""; + } + + @Override + public void shutdown() {} + + @Override + public void halfClose() { done.countDown(); } @@ -257,7 +265,7 @@ public boolean awaitTermination(int time, TimeUnit unit) throws InterruptedExcep try { sleepMillis(500); } catch (InterruptedException e) { - close(); + halfClose(); Thread.currentThread().interrupt(); } continue; @@ -294,6 +302,14 @@ public Instant startTime() { public GetDataStream getDataStream() { Instant startTime = Instant.now(); return new GetDataStream() { + @Override + public String backendWorkerToken() { + return ""; + } + + @Override + public void shutdown() {} + @Override public Windmill.KeyedGetDataResponse requestKeyedData( String computation, KeyedGetDataRequest request) { @@ -330,9 +346,9 @@ public Windmill.GlobalData requestGlobalData(Windmill.GlobalDataRequest request) } @Override - public void refreshActiveWork(Map> heartbeats) { + public void refreshActiveWork(Map> heartbeats) { Windmill.GetDataRequest.Builder builder = Windmill.GetDataRequest.newBuilder(); - for (Map.Entry> entry : heartbeats.entrySet()) { + for (Map.Entry> entry : heartbeats.entrySet()) { builder.addComputationHeartbeatRequest( ComputationHeartbeatRequest.newBuilder() .setComputationId(entry.getKey()) @@ -348,7 +364,7 @@ public void onHeartbeatResponse(List responses) { } @Override - public void close() {} + public void halfClose() {} @Override public boolean awaitTermination(int time, TimeUnit unit) { @@ -368,18 +384,16 @@ public CommitWorkStream commitWorkStream() { return new CommitWorkStream() { @Override - public RequestBatcher batcher() { - return new RequestBatcher() { - class RequestAndDone { - final Consumer onDone; - final WorkItemCommitRequest request; + public String backendWorkerToken() { + return ""; + } - RequestAndDone(WorkItemCommitRequest request, Consumer onDone) { - this.request = request; - this.onDone = onDone; - } - } + @Override + public void shutdown() {} + @Override + public RequestBatcher batcher() { + return new RequestBatcher() { final List requests = new ArrayList<>(); @Override @@ -427,11 +441,21 @@ public void flush() { } requests.clear(); } + + class RequestAndDone { + final Consumer onDone; + final WorkItemCommitRequest request; + + RequestAndDone(WorkItemCommitRequest request, Consumer onDone) { + this.request = request; + this.onDone = onDone; + } + } }; } @Override - public void close() {} + public void halfClose() {} @Override public boolean awaitTermination(int time, TimeUnit unit) { @@ -523,27 +547,13 @@ public ArrayList getStatsReceived() { } @Override - public void setWindmillServiceEndpoints(Set endpoints) { - synchronized (this) { - this.dispatcherEndpoints = ImmutableSet.copyOf(endpoints); - isReady = true; - } + public synchronized ImmutableSet getWindmillServiceEndpoints() { + return dispatcherEndpoints; } @Override - public ImmutableSet getWindmillServiceEndpoints() { - synchronized (this) { - return dispatcherEndpoints; - } - } - - @Override - public boolean isReady() { - return isReady; - } - - public void setIsReady(boolean ready) { - this.isReady = ready; + public synchronized void setWindmillServiceEndpoints(Set endpoints) { + this.dispatcherEndpoints = ImmutableSet.copyOf(endpoints); } public static class ResponseQueue { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java index 52bc61e59919..d16ed2942fd9 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java @@ -126,6 +126,8 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.Timer.Type; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WatermarkHold; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.Coder.Context; import org.apache.beam.sdk.coders.CollectionCoder; @@ -330,9 +332,7 @@ private static ExecutableWork createMockWork( .build(), Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( - computationId, - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + computationId, new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()), processWorkFn); @@ -549,7 +549,6 @@ private Windmill.GetWorkResponse buildSessionInput( List inputs, List timers) throws Exception { - // Windmill.GetWorkResponse.Builder builder = Windmill.GetWorkResponse.newBuilder(); Windmill.WorkItem.Builder builder = Windmill.WorkItem.newBuilder(); builder.setKey(DEFAULT_KEY_BYTES); builder.setShardingKey(DEFAULT_SHARDING_KEY); @@ -849,7 +848,7 @@ private StreamingDataflowWorker makeWorker( streamingDataflowWorkerTestParams.clock(), streamingDataflowWorkerTestParams.executorSupplier(), streamingDataflowWorkerTestParams.localRetryTimeoutMs(), - streamingDataflowWorkerTestParams.maxWorkItemCommitBytes()); + streamingDataflowWorkerTestParams.operationalLimits()); this.computationStateCache = worker.getComputationStateCache(); return worker; } @@ -888,7 +887,6 @@ private void runTestBasic(int numCommitThreads) throws Exception { makeSourceInstruction(StringUtf8Coder.of()), makeSinkInstruction(StringUtf8Coder.of(), 0)); - server.setIsReady(false); StreamingConfigTask streamingConfig = new StreamingConfigTask(); streamingConfig.setStreamingComputationConfigs( ImmutableList.of(makeDefaultStreamingComputationConfig(instructions))); @@ -936,8 +934,6 @@ public void testHotKeyLogging() throws Exception { makeSourceInstruction(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())), makeSinkInstruction(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), 0)); - server.setIsReady(false); - StreamingConfigTask streamingConfig = new StreamingConfigTask(); streamingConfig.setStreamingComputationConfigs( ImmutableList.of(makeDefaultStreamingComputationConfig(instructions))); @@ -975,8 +971,6 @@ public void testHotKeyLoggingNotEnabled() throws Exception { makeSourceInstruction(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())), makeSinkInstruction(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), 0)); - server.setIsReady(false); - StreamingConfigTask streamingConfig = new StreamingConfigTask(); streamingConfig.setStreamingComputationConfigs( ImmutableList.of(makeDefaultStreamingComputationConfig(instructions))); @@ -1216,7 +1210,8 @@ public void testKeyCommitTooLargeException() throws Exception { makeWorker( defaultWorkerParams() .setInstructions(instructions) - .setMaxWorkItemCommitBytes(1000) + .setOperationalLimits( + OperationalLimits.builder().setMaxWorkItemCommitBytes(1000).build()) .publishCounters() .build()); worker.start(); @@ -1271,6 +1266,80 @@ public void testKeyCommitTooLargeException() throws Exception { assertTrue(foundErrors); } + @Test + public void testOutputKeyTooLargeException() throws Exception { + KvCoder kvCoder = KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()); + + List instructions = + Arrays.asList( + makeSourceInstruction(kvCoder), + makeDoFnInstruction(new ExceptionCatchingFn(), 0, kvCoder), + makeSinkInstruction(kvCoder, 1)); + + server.setExpectedExceptionCount(1); + + StreamingDataflowWorker worker = + makeWorker( + defaultWorkerParams() + .setInstructions(instructions) + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputKeyBytes(15) + .setThrowExceptionOnLargeOutput(true) + .build()) + .build()); + worker.start(); + + // This large key will cause the ExceptionCatchingFn to throw an exception, which will then + // cause it to output a smaller key. + String bigKey = "some_much_too_large_output_key"; + server.whenGetWorkCalled().thenReturn(makeInput(1, 0, bigKey, DEFAULT_SHARDING_KEY)); + server.waitForEmptyWorkQueue(); + + Map result = server.waitForAndGetCommits(1); + assertEquals(1, result.size()); + assertEquals( + makeExpectedOutput(1, 0, bigKey, DEFAULT_SHARDING_KEY, "smaller_key").build(), + removeDynamicFields(result.get(1L))); + } + + @Test + public void testOutputValueTooLargeException() throws Exception { + KvCoder kvCoder = KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()); + + List instructions = + Arrays.asList( + makeSourceInstruction(kvCoder), + makeDoFnInstruction(new ExceptionCatchingFn(), 0, kvCoder), + makeSinkInstruction(kvCoder, 1)); + + server.setExpectedExceptionCount(1); + + StreamingDataflowWorker worker = + makeWorker( + defaultWorkerParams() + .setInstructions(instructions) + .setOperationalLimits( + OperationalLimits.builder() + .setMaxOutputValueBytes(15) + .setThrowExceptionOnLargeOutput(true) + .build()) + .build()); + worker.start(); + + // The first time processing will have value "data1_a_bunch_more_data_output", which is above + // the limit. After throwing the exception, the output should be just "data1", which is small + // enough. + server.whenGetWorkCalled().thenReturn(makeInput(1, 0, "key", DEFAULT_SHARDING_KEY)); + server.waitForEmptyWorkQueue(); + + Map result = server.waitForAndGetCommits(1); + assertEquals(1, result.size()); + assertEquals( + makeExpectedOutput(1, 0, "key", DEFAULT_SHARDING_KEY, "smaller_key").build(), + removeDynamicFields(result.get(1L))); + } + @Test public void testKeyChange() throws Exception { KvCoder kvCoder = KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()); @@ -2113,7 +2182,7 @@ public void testMergeWindowsCaching() throws Exception { // No input messages assertEquals(0L, splitIntToLong(getCounter(counters, "WindmillShuffleBytesRead").getInteger())); - CacheStats stats = worker.stateCache.getCacheStats(); + CacheStats stats = worker.getStateCacheStats(); LOG.info("cache stats {}", stats); assertEquals(1, stats.hitCount()); assertEquals(4, stats.missCount()); @@ -3410,8 +3479,9 @@ public void testLatencyAttributionProtobufsPopulated() { Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + new FakeGetDataClient(), + ignored -> {}, + mock(HeartbeatSender.class)), clock, Collections.emptyList()); @@ -3428,7 +3498,7 @@ public void testLatencyAttributionProtobufsPopulated() { clock.sleep(Duration.millis(60)); Iterator it = - work.getLatencyAttributions(false, DataflowExecutionStateSampler.instance()).iterator(); + work.getLatencyAttributions(DataflowExecutionStateSampler.instance()).iterator(); assertTrue(it.hasNext()); LatencyAttribution lat = it.next(); assertSame(State.QUEUED, lat.getState()); @@ -3713,7 +3783,7 @@ public void testDoFnActiveMessageMetadataReportedOnHeartbeat() throws Exception Map result = server.waitForAndGetCommits(1); assertThat(server.numGetDataRequests(), greaterThan(0)); - Windmill.GetDataRequest heartbeat = server.getGetDataRequests().get(2); + Windmill.GetDataRequest heartbeat = server.getGetDataRequests().get(1); for (LatencyAttribution la : heartbeat @@ -4021,6 +4091,18 @@ public void processElement(ProcessContext c) { } } + static class ExceptionCatchingFn extends DoFn, KV> { + + @ProcessElement + public void processElement(ProcessContext c) { + try { + c.output(KV.of(c.element().getKey(), c.element().getValue() + "_a_bunch_more_data_output")); + } catch (Exception e) { + c.output(KV.of("smaller_key", c.element().getValue())); + } + } + } + static class ChangeKeysFn extends DoFn, KV> { @ProcessElement @@ -4433,7 +4515,7 @@ private static StreamingDataflowWorkerTestParams.Builder builder() { .setLocalRetryTimeoutMs(-1) .setPublishCounters(false) .setClock(Instant::now) - .setMaxWorkItemCommitBytes(Integer.MAX_VALUE); + .setOperationalLimits(OperationalLimits.builder().build()); } abstract ImmutableMap stateNameMappings(); @@ -4450,7 +4532,7 @@ private static StreamingDataflowWorkerTestParams.Builder builder() { abstract int localRetryTimeoutMs(); - abstract int maxWorkItemCommitBytes(); + abstract OperationalLimits operationalLimits(); @AutoValue.Builder abstract static class Builder { @@ -4484,7 +4566,7 @@ final Builder publishCounters() { abstract Builder setLocalRetryTimeoutMs(int value); - abstract Builder setMaxWorkItemCommitBytes(int maxWorkItemCommitBytes); + abstract Builder setOperationalLimits(OperationalLimits operationalLimits); abstract StreamingDataflowWorkerTestParams build(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java index 6c46bda5acfe..8445e8ede852 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java @@ -24,6 +24,7 @@ import static org.hamcrest.Matchers.equalTo; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; import com.google.api.services.dataflow.model.CounterMetadata; import com.google.api.services.dataflow.model.CounterStructuredName; @@ -60,8 +61,10 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.metrics.MetricsContainer; import org.apache.beam.sdk.options.PipelineOptionsFactory; @@ -82,7 +85,6 @@ import org.junit.runner.RunWith; import org.junit.runners.JUnit4; import org.mockito.Mock; -import org.mockito.Mockito; import org.mockito.MockitoAnnotations; /** Tests for {@link StreamingModeExecutionContext}. */ @@ -133,9 +135,7 @@ private static Work createMockWork(Windmill.WorkItem workItem, Watermarks waterm workItem, watermarks, Work.createProcessingContext( - COMPUTATION_ID, - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + COMPUTATION_ID, new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } @@ -157,6 +157,7 @@ public void testTimerInternalsSetTimer() { Watermarks.builder().setInputDataWatermark(new Instant(1000)).build()), stateReader, sideInputStateFetcher, + OperationalLimits.builder().build(), outputBuilder); TimerInternals timerInternals = stepContext.timerInternals(); @@ -206,6 +207,7 @@ public void testTimerInternalsProcessingTimeSkew() { Watermarks.builder().setInputDataWatermark(new Instant(1000)).build()), stateReader, sideInputStateFetcher, + OperationalLimits.builder().build(), outputBuilder); TimerInternals timerInternals = stepContext.timerInternals(); assertTrue(timerTimestamp.isBefore(timerInternals.currentProcessingTime())); @@ -241,8 +243,8 @@ public void testSideInputReaderReconstituted() { @Test public void extractMsecCounters() { - MetricsContainer metricsContainer = Mockito.mock(MetricsContainer.class); - ProfileScope profileScope = Mockito.mock(ProfileScope.class); + MetricsContainer metricsContainer = mock(MetricsContainer.class); + ProfileScope profileScope = mock(ProfileScope.class); ExecutionState start1 = executionContext.executionStateRegistry.getState( NameContext.create("stage", "original-1", "system-1", "user-1"), diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java index d128255cd237..2d5a8d8266ae 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java @@ -292,7 +292,6 @@ public void testStringSetUpdateExtraction() { .setCumulative(false) .setStringList(new StringList().setElements(Arrays.asList("ab", "cd", "ef", "gh"))); - ((StreamingStepMetricsContainer) c1).populateStringSetUpdates = true; Iterable updates = StreamingStepMetricsContainer.extractMetricUpdates(registry); assertThat(updates, containsInAnyOrder(name1Update)); @@ -315,7 +314,6 @@ public void testStringSetUpdateExtraction() { .setCumulative(false) .setStringList(new StringList().setElements(Arrays.asList("ij", "kl", "mn"))); - ((StreamingStepMetricsContainer) c2).populateStringSetUpdates = true; updates = StreamingStepMetricsContainer.extractMetricUpdates(registry); assertThat(updates, containsInAnyOrder(name1Update, name2Update)); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java index 5d8ebd53400c..5c149a65f4ce 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSourcesTest.java @@ -95,8 +95,10 @@ import org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader; import org.apache.beam.runners.dataflow.worker.util.common.worker.NativeReader.NativeReaderIterator; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.BigEndianIntegerCoder; import org.apache.beam.sdk.coders.Coder; @@ -197,9 +199,7 @@ private static Work createMockWork(Windmill.WorkItem workItem, Watermarks waterm workItem, watermarks, Work.createProcessingContext( - COMPUTATION_ID, - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + COMPUTATION_ID, new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } @@ -634,6 +634,7 @@ public void testReadUnboundedReader() throws Exception { Watermarks.builder().setInputDataWatermark(new Instant(0)).build()), mock(WindmillStateReader.class), mock(SideInputStateFetcher.class), + OperationalLimits.builder().build(), Windmill.WorkItemCommitRequest.newBuilder()); @SuppressWarnings({"unchecked", "rawtypes"}) @@ -999,8 +1000,9 @@ public void testFailedWorkItemsAbort() throws Exception { Watermarks.builder().setInputDataWatermark(new Instant(0)).build(), Work.createProcessingContext( COMPUTATION_ID, - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - gnored -> {}), + new FakeGetDataClient(), + ignored -> {}, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); context.start( @@ -1008,6 +1010,7 @@ public void testFailedWorkItemsAbort() throws Exception { dummyWork, mock(WindmillStateReader.class), mock(SideInputStateFetcher.class), + OperationalLimits.builder().build(), Windmill.WorkItemCommitRequest.newBuilder()); @SuppressWarnings({"unchecked", "rawtypes"}) diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java index 3a3e0a34c217..a373dffd1dc4 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java @@ -18,7 +18,6 @@ package org.apache.beam.runners.dataflow.worker.streaming; import static com.google.common.truth.Truth.assertThat; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList.toImmutableList; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertSame; @@ -26,20 +25,18 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.verify; -import com.google.auto.value.AutoValue; import java.util.Collections; import java.util.Deque; import java.util.HashMap; import java.util.Map; import java.util.Optional; -import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.streaming.ActiveWorkState.ActivateWorkResult; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.joda.time.Instant; import org.junit.Before; import org.junit.Rule; @@ -85,9 +82,7 @@ private static ExecutableWork expiredWork(Windmill.WorkItem workItem) { private static Work.ProcessingContext createWorkProcessingContext() { return Work.createProcessingContext( - "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}); + "computationId", new FakeGetDataClient(), ignored -> {}, mock(HeartbeatSender.class)); } private static WorkId workId(long workToken, long cacheToken) { @@ -447,70 +442,4 @@ public void testActivateWorkForKey_matchingCacheTokens_newWorkTokenLesser_STALE( assertFalse(readOnlyActiveWork.get(shardedKey).contains(newWork)); assertEquals(queuedWork, readOnlyActiveWork.get(shardedKey).peek()); } - - @Test - public void testGetKeyHeartbeats() { - Instant refreshDeadline = Instant.now(); - ShardedKey shardedKey1 = shardedKey("someKey", 1L); - ShardedKey shardedKey2 = shardedKey("anotherKey", 2L); - - ExecutableWork freshWork = createWork(createWorkItem(3L, 3L, shardedKey1)); - ExecutableWork refreshableWork1 = expiredWork(createWorkItem(1L, 1L, shardedKey1)); - refreshableWork1.work().setState(Work.State.COMMITTING); - ExecutableWork refreshableWork2 = expiredWork(createWorkItem(2L, 2L, shardedKey2)); - refreshableWork2.work().setState(Work.State.COMMITTING); - - activeWorkState.activateWorkForKey(refreshableWork1); - activeWorkState.activateWorkForKey(freshWork); - activeWorkState.activateWorkForKey(refreshableWork2); - - ImmutableList requests = - activeWorkState.getKeyHeartbeats(refreshDeadline, DataflowExecutionStateSampler.instance()); - - ImmutableList expected = - ImmutableList.of( - HeartbeatRequestShardingKeyWorkTokenAndCacheToken.from( - shardedKey1, refreshableWork1.work()), - HeartbeatRequestShardingKeyWorkTokenAndCacheToken.from( - shardedKey2, refreshableWork2.work())); - - ImmutableList actual = - requests.stream() - .map(HeartbeatRequestShardingKeyWorkTokenAndCacheToken::from) - .collect(toImmutableList()); - - assertThat(actual).containsExactlyElementsIn(expected); - } - - @AutoValue - abstract static class HeartbeatRequestShardingKeyWorkTokenAndCacheToken { - - private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken create( - long shardingKey, long workToken, long cacheToken) { - return new AutoValue_ActiveWorkStateTest_HeartbeatRequestShardingKeyWorkTokenAndCacheToken( - shardingKey, workToken, cacheToken); - } - - private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken from( - HeartbeatRequest heartbeatRequest) { - return create( - heartbeatRequest.getShardingKey(), - heartbeatRequest.getWorkToken(), - heartbeatRequest.getCacheToken()); - } - - private static HeartbeatRequestShardingKeyWorkTokenAndCacheToken from( - ShardedKey shardedKey, Work work) { - return create( - shardedKey.shardingKey(), - work.getWorkItem().getWorkToken(), - work.getWorkItem().getCacheToken()); - } - - abstract long shardingKey(); - - abstract long workToken(); - - abstract long cacheToken(); - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java index 3c1683ecf436..1f70c2476325 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationStateCacheTest.java @@ -36,8 +36,10 @@ import org.apache.beam.runners.dataflow.worker.streaming.config.ComputationConfig; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.fn.IdGenerators; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; @@ -68,8 +70,9 @@ private static ExecutableWork createWork(ShardedKey shardedKey, long workToken, Watermarks.builder().setInputDataWatermark(Instant.now()).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + new FakeGetDataClient(), + ignored -> {}, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()), ignored -> {}); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClientTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarnessTest.java similarity index 90% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClientTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarnessTest.java index bc3afaff1b38..aaa71b6598ea 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClientTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarnessTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; +package org.apache.beam.runners.dataflow.worker.streaming.harness; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -33,13 +33,13 @@ import java.util.Comparator; import java.util.HashSet; import java.util.List; -import java.util.Optional; import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import javax.annotation.Nullable; +import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillMetadataServiceV1Alpha1Grpc; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; @@ -48,12 +48,16 @@ import org.apache.beam.runners.dataflow.worker.windmill.WindmillConnection; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcDispatcherClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.ChannelCachingStubFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory; import org.apache.beam.runners.dataflow.worker.windmill.testing.FakeWindmillStubFactory; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudgetDistributor; +import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudgetSpender; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Server; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessServerBuilder; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessSocketAddress; @@ -75,7 +79,7 @@ import org.junit.runners.JUnit4; @RunWith(JUnit4.class) -public class StreamingEngineClientTest { +public class FanOutStreamingEngineWorkerHarnessTest { private static final WindmillServiceAddress DEFAULT_WINDMILL_SERVICE_ADDRESS = WindmillServiceAddress.create(HostAndPort.fromParts(WindmillChannelFactory.LOCALHOST, 443)); private static final ImmutableMap DEFAULT = @@ -97,7 +101,6 @@ public class StreamingEngineClientTest { .build(); @Rule public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); - @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private final MutableHandlerRegistry serviceRegistry = new MutableHandlerRegistry(); private final GrpcWindmillStreamFactory streamFactory = spy(GrpcWindmillStreamFactory.of(JOB_HEADER).build()); @@ -109,18 +112,14 @@ public class StreamingEngineClientTest { private final GrpcDispatcherClient dispatcherClient = GrpcDispatcherClient.forTesting( stubFactory, new ArrayList<>(), new ArrayList<>(), new HashSet<>()); - + @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private Server fakeStreamingEngineServer; private CountDownLatch getWorkerMetadataReady; private GetWorkerMetadataTestStub fakeGetWorkerMetadataStub; - private StreamingEngineClient streamingEngineClient; + private FanOutStreamingEngineWorkerHarness fanOutStreamingEngineWorkProvider; private static WorkItemScheduler noOpProcessWorkItemFn() { - return (workItem, - watermarks, - processingContext, - ackWorkItemQueued, - getWorkStreamLatencies) -> {}; + return (workItem, watermarks, processingContext, getWorkStreamLatencies) -> {}; } private static GetWorkRequest getWorkRequest(long items, long bytes) { @@ -163,16 +162,16 @@ public void setUp() throws IOException { @After public void cleanUp() { - Preconditions.checkNotNull(streamingEngineClient).finish(); + Preconditions.checkNotNull(fanOutStreamingEngineWorkProvider).shutdown(); fakeStreamingEngineServer.shutdownNow(); stubFactory.shutdown(); } - private StreamingEngineClient newStreamingEngineClient( + private FanOutStreamingEngineWorkerHarness newStreamingEngineClient( GetWorkBudget getWorkBudget, GetWorkBudgetDistributor getWorkBudgetDistributor, WorkItemScheduler workItemScheduler) { - return StreamingEngineClient.forTesting( + return FanOutStreamingEngineWorkerHarness.forTesting( JOB_HEADER, getWorkBudget, streamFactory, @@ -181,7 +180,8 @@ private StreamingEngineClient newStreamingEngineClient( getWorkBudgetDistributor, dispatcherClient, CLIENT_ID, - ignored -> mock(WorkCommitter.class)); + ignored -> mock(WorkCommitter.class), + new ThrottlingGetDataMetricTracker(mock(MemoryMonitor.class))); } @Test @@ -193,7 +193,7 @@ public void testStreamsStartCorrectly() throws InterruptedException { TestGetWorkBudgetDistributor getWorkBudgetDistributor = spy(new TestGetWorkBudgetDistributor(numBudgetDistributionsExpected)); - streamingEngineClient = + fanOutStreamingEngineWorkProvider = newStreamingEngineClient( GetWorkBudget.builder().setItems(items).setBytes(bytes).build(), getWorkBudgetDistributor, @@ -215,15 +215,13 @@ public void testStreamsStartCorrectly() throws InterruptedException { waitForWorkerMetadataToBeConsumed(getWorkBudgetDistributor); StreamingEngineConnectionState currentConnections = - streamingEngineClient.getCurrentConnections(); + fanOutStreamingEngineWorkProvider.getCurrentConnections(); assertEquals(2, currentConnections.windmillConnections().size()); assertEquals(2, currentConnections.windmillStreams().size()); Set workerTokens = currentConnections.windmillConnections().values().stream() .map(WindmillConnection::backendWorkerToken) - .filter(Optional::isPresent) - .map(Optional::get) .collect(Collectors.toSet()); assertTrue(workerTokens.contains(workerToken)); @@ -235,7 +233,13 @@ public void testStreamsStartCorrectly() throws InterruptedException { verify(streamFactory, times(2)) .createDirectGetWorkStream( - any(), eq(getWorkRequest(0, 0)), any(), any(), any(), eq(noOpProcessWorkItemFn())); + any(), + eq(getWorkRequest(0, 0)), + any(), + any(), + any(), + any(), + eq(noOpProcessWorkItemFn())); verify(streamFactory, times(2)).createGetDataStream(any(), any()); verify(streamFactory, times(2)).createCommitWorkStream(any(), any()); @@ -245,7 +249,7 @@ public void testStreamsStartCorrectly() throws InterruptedException { public void testScheduledBudgetRefresh() throws InterruptedException { TestGetWorkBudgetDistributor getWorkBudgetDistributor = spy(new TestGetWorkBudgetDistributor(2)); - streamingEngineClient = + fanOutStreamingEngineWorkProvider = newStreamingEngineClient( GetWorkBudget.builder().setItems(1L).setBytes(1L).build(), getWorkBudgetDistributor, @@ -268,7 +272,7 @@ public void testOnNewWorkerMetadata_correctlyRemovesStaleWindmillServers() int metadataCount = 2; TestGetWorkBudgetDistributor getWorkBudgetDistributor = spy(new TestGetWorkBudgetDistributor(metadataCount)); - streamingEngineClient = + fanOutStreamingEngineWorkProvider = newStreamingEngineClient( GetWorkBudget.builder().setItems(1).setBytes(1).build(), getWorkBudgetDistributor, @@ -306,14 +310,13 @@ public void testOnNewWorkerMetadata_correctlyRemovesStaleWindmillServers() fakeGetWorkerMetadataStub.injectWorkerMetadata(secondWorkerMetadata); waitForWorkerMetadataToBeConsumed(getWorkBudgetDistributor); StreamingEngineConnectionState currentConnections = - streamingEngineClient.getCurrentConnections(); + fanOutStreamingEngineWorkProvider.getCurrentConnections(); assertEquals(1, currentConnections.windmillConnections().size()); assertEquals(1, currentConnections.windmillStreams().size()); Set workerTokens = - streamingEngineClient.getCurrentConnections().windmillConnections().values().stream() + fanOutStreamingEngineWorkProvider.getCurrentConnections().windmillConnections().values() + .stream() .map(WindmillConnection::backendWorkerToken) - .filter(Optional::isPresent) - .map(Optional::get) .collect(Collectors.toSet()); assertFalse(workerTokens.contains(workerToken)); @@ -359,7 +362,7 @@ public void testOnNewWorkerMetadata_redistributesBudget() throws InterruptedExce TestGetWorkBudgetDistributor getWorkBudgetDistributor = spy(new TestGetWorkBudgetDistributor(workerMetadataResponses.size())); - streamingEngineClient = + fanOutStreamingEngineWorkProvider = newStreamingEngineClient( GetWorkBudget.builder().setItems(1).setBytes(1).build(), getWorkBudgetDistributor, @@ -436,8 +439,8 @@ private void waitForBudgetDistribution() throws InterruptedException { } @Override - public void distributeBudget( - ImmutableCollection streams, GetWorkBudget getWorkBudget) { + public void distributeBudget( + ImmutableCollection streams, GetWorkBudget getWorkBudget) { streams.forEach(stream -> stream.adjustBudget(getWorkBudget.items(), getWorkBudget.bytes())); getWorkBudgetDistributorTriggered.countDown(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSenderTest.java similarity index 81% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSenderTest.java index 162c69509ae1..dc6cc5641055 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSenderTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; +package org.apache.beam.runners.dataflow.worker.streaming.harness; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; @@ -27,13 +27,15 @@ import static org.mockito.Mockito.when; import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; -import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillConnection; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; @@ -63,10 +65,10 @@ public class WindmillStreamSenderTest { .build()) .build()); private final WorkItemScheduler workItemScheduler = - (workItem, watermarks, processingContext, ackWorkItemQueued, getWorkStreamLatencies) -> {}; + (workItem, watermarks, processingContext, getWorkStreamLatencies) -> {}; @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private ManagedChannel inProcessChannel; - private CloudWindmillServiceV1Alpha1Stub stub; + private WindmillConnection connection; @Before public void setUp() { @@ -74,7 +76,10 @@ public void setUp() { grpcCleanup.register( InProcessChannelBuilder.forName("WindmillStreamSenderTest").directExecutor().build()); grpcCleanup.register(inProcessChannel); - stub = CloudWindmillServiceV1Alpha1Grpc.newStub(inProcessChannel); + connection = + WindmillConnection.builder() + .setStub(CloudWindmillServiceV1Alpha1Grpc.newStub(inProcessChannel)) + .build(); } @After @@ -95,7 +100,7 @@ public void testStartStream_startsAllStreams() { verify(streamFactory) .createDirectGetWorkStream( - eq(stub), + eq(connection), eq( GET_WORK_REQUEST .toBuilder() @@ -105,10 +110,11 @@ public void testStartStream_startsAllStreams() { any(ThrottleTimer.class), any(), any(), + any(), eq(workItemScheduler)); - verify(streamFactory).createGetDataStream(eq(stub), any(ThrottleTimer.class)); - verify(streamFactory).createCommitWorkStream(eq(stub), any(ThrottleTimer.class)); + verify(streamFactory).createGetDataStream(eq(connection.stub()), any(ThrottleTimer.class)); + verify(streamFactory).createCommitWorkStream(eq(connection.stub()), any(ThrottleTimer.class)); } @Test @@ -126,7 +132,7 @@ public void testStartStream_onlyStartsStreamsOnce() { verify(streamFactory, times(1)) .createDirectGetWorkStream( - eq(stub), + eq(connection), eq( GET_WORK_REQUEST .toBuilder() @@ -136,10 +142,13 @@ public void testStartStream_onlyStartsStreamsOnce() { any(ThrottleTimer.class), any(), any(), + any(), eq(workItemScheduler)); - verify(streamFactory, times(1)).createGetDataStream(eq(stub), any(ThrottleTimer.class)); - verify(streamFactory, times(1)).createCommitWorkStream(eq(stub), any(ThrottleTimer.class)); + verify(streamFactory, times(1)) + .createGetDataStream(eq(connection.stub()), any(ThrottleTimer.class)); + verify(streamFactory, times(1)) + .createCommitWorkStream(eq(connection.stub()), any(ThrottleTimer.class)); } @Test @@ -160,7 +169,7 @@ public void testStartStream_onlyStartsStreamsOnceConcurrent() throws Interrupted verify(streamFactory, times(1)) .createDirectGetWorkStream( - eq(stub), + eq(connection), eq( GET_WORK_REQUEST .toBuilder() @@ -170,10 +179,13 @@ public void testStartStream_onlyStartsStreamsOnceConcurrent() throws Interrupted any(ThrottleTimer.class), any(), any(), + any(), eq(workItemScheduler)); - verify(streamFactory, times(1)).createGetDataStream(eq(stub), any(ThrottleTimer.class)); - verify(streamFactory, times(1)).createCommitWorkStream(eq(stub), any(ThrottleTimer.class)); + verify(streamFactory, times(1)) + .createGetDataStream(eq(connection.stub()), any(ThrottleTimer.class)); + verify(streamFactory, times(1)) + .createCommitWorkStream(eq(connection.stub()), any(ThrottleTimer.class)); } @Test @@ -198,17 +210,18 @@ public void testCloseAllStreams_closesAllStreams() { CommitWorkStream mockCommitWorkStream = mock(CommitWorkStream.class); when(mockStreamFactory.createDirectGetWorkStream( - eq(stub), + eq(connection), eq(getWorkRequestWithBudget), any(ThrottleTimer.class), any(), any(), + any(), eq(workItemScheduler))) .thenReturn(mockGetWorkStream); - when(mockStreamFactory.createGetDataStream(eq(stub), any(ThrottleTimer.class))) + when(mockStreamFactory.createGetDataStream(eq(connection.stub()), any(ThrottleTimer.class))) .thenReturn(mockGetDataStream); - when(mockStreamFactory.createCommitWorkStream(eq(stub), any(ThrottleTimer.class))) + when(mockStreamFactory.createCommitWorkStream(eq(connection.stub()), any(ThrottleTimer.class))) .thenReturn(mockCommitWorkStream); WindmillStreamSender windmillStreamSender = @@ -219,9 +232,9 @@ public void testCloseAllStreams_closesAllStreams() { windmillStreamSender.startStreams(); windmillStreamSender.closeAllStreams(); - verify(mockGetWorkStream).close(); - verify(mockGetDataStream).close(); - verify(mockCommitWorkStream).close(); + verify(mockGetWorkStream).shutdown(); + verify(mockGetDataStream).shutdown(); + verify(mockCommitWorkStream).shutdown(); } private WindmillStreamSender newWindmillStreamSender(GetWorkBudget budget) { @@ -231,11 +244,12 @@ private WindmillStreamSender newWindmillStreamSender(GetWorkBudget budget) { private WindmillStreamSender newWindmillStreamSender( GetWorkBudget budget, GrpcWindmillStreamFactory streamFactory) { return WindmillStreamSender.create( - stub, + connection, GET_WORK_REQUEST, budget, streamFactory, workItemScheduler, + ignored -> mock(GetDataClient.class), ignored -> mock(WorkCommitter.class)); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java index ad2ac6baeabb..24a93f58b12a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java @@ -33,8 +33,8 @@ import java.util.List; import java.util.concurrent.TimeUnit; import org.apache.beam.runners.dataflow.options.DataflowStreamingPipelineOptions; -import org.apache.beam.runners.dataflow.worker.MetricTrackingWindmillServerStub; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.ListCoder; import org.apache.beam.sdk.coders.StringUtf8Coder; @@ -67,13 +67,46 @@ @SuppressWarnings("deprecation") @RunWith(JUnit4.class) public class SideInputStateFetcherTest { - @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private static final String STATE_FAMILY = "state"; - - @Mock private MetricTrackingWindmillServerStub server; + @Rule public transient Timeout globalTimeout = Timeout.seconds(600); + @Mock private GetDataClient server; @Mock private Supplier readStateSupplier; + private static Windmill.GlobalData buildGlobalDataResponse( + String tag, boolean isReady, ByteString data) { + Windmill.GlobalData.Builder builder = + Windmill.GlobalData.newBuilder() + .setDataId( + Windmill.GlobalDataId.newBuilder() + .setTag(tag) + .setVersion(ByteString.EMPTY) + .build()); + + if (isReady) { + builder.setIsReady(true).setData(data); + } else { + builder.setIsReady(false); + } + return builder.build(); + } + + private static Windmill.GlobalDataRequest buildGlobalDataRequest(String tag, ByteString version) { + Windmill.GlobalDataId id = + Windmill.GlobalDataId.newBuilder().setTag(tag).setVersion(version).build(); + + return Windmill.GlobalDataRequest.newBuilder() + .setDataId(id) + .setStateFamily(STATE_FAMILY) + .setExistenceWatermarkDeadline( + TimeUnit.MILLISECONDS.toMicros(GlobalWindow.INSTANCE.maxTimestamp().getMillis())) + .build(); + } + + private static Windmill.GlobalDataRequest buildGlobalDataRequest(String tag) { + return buildGlobalDataRequest(tag, ByteString.EMPTY); + } + @Before public void setUp() { MockitoAnnotations.initMocks(this); @@ -81,10 +114,10 @@ public void setUp() { @Test public void testFetchGlobalDataBasic() throws Exception { - SideInputStateFetcher fetcher = - new SideInputStateFetcher( - server::getSideInputData, + SideInputStateFetcherFactory factory = + SideInputStateFetcherFactory.fromOptions( PipelineOptionsFactory.as(DataflowStreamingPipelineOptions.class)); + SideInputStateFetcher fetcher = factory.createSideInputStateFetcher(server::getSideInputData); ByteStringOutputStream stream = new ByteStringOutputStream(); ListCoder.of(StringUtf8Coder.of()) @@ -152,10 +185,10 @@ public void testFetchGlobalDataBasic() throws Exception { @Test public void testFetchGlobalDataNull() throws Exception { - SideInputStateFetcher fetcher = - new SideInputStateFetcher( - server::getSideInputData, + SideInputStateFetcherFactory factory = + SideInputStateFetcherFactory.fromOptions( PipelineOptionsFactory.as(DataflowStreamingPipelineOptions.class)); + SideInputStateFetcher fetcher = factory.createSideInputStateFetcher(server::getSideInputData); ByteStringOutputStream stream = new ByteStringOutputStream(); ListCoder.of(VoidCoder.of()) @@ -311,10 +344,10 @@ public void testFetchGlobalDataCacheOverflow() throws Exception { @Test public void testEmptyFetchGlobalData() { - SideInputStateFetcher fetcher = - new SideInputStateFetcher( - server::getSideInputData, + SideInputStateFetcherFactory factory = + SideInputStateFetcherFactory.fromOptions( PipelineOptionsFactory.as(DataflowStreamingPipelineOptions.class)); + SideInputStateFetcher fetcher = factory.createSideInputStateFetcher(server::getSideInputData); ByteString encodedIterable = ByteString.EMPTY; @@ -346,38 +379,4 @@ public void testEmptyFetchGlobalData() { verify(server).getSideInputData(buildGlobalDataRequest(tag)); verifyNoMoreInteractions(server); } - - private static Windmill.GlobalData buildGlobalDataResponse( - String tag, boolean isReady, ByteString data) { - Windmill.GlobalData.Builder builder = - Windmill.GlobalData.newBuilder() - .setDataId( - Windmill.GlobalDataId.newBuilder() - .setTag(tag) - .setVersion(ByteString.EMPTY) - .build()); - - if (isReady) { - builder.setIsReady(true).setData(data); - } else { - builder.setIsReady(false); - } - return builder.build(); - } - - private static Windmill.GlobalDataRequest buildGlobalDataRequest(String tag, ByteString version) { - Windmill.GlobalDataId id = - Windmill.GlobalDataId.newBuilder().setTag(tag).setVersion(version).build(); - - return Windmill.GlobalDataRequest.newBuilder() - .setDataId(id) - .setStateFamily(STATE_FAMILY) - .setExistenceWatermarkDeadline( - TimeUnit.MILLISECONDS.toMicros(GlobalWindow.INSTANCE.maxTimestamp().getMillis())) - .build(); - } - - private static Windmill.GlobalDataRequest buildGlobalDataRequest(String tag) { - return buildGlobalDataRequest(tag, ByteString.EMPTY); - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java index e08c951975fa..ad77958837a1 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java @@ -22,6 +22,7 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; import java.util.Collections; import java.util.concurrent.CountDownLatch; @@ -31,6 +32,8 @@ import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.joda.time.Instant; @@ -65,24 +68,23 @@ private static ExecutableWork createWork(Consumer executeWorkFn) { Watermarks.builder().setInputDataWatermark(Instant.now()).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + new FakeGetDataClient(), + ignored -> {}, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()), executeWorkFn); } private Runnable createSleepProcessWorkFn(CountDownLatch start, CountDownLatch stop) { - Runnable runnable = - () -> { - start.countDown(); - try { - stop.await(); - } catch (Exception e) { - throw new RuntimeException(e); - } - }; - return runnable; + return () -> { + start.countDown(); + try { + stop.await(); + } catch (Exception e) { + throw new RuntimeException(e); + } + }; } @Before diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/common/worker/GroupingShuffleEntryIteratorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/common/worker/GroupingShuffleEntryIteratorTest.java index 2421d7faf824..8c6a003cb72b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/common/worker/GroupingShuffleEntryIteratorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/common/worker/GroupingShuffleEntryIteratorTest.java @@ -42,7 +42,6 @@ import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.util.common.Reiterator; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; import org.checkerframework.checker.nullness.qual.Nullable; import org.junit.After; @@ -130,10 +129,10 @@ private void setCurrentExecutionState(String mockOriginalName) { private static ShuffleEntry shuffleEntry(String key, String value) { return new ShuffleEntry( /* use key itself as position */ - ByteArrayShufflePosition.of(key.getBytes(Charsets.UTF_8)), - ByteString.copyFrom(key.getBytes(Charsets.UTF_8)), + ByteArrayShufflePosition.of(key.getBytes(StandardCharsets.UTF_8)), + ByteString.copyFrom(key.getBytes(StandardCharsets.UTF_8)), ByteString.copyFrom(new byte[0]), - ByteString.copyFrom(value.getBytes(Charsets.UTF_8))); + ByteString.copyFrom(value.getBytes(StandardCharsets.UTF_8))); } @Test diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java index a2f5e71d04c3..bdad382c9af2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java @@ -38,12 +38,12 @@ @RunWith(JUnit4.class) public class WindmillStreamPoolTest { - @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private static final int DEFAULT_NUM_STREAMS = 10; private static final int NEW_STREAM_HOLDS = 2; private final ConcurrentHashMap< TestWindmillStream, WindmillStreamPool.StreamData> holds = new ConcurrentHashMap<>(); + @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private List> streams; @Before @@ -237,7 +237,7 @@ private TestWindmillStream(Instant startTime) { } @Override - public void close() { + public void halfClose() { closed = true; } @@ -250,5 +250,15 @@ public boolean awaitTermination(int time, TimeUnit unit) { public Instant startTime() { return startTime; } + + @Override + public String backendWorkerToken() { + return ""; + } + + @Override + public void shutdown() { + halfClose(); + } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java index 85e07c3bd797..51cd83d17fab 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingApplianceWorkCommitterTest.java @@ -19,6 +19,7 @@ import static com.google.common.truth.Truth.assertThat; import static org.junit.Assert.assertNotNull; +import static org.mockito.Mockito.mock; import com.google.api.services.dataflow.model.MapTask; import com.google.common.truth.Correspondence; @@ -35,6 +36,8 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.joda.time.Instant; @@ -45,7 +48,6 @@ import org.junit.rules.ErrorCollector; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -import org.mockito.Mockito; @RunWith(JUnit4.class) public class StreamingApplianceWorkCommitterTest { @@ -64,10 +66,11 @@ private static Work createMockWork(long workToken) { Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), + new FakeGetDataClient(), ignored -> { throw new UnsupportedOperationException(); - }), + }, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } @@ -76,7 +79,7 @@ private static ComputationState createComputationState(String computationId) { return new ComputationState( computationId, new MapTask().setSystemName("system").setStageName("stage"), - Mockito.mock(BoundedQueueExecutor.class), + mock(BoundedQueueExecutor.class), ImmutableMap.of(), null); } @@ -90,7 +93,7 @@ private StreamingApplianceWorkCommitter createWorkCommitter( public void setUp() { fakeWindmillServer = new FakeWindmillServer( - errorCollector, ignored -> Optional.of(Mockito.mock(ComputationState.class))); + errorCollector, ignored -> Optional.of(mock(ComputationState.class))); } @After diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java index d53690938aef..546a2883e3b2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/commits/StreamingEngineWorkCommitterTest.java @@ -21,6 +21,7 @@ import static org.apache.beam.runners.dataflow.worker.windmill.Windmill.CommitStatus.OK; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; import com.google.api.services.dataflow.model.MapTask; import java.io.IOException; @@ -49,24 +50,24 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.CloseableStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.joda.time.Duration; import org.joda.time.Instant; -import org.junit.After; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ErrorCollector; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -import org.mockito.Mockito; @RunWith(JUnit4.class) public class StreamingEngineWorkCommitterTest { @Rule public ErrorCollector errorCollector = new ErrorCollector(); - private StreamingEngineWorkCommitter workCommitter; + private WorkCommitter workCommitter; private FakeWindmillServer fakeWindmillServer; private Supplier> commitWorkStreamFactory; @@ -81,10 +82,11 @@ private static Work createMockWork(long workToken) { Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), + new FakeGetDataClient(), ignored -> { throw new UnsupportedOperationException(); - }), + }, + mock(HeartbeatSender.class)), Instant::now, Collections.emptyList()); } @@ -93,7 +95,7 @@ private static ComputationState createComputationState(String computationId) { return new ComputationState( computationId, new MapTask().setSystemName("system").setStageName("stage"), - Mockito.mock(BoundedQueueExecutor.class), + mock(BoundedQueueExecutor.class), ImmutableMap.of(), null); } @@ -110,21 +112,18 @@ private static CompleteCommit asCompleteCommit(Commit commit, Windmill.CommitSta public void setUp() throws IOException { fakeWindmillServer = new FakeWindmillServer( - errorCollector, ignored -> Optional.of(Mockito.mock(ComputationState.class))); + errorCollector, ignored -> Optional.of(mock(ComputationState.class))); commitWorkStreamFactory = WindmillStreamPool.create( 1, Duration.standardMinutes(1), fakeWindmillServer::commitWorkStream) ::getCloseableStream; } - @After - public void cleanUp() { - workCommitter.stop(); - } - - private StreamingEngineWorkCommitter createWorkCommitter( - Consumer onCommitComplete) { - return StreamingEngineWorkCommitter.create(commitWorkStreamFactory, 1, onCommitComplete); + private WorkCommitter createWorkCommitter(Consumer onCommitComplete) { + return StreamingEngineWorkCommitter.builder() + .setCommitWorkStreamFactory(commitWorkStreamFactory) + .setOnCommitComplete(onCommitComplete) + .build(); } @Test @@ -156,6 +155,8 @@ public void testCommit_sendsCommitsToStreamingEngine() { assertThat(request).isEqualTo(commit.request()); assertThat(completeCommits).contains(asCompleteCommit(commit, Windmill.CommitStatus.OK)); } + + workCommitter.stop(); } @Test @@ -196,6 +197,8 @@ public void testCommit_handlesFailedCommits() { .containsEntry(commit.work().getWorkItem().getWorkToken(), commit.request()); } } + + workCommitter.stop(); } @Test @@ -248,6 +251,8 @@ public void testCommit_handlesCompleteCommits_commitStatusNotOK() { .contains(asCompleteCommit(commit, expectedCommitStatus.get(commit.work().id()))); } assertThat(completeCommits.size()).isEqualTo(commits.size()); + + workCommitter.stop(); } @Test @@ -273,7 +278,7 @@ public void flush() {} } @Override - public void close() {} + public void halfClose() {} @Override public boolean awaitTermination(int time, TimeUnit unit) { @@ -284,6 +289,14 @@ public boolean awaitTermination(int time, TimeUnit unit) { public Instant startTime() { return Instant.now(); } + + @Override + public String backendWorkerToken() { + return ""; + } + + @Override + public void shutdown() {} }; commitWorkStreamFactory = @@ -328,7 +341,12 @@ public void testMultipleCommitSendersSingleStream() { ::getCloseableStream; Set completeCommits = Collections.newSetFromMap(new ConcurrentHashMap<>()); workCommitter = - StreamingEngineWorkCommitter.create(commitWorkStreamFactory, 5, completeCommits::add); + StreamingEngineWorkCommitter.builder() + .setCommitWorkStreamFactory(commitWorkStreamFactory) + .setNumCommitSenders(5) + .setOnCommitComplete(completeCommits::add) + .build(); + List commits = new ArrayList<>(); for (int i = 1; i <= 500; i++) { Work work = createMockWork(i); @@ -353,5 +371,7 @@ public void testMultipleCommitSendersSingleStream() { assertThat(request).isEqualTo(commit.request()); assertThat(completeCommits).contains(asCompleteCommit(commit, Windmill.CommitStatus.OK)); } + + workCommitter.stop(); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FakeGetDataClient.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FakeGetDataClient.java new file mode 100644 index 000000000000..ca89e9647153 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/FakeGetDataClient.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import java.io.PrintWriter; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; + +/** Fake {@link GetDataClient} implementation for testing. */ +public final class FakeGetDataClient implements GetDataClient { + @Override + public Windmill.KeyedGetDataResponse getStateData( + String computationId, Windmill.KeyedGetDataRequest request) throws GetDataException { + return Windmill.KeyedGetDataResponse.getDefaultInstance(); + } + + @Override + public Windmill.GlobalData getSideInputData(Windmill.GlobalDataRequest request) + throws GetDataException { + return Windmill.GlobalData.getDefaultInstance(); + } + + @Override + public void printHtml(PrintWriter writer) {} +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTrackerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTrackerTest.java new file mode 100644 index 000000000000..d687434edff4 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/getdata/ThrottlingGetDataMetricTrackerTest.java @@ -0,0 +1,277 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.getdata; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertFalse; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doNothing; +import static org.mockito.Mockito.mock; + +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +@SuppressWarnings("FutureReturnValueIgnored") +public class ThrottlingGetDataMetricTrackerTest { + + private final MemoryMonitor memoryMonitor = mock(MemoryMonitor.class); + private final ThrottlingGetDataMetricTracker getDataMetricTracker = + new ThrottlingGetDataMetricTracker(memoryMonitor); + private final ExecutorService getDataProcessor = Executors.newCachedThreadPool(); + + @Test + public void testTrackFetchStateDataWithThrottling() throws InterruptedException { + doNothing().when(memoryMonitor).waitForResources(anyString()); + CountDownLatch processCall = new CountDownLatch(1); + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch processingDone = new CountDownLatch(1); + getDataProcessor.submit( + () -> { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling()) { + callProcessing.countDown(); + processCall.await(); + } catch (Exception e) { + // Do nothing. + } + processingDone.countDown(); + }); + + callProcessing.await(); + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(1); + assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsWhileProcessing.activeSideInputs()).isEqualTo(0); + + // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets + // decremented + processCall.countDown(); + processingDone.await(); + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeSideInputs()).isEqualTo(0); + } + + @Test + public void testTrackSideInputFetchWithThrottling() throws InterruptedException { + doNothing().when(memoryMonitor).waitForResources(anyString()); + CountDownLatch processCall = new CountDownLatch(1); + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch processingDone = new CountDownLatch(1); + getDataProcessor.submit( + () -> { + try (AutoCloseable ignored = getDataMetricTracker.trackSideInputFetchWithThrottling()) { + callProcessing.countDown(); + processCall.await(); + } catch (Exception e) { + // Do nothing. + } + processingDone.countDown(); + }); + + callProcessing.await(); + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(0); + assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsWhileProcessing.activeSideInputs()).isEqualTo(1); + + // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets + // decremented + processCall.countDown(); + processingDone.await(); + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsAfterProcessing.activeSideInputs()).isEqualTo(0); + } + + @Test + public void testThrottledTrackSingleCallWithThrottling() throws InterruptedException { + CountDownLatch mockThrottler = simulateMemoryPressure(); + CountDownLatch processCall = new CountDownLatch(1); + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch processingDone = new CountDownLatch(1); + getDataProcessor.submit( + () -> { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling()) { + callProcessing.countDown(); + processCall.await(); + } catch (Exception e) { + // Do nothing. + } + processingDone.countDown(); + }); + + assertFalse(callProcessing.await(10, TimeUnit.MILLISECONDS)); + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsBeforeProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsBeforeProcessing.activeStateReads()).isEqualTo(0); + assertThat(metricsBeforeProcessing.activeHeartbeats()).isEqualTo(0); + assertThat(metricsBeforeProcessing.activeSideInputs()).isEqualTo(0); + + // Stop throttling. + mockThrottler.countDown(); + callProcessing.await(); + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(1); + + // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets + // decremented + processCall.countDown(); + processingDone.await(); + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); + } + + @Test + public void testTrackSingleCall_exceptionThrown() throws InterruptedException { + doNothing().when(memoryMonitor).waitForResources(anyString()); + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch beforeException = new CountDownLatch(1); + CountDownLatch afterException = new CountDownLatch(1); + + // Catch the exception outside the try-with-resources block to ensure that + // AutoCloseable.closed() runs in the midst of an exception. + getDataProcessor.submit( + () -> { + try { + try (AutoCloseable ignored = getDataMetricTracker.trackStateDataFetchWithThrottling()) { + callProcessing.countDown(); + beforeException.await(); + throw new RuntimeException("something bad happened"); + } + } catch (RuntimeException e) { + afterException.countDown(); + throw e; + } + }); + + callProcessing.await(); + + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeStateReads()).isEqualTo(1); + beforeException.countDown(); + + // In the midst of an exception, close() should still run. + afterException.await(); + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeStateReads()).isEqualTo(0); + } + + @Test + public void testTrackHeartbeats() throws InterruptedException { + CountDownLatch processCall = new CountDownLatch(1); + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch processingDone = new CountDownLatch(1); + int numHeartbeats = 5; + getDataProcessor.submit( + () -> { + try (AutoCloseable ignored = getDataMetricTracker.trackHeartbeats(numHeartbeats)) { + callProcessing.countDown(); + processCall.await(); + } catch (Exception e) { + // Do nothing. + } + processingDone.countDown(); + }); + + callProcessing.await(); + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(5); + + // Free the thread inside the AutoCloseable, wait for processingDone and check that metrics gets + // decremented + processCall.countDown(); + processingDone.await(); + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); + } + + @Test + public void testTrackHeartbeats_exceptionThrown() throws InterruptedException { + CountDownLatch callProcessing = new CountDownLatch(1); + CountDownLatch beforeException = new CountDownLatch(1); + CountDownLatch afterException = new CountDownLatch(1); + int numHeartbeats = 10; + // Catch the exception outside the try-with-resources block to ensure that + // AutoCloseable.closed() runs in the midst of an exception. + getDataProcessor.submit( + () -> { + try { + try (AutoCloseable ignored = getDataMetricTracker.trackHeartbeats(numHeartbeats)) { + callProcessing.countDown(); + beforeException.await(); + throw new RuntimeException("something bad happened"); + } + } catch (RuntimeException e) { + afterException.countDown(); + throw e; + } + }); + + callProcessing.await(); + + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsWhileProcessing = + getDataMetricTracker.getMetricsSnapshot(); + + assertThat(metricsWhileProcessing.activeHeartbeats()).isEqualTo(numHeartbeats); + beforeException.countDown(); + + // In the midst of an exception, close() should still run. + afterException.await(); + ThrottlingGetDataMetricTracker.ReadOnlySnapshot metricsAfterProcessing = + getDataMetricTracker.getMetricsSnapshot(); + assertThat(metricsAfterProcessing.activeHeartbeats()).isEqualTo(0); + } + + /** Have the memory monitor block when waitForResources is called simulating memory pressure. */ + private CountDownLatch simulateMemoryPressure() { + CountDownLatch mockThrottler = new CountDownLatch(1); + doAnswer( + invocationOnMock -> { + mockThrottler.await(); + return null; + }) + .when(memoryMonitor) + .waitForResources(anyString()); + return mockThrottler; + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServletTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServletTest.java index 96c675169a7d..d234cf424767 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServletTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServletTest.java @@ -56,7 +56,8 @@ public void testRendersAllChannels() throws UnsupportedEncodingException { fakeWindmillServer.setWindmillServiceEndpoints( ImmutableSet.of(HostAndPort.fromHost(windmill1), HostAndPort.fromHost(windmill2))); options.setChannelzShowOnlyWindmillServiceChannels(false); - ChannelzServlet channelzServlet = new ChannelzServlet("/channelz", options, fakeWindmillServer); + ChannelzServlet channelzServlet = + new ChannelzServlet("/channelz", options, fakeWindmillServer::getWindmillServiceEndpoints); StringWriter stringWriter = new StringWriter(); PrintWriter writer = new PrintWriter(stringWriter); channelzServlet.captureData(writer); @@ -88,7 +89,8 @@ public void testRendersOnlyWindmillChannels() throws UnsupportedEncodingExceptio fakeWindmillServer.setWindmillServiceEndpoints( ImmutableSet.of(HostAndPort.fromHost(windmill1), HostAndPort.fromHost(windmill2))); options.setChannelzShowOnlyWindmillServiceChannels(true); - ChannelzServlet channelzServlet = new ChannelzServlet("/channelz", options, fakeWindmillServer); + ChannelzServlet channelzServlet = + new ChannelzServlet("/channelz", options, fakeWindmillServer::getWindmillServiceEndpoints); StringWriter stringWriter = new StringWriter(); PrintWriter writer = new PrintWriter(stringWriter); channelzServlet.captureData(writer); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStreamTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStreamTest.java index 515beba0c88d..4439c409b32f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStreamTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStreamTest.java @@ -261,7 +261,7 @@ public void testGetWorkerMetadata_correctlyAddsAndRemovesStreamFromRegistry() { .build()); assertTrue(streamRegistry.contains(stream)); - stream.close(); + stream.halfClose(); assertFalse(streamRegistry.contains(stream)); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java index 6473d5527a81..7e5801b65de4 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java @@ -110,14 +110,13 @@ "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) }) public class GrpcWindmillServerTest { - @Rule public transient Timeout globalTimeout = Timeout.seconds(600); - @Rule public GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); - @Rule public ErrorCollector errorCollector = new ErrorCollector(); - private static final Logger LOG = LoggerFactory.getLogger(GrpcWindmillServerTest.class); private static final int STREAM_CHUNK_SIZE = 2 << 20; private final long clientId = 10L; private final MutableHandlerRegistry serviceRegistry = new MutableHandlerRegistry(); + @Rule public transient Timeout globalTimeout = Timeout.seconds(600); + @Rule public GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); + @Rule public ErrorCollector errorCollector = new ErrorCollector(); private Server server; private GrpcWindmillServer client; private int remainingErrors = 20; @@ -329,7 +328,7 @@ public void onCompleted() { }); assertTrue(latch.await(30, TimeUnit.SECONDS)); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(30, TimeUnit.SECONDS)); } @@ -490,7 +489,7 @@ private void flushResponse() { }); } done.await(); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(60, TimeUnit.SECONDS)); executor.shutdown(); } @@ -688,7 +687,7 @@ public StreamObserver commitWorkStream( // Make the commit requests, waiting for each of them to be verified and acknowledged. CommitWorkStream stream = client.commitWorkStream(); commitWorkTestHelper(stream, commitRequests, 0, 500); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(30, TimeUnit.SECONDS)); } @@ -723,7 +722,7 @@ public StreamObserver commitWorkStream( for (Future f : futures) { f.get(); } - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(30, TimeUnit.SECONDS)); executor.shutdown(); } @@ -825,7 +824,7 @@ public void onCompleted() { } } - stream.close(); + stream.halfClose(); isClientClosed.set(true); deadline = System.currentTimeMillis() + 60_000; // 1 min @@ -957,13 +956,13 @@ public void onCompleted() { Map> expectedKeyedGetDataRequests = new HashMap<>(); expectedKeyedGetDataRequests.put("Computation1", makeGetDataHeartbeatRequest(computation1Keys)); expectedKeyedGetDataRequests.put("Computation2", makeGetDataHeartbeatRequest(computation2Keys)); - Map> heartbeatsToRefresh = new HashMap<>(); + Map> heartbeatsToRefresh = new HashMap<>(); heartbeatsToRefresh.put("Computation1", makeHeartbeatRequest(computation1Keys)); heartbeatsToRefresh.put("Computation2", makeHeartbeatRequest(computation2Keys)); GetDataStream stream = client.getDataStream(); stream.refreshActiveWork(heartbeatsToRefresh); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(60, TimeUnit.SECONDS)); boolean receivedAllGetDataHeartbeats = false; @@ -1058,13 +1057,13 @@ public void onCompleted() { } expectedHeartbeats.add(comp1Builder.build()); expectedHeartbeats.add(comp2Builder.build()); - Map> heartbeatRequestMap = new HashMap<>(); + Map> heartbeatRequestMap = new HashMap<>(); heartbeatRequestMap.put("Computation1", makeHeartbeatRequest(computation1Keys)); heartbeatRequestMap.put("Computation2", makeHeartbeatRequest(computation2Keys)); GetDataStream stream = client.getDataStream(); stream.refreshActiveWork(heartbeatRequestMap); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(60, TimeUnit.SECONDS)); boolean receivedAllHeartbeatRequests = false; @@ -1143,7 +1142,13 @@ public void onNext(StreamingGetWorkRequest request) { StreamingGetWorkResponseChunk.newBuilder() .setStreamId(id) .setSerializedWorkItem(serializedResponse) - .setRemainingBytesForWorkItem(0); + .setRemainingBytesForWorkItem(0) + .setComputationMetadata( + ComputationWorkItemMetadata.newBuilder() + .setComputationId("computation") + .setInputDataWatermark(1L) + .setDependentRealtimeInputWatermark(1L) + .build()); try { responseObserver.onNext(builder.build()); } catch (IllegalStateException e) { @@ -1176,16 +1181,14 @@ public void onCompleted() { @Nullable Instant inputDataWatermark, Instant synchronizedProcessingTime, Windmill.WorkItem workItem, - Collection getWorkStreamLatencies) -> { - latch.countDown(); - }); + Collection getWorkStreamLatencies) -> latch.countDown()); // Wait for 100 items or 30 seconds. assertTrue(latch.await(30, TimeUnit.SECONDS)); // Confirm that we report at least as much throttle time as our server sent errors for. We will // actually report more due to backoff in restarting streams. assertTrue(this.client.getAndResetThrottleTime() > throttleTime); - stream.close(); + stream.halfClose(); assertTrue(stream.awaitTermination(30, TimeUnit.SECONDS)); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java index 33e47623cd0e..d06ed0f526c7 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java @@ -80,7 +80,6 @@ import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ArrayListMultimap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -225,7 +224,7 @@ public void resetUnderTest() { .forComputation("comp") .forKey( WindmillComputationKey.create( - "comp", ByteString.copyFrom("dummyKey", Charsets.UTF_8), 123), + "comp", ByteString.copyFrom("dummyKey", StandardCharsets.UTF_8), 123), 17L, workToken) .forFamily(STATE_FAMILY), @@ -240,7 +239,7 @@ public void resetUnderTest() { .forComputation("comp") .forKey( WindmillComputationKey.create( - "comp", ByteString.copyFrom("dummyNewKey", Charsets.UTF_8), 123), + "comp", ByteString.copyFrom("dummyNewKey", StandardCharsets.UTF_8), 123), 17L, workToken) .forFamily(STATE_FAMILY), @@ -255,7 +254,7 @@ public void resetUnderTest() { .forComputation("comp") .forKey( WindmillComputationKey.create( - "comp", ByteString.copyFrom("dummyNewKey", Charsets.UTF_8), 123), + "comp", ByteString.copyFrom("dummyNewKey", StandardCharsets.UTF_8), 123), 17L, workToken) .forFamily(STATE_FAMILY), @@ -2004,7 +2003,9 @@ false, key(NAMESPACE, tag), STATE_FAMILY, VarIntCoder.of())) } // clear cache and recreate multimapState - cache.forComputation("comp").invalidate(ByteString.copyFrom("dummyKey", Charsets.UTF_8), 123); + cache + .forComputation("comp") + .invalidate(ByteString.copyFrom("dummyKey", StandardCharsets.UTF_8), 123); resetUnderTest(); multimapState = underTest.state(NAMESPACE, addr); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java index 3460fc4cab92..b06d88bf4bc4 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java @@ -27,6 +27,7 @@ import com.google.api.client.util.Lists; import com.google.common.collect.Maps; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.AbstractMap; import java.util.ArrayList; import java.util.Arrays; @@ -35,20 +36,19 @@ import java.util.Optional; import java.util.concurrent.Future; import org.apache.beam.runners.dataflow.worker.KeyTokenInvalidException; -import org.apache.beam.runners.dataflow.worker.MetricTrackingWindmillServerStub; import org.apache.beam.runners.dataflow.worker.WindmillStateTestUtils; import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.SortedListEntry; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.SortedListRange; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.VarIntCoder; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.util.ByteStringOutputStream; import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Range; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.BaseEncoding; @@ -97,7 +97,7 @@ private static void assertNoReader(Object obj) throws Exception { WindmillStateTestUtils.assertNoReference(obj, WindmillStateReader.class); } - @Mock private MetricTrackingWindmillServerStub mockWindmill; + @Mock private GetDataClient mockWindmill; private WindmillStateReader underTest; @@ -1151,8 +1151,8 @@ public void testReadSortedListWithContinuations() throws Exception { .addFetchRanges(SortedListRange.newBuilder().setStart(beginning).setLimit(end)) .setFetchMaxBytes(WindmillStateReader.MAX_ORDERED_LIST_BYTES)); - final ByteString CONT_1 = ByteString.copyFrom("CONTINUATION_1", Charsets.UTF_8); - final ByteString CONT_2 = ByteString.copyFrom("CONTINUATION_2", Charsets.UTF_8); + final ByteString CONT_1 = ByteString.copyFrom("CONTINUATION_1", StandardCharsets.UTF_8); + final ByteString CONT_2 = ByteString.copyFrom("CONTINUATION_2", StandardCharsets.UTF_8); Windmill.KeyedGetDataResponse.Builder response1 = Windmill.KeyedGetDataResponse.newBuilder() .setKey(DATA_KEY) @@ -1327,7 +1327,7 @@ public void testReadTagValuePrefixWithContinuations() throws Exception { .setStateFamily(STATE_FAMILY) .setFetchMaxBytes(WindmillStateReader.MAX_TAG_VALUE_PREFIX_BYTES)); - final ByteString CONT = ByteString.copyFrom("CONTINUATION", Charsets.UTF_8); + final ByteString CONT = ByteString.copyFrom("CONTINUATION", StandardCharsets.UTF_8); Windmill.KeyedGetDataResponse.Builder response1 = Windmill.KeyedGetDataResponse.newBuilder() .setKey(DATA_KEY) diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java index 83ae8aa22ce3..3cda4559c100 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java @@ -19,7 +19,6 @@ import static org.mockito.ArgumentMatchers.anyLong; import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.times; @@ -28,18 +27,8 @@ import java.util.ArrayList; import java.util.List; -import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; -import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.WindmillStreamSender; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessChannelBuilder; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.testing.GrpcCleanupRule; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.junit.After; -import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.Timeout; @@ -50,8 +39,6 @@ public class EvenGetWorkBudgetDistributorTest { @Rule public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); @Rule public transient Timeout globalTimeout = Timeout.seconds(600); - private ManagedChannel inProcessChannel; - private CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub stub; private static GetWorkBudgetDistributor createBudgetDistributor(GetWorkBudget activeWorkBudget) { return GetWorkBudgetDistributors.distributeEvenly(() -> activeWorkBudget); @@ -65,20 +52,6 @@ private static GetWorkBudgetDistributor createBudgetDistributor(long activeWorkI .build()); } - @Before - public void setUp() { - inProcessChannel = - grpcCleanup.register( - InProcessChannelBuilder.forName("WindmillStreamSenderTest").directExecutor().build()); - grpcCleanup.register(inProcessChannel); - stub = CloudWindmillServiceV1Alpha1Grpc.newStub(inProcessChannel); - } - - @After - public void cleanUp() { - inProcessChannel.shutdownNow(); - } - @Test public void testDistributeBudget_doesNothingWhenPassedInStreamsEmpty() { createBudgetDistributor(1L) @@ -88,38 +61,40 @@ public void testDistributeBudget_doesNothingWhenPassedInStreamsEmpty() { @Test public void testDistributeBudget_doesNothingWithNoBudget() { - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(GetWorkBudget.noBudget())); + GetWorkBudgetSpender getWorkBudgetSpender = + spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(GetWorkBudget.noBudget())); createBudgetDistributor(1L) - .distributeBudget(ImmutableList.of(windmillStreamSender), GetWorkBudget.noBudget()); - verifyNoInteractions(windmillStreamSender); + .distributeBudget(ImmutableList.of(getWorkBudgetSpender), GetWorkBudget.noBudget()); + verifyNoInteractions(getWorkBudgetSpender); } @Test public void testDistributeBudget_doesNotAdjustStreamBudgetWhenRemainingBudgetHighNoActiveWork() { - WindmillStreamSender windmillStreamSender = + GetWorkBudgetSpender getWorkBudgetSpender = spy( - createWindmillStreamSender( + createGetWorkBudgetOwnerWithRemainingBudgetOf( GetWorkBudget.builder().setItems(10L).setBytes(10L).build())); createBudgetDistributor(0L) .distributeBudget( - ImmutableList.of(windmillStreamSender), + ImmutableList.of(getWorkBudgetSpender), GetWorkBudget.builder().setItems(10L).setBytes(10L).build()); - verify(windmillStreamSender, never()).adjustBudget(anyLong(), anyLong()); + verify(getWorkBudgetSpender, never()).adjustBudget(anyLong(), anyLong()); } @Test public void testDistributeBudget_doesNotAdjustStreamBudgetWhenRemainingBudgetHighWithActiveWork() { - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(GetWorkBudget.builder().setItems(5L).setBytes(5L).build())); + GetWorkBudgetSpender getWorkBudgetSpender = + spy( + createGetWorkBudgetOwnerWithRemainingBudgetOf( + GetWorkBudget.builder().setItems(5L).setBytes(5L).build())); createBudgetDistributor(10L) .distributeBudget( - ImmutableList.of(windmillStreamSender), + ImmutableList.of(getWorkBudgetSpender), GetWorkBudget.builder().setItems(20L).setBytes(20L).build()); - verify(windmillStreamSender, never()).adjustBudget(anyLong(), anyLong()); + verify(getWorkBudgetSpender, never()).adjustBudget(anyLong(), anyLong()); } @Test @@ -128,12 +103,12 @@ public void testDistributeBudget_doesNotAdjustStreamBudgetWhenRemainingBudgetHig GetWorkBudget streamRemainingBudget = GetWorkBudget.builder().setItems(1L).setBytes(10L).build(); GetWorkBudget totalGetWorkBudget = GetWorkBudget.builder().setItems(10L).setBytes(10L).build(); - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(streamRemainingBudget)); + GetWorkBudgetSpender getWorkBudgetSpender = + spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(streamRemainingBudget)); createBudgetDistributor(0L) - .distributeBudget(ImmutableList.of(windmillStreamSender), totalGetWorkBudget); + .distributeBudget(ImmutableList.of(getWorkBudgetSpender), totalGetWorkBudget); - verify(windmillStreamSender, times(1)) + verify(getWorkBudgetSpender, times(1)) .adjustBudget( eq(totalGetWorkBudget.items() - streamRemainingBudget.items()), eq(totalGetWorkBudget.bytes() - streamRemainingBudget.bytes())); @@ -146,12 +121,12 @@ public void testDistributeBudget_doesNotAdjustStreamBudgetWhenRemainingBudgetHig GetWorkBudget.builder().setItems(1L).setBytes(10L).build(); GetWorkBudget totalGetWorkBudget = GetWorkBudget.builder().setItems(10L).setBytes(10L).build(); long activeWorkItemsAndBytes = 2L; - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(streamRemainingBudget)); + GetWorkBudgetSpender getWorkBudgetSpender = + spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(streamRemainingBudget)); createBudgetDistributor(activeWorkItemsAndBytes) - .distributeBudget(ImmutableList.of(windmillStreamSender), totalGetWorkBudget); + .distributeBudget(ImmutableList.of(getWorkBudgetSpender), totalGetWorkBudget); - verify(windmillStreamSender, times(1)) + verify(getWorkBudgetSpender, times(1)) .adjustBudget( eq( totalGetWorkBudget.items() @@ -165,12 +140,12 @@ public void testDistributeBudget_adjustsStreamBudgetWhenRemainingByteBudgetTooLo GetWorkBudget streamRemainingBudget = GetWorkBudget.builder().setItems(10L).setBytes(1L).build(); GetWorkBudget totalGetWorkBudget = GetWorkBudget.builder().setItems(10L).setBytes(10L).build(); - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(streamRemainingBudget)); + GetWorkBudgetSpender getWorkBudgetSpender = + spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(streamRemainingBudget)); createBudgetDistributor(0L) - .distributeBudget(ImmutableList.of(windmillStreamSender), totalGetWorkBudget); + .distributeBudget(ImmutableList.of(getWorkBudgetSpender), totalGetWorkBudget); - verify(windmillStreamSender, times(1)) + verify(getWorkBudgetSpender, times(1)) .adjustBudget( eq(totalGetWorkBudget.items() - streamRemainingBudget.items()), eq(totalGetWorkBudget.bytes() - streamRemainingBudget.bytes())); @@ -184,12 +159,12 @@ public void testDistributeBudget_adjustsStreamBudgetWhenRemainingByteBudgetTooLo GetWorkBudget totalGetWorkBudget = GetWorkBudget.builder().setItems(10L).setBytes(10L).build(); long activeWorkItemsAndBytes = 2L; - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(streamRemainingBudget)); + GetWorkBudgetSpender getWorkBudgetSpender = + spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(streamRemainingBudget)); createBudgetDistributor(activeWorkItemsAndBytes) - .distributeBudget(ImmutableList.of(windmillStreamSender), totalGetWorkBudget); + .distributeBudget(ImmutableList.of(getWorkBudgetSpender), totalGetWorkBudget); - verify(windmillStreamSender, times(1)) + verify(getWorkBudgetSpender, times(1)) .adjustBudget( eq(totalGetWorkBudget.items() - streamRemainingBudget.items()), eq( @@ -201,9 +176,9 @@ public void testDistributeBudget_adjustsStreamBudgetWhenRemainingByteBudgetTooLo @Test public void testDistributeBudget_distributesBudgetEvenlyIfPossible() { long totalItemsAndBytes = 10L; - List streams = new ArrayList<>(); + List streams = new ArrayList<>(); for (int i = 0; i < totalItemsAndBytes; i++) { - streams.add(spy(createWindmillStreamSender(GetWorkBudget.noBudget()))); + streams.add(spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(GetWorkBudget.noBudget()))); } createBudgetDistributor(0L) .distributeBudget( @@ -223,9 +198,9 @@ public void testDistributeBudget_distributesBudgetEvenlyIfPossible() { @Test public void testDistributeBudget_distributesFairlyWhenNotEven() { long totalItemsAndBytes = 10L; - List streams = new ArrayList<>(); + List streams = new ArrayList<>(); for (int i = 0; i < 3; i++) { - streams.add(spy(createWindmillStreamSender(GetWorkBudget.noBudget()))); + streams.add(spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(GetWorkBudget.noBudget()))); } createBudgetDistributor(0L) .distributeBudget( @@ -242,23 +217,17 @@ public void testDistributeBudget_distributesFairlyWhenNotEven() { .adjustBudget(eq(itemsAndBytesPerStream), eq(itemsAndBytesPerStream))); } - private WindmillStreamSender createWindmillStreamSender(GetWorkBudget getWorkBudget) { - return WindmillStreamSender.create( - stub, - Windmill.GetWorkRequest.newBuilder() - .setClientId(1L) - .setJobId("job") - .setProjectId("project") - .build(), - getWorkBudget, - GrpcWindmillStreamFactory.of( - JobHeader.newBuilder() - .setJobId("job") - .setProjectId("project") - .setWorkerId("worker") - .build()) - .build(), - (workItem, watermarks, processingContext, ackWorkItemQueued, getWorkStreamLatencies) -> {}, - ignored -> mock(WorkCommitter.class)); + private GetWorkBudgetSpender createGetWorkBudgetOwnerWithRemainingBudgetOf( + GetWorkBudget getWorkBudget) { + return spy( + new GetWorkBudgetSpender() { + @Override + public void adjustBudget(long itemsDelta, long bytesDelta) {} + + @Override + public GetWorkBudget remainingBudget() { + return getWorkBudget; + } + }); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java index bd55595da135..146b05bb7e35 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/failures/WorkFailureProcessorTest.java @@ -18,6 +18,7 @@ package org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures; import static com.google.common.truth.Truth.assertThat; +import static org.mockito.Mockito.mock; import java.util.ArrayList; import java.util.HashSet; @@ -34,6 +35,8 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.joda.time.Duration; @@ -86,8 +89,9 @@ private static ExecutableWork createWork(Supplier clock, Consumer Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), + new FakeGetDataClient(), + ignored -> {}, + mock(HeartbeatSender.class)), clock, new ArrayList<>()), processWorkFn); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java similarity index 76% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresherTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java index 13019116767c..5efb2421fe60 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/DispatchedActiveWorkRefresherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java @@ -18,6 +18,7 @@ package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.*; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.doAnswer; @@ -38,6 +39,7 @@ import java.util.concurrent.TimeUnit; import java.util.function.Consumer; import java.util.function.Supplier; +import java.util.stream.Collectors; import org.apache.beam.runners.dataflow.worker.DataflowExecutionStateSampler; import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; import org.apache.beam.runners.dataflow.worker.streaming.ExecutableWork; @@ -46,7 +48,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.HeartbeatRequest; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.FakeGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.runners.direct.Clock; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; @@ -59,13 +61,14 @@ import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; +import org.mockito.ArgumentCaptor; @RunWith(JUnit4.class) -public class DispatchedActiveWorkRefresherTest { - +public class ActiveWorkRefresherTest { private static final Supplier A_LONG_TIME_AGO = () -> Instant.parse("1998-09-04T00:00:00Z"); private static final String COMPUTATION_ID_PREFIX = "ComputationId-"; + private final HeartbeatSender heartbeatSender = mock(HeartbeatSender.class); private static BoundedQueueExecutor workExecutor() { return new BoundedQueueExecutor( @@ -97,15 +100,20 @@ private ActiveWorkRefresher createActiveWorkRefresher( int activeWorkRefreshPeriodMillis, int stuckCommitDurationMillis, Supplier> computations, - Consumer>> activeWorkRefresherFn) { - return new DispatchedActiveWorkRefresher( + ActiveWorkRefresher.HeartbeatTracker heartbeatTracker) { + return new ActiveWorkRefresher( clock, activeWorkRefreshPeriodMillis, stuckCommitDurationMillis, computations, DataflowExecutionStateSampler.instance(), - activeWorkRefresherFn, - Executors.newSingleThreadScheduledExecutor()); + Executors.newSingleThreadScheduledExecutor(), + heartbeatTracker); + } + + private ExecutableWork createOldWork(int workIds, Consumer processWork) { + ShardedKey shardedKey = ShardedKey.create(ByteString.EMPTY, workIds); + return createOldWork(shardedKey, workIds, processWork); } private ExecutableWork createOldWork( @@ -120,10 +128,8 @@ private ExecutableWork createOldWork( .build(), Watermarks.builder().setInputDataWatermark(Instant.EPOCH).build(), Work.createProcessingContext( - "computationId", - (a, b) -> Windmill.KeyedGetDataResponse.getDefaultInstance(), - ignored -> {}), - DispatchedActiveWorkRefresherTest.A_LONG_TIME_AGO, + "computationId", new FakeGetDataClient(), ignored -> {}, heartbeatSender), + A_LONG_TIME_AGO, ImmutableList.of()), processWork); } @@ -147,8 +153,7 @@ public void testActiveWorkRefresh() throws InterruptedException { Map> computationsAndWork = new HashMap<>(); for (int i = 0; i < 5; i++) { ComputationState computationState = createComputationState(i); - ExecutableWork fakeWork = - createOldWork(ShardedKey.create(ByteString.EMPTY, i), i, processWork); + ExecutableWork fakeWork = createOldWork(i, processWork); computationState.activateWork(fakeWork); computations.add(computationState); @@ -158,41 +163,45 @@ public void testActiveWorkRefresh() throws InterruptedException { activeWorkForComputation.add(fakeWork); } - Map> expectedHeartbeats = new HashMap<>(); CountDownLatch heartbeatsSent = new CountDownLatch(1); TestClock fakeClock = new TestClock(Instant.now()); - ActiveWorkRefresher activeWorkRefresher = createActiveWorkRefresher( fakeClock::now, activeWorkRefreshPeriodMillis, 0, () -> computations, - heartbeats -> { - expectedHeartbeats.putAll(heartbeats); - heartbeatsSent.countDown(); - }); + heartbeats -> heartbeatsSent::countDown); + ArgumentCaptor heartbeatsCaptor = ArgumentCaptor.forClass(Heartbeats.class); activeWorkRefresher.start(); fakeClock.advance(Duration.millis(activeWorkRefreshPeriodMillis * 2)); heartbeatsSent.await(); activeWorkRefresher.stop(); + verify(heartbeatSender).sendHeartbeats(heartbeatsCaptor.capture()); + Heartbeats fanoutExpectedHeartbeats = heartbeatsCaptor.getValue(); + assertThat(computationsAndWork.size()) + .isEqualTo(fanoutExpectedHeartbeats.heartbeatRequests().size()); - assertThat(computationsAndWork.size()).isEqualTo(expectedHeartbeats.size()); - for (Map.Entry> expectedHeartbeat : - expectedHeartbeats.entrySet()) { + for (Map.Entry> expectedHeartbeat : + fanoutExpectedHeartbeats.heartbeatRequests().asMap().entrySet()) { String computationId = expectedHeartbeat.getKey(); - List heartbeatRequests = expectedHeartbeat.getValue(); - List work = computationsAndWork.get(computationId); - + Collection heartbeatRequests = expectedHeartbeat.getValue(); + List work = + computationsAndWork.get(computationId).stream() + .map(ExecutableWork::work) + .collect(Collectors.toList()); // Compare the heartbeatRequest's and Work's workTokens, cacheTokens, and shardingKeys. assertThat(heartbeatRequests) .comparingElementsUsing( Correspondence.from( - (HeartbeatRequest h, ExecutableWork w) -> - h.getWorkToken() == w.getWorkItem().getWorkToken() - && h.getCacheToken() == w.getWorkItem().getWorkToken() - && h.getShardingKey() == w.getWorkItem().getShardingKey(), + (Windmill.HeartbeatRequest h, Work w) -> { + assert h != null; + assert w != null; + return h.getWorkToken() == w.getWorkItem().getWorkToken() + && h.getCacheToken() == w.getWorkItem().getWorkToken() + && h.getShardingKey() == w.getWorkItem().getShardingKey(); + }, "heartbeatRequest's and Work's workTokens, cacheTokens, and shardingKeys should be equal.")) .containsExactlyElementsIn(work); } @@ -202,6 +211,32 @@ public void testActiveWorkRefresh() throws InterruptedException { workIsProcessed.countDown(); } + @Test + public void testEmptyActiveWorkRefresh() throws InterruptedException { + int activeWorkRefreshPeriodMillis = 100; + + List computations = new ArrayList<>(); + for (int i = 0; i < 5; i++) { + ComputationState computationState = createComputationState(i); + computations.add(computationState); + } + + CountDownLatch heartbeatsSent = new CountDownLatch(1); + TestClock fakeClock = new TestClock(Instant.now()); + ActiveWorkRefresher activeWorkRefresher = + createActiveWorkRefresher( + fakeClock::now, + activeWorkRefreshPeriodMillis, + 0, + () -> computations, + heartbeats -> heartbeatsSent::countDown); + + activeWorkRefresher.start(); + fakeClock.advance(Duration.millis(activeWorkRefreshPeriodMillis * 2)); + assertFalse(heartbeatsSent.await(500, TimeUnit.MILLISECONDS)); + activeWorkRefresher.stop(); + } + @Test public void testInvalidateStuckCommits() throws InterruptedException { int stuckCommitDurationMillis = 100; @@ -240,7 +275,7 @@ public void testInvalidateStuckCommits() throws InterruptedException { 0, stuckCommitDurationMillis, computations.rowMap()::keySet, - ignored -> {}); + ignored -> () -> {}); activeWorkRefresher.start(); fakeClock.advance(Duration.millis(stuckCommitDurationMillis)); diff --git a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/ArtifactStagingService.java b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/ArtifactStagingService.java index a8e5e2ab6a88..8c7a356b9939 100644 --- a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/ArtifactStagingService.java +++ b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/ArtifactStagingService.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.OutputStream; import java.nio.channels.Channels; +import java.nio.charset.StandardCharsets; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.HashMap; @@ -56,7 +57,6 @@ import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Status; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.StatusException; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -192,7 +192,7 @@ public void removeStagedArtifacts(String stagingToken) throws IOException { private ResourceId stagingDir(String stagingToken) { return FileSystems.matchNewResource(root, true) .resolve( - Hashing.sha256().hashString(stagingToken, Charsets.UTF_8).toString(), + Hashing.sha256().hashString(stagingToken, StandardCharsets.UTF_8).toString(), ResolveOptions.StandardResolveOptions.RESOLVE_DIRECTORY); } }; diff --git a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/state/StateRequestHandlers.java b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/state/StateRequestHandlers.java index e2c45850dba9..4e1c31744c1a 100644 --- a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/state/StateRequestHandlers.java +++ b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/state/StateRequestHandlers.java @@ -19,6 +19,7 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.EnumMap; @@ -54,7 +55,6 @@ import org.apache.beam.sdk.util.common.Reiterable; import org.apache.beam.sdk.values.KV; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; /** @@ -655,7 +655,8 @@ private BagUserStateHandler createHandl } private static BeamFnApi.ProcessBundleRequest.CacheToken createCacheToken() { - ByteString token = ByteString.copyFrom(UUID.randomUUID().toString().getBytes(Charsets.UTF_8)); + ByteString token = + ByteString.copyFrom(UUID.randomUUID().toString().getBytes(StandardCharsets.UTF_8)); return BeamFnApi.ProcessBundleRequest.CacheToken.newBuilder() .setUserState(BeamFnApi.ProcessBundleRequest.CacheToken.UserState.getDefaultInstance()) .setToken(token) diff --git a/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/ArtifactRetrievalServiceTest.java b/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/ArtifactRetrievalServiceTest.java index d6b48a936135..4d19e87c3d11 100644 --- a/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/ArtifactRetrievalServiceTest.java +++ b/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/ArtifactRetrievalServiceTest.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -33,7 +34,6 @@ import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessChannelBuilder; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessServerBuilder; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.testing.GrpcCleanupRule; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.junit.Before; @@ -74,7 +74,7 @@ private void stageFiles(Map files) throws IOException { for (Map.Entry entry : files.entrySet()) { Files.write( Paths.get(stagingDir.toString(), entry.getKey()), - entry.getValue().getBytes(Charsets.UTF_8)); + entry.getValue().getBytes(StandardCharsets.UTF_8)); } } diff --git a/runners/portability/java/src/main/java/org/apache/beam/runners/portability/PortableMetrics.java b/runners/portability/java/src/main/java/org/apache/beam/runners/portability/PortableMetrics.java index 1d45a83b1e79..7ae57a4b3089 100644 --- a/runners/portability/java/src/main/java/org/apache/beam/runners/portability/PortableMetrics.java +++ b/runners/portability/java/src/main/java/org/apache/beam/runners/portability/PortableMetrics.java @@ -90,6 +90,7 @@ public MetricQueryResults queryMetrics(MetricsFilter filter) { private static PortableMetrics convertMonitoringInfosToMetricResults( JobApi.MetricResults jobMetrics) { List monitoringInfoList = new ArrayList<>(); + // TODO(https://github.com/apache/beam/issues/32001) dedup Attempted and Committed metrics monitoringInfoList.addAll(jobMetrics.getAttemptedList()); monitoringInfoList.addAll(jobMetrics.getCommittedList()); Iterable> countersFromJobMetrics = diff --git a/runners/portability/java/src/main/java/org/apache/beam/runners/portability/testing/TestUniversalRunner.java b/runners/portability/java/src/main/java/org/apache/beam/runners/portability/testing/TestUniversalRunner.java index 533106869c62..a36c1e8b2efb 100644 --- a/runners/portability/java/src/main/java/org/apache/beam/runners/portability/testing/TestUniversalRunner.java +++ b/runners/portability/java/src/main/java/org/apache/beam/runners/portability/testing/TestUniversalRunner.java @@ -21,6 +21,7 @@ import com.google.auto.service.AutoService; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; import org.apache.beam.runners.portability.PortableRunner; @@ -33,7 +34,6 @@ import org.apache.beam.sdk.options.PortablePipelineOptions; import org.apache.beam.sdk.runners.PipelineRunnerRegistrar; import org.apache.beam.sdk.testing.TestPipelineOptions; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.hamcrest.Matchers; @@ -65,7 +65,8 @@ public PipelineResult run(Pipeline pipeline) { testOptions.setJobEndpoint( "localhost:" + new String( - Files.readAllBytes(Paths.get(localServicePortFilePath)), Charsets.UTF_8) + Files.readAllBytes(Paths.get(localServicePortFilePath)), + StandardCharsets.UTF_8) .trim()); } catch (IOException e) { throw new RuntimeException( diff --git a/runners/prism/java/build.gradle b/runners/prism/java/build.gradle index dfc863e8f639..96ab4e70a579 100644 --- a/runners/prism/java/build.gradle +++ b/runners/prism/java/build.gradle @@ -26,13 +26,20 @@ description = "Apache Beam :: Runners :: Prism :: Java" ext.summary = "Support for executing a pipeline on Prism." dependencies { + implementation project(path: ":model:job-management", configuration: "shadow") + implementation project(path: ":model:pipeline", configuration: "shadow") implementation project(path: ":sdks:java:core", configuration: "shadow") + implementation project(path: ":sdks:java:harness", configuration: "shadow") + implementation project(":runners:java-fn-execution") implementation project(":runners:portability:java") + implementation library.java.joda_time implementation library.java.slf4j_api + implementation library.java.vendored_grpc_1_60_1 implementation library.java.vendored_guava_32_1_2_jre testImplementation library.java.junit + testImplementation library.java.mockito_core testImplementation library.java.truth } diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactResolver.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactResolver.java new file mode 100644 index 000000000000..db56bc6047ca --- /dev/null +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactResolver.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; + +import com.google.auto.value.AutoValue; +import java.util.Optional; +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.util.construction.DefaultArtifactResolver; +import org.apache.beam.sdk.util.construction.PipelineTranslation; +import org.apache.beam.sdk.util.construction.SdkComponents; + +/** + * The {@link PrismArtifactResolver} converts a {@link Pipeline} to a {@link RunnerApi.Pipeline} via + * resolving {@link RunnerApi.ArtifactInformation}. + */ +@AutoValue +abstract class PrismArtifactResolver { + + /** + * Instantiates a {@link PrismArtifactResolver} from the {@param pipeline}, applying defaults to + * the remaining dependencies. + */ + static PrismArtifactResolver of(Pipeline pipeline) { + return PrismArtifactResolver.builder().setPipeline(pipeline).build(); + } + + static Builder builder() { + return new AutoValue_PrismArtifactResolver.Builder(); + } + + /** + * Converts the {@link #getPipeline()} using {@link PipelineTranslation#toProto} and {@link + * #getDelegate()}'s {@link + * org.apache.beam.sdk.util.construction.ArtifactResolver#resolveArtifacts}. + */ + RunnerApi.Pipeline resolvePipelineProto() { + RunnerApi.Pipeline result = PipelineTranslation.toProto(getPipeline(), getSdkComponents()); + return getDelegate().resolveArtifacts(result); + } + + /** + * {@link PrismArtifactResolver} delegates to {@link + * org.apache.beam.sdk.util.construction.ArtifactResolver} to transform {@link + * RunnerApi.ArtifactInformation}. Defaults to {@link DefaultArtifactResolver#INSTANCE} if not + * set. + */ + abstract org.apache.beam.sdk.util.construction.ArtifactResolver getDelegate(); + + /** The {@link Pipeline} from which {@link PrismArtifactResolver#resolvePipelineProto()}. */ + abstract Pipeline getPipeline(); + + /** + * SDK objects that will be represented by {@link + * org.apache.beam.model.pipeline.v1.RunnerApi.Components}. Instantiated via {@link + * SdkComponents#create(PipelineOptions)} by default, where {@link PipelineOptions} are acquired + * from {@link #getPipeline}'s {@link Pipeline#getOptions}. + */ + abstract SdkComponents getSdkComponents(); + + @AutoValue.Builder + abstract static class Builder { + + abstract Builder setDelegate( + org.apache.beam.sdk.util.construction.ArtifactResolver artifactResolver); + + abstract Optional getDelegate(); + + abstract Builder setSdkComponents(SdkComponents sdkComponents); + + abstract Optional getSdkComponents(); + + abstract Builder setPipeline(Pipeline pipeline); + + abstract Optional getPipeline(); + + abstract PrismArtifactResolver autoBuild(); + + final PrismArtifactResolver build() { + if (!getDelegate().isPresent()) { + setDelegate(DefaultArtifactResolver.INSTANCE); + } + + if (!getSdkComponents().isPresent()) { + checkState(getPipeline().isPresent()); + setSdkComponents(SdkComponents.create(getPipeline().get().getOptions())); + } + + return autoBuild(); + } + } +} diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactStager.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactStager.java new file mode 100644 index 000000000000..f1d99a213eea --- /dev/null +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactStager.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; + +import com.google.auto.value.AutoValue; +import java.util.Optional; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import org.apache.beam.model.jobmanagement.v1.ArtifactStagingServiceGrpc; +import org.apache.beam.model.jobmanagement.v1.JobApi; +import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; +import org.apache.beam.model.pipeline.v1.Endpoints; +import org.apache.beam.runners.fnexecution.artifact.ArtifactRetrievalService; +import org.apache.beam.runners.fnexecution.artifact.ArtifactStagingService; +import org.apache.beam.sdk.fn.channel.ManagedChannelFactory; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Stages {@link org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline} artifacts of prepared jobs. + */ +@AutoValue +abstract class PrismArtifactStager implements AutoCloseable { + + private static final Logger LOG = LoggerFactory.getLogger(PrismArtifactStager.class); + + /** + * Instantiate a {@link PrismArtifactStager} via call to {@link #of(String, String)}, assigning + * {@link Builder#setStagingEndpoint} using {@param prepareJobResponse} {@link + * JobApi.PrepareJobResponse#getArtifactStagingEndpoint} and {@link + * JobApi.PrepareJobResponse#getStagingSessionToken}. + */ + static PrismArtifactStager of(JobApi.PrepareJobResponse prepareJobResponse) { + return of( + prepareJobResponse.getArtifactStagingEndpoint().getUrl(), + prepareJobResponse.getStagingSessionToken()); + } + + /** + * Instantiates a {@link PrismArtifactStager} from the {@param stagingEndpoint} URL and {@param + * stagingSessionToken} to instantiate the {@link #getRetrievalService}, {@link + * #getManagedChannel}, and {@link #getStagingServiceStub} defaults. See the referenced getters + * for more details. + */ + static PrismArtifactStager of(String stagingEndpoint, String stagingSessionToken) { + return PrismArtifactStager.builder() + .setStagingEndpoint(stagingEndpoint) + .setStagingSessionToken(stagingSessionToken) + .build(); + } + + static Builder builder() { + return new AutoValue_PrismArtifactStager.Builder(); + } + + /** + * Stage the {@link org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline} artifacts via {@link + * ArtifactStagingService#offer} supplying {@link #getRetrievalService}, {@link + * #getStagingServiceStub}, and {@link #getStagingSessionToken}. + */ + void stage() throws ExecutionException, InterruptedException { + LOG.info("staging artifacts at {}", getStagingEndpoint()); + ArtifactStagingService.offer( + getRetrievalService(), getStagingServiceStub(), getStagingSessionToken()); + } + + /** The URL of the {@link ArtifactStagingService}. */ + abstract String getStagingEndpoint(); + + /** + * Token associated with a staging session and acquired from a {@link + * JobServiceGrpc.JobServiceStub#prepare}'s {@link JobApi.PrepareJobResponse}. + */ + abstract String getStagingSessionToken(); + + /** + * The service that retrieves artifacts; defaults to instantiating from the default {@link + * ArtifactRetrievalService#ArtifactRetrievalService()} constructor. + */ + abstract ArtifactRetrievalService getRetrievalService(); + + /** + * Used to instantiate the {@link #getStagingServiceStub}. By default, instantiates using {@link + * ManagedChannelFactory#forDescriptor(Endpoints.ApiServiceDescriptor)}, where {@link + * Endpoints.ApiServiceDescriptor} is instantiated via {@link + * Endpoints.ApiServiceDescriptor.Builder#setUrl(String)} and the URL provided by {@link + * #getStagingEndpoint}. + */ + abstract ManagedChannel getManagedChannel(); + + /** + * Required by {@link ArtifactStagingService#offer}. By default, instantiates using {@link + * ArtifactStagingServiceGrpc#newStub} and {@link #getManagedChannel}. + */ + abstract ArtifactStagingServiceGrpc.ArtifactStagingServiceStub getStagingServiceStub(); + + @Override + public void close() { + LOG.info("shutting down {}", PrismArtifactStager.class); + getRetrievalService().close(); + getManagedChannel().shutdown(); + try { + getManagedChannel().awaitTermination(3000L, TimeUnit.MILLISECONDS); + } catch (InterruptedException ignored) { + } + } + + @AutoValue.Builder + abstract static class Builder { + + abstract Builder setStagingEndpoint(String stagingEndpoint); + + abstract Optional getStagingEndpoint(); + + abstract Builder setStagingSessionToken(String stagingSessionToken); + + abstract Builder setRetrievalService(ArtifactRetrievalService retrievalService); + + abstract Optional getRetrievalService(); + + abstract Builder setManagedChannel(ManagedChannel managedChannel); + + abstract Optional getManagedChannel(); + + abstract Builder setStagingServiceStub( + ArtifactStagingServiceGrpc.ArtifactStagingServiceStub stub); + + abstract Optional + getStagingServiceStub(); + + abstract PrismArtifactStager autoBuild(); + + final PrismArtifactStager build() { + + checkState(getStagingEndpoint().isPresent(), "missing staging endpoint"); + ManagedChannelFactory channelFactory = ManagedChannelFactory.createDefault(); + + if (!getManagedChannel().isPresent()) { + Endpoints.ApiServiceDescriptor descriptor = + Endpoints.ApiServiceDescriptor.newBuilder().setUrl(getStagingEndpoint().get()).build(); + setManagedChannel(channelFactory.forDescriptor(descriptor)); + } + + if (!getStagingServiceStub().isPresent()) { + setStagingServiceStub(ArtifactStagingServiceGrpc.newStub(getManagedChannel().get())); + } + + if (!getRetrievalService().isPresent()) { + setRetrievalService(new ArtifactRetrievalService()); + } + + return autoBuild(); + } + } +} diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismExecutor.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismExecutor.java index fba2eec99c5c..620d5508f22a 100644 --- a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismExecutor.java +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismExecutor.java @@ -87,6 +87,14 @@ void stop() { } } + /** Reports whether the Prism executable {@link Process#isAlive()}. */ + boolean isAlive() { + if (process == null) { + return false; + } + return process.isAlive(); + } + /** * Execute the {@link ProcessBuilder} that starts the Prism service. Redirects output to STDOUT. */ diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismJobManager.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismJobManager.java new file mode 100644 index 000000000000..e461e92c4749 --- /dev/null +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismJobManager.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; + +import com.google.auto.value.AutoValue; +import java.io.Closeable; +import java.util.Optional; +import java.util.concurrent.TimeUnit; +import org.apache.beam.model.jobmanagement.v1.JobApi; +import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; +import org.apache.beam.model.pipeline.v1.Endpoints; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.fn.channel.ManagedChannelFactory; +import org.apache.beam.sdk.options.PortablePipelineOptions; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; +import org.joda.time.Duration; + +/** + * A wrapper for {@link JobServiceGrpc.JobServiceBlockingStub} that {@link #close}es when {@link + * StateListener#onStateChanged} is invoked with a {@link PipelineResult.State} that is {@link + * PipelineResult.State#isTerminal}. + */ +@AutoValue +abstract class PrismJobManager implements StateListener, Closeable { + + /** + * Instantiate a {@link PrismJobManager} with {@param options}, assigning {@link #getEndpoint} + * from {@link PortablePipelineOptions#getJobEndpoint} and {@link #getTimeout} from {@link + * PortablePipelineOptions#getJobServerTimeout}. Defaults the instantiations of {@link + * #getManagedChannel} and {@link #getBlockingStub}. See respective getters for more details. + */ + static PrismJobManager of(PortablePipelineOptions options) { + return builder() + .setEndpoint(options.getJobEndpoint()) + .setTimeout(Duration.standardSeconds(options.getJobServerTimeout())) + .build(); + } + + static Builder builder() { + return new AutoValue_PrismJobManager.Builder(); + } + + /** + * Executes {@link #getBlockingStub()}'s {@link JobServiceGrpc.JobServiceBlockingStub#prepare} + * method. + */ + JobApi.PrepareJobResponse prepare(JobApi.PrepareJobRequest request) { + return getBlockingStub().prepare(request); + } + + /** + * Executes {@link #getBlockingStub()}'s {@link JobServiceGrpc.JobServiceBlockingStub#run} method. + */ + JobApi.RunJobResponse run(JobApi.RunJobRequest request) { + return getBlockingStub().run(request); + } + + /** The {@link JobServiceGrpc} endpoint. */ + abstract String getEndpoint(); + + /** The {@link JobServiceGrpc} timeout. */ + abstract Duration getTimeout(); + + /** The {@link #getBlockingStub}'s channel. Defaulted from the {@link #getEndpoint()}. */ + abstract ManagedChannel getManagedChannel(); + + /** The wrapped service defaulted using the {@link #getManagedChannel}. */ + abstract JobServiceGrpc.JobServiceBlockingStub getBlockingStub(); + + /** Shuts down {@link #getManagedChannel}, if not {@link #isShutdown}. */ + @Override + public void close() { + if (isShutdown()) { + return; + } + getManagedChannel().shutdown(); + try { + getManagedChannel().awaitTermination(3000L, TimeUnit.MILLISECONDS); + } catch (InterruptedException ignored) { + } + } + + /** Queries whether {@link #getManagedChannel} {@link ManagedChannel#isShutdown}. */ + boolean isShutdown() { + return getManagedChannel().isShutdown(); + } + + /** + * Override of {@link StateListener#onStateChanged}. Invokes {@link #close} when {@link + * PipelineResult.State} {@link PipelineResult.State#isTerminal}. + */ + @Override + public void onStateChanged(PipelineResult.State state) { + if (state.isTerminal()) { + close(); + } + } + + @AutoValue.Builder + abstract static class Builder { + + abstract Builder setEndpoint(String endpoint); + + abstract Optional getEndpoint(); + + abstract Builder setTimeout(Duration timeout); + + abstract Optional getTimeout(); + + abstract Builder setManagedChannel(ManagedChannel managedChannel); + + abstract Optional getManagedChannel(); + + abstract Builder setBlockingStub(JobServiceGrpc.JobServiceBlockingStub blockingStub); + + abstract Optional getBlockingStub(); + + abstract PrismJobManager autoBuild(); + + final PrismJobManager build() { + + checkState(getEndpoint().isPresent(), "endpoint is not set"); + checkState(getTimeout().isPresent(), "timeout is not set"); + + if (!getManagedChannel().isPresent()) { + ManagedChannelFactory channelFactory = ManagedChannelFactory.createDefault(); + + setManagedChannel( + channelFactory.forDescriptor( + Endpoints.ApiServiceDescriptor.newBuilder().setUrl(getEndpoint().get()).build())); + } + + if (!getBlockingStub().isPresent()) { + setBlockingStub( + JobServiceGrpc.newBlockingStub(getManagedChannel().get()) + .withDeadlineAfter(getTimeout().get().getMillis(), TimeUnit.MILLISECONDS) + .withWaitForReady()); + } + + return autoBuild(); + } + } +} diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismPipelineResult.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismPipelineResult.java new file mode 100644 index 000000000000..a551196c9b6f --- /dev/null +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismPipelineResult.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import java.io.IOException; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.metrics.MetricResults; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; + +/** + * The {@link PipelineResult} of executing a {@link org.apache.beam.sdk.Pipeline} using the {@link + * PrismRunner} and an internal {@link PipelineResult} delegate. + */ +class PrismPipelineResult implements PipelineResult { + + static PrismPipelineResult of(PipelineResult delegate, PrismExecutor executor) { + return new PrismPipelineResult(delegate, executor::stop); + } + + private final PipelineResult delegate; + private final Runnable cancel; + private @Nullable MetricResults terminalMetrics; + private @Nullable State terminalState; + + /** + * Instantiate the {@link PipelineResult} from the {@param delegate} and a {@param cancel} to be + * called when stopping the underlying executable Job management service. + */ + PrismPipelineResult(PipelineResult delegate, Runnable cancel) { + this.delegate = delegate; + this.cancel = cancel; + } + + /** Forwards the result of the delegate {@link PipelineResult#getState}. */ + @Override + public State getState() { + if (terminalState != null) { + return terminalState; + } + return delegate.getState(); + } + + /** + * Forwards the result of the delegate {@link PipelineResult#cancel}. Invokes {@link + * PrismExecutor#stop()} before returning the resulting {@link + * org.apache.beam.sdk.PipelineResult.State}. + */ + @Override + public State cancel() throws IOException { + State state = delegate.cancel(); + this.terminalMetrics = delegate.metrics(); + this.terminalState = state; + this.cancel.run(); + return state; + } + + /** + * Forwards the result of the delegate {@link PipelineResult#waitUntilFinish(Duration)}. Invokes + * {@link PrismExecutor#stop()} before returning the resulting {@link + * org.apache.beam.sdk.PipelineResult.State}. + */ + @Override + public State waitUntilFinish(Duration duration) { + State state = delegate.waitUntilFinish(duration); + this.terminalMetrics = delegate.metrics(); + this.terminalState = state; + this.cancel.run(); + return state; + } + + /** + * Forwards the result of the delegate {@link PipelineResult#waitUntilFinish}. Invokes {@link + * PrismExecutor#stop()} before returning the resulting {@link + * org.apache.beam.sdk.PipelineResult.State}. + */ + @Override + public State waitUntilFinish() { + State state = delegate.waitUntilFinish(); + this.terminalMetrics = delegate.metrics(); + this.terminalState = state; + this.cancel.run(); + return state; + } + + /** Forwards the result of the delegate {@link PipelineResult#metrics}. */ + @Override + public MetricResults metrics() { + if (terminalMetrics != null) { + return terminalMetrics; + } + return delegate.metrics(); + } +} diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateListener.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateListener.java new file mode 100644 index 000000000000..89f537e4f812 --- /dev/null +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateListener.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import org.apache.beam.sdk.PipelineResult; + +/** Listens for {@link PipelineResult.State} changes reported by the {@link StateWatcher}. */ +interface StateListener { + + /** Callback invoked when {@link StateWatcher} discovers a {@link PipelineResult.State} change. */ + void onStateChanged(PipelineResult.State state); +} diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateWatcher.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateWatcher.java new file mode 100644 index 000000000000..fe9eb84a72b5 --- /dev/null +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/StateWatcher.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import com.google.auto.value.AutoValue; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.TimeUnit; +import org.apache.beam.model.jobmanagement.v1.JobApi; +import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ChannelCredentials; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.InsecureChannelCredentials; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.netty.NettyChannelBuilder; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; + +/** + * {@link StateWatcher} {@link #watch}es for and reports {@link PipelineResult.State} changes to + * {@link StateListener}s. + */ +@AutoValue +abstract class StateWatcher implements AutoCloseable { + + private Optional latestState = Optional.empty(); + + /** + * Instantiates a {@link StateWatcher} with {@link InsecureChannelCredentials}. {@link + * StateWatcher} will report to each {@link StateListener} of {@param listeners} of any changed + * {@link PipelineResult.State}. + */ + static StateWatcher insecure(String endpoint, StateListener... listeners) { + return StateWatcher.builder() + .setEndpoint(HostAndPort.fromString(endpoint)) + .setCredentials(InsecureChannelCredentials.create()) + .setListeners(Arrays.asList(listeners)) + .build(); + } + + /** + * Watch for a Job's {@link PipelineResult.State} change. A {@link + * org.apache.beam.model.jobmanagement.v1.JobApi.GetJobStateRequest} identifies a Job to watch via + * its {@link JobApi.GetJobStateRequest#getJobId()}. The method is blocking until the {@link + * JobApi.JobStateEvent} {@link StreamObserver#onCompleted()}. + */ + void watch(String jobId) { + JobApi.GetJobStateRequest request = + JobApi.GetJobStateRequest.newBuilder().setJobId(jobId).build(); + Iterator iterator = getJobServiceBlockingStub().getStateStream(request); + while (iterator.hasNext()) { + JobApi.JobStateEvent event = iterator.next(); + PipelineResult.State state = PipelineResult.State.valueOf(event.getState().name()); + publish(state); + } + } + + private void publish(PipelineResult.State state) { + if (latestState.isPresent() && latestState.get().equals(state)) { + return; + } + latestState = Optional.of(state); + for (StateListener listener : getListeners()) { + listener.onStateChanged(state); + } + } + + static Builder builder() { + return new AutoValue_StateWatcher.Builder(); + } + + abstract HostAndPort getEndpoint(); + + abstract ChannelCredentials getCredentials(); + + abstract List getListeners(); + + abstract ManagedChannel getManagedChannel(); + + abstract JobServiceGrpc.JobServiceBlockingStub getJobServiceBlockingStub(); + + @Override + public void close() { + getManagedChannel().shutdown(); + try { + getManagedChannel().awaitTermination(3000L, TimeUnit.MILLISECONDS); + } catch (InterruptedException ignored) { + } + } + + @AutoValue.Builder + abstract static class Builder { + + abstract Builder setEndpoint(HostAndPort endpoint); + + abstract Optional getEndpoint(); + + abstract Builder setCredentials(ChannelCredentials credentials); + + abstract Optional getCredentials(); + + abstract Builder setListeners(List listeners); + + abstract Builder setManagedChannel(ManagedChannel managedChannel); + + abstract Builder setJobServiceBlockingStub( + JobServiceGrpc.JobServiceBlockingStub jobServiceBlockingStub); + + abstract StateWatcher autoBuild(); + + final StateWatcher build() { + if (!getEndpoint().isPresent()) { + throw new IllegalStateException("missing endpoint"); + } + if (!getCredentials().isPresent()) { + throw new IllegalStateException("missing credentials"); + } + HostAndPort endpoint = getEndpoint().get(); + ManagedChannel channel = + NettyChannelBuilder.forAddress( + endpoint.getHost(), endpoint.getPort(), getCredentials().get()) + .build(); + setManagedChannel(channel); + setJobServiceBlockingStub(JobServiceGrpc.newBlockingStub(channel)); + + return autoBuild(); + } + } +} diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/WorkerService.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/WorkerService.java new file mode 100644 index 000000000000..289ffac64f8a --- /dev/null +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/WorkerService.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +import org.apache.beam.fn.harness.ExternalWorkerService; +import org.apache.beam.model.pipeline.v1.Endpoints; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.fn.server.GrpcFnServer; +import org.apache.beam.sdk.options.PortablePipelineOptions; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Server; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * An {@link ExternalWorkerService} {@link GrpcFnServer} encapsulation that {@link #stop}s when + * {@link StateListener#onStateChanged} is invoked with a {@link PipelineResult.State} that is + * {@link PipelineResult.State#isTerminal}. + */ +class WorkerService implements StateListener { + + private static final Logger LOG = LoggerFactory.getLogger(WorkerService.class); + + private final ExternalWorkerService worker; + private @MonotonicNonNull GrpcFnServer server; + + WorkerService(PortablePipelineOptions options) { + this.worker = new ExternalWorkerService(options); + } + + /** Start the {@link ExternalWorkerService}. */ + void start() throws Exception { + if (server != null && !server.getServer().isShutdown()) { + return; + } + + server = worker.start(); + LOG.info("Starting worker service at {}", getApiServiceDescriptorUrl()); + } + + /** + * Queries whether the {@link ExternalWorkerService} {@link GrpcFnServer}'s {@link Server} is + * running. + */ + boolean isRunning() { + if (server == null) { + return false; + } + return !server.getServer().isShutdown(); + } + + /** + * Queries the {@link Endpoints.ApiServiceDescriptor#getUrl} of the {@link ExternalWorkerService} + * {@link GrpcFnServer}'s {@link Server}. Throws an exception if the {@link WorkerService} has not + * {@link WorkerService#start}ed. + */ + String getApiServiceDescriptorUrl() { + return checkStateNotNull(server, "worker service not started") + .getApiServiceDescriptor() + .getUrl(); + } + + /** + * Updates {@link PortablePipelineOptions#getDefaultEnvironmentConfig} with {@link + * #getApiServiceDescriptorUrl}. Throws an exception if the {@link WorkerService} has not {@link + * WorkerService#start}ed. + */ + PortablePipelineOptions updateDefaultEnvironmentConfig(PortablePipelineOptions options) { + options.setDefaultEnvironmentConfig(getApiServiceDescriptorUrl()); + return options; + } + + /** + * Overrides {@link StateListener#onStateChanged}, invoking {@link #stop} when {@link + * PipelineResult.State#isTerminal}. + */ + @Override + public void onStateChanged(PipelineResult.State state) { + if (state.isTerminal()) { + stop(); + } + } + + /** + * Stops the {@link ExternalWorkerService} {@link GrpcFnServer}'s {@link Server}. If not {@link + * WorkerService#isRunning()}, then calling stop is a noop. + */ + void stop() { + if (server == null || server.getServer().isShutdown()) { + return; + } + LOG.info("Stopping worker service at {}", getApiServiceDescriptorUrl()); + try { + server.close(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactResolverTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactResolverTest.java new file mode 100644 index 000000000000..ef4646f02347 --- /dev/null +++ b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactResolverTest.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static com.google.common.truth.Truth.assertThat; + +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.transforms.Impulse; +import org.apache.beam.sdk.util.construction.BeamUrns; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link PrismArtifactResolver}. */ +@RunWith(JUnit4.class) +public class PrismArtifactResolverTest { + @Test + public void resolvesPipeline() { + Pipeline pipeline = Pipeline.create(); + pipeline.apply(Impulse.create()); + PrismArtifactResolver underTest = PrismArtifactResolver.of(pipeline); + RunnerApi.Pipeline pipelineProto = underTest.resolvePipelineProto(); + RunnerApi.Components components = pipelineProto.getComponents(); + assertThat(components.getTransformsMap()).containsKey("Impulse"); + assertThat(components.getCodersMap()).containsKey("ByteArrayCoder"); + assertThat(components.getEnvironmentsMap()) + .containsKey(BeamUrns.getUrn(RunnerApi.StandardEnvironments.Environments.DOCKER)); + } +} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactStagerTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactStagerTest.java new file mode 100644 index 000000000000..d3ac8a72eafb --- /dev/null +++ b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactStagerTest.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static com.google.common.truth.Truth.assertThat; +import static org.apache.beam.runners.fnexecution.artifact.ArtifactRetrievalService.EMBEDDED_ARTIFACT_URN; +import static org.junit.Assert.assertThrows; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.runners.fnexecution.artifact.ArtifactRetrievalService; +import org.apache.beam.runners.fnexecution.artifact.ArtifactStagingService; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessChannelBuilder; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessServerBuilder; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.testing.GrpcCleanupRule; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.commons.io.output.ByteArrayOutputStream; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link PrismArtifactStager}. */ +@RunWith(JUnit4.class) +public class PrismArtifactStagerTest { + + @Rule public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); + + final ArtifactStagingService stagingService = + new ArtifactStagingService(new TestDestinationProvider()); + + @Test + public void givenValidArtifacts_stages() + throws IOException, ExecutionException, InterruptedException { + PrismArtifactStager underTest = prismArtifactStager(validArtifacts()); + assertThat(underTest.getManagedChannel().isShutdown()).isFalse(); + underTest.stage(); + assertThat(stagingService.getStagedArtifacts(underTest.getStagingSessionToken())).isNotEmpty(); + underTest.close(); + assertThat(underTest.getManagedChannel().isShutdown()).isTrue(); + } + + @Test + public void givenErrors_performsGracefulCleanup() throws IOException { + PrismArtifactStager underTest = prismArtifactStager(invalidArtifacts()); + assertThat(underTest.getManagedChannel().isShutdown()).isFalse(); + ExecutionException error = assertThrows(ExecutionException.class, underTest::stage); + assertThat(error.getMessage()).contains("Unexpected artifact type: invalid-type-urn"); + assertThat(underTest.getManagedChannel().isShutdown()).isFalse(); + underTest.close(); + assertThat(underTest.getManagedChannel().isShutdown()).isTrue(); + } + + private PrismArtifactStager prismArtifactStager( + Map> artifacts) throws IOException { + String serverName = InProcessServerBuilder.generateName(); + ArtifactRetrievalService retrievalService = new ArtifactRetrievalService(); + String stagingToken = "staging-token"; + stagingService.registerJob(stagingToken, artifacts); + + grpcCleanup.register( + InProcessServerBuilder.forName(serverName) + .directExecutor() + .addService(stagingService) + .addService(retrievalService) + .build() + .start()); + + ManagedChannel channel = + grpcCleanup.register(InProcessChannelBuilder.forName(serverName).build()); + + return PrismArtifactStager.builder() + .setStagingEndpoint("ignore") + .setStagingSessionToken(stagingToken) + .setManagedChannel(channel) + .build(); + } + + private Map> validArtifacts() { + return ImmutableMap.of( + "env1", + Collections.singletonList( + RunnerApi.ArtifactInformation.newBuilder() + .setTypeUrn(EMBEDDED_ARTIFACT_URN) + .setTypePayload( + RunnerApi.EmbeddedFilePayload.newBuilder() + .setData(ByteString.copyFromUtf8("type-payload")) + .build() + .toByteString()) + .setRoleUrn("role-urn") + .build())); + } + + private Map> invalidArtifacts() { + return ImmutableMap.of( + "env1", + Collections.singletonList( + RunnerApi.ArtifactInformation.newBuilder() + .setTypeUrn("invalid-type-urn") + .setTypePayload( + RunnerApi.EmbeddedFilePayload.newBuilder() + .setData(ByteString.copyFromUtf8("type-payload")) + .build() + .toByteString()) + .setRoleUrn("role-urn") + .build())); + } + + private static class TestDestinationProvider + implements ArtifactStagingService.ArtifactDestinationProvider { + + @Override + public ArtifactStagingService.ArtifactDestination getDestination( + String stagingToken, String name) throws IOException { + return ArtifactStagingService.ArtifactDestination.create( + EMBEDDED_ARTIFACT_URN, ByteString.EMPTY, new ByteArrayOutputStream()); + } + + @Override + public void removeStagedArtifacts(String stagingToken) throws IOException {} + } +} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismJobManagerTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismJobManagerTest.java new file mode 100644 index 000000000000..1e38e4f8d12e --- /dev/null +++ b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismJobManagerTest.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertThrows; + +import java.io.IOException; +import java.util.Optional; +import org.apache.beam.model.jobmanagement.v1.JobApi; +import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; +import org.apache.beam.model.pipeline.v1.Endpoints; +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.transforms.Impulse; +import org.apache.beam.sdk.util.construction.PipelineTranslation; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessChannelBuilder; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessServerBuilder; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.testing.GrpcCleanupRule; +import org.joda.time.Duration; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link PrismJobManager}. */ +@RunWith(JUnit4.class) +public class PrismJobManagerTest { + @Rule public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); + + @Rule public TestName testName = new TestName(); + + @Test + public void givenPrepareError_forwardsException_canGracefulShutdown() { + TestJobService service = + new TestJobService().withErrorResponse(new RuntimeException(testName.getMethodName())); + PrismJobManager underTest = prismJobManager(service); + assertThat(underTest.isShutdown()).isFalse(); + assertThrows( + RuntimeException.class, + () -> + underTest.prepare( + JobApi.PrepareJobRequest.newBuilder().setPipeline(pipelineOf()).build())); + assertThat(underTest.isShutdown()).isFalse(); + underTest.close(); + assertThat(underTest.isShutdown()).isTrue(); + } + + @Test + public void givenPrepareSuccess_forwardsResponse_canGracefulShutdown() { + TestJobService service = + new TestJobService() + .withPrepareJobResponse( + JobApi.PrepareJobResponse.newBuilder() + .setStagingSessionToken("token") + .setPreparationId("preparationId") + .setArtifactStagingEndpoint( + Endpoints.ApiServiceDescriptor.newBuilder() + .setUrl("localhost:1234") + .build()) + .build()); + PrismJobManager underTest = prismJobManager(service); + assertThat(underTest.isShutdown()).isFalse(); + JobApi.PrepareJobResponse response = + underTest.prepare(JobApi.PrepareJobRequest.newBuilder().setPipeline(pipelineOf()).build()); + assertThat(underTest.isShutdown()).isFalse(); + assertThat(response.getStagingSessionToken()).isEqualTo("token"); + assertThat(response.getPreparationId()).isEqualTo("preparationId"); + underTest.close(); + assertThat(underTest.isShutdown()).isTrue(); + } + + @Test + public void givenRunError_forwardsException_canGracefulShutdown() { + TestJobService service = + new TestJobService().withErrorResponse(new RuntimeException(testName.getMethodName())); + PrismJobManager underTest = prismJobManager(service); + assertThat(underTest.isShutdown()).isFalse(); + assertThrows( + RuntimeException.class, + () -> + underTest.run(JobApi.RunJobRequest.newBuilder().setPreparationId("prepareId").build())); + assertThat(underTest.isShutdown()).isFalse(); + underTest.close(); + assertThat(underTest.isShutdown()).isTrue(); + } + + @Test + public void givenRunSuccess_forwardsResponse_canGracefulShutdown() { + TestJobService service = + new TestJobService() + .withRunJobResponse(JobApi.RunJobResponse.newBuilder().setJobId("jobId").build()); + PrismJobManager underTest = prismJobManager(service); + assertThat(underTest.isShutdown()).isFalse(); + JobApi.RunJobResponse runJobResponse = + underTest.run(JobApi.RunJobRequest.newBuilder().setPreparationId("preparationId").build()); + assertThat(underTest.isShutdown()).isFalse(); + assertThat(runJobResponse.getJobId()).isEqualTo("jobId"); + underTest.close(); + assertThat(underTest.isShutdown()).isTrue(); + } + + @Test + public void givenTerminalState_closes() { + PrismJobManager underTest = prismJobManager(new TestJobService()); + assertThat(underTest.isShutdown()).isFalse(); + underTest.onStateChanged(PipelineResult.State.RUNNING); + assertThat(underTest.isShutdown()).isFalse(); + underTest.onStateChanged(PipelineResult.State.RUNNING); + assertThat(underTest.isShutdown()).isFalse(); + underTest.onStateChanged(PipelineResult.State.CANCELLED); + assertThat(underTest.isShutdown()).isTrue(); + + underTest.close(); + } + + private PrismJobManager prismJobManager(TestJobService service) { + String serverName = InProcessServerBuilder.generateName(); + try { + grpcCleanup.register( + InProcessServerBuilder.forName(serverName) + .directExecutor() + .addService(service) + .build() + .start()); + } catch (IOException e) { + throw new RuntimeException(e); + } + + ManagedChannel channel = + grpcCleanup.register(InProcessChannelBuilder.forName(serverName).build()); + + return PrismJobManager.builder() + .setTimeout(Duration.millis(3000L)) + .setEndpoint("ignore") + .setManagedChannel(channel) + .build(); + } + + private static class TestJobService extends JobServiceGrpc.JobServiceImplBase { + + private Optional prepareJobResponse = Optional.empty(); + private Optional runJobResponse = Optional.empty(); + private Optional error = Optional.empty(); + + TestJobService withPrepareJobResponse(JobApi.PrepareJobResponse prepareJobResponse) { + this.prepareJobResponse = Optional.of(prepareJobResponse); + return this; + } + + TestJobService withRunJobResponse(JobApi.RunJobResponse runJobResponse) { + this.runJobResponse = Optional.of(runJobResponse); + return this; + } + + TestJobService withErrorResponse(RuntimeException error) { + this.error = Optional.of(error); + return this; + } + + @Override + public void prepare( + JobApi.PrepareJobRequest request, + StreamObserver responseObserver) { + if (prepareJobResponse.isPresent()) { + responseObserver.onNext(prepareJobResponse.get()); + responseObserver.onCompleted(); + } + if (error.isPresent()) { + responseObserver.onError(error.get()); + } + } + + @Override + public void run( + JobApi.RunJobRequest request, StreamObserver responseObserver) { + if (runJobResponse.isPresent()) { + responseObserver.onNext(runJobResponse.get()); + responseObserver.onCompleted(); + } + if (error.isPresent()) { + responseObserver.onError(error.get()); + } + } + } + + private static RunnerApi.Pipeline pipelineOf() { + Pipeline pipeline = Pipeline.create(); + pipeline.apply(Impulse.create()); + return PipelineTranslation.toProto(pipeline); + } +} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismPipelineResultTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismPipelineResultTest.java new file mode 100644 index 000000000000..2ad7e2eb3dd9 --- /dev/null +++ b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismPipelineResultTest.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static com.google.common.truth.Truth.assertThat; +import static org.apache.beam.runners.prism.PrismRunnerTest.getLocalPrismBuildOrIgnoreTest; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.metrics.MetricResults; +import org.joda.time.Duration; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link PrismPipelineResult}. */ +@RunWith(JUnit4.class) +public class PrismPipelineResultTest { + + final PrismExecutor exec = executor(); + + @Before + public void setUp() throws IOException { + exec.execute(); + assertThat(exec.isAlive()).isTrue(); + } + + @After + public void tearDown() { + assertThat(exec.isAlive()).isFalse(); + } + + @Test + public void givenTerminated_reportsState() { + PipelineResult delegate = mock(PipelineResult.class); + when(delegate.waitUntilFinish()).thenReturn(PipelineResult.State.FAILED); + PrismPipelineResult underTest = new PrismPipelineResult(delegate, exec::stop); + // Assigns terminal state. + underTest.waitUntilFinish(); + assertThat(underTest.getState()).isEqualTo(PipelineResult.State.FAILED); + } + + @Test + public void givenNotTerminated_reportsState() { + PipelineResult delegate = mock(PipelineResult.class); + when(delegate.getState()).thenReturn(PipelineResult.State.RUNNING); + PrismPipelineResult underTest = new PrismPipelineResult(delegate, exec::stop); + assertThat(underTest.getState()).isEqualTo(PipelineResult.State.RUNNING); + exec.stop(); + } + + @Test + public void cancelStopsExecutable_reportsTerminalState() throws IOException { + PipelineResult delegate = mock(PipelineResult.class); + when(delegate.cancel()).thenReturn(PipelineResult.State.CANCELLED); + PrismPipelineResult underTest = new PrismPipelineResult(delegate, exec::stop); + assertThat(underTest.cancel()).isEqualTo(PipelineResult.State.CANCELLED); + } + + @Test + public void givenTerminated_cancelIsNoop_reportsTerminalState() throws IOException { + PipelineResult delegate = mock(PipelineResult.class); + when(delegate.cancel()).thenReturn(PipelineResult.State.FAILED); + PrismPipelineResult underTest = new PrismPipelineResult(delegate, exec::stop); + assertThat(underTest.cancel()).isEqualTo(PipelineResult.State.FAILED); + } + + @Test + public void givenPipelineRunWithDuration_waitUntilFinish_reportsTerminalState() { + PipelineResult delegate = mock(PipelineResult.class); + when(delegate.waitUntilFinish(Duration.millis(3000L))) + .thenReturn(PipelineResult.State.CANCELLED); + PrismPipelineResult underTest = new PrismPipelineResult(delegate, exec::stop); + assertThat(underTest.waitUntilFinish(Duration.millis(3000L))) + .isEqualTo(PipelineResult.State.CANCELLED); + } + + @Test + public void givenTerminated_waitUntilFinishIsNoop_reportsTerminalState() { + PipelineResult delegate = mock(PipelineResult.class); + when(delegate.waitUntilFinish()).thenReturn(PipelineResult.State.DONE); + PrismPipelineResult underTest = new PrismPipelineResult(delegate, exec::stop); + // Terminate Job as setup for additional call. + underTest.waitUntilFinish(); + assertThat(underTest.waitUntilFinish()).isEqualTo(PipelineResult.State.DONE); + } + + @Test + public void givenNotTerminated_reportsMetrics() { + PipelineResult delegate = mock(PipelineResult.class); + when(delegate.metrics()).thenReturn(mock(MetricResults.class)); + PrismPipelineResult underTest = new PrismPipelineResult(delegate, exec::stop); + assertThat(underTest.metrics()).isNotNull(); + exec.stop(); + } + + @Test + public void givenTerminated_reportsTerminatedMetrics() { + PipelineResult delegate = mock(PipelineResult.class); + when(delegate.metrics()).thenReturn(mock(MetricResults.class)); + when(delegate.waitUntilFinish()).thenReturn(PipelineResult.State.DONE); + PrismPipelineResult underTest = new PrismPipelineResult(delegate, exec::stop); + // Terminate Job as setup for additional call. + underTest.waitUntilFinish(); + assertThat(underTest.metrics()).isNotNull(); + } + + private static PrismExecutor executor() { + return PrismExecutor.builder().setCommand(getLocalPrismBuildOrIgnoreTest()).build(); + } +} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/StateWatcherTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/StateWatcherTest.java new file mode 100644 index 000000000000..cfc420046206 --- /dev/null +++ b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/StateWatcherTest.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static com.google.common.truth.Truth.assertThat; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.model.jobmanagement.v1.JobApi; +import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Grpc; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.InsecureServerCredentials; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Server; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class StateWatcherTest { + + @Test + public void givenSingleListener_watches() { + Server server = serverOf(PipelineResult.State.RUNNING, PipelineResult.State.DONE); + TestStateListener listener = new TestStateListener(); + try (StateWatcher underTest = StateWatcher.insecure("0.0.0.0:" + server.getPort(), listener)) { + underTest.watch("job-001"); + assertThat(listener.states) + .containsExactly(PipelineResult.State.RUNNING, PipelineResult.State.DONE); + shutdown(server); + } + } + + @Test + public void givenMultipleListeners_watches() { + Server server = serverOf(PipelineResult.State.RUNNING, PipelineResult.State.DONE); + TestStateListener listenerA = new TestStateListener(); + TestStateListener listenerB = new TestStateListener(); + try (StateWatcher underTest = + StateWatcher.insecure("0.0.0.0:" + server.getPort(), listenerA, listenerB)) { + underTest.watch("job-001"); + assertThat(listenerA.states) + .containsExactly(PipelineResult.State.RUNNING, PipelineResult.State.DONE); + assertThat(listenerB.states) + .containsExactly(PipelineResult.State.RUNNING, PipelineResult.State.DONE); + shutdown(server); + } + } + + @Test + public void publishesOnlyChangedState() { + Server server = + serverOf( + PipelineResult.State.RUNNING, + PipelineResult.State.RUNNING, + PipelineResult.State.RUNNING, + PipelineResult.State.RUNNING, + PipelineResult.State.RUNNING, + PipelineResult.State.RUNNING, + PipelineResult.State.RUNNING, + PipelineResult.State.DONE); + TestStateListener listener = new TestStateListener(); + try (StateWatcher underTest = StateWatcher.insecure("0.0.0.0:" + server.getPort(), listener)) { + underTest.watch("job-001"); + assertThat(listener.states) + .containsExactly(PipelineResult.State.RUNNING, PipelineResult.State.DONE); + shutdown(server); + } + } + + private static class TestStateListener implements StateListener { + private final List states = new ArrayList<>(); + + @Override + public void onStateChanged(PipelineResult.State state) { + states.add(state); + } + } + + private static class TestJobServiceStateStream extends JobServiceGrpc.JobServiceImplBase { + private final List states; + + TestJobServiceStateStream(PipelineResult.State... states) { + this.states = Arrays.asList(states); + } + + @Override + public void getStateStream( + JobApi.GetJobStateRequest request, StreamObserver responseObserver) { + for (PipelineResult.State state : states) { + responseObserver.onNext( + JobApi.JobStateEvent.newBuilder() + .setState(JobApi.JobState.Enum.valueOf(state.name())) + .build()); + } + responseObserver.onCompleted(); + } + } + + private static Server serverOf(PipelineResult.State... states) { + try { + return Grpc.newServerBuilderForPort(0, InsecureServerCredentials.create()) + .addService(new TestJobServiceStateStream(states)) + .build() + .start(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static void shutdown(Server server) { + server.shutdownNow(); + try { + server.awaitTermination(); + } catch (InterruptedException ignored) { + } + } +} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/WorkerServiceTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/WorkerServiceTest.java new file mode 100644 index 000000000000..7fc05d7747cd --- /dev/null +++ b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/WorkerServiceTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertThrows; + +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.PortablePipelineOptions; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link WorkerService}. */ +@RunWith(JUnit4.class) +public class WorkerServiceTest { + @Test + public void testStartStop() throws Exception { + PortablePipelineOptions options = + PipelineOptionsFactory.create().as(PortablePipelineOptions.class); + WorkerService underTest = new WorkerService(options); + underTest.start(); + assertThat(underTest.isRunning()).isTrue(); + assertThat(underTest.getApiServiceDescriptorUrl()).matches("localhost:\\d+"); + underTest.stop(); + assertThat(underTest.isRunning()).isFalse(); + } + + @Test + public void givenStarted_updateDefaultEnvironmentConfig() throws Exception { + PortablePipelineOptions options = + PipelineOptionsFactory.create().as(PortablePipelineOptions.class); + assertThat(options.getDefaultEnvironmentConfig()).isNull(); + WorkerService underTest = new WorkerService(options); + underTest.start(); + options = underTest.updateDefaultEnvironmentConfig(options); + assertThat(options.getDefaultEnvironmentConfig()) + .isEqualTo(underTest.getApiServiceDescriptorUrl()); + underTest.stop(); + } + + @Test + public void givenNotStarted_updateDefaultEnvironmentConfig_throws() { + PortablePipelineOptions options = + PipelineOptionsFactory.create().as(PortablePipelineOptions.class); + WorkerService underTest = new WorkerService(options); + assertThrows( + IllegalStateException.class, () -> underTest.updateDefaultEnvironmentConfig(options)); + } + + @Test + public void whenStateIsTerminal_thenStop() throws Exception { + PortablePipelineOptions options = + PipelineOptionsFactory.create().as(PortablePipelineOptions.class); + WorkerService underTest = new WorkerService(options); + assertThat(underTest.isRunning()).isFalse(); + underTest.start(); + assertThat(underTest.isRunning()).isTrue(); + + underTest.onStateChanged(PipelineResult.State.RUNNING); + assertThat(underTest.isRunning()).isTrue(); + + underTest.onStateChanged(PipelineResult.State.RUNNING); + assertThat(underTest.isRunning()).isTrue(); + + underTest.onStateChanged(PipelineResult.State.CANCELLED); + assertThat(underTest.isRunning()).isFalse(); + } +} diff --git a/runners/samza/job-server/build.gradle b/runners/samza/job-server/build.gradle index 4be206727121..f972f376e5c8 100644 --- a/runners/samza/job-server/build.gradle +++ b/runners/samza/job-server/build.gradle @@ -90,7 +90,6 @@ def portableValidatesRunnerTask(String name, boolean docker) { excludeCategories 'org.apache.beam.sdk.testing.UsesCustomWindowMerging' excludeCategories 'org.apache.beam.sdk.testing.UsesFailureMessage' excludeCategories 'org.apache.beam.sdk.testing.UsesGaugeMetrics' - excludeCategories 'org.apache.beam.sdk.testing.UsesStringSetMetrics' excludeCategories 'org.apache.beam.sdk.testing.UsesParDoLifecycle' excludeCategories 'org.apache.beam.sdk.testing.UsesMapState' excludeCategories 'org.apache.beam.sdk.testing.UsesMultimapState' diff --git a/runners/spark/job-server/spark_job_server.gradle b/runners/spark/job-server/spark_job_server.gradle index bd00c8cf52ac..6d2d4b2bafbf 100644 --- a/runners/spark/job-server/spark_job_server.gradle +++ b/runners/spark/job-server/spark_job_server.gradle @@ -117,7 +117,6 @@ def portableValidatesRunnerTask(String name, boolean streaming, boolean docker, excludeCategories 'org.apache.beam.sdk.testing.UsesCustomWindowMerging' excludeCategories 'org.apache.beam.sdk.testing.UsesFailureMessage' excludeCategories 'org.apache.beam.sdk.testing.UsesGaugeMetrics' - excludeCategories 'org.apache.beam.sdk.testing.UsesStringSetMetrics' excludeCategories 'org.apache.beam.sdk.testing.UsesPerKeyOrderedDelivery' excludeCategories 'org.apache.beam.sdk.testing.UsesParDoLifecycle' excludeCategories 'org.apache.beam.sdk.testing.UsesMapState' @@ -186,7 +185,6 @@ def portableValidatesRunnerTask(String name, boolean streaming, boolean docker, excludeCategories 'org.apache.beam.sdk.testing.UsesCustomWindowMerging' excludeCategories 'org.apache.beam.sdk.testing.UsesFailureMessage' excludeCategories 'org.apache.beam.sdk.testing.UsesGaugeMetrics' - excludeCategories 'org.apache.beam.sdk.testing.UsesStringSetMetrics' excludeCategories 'org.apache.beam.sdk.testing.UsesPerKeyOrderedDelivery' excludeCategories 'org.apache.beam.sdk.testing.UsesPerKeyOrderInBundle' excludeCategories 'org.apache.beam.sdk.testing.UsesParDoLifecycle' diff --git a/runners/spark/spark_runner.gradle b/runners/spark/spark_runner.gradle index 8b6f1117019a..f4e6bf740189 100644 --- a/runners/spark/spark_runner.gradle +++ b/runners/spark/spark_runner.gradle @@ -290,6 +290,8 @@ def validatesRunnerBatch = tasks.register("validatesRunnerBatch", Test) { // TODO(https://github.com/apache/beam/issues/31231 it.filter { excludeTestsMatching 'org.apache.beam.sdk.transforms.RedistributeTest.testRedistributePreservesMetadata' + // TODO(https://github.com/apache/beam/issues/32021) + excludeTestsMatching 'org.apache.beam.sdk.metrics.MetricsTest$AttemptedMetricTests.testBoundedSourceMetricsInSplit' } } @@ -329,6 +331,8 @@ def validatesRunnerStreaming = tasks.register("validatesRunnerStreaming", Test) excludeTestsMatching 'org.apache.beam.sdk.transforms.ReshuffleTest.testReshufflePreservesMetadata' // TODO(https://github.com/apache/beam/issues/31231 excludeTestsMatching 'org.apache.beam.sdk.transforms.RedistributeTest.testRedistributePreservesMetadata' + // TODO(https://github.com/apache/beam/issues/32021) + excludeTestsMatching 'org.apache.beam.sdk.metrics.MetricsTest$AttemptedMetricTests.testBoundedSourceMetricsInSplit' } // TestStream using processing time is not supported in Spark @@ -428,6 +432,8 @@ tasks.register("validatesStructuredStreamingRunnerBatch", Test) { excludeTestsMatching 'org.apache.beam.sdk.transforms.SplittableDoFnTest.testLifecycleMethodsBounded' // https://github.com/apache/beam/issues/29972 excludeTestsMatching 'org.apache.beam.sdk.transforms.CombineTest$CombineWithContextTests.testHotKeyCombineWithSideInputs' + // TODO(https://github.com/apache/beam/issues/32021) + excludeTestsMatching 'org.apache.beam.sdk.metrics.MetricsTest$AttemptedMetricTests.testBoundedSourceMetricsInSplit' } } diff --git a/scripts/ci/pr-bot/package-lock.json b/scripts/ci/pr-bot/package-lock.json index 336a8d45677d..7cb764a43795 100644 --- a/scripts/ci/pr-bot/package-lock.json +++ b/scripts/ci/pr-bot/package-lock.json @@ -273,12 +273,12 @@ } }, "node_modules/braces": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", - "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", "dev": true, "dependencies": { - "fill-range": "^7.0.1" + "fill-range": "^7.1.1" }, "engines": { "node": ">=8" @@ -469,9 +469,9 @@ } }, "node_modules/fill-range": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", - "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", "dev": true, "dependencies": { "to-regex-range": "^5.0.1" @@ -1421,12 +1421,12 @@ } }, "braces": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", - "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", "dev": true, "requires": { - "fill-range": "^7.0.1" + "fill-range": "^7.1.1" } }, "browser-stdout": { @@ -1563,9 +1563,9 @@ "dev": true }, "fill-range": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", - "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", "dev": true, "requires": { "to-regex-range": "^5.0.1" diff --git a/sdks/go.mod b/sdks/go.mod index 612292c472a3..624cc0ab1ce8 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -20,48 +20,47 @@ // directory. module github.com/apache/beam/sdks/v2 -go 1.20 +go 1.21 require ( - cloud.google.com/go/bigquery v1.61.0 - cloud.google.com/go/bigtable v1.25.0 + cloud.google.com/go/bigquery v1.62.0 + cloud.google.com/go/bigtable v1.28.0 cloud.google.com/go/datastore v1.17.1 - cloud.google.com/go/profiler v0.4.0 - cloud.google.com/go/pubsub v1.39.0 - cloud.google.com/go/spanner v1.63.0 + cloud.google.com/go/profiler v0.4.1 + cloud.google.com/go/pubsub v1.40.0 + cloud.google.com/go/spanner v1.66.0 cloud.google.com/go/storage v1.43.0 - github.com/aws/aws-sdk-go-v2 v1.30.0 - github.com/aws/aws-sdk-go-v2/config v1.27.4 - github.com/aws/aws-sdk-go-v2/credentials v1.17.18 - github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.8 - github.com/aws/aws-sdk-go-v2/service/s3 v1.42.2 - github.com/aws/smithy-go v1.20.2 + github.com/aws/aws-sdk-go-v2 v1.30.3 + github.com/aws/aws-sdk-go-v2/config v1.27.27 + github.com/aws/aws-sdk-go-v2/credentials v1.17.27 + github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10 + github.com/aws/aws-sdk-go-v2/service/s3 v1.58.3 + github.com/aws/smithy-go v1.20.3 github.com/docker/go-connections v0.5.0 github.com/dustin/go-humanize v1.0.1 github.com/go-sql-driver/mysql v1.8.1 - github.com/golang/protobuf v1.5.4 // TODO(danoliveira): Fully replace this with google.golang.org/protobuf github.com/google/go-cmp v0.6.0 github.com/google/uuid v1.6.0 github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6 github.com/lib/pq v1.10.9 github.com/linkedin/goavro/v2 v2.13.0 - github.com/nats-io/nats-server/v2 v2.10.12 - github.com/nats-io/nats.go v1.33.1 - github.com/proullon/ramsql v0.1.3 + github.com/nats-io/nats-server/v2 v2.10.18 + github.com/nats-io/nats.go v1.36.0 + github.com/proullon/ramsql v0.1.4 github.com/spf13/cobra v1.8.1 - github.com/testcontainers/testcontainers-go v0.26.0 + github.com/testcontainers/testcontainers-go v0.32.0 github.com/tetratelabs/wazero v1.7.3 github.com/xitongsys/parquet-go v1.6.2 github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c - go.mongodb.org/mongo-driver v1.13.1 - golang.org/x/net v0.26.0 - golang.org/x/oauth2 v0.21.0 - golang.org/x/sync v0.7.0 - golang.org/x/sys v0.21.0 - golang.org/x/text v0.16.0 - google.golang.org/api v0.187.0 - google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d - google.golang.org/grpc v1.64.1 + go.mongodb.org/mongo-driver v1.16.1 + golang.org/x/net v0.28.0 + golang.org/x/oauth2 v0.22.0 + golang.org/x/sync v0.8.0 + golang.org/x/sys v0.23.0 + golang.org/x/text v0.17.0 + google.golang.org/api v0.189.0 + google.golang.org/genproto v0.0.0-20240725223205-93522f1f2a9f + google.golang.org/grpc v1.65.0 google.golang.org/protobuf v1.34.2 gopkg.in/yaml.v2 v2.4.0 gopkg.in/yaml.v3 v3.0.1 @@ -69,31 +68,36 @@ require ( require ( github.com/avast/retry-go/v4 v4.6.0 - github.com/fsouza/fake-gcs-server v1.47.7 + github.com/fsouza/fake-gcs-server v1.49.2 golang.org/x/exp v0.0.0-20231006140011-7918f672742d ) require ( - cloud.google.com/go/auth v0.6.1 // indirect - cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect + cel.dev/expr v0.15.0 // indirect + cloud.google.com/go/auth v0.7.2 // indirect + cloud.google.com/go/auth/oauth2adapt v0.2.3 // indirect + cloud.google.com/go/monitoring v1.20.2 // indirect dario.cat/mergo v1.0.0 // indirect filippo.io/edwards25519 v1.1.0 // indirect github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.0 // indirect - github.com/Microsoft/hcsshim v0.11.4 // indirect + github.com/Microsoft/hcsshim v0.11.5 // indirect github.com/apache/arrow/go/v15 v15.0.2 // indirect + github.com/containerd/errdefs v0.1.0 // indirect github.com/containerd/log v0.1.0 // indirect - github.com/distribution/reference v0.5.0 // indirect - github.com/go-logr/logr v1.4.1 // indirect + github.com/distribution/reference v0.6.0 // indirect + github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.2.6 // indirect + github.com/golang/protobuf v1.5.4 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect - github.com/minio/highwayhash v1.0.2 // indirect + github.com/minio/highwayhash v1.0.3 // indirect + github.com/moby/docker-image-spec v1.3.1 // indirect github.com/moby/sys/user v0.1.0 // indirect - github.com/nats-io/jwt/v2 v2.5.5 // indirect + github.com/nats-io/jwt/v2 v2.5.8 // indirect github.com/nats-io/nkeys v0.4.7 // indirect github.com/nats-io/nuid v1.0.1 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect - github.com/shirou/gopsutil/v3 v3.23.9 // indirect + github.com/shirou/gopsutil/v3 v3.23.12 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect github.com/tklauser/go-sysconf v0.3.12 // indirect github.com/tklauser/numcpus v0.6.1 // indirect @@ -104,40 +108,41 @@ require ( go.opentelemetry.io/otel v1.24.0 // indirect go.opentelemetry.io/otel/metric v1.24.0 // indirect go.opentelemetry.io/otel/sdk v1.24.0 // indirect + go.opentelemetry.io/otel/sdk/metric v1.24.0 // indirect go.opentelemetry.io/otel/trace v1.24.0 // indirect golang.org/x/time v0.5.0 // indirect ) require ( cloud.google.com/go v0.115.0 // indirect - cloud.google.com/go/compute/metadata v0.3.0 // indirect - cloud.google.com/go/iam v1.1.8 // indirect - cloud.google.com/go/longrunning v0.5.7 // indirect + cloud.google.com/go/compute/metadata v0.5.0 // indirect + cloud.google.com/go/iam v1.1.11 // indirect + cloud.google.com/go/longrunning v0.5.10 // indirect github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect - github.com/Microsoft/go-winio v0.6.1 // indirect + github.com/Microsoft/go-winio v0.6.2 // indirect github.com/apache/arrow/go/arrow v0.0.0-20200730104253-651201b0f516 // indirect github.com/apache/thrift v0.17.0 // indirect github.com/aws/aws-sdk-go v1.34.0 // indirect - github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.1 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.5 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.9 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.9 // indirect + github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 // indirect - github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.2 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.11 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.3 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.20.11 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.24.5 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.28.12 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.15 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.17 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.15 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.22.4 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.4 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.30.3 // indirect github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect - github.com/cncf/xds/go v0.0.0-20240318125728-8a4994d93e50 // indirect - github.com/containerd/containerd v1.7.11 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b // indirect + github.com/containerd/containerd v1.7.18 // indirect github.com/cpuguy83/dockercfg v0.3.1 // indirect - github.com/docker/docker v25.0.5+incompatible // but required to resolve issue docker has with go1.20 + github.com/docker/docker v27.1.1+incompatible // but required to resolve issue docker has with go1.20 github.com/docker/go-units v0.5.0 // indirect github.com/envoyproxy/go-control-plane v0.12.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect @@ -147,25 +152,24 @@ require ( github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/snappy v0.0.4 // indirect github.com/google/flatbuffers v23.5.26+incompatible // indirect - github.com/google/pprof v0.0.0-20230602150820-91b7bce49751 // indirect + github.com/google/pprof v0.0.0-20240528025155-186aa0362fba // indirect github.com/google/renameio/v2 v2.0.0 // indirect github.com/google/s2a-go v0.1.7 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect - github.com/googleapis/gax-go/v2 v2.12.5 // indirect + github.com/googleapis/gax-go/v2 v2.13.0 // indirect github.com/gorilla/handlers v1.5.2 // indirect github.com/gorilla/mux v1.8.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/jmespath/go-jmespath v0.4.0 // indirect - github.com/klauspost/compress v1.17.7 // indirect + github.com/klauspost/compress v1.17.9 // indirect github.com/klauspost/cpuid/v2 v2.2.6 // indirect github.com/magiconair/properties v1.8.7 // indirect github.com/moby/patternmatcher v0.6.0 // indirect github.com/moby/sys/sequential v0.5.0 // indirect github.com/moby/term v0.5.0 // indirect - github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect + github.com/montanaflynn/stats v0.7.1 // indirect github.com/morikuni/aec v1.0.0 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect - github.com/opencontainers/image-spec v1.1.0-rc5 // indirect + github.com/opencontainers/image-spec v1.1.0 // indirect github.com/pierrec/lz4/v4 v4.1.18 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pkg/xattr v0.4.9 // indirect @@ -179,10 +183,10 @@ require ( github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect github.com/zeebo/xxh3 v1.0.2 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.24.0 // indirect - golang.org/x/mod v0.17.0 // indirect - golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect + golang.org/x/crypto v0.26.0 // indirect + golang.org/x/mod v0.18.0 // indirect + golang.org/x/tools v0.22.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240722135656-d784300faade // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade // indirect ) diff --git a/sdks/go.sum b/sdks/go.sum index 39d23d40df06..67686da8e408 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1,3 +1,5 @@ +cel.dev/expr v0.15.0 h1:O1jzfJCQBfL5BFoYktaxwIhuttaQPsVWerH9/EEKx0w= +cel.dev/expr v0.15.0/go.mod h1:TRSuuV7DlVCE/uwv5QbAiW/v8l5O8C4eEPHeu7gf7Sg= cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= @@ -99,10 +101,10 @@ cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVo cloud.google.com/go/assuredworkloads v1.8.0/go.mod h1:AsX2cqyNCOvEQC8RMPnoc0yEarXQk6WEKkxYfL6kGIo= cloud.google.com/go/assuredworkloads v1.9.0/go.mod h1:kFuI1P78bplYtT77Tb1hi0FMxM0vVpRC7VVoJC3ZoT0= cloud.google.com/go/assuredworkloads v1.10.0/go.mod h1:kwdUQuXcedVdsIaKgKTp9t0UJkE5+PAVNhdQm4ZVq2E= -cloud.google.com/go/auth v0.6.1 h1:T0Zw1XM5c1GlpN2HYr2s+m3vr1p2wy+8VN+Z1FKxW38= -cloud.google.com/go/auth v0.6.1/go.mod h1:eFHG7zDzbXHKmjJddFG/rBlcGp6t25SwRUiEQSlO4x4= -cloud.google.com/go/auth/oauth2adapt v0.2.2 h1:+TTV8aXpjeChS9M+aTtN/TjdQnzJvmzKFt//oWu7HX4= -cloud.google.com/go/auth/oauth2adapt v0.2.2/go.mod h1:wcYjgpZI9+Yu7LyYBg4pqSiaRkfEK3GQcpb7C/uyF1Q= +cloud.google.com/go/auth v0.7.2 h1:uiha352VrCDMXg+yoBtaD0tUF4Kv9vrtrWPYXwutnDE= +cloud.google.com/go/auth v0.7.2/go.mod h1:VEc4p5NNxycWQTMQEDQF0bd6aTMb6VgYDXEwiJJQAbs= +cloud.google.com/go/auth/oauth2adapt v0.2.3 h1:MlxF+Pd3OmSudg/b1yZ5lJwoXCEaeedAguodky1PcKI= +cloud.google.com/go/auth/oauth2adapt v0.2.3/go.mod h1:tMQXOfZzFuNuUxOypHlQEXgdfX5cuhwU+ffUuXRJE8I= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= cloud.google.com/go/automl v1.6.0/go.mod h1:ugf8a6Fx+zP0D59WLhqgTDsQI9w07o64uf/Is3Nh5p8= cloud.google.com/go/automl v1.7.0/go.mod h1:RL9MYCCsJEOmt0Wf3z9uzG0a7adTT1fe+aObgSpkCt8= @@ -131,10 +133,10 @@ cloud.google.com/go/bigquery v1.47.0/go.mod h1:sA9XOgy0A8vQK9+MWhEQTY6Tix87M/Zur cloud.google.com/go/bigquery v1.48.0/go.mod h1:QAwSz+ipNgfL5jxiaK7weyOhzdoAy1zFm0Nf1fysJac= cloud.google.com/go/bigquery v1.49.0/go.mod h1:Sv8hMmTFFYBlt/ftw2uN6dFdQPzBlREY9yBh7Oy7/4Q= cloud.google.com/go/bigquery v1.50.0/go.mod h1:YrleYEh2pSEbgTBZYMJ5SuSr0ML3ypjRB1zgf7pvQLU= -cloud.google.com/go/bigquery v1.61.0 h1:w2Goy9n6gh91LVi6B2Sc+HpBl8WbWhIyzdvVvrAuEIw= -cloud.google.com/go/bigquery v1.61.0/go.mod h1:PjZUje0IocbuTOdq4DBOJLNYB0WF3pAKBHzAYyxCwFo= -cloud.google.com/go/bigtable v1.25.0 h1:P3J0qFd2BUpvnamJOaTW9KkgqAiUXsFtFAW33sxj/hU= -cloud.google.com/go/bigtable v1.25.0/go.mod h1:NOwb5o8cw2LCEMP8SthXGxpZAjbQXc4Gb7V6A3TvsJc= +cloud.google.com/go/bigquery v1.62.0 h1:SYEA2f7fKqbSRRBHb7g0iHTtZvtPSPYdXfmqsjpsBwo= +cloud.google.com/go/bigquery v1.62.0/go.mod h1:5ee+ZkF1x/ntgCsFQJAQTM3QkAZOecfCmvxhkJsWRSA= +cloud.google.com/go/bigtable v1.28.0 h1:c0wc/wy+9Chj8BooqW/zgaeslXsA5YEYl84VBmvwp+4= +cloud.google.com/go/bigtable v1.28.0/go.mod h1:avmXcmxVbLJAo9moICRYMgDyTTPoV0MA0lHKnyqV4fQ= cloud.google.com/go/billing v1.4.0/go.mod h1:g9IdKBEFlItS8bTtlrZdVLWSSdSyFUZKXNS02zKMOZY= cloud.google.com/go/billing v1.5.0/go.mod h1:mztb1tBc3QekhjSgmpf/CV4LzWXLzCArwpLmP2Gm88s= cloud.google.com/go/billing v1.6.0/go.mod h1:WoXzguj+BeHXPbKfNWkqVtDdzORazmCjraY+vrxcyvI= @@ -186,8 +188,8 @@ cloud.google.com/go/compute/metadata v0.1.0/go.mod h1:Z1VN+bulIf6bt4P/C37K4DyZYZ cloud.google.com/go/compute/metadata v0.2.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= cloud.google.com/go/compute/metadata v0.2.1/go.mod h1:jgHgmJd2RKBGzXqF5LR2EZMGxBkeanZ9wwa75XHJgOM= cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA= -cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc= -cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= +cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY= +cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY= cloud.google.com/go/contactcenterinsights v1.3.0/go.mod h1:Eu2oemoePuEFc/xKFPjbTuPSj0fYJcPls9TFlPNnHHY= cloud.google.com/go/contactcenterinsights v1.4.0/go.mod h1:L2YzkGbPsv+vMQMCADxJoT9YiTTnSEd6fEvCeHTYVck= cloud.google.com/go/contactcenterinsights v1.6.0/go.mod h1:IIDlT6CLcDoyv79kDv8iWxMSTZhLxSCofVV5W6YFM/w= @@ -208,7 +210,8 @@ cloud.google.com/go/datacatalog v1.8.0/go.mod h1:KYuoVOv9BM8EYz/4eMFxrr4DUKhGIOX cloud.google.com/go/datacatalog v1.8.1/go.mod h1:RJ58z4rMp3gvETA465Vg+ag8BGgBdnRPEMMSTr5Uv+M= cloud.google.com/go/datacatalog v1.12.0/go.mod h1:CWae8rFkfp6LzLumKOnmVh4+Zle4A3NXLzVJ1d1mRm0= cloud.google.com/go/datacatalog v1.13.0/go.mod h1:E4Rj9a5ZtAxcQJlEBTLgMTphfP11/lNaAshpoBgemX8= -cloud.google.com/go/datacatalog v1.20.1 h1:czcba5mxwRM5V//jSadyig0y+8aOHmN7gUl9GbHu59E= +cloud.google.com/go/datacatalog v1.20.4 h1:nUR7JBPZezl1+o+86N01VxAQQHY+It/D8tmNipcdVjI= +cloud.google.com/go/datacatalog v1.20.4/go.mod h1:71PDwywIYkNgSXdUU3H0mkTp3j15aahfYJ1CY3DogtU= cloud.google.com/go/dataflow v0.6.0/go.mod h1:9QwV89cGoxjjSR9/r7eFDqqjtvbKxAK2BaYU6PVk9UM= cloud.google.com/go/dataflow v0.7.0/go.mod h1:PX526vb4ijFMesO1o202EaUmouZKBpjHsTlCtB4parQ= cloud.google.com/go/dataflow v0.8.0/go.mod h1:Rcf5YgTKPtQyYz8bLYhFoIV/vP39eL7fWNcSOyFfLJE= @@ -324,8 +327,8 @@ cloud.google.com/go/iam v0.8.0/go.mod h1:lga0/y3iH6CX7sYqypWJ33hf7kkfXJag67naqGE cloud.google.com/go/iam v0.11.0/go.mod h1:9PiLDanza5D+oWFZiH1uG+RnRCfEGKoyl6yo4cgWZGY= cloud.google.com/go/iam v0.12.0/go.mod h1:knyHGviacl11zrtZUoDuYpDgLjvr28sLQaG0YB2GYAY= cloud.google.com/go/iam v0.13.0/go.mod h1:ljOg+rcNfzZ5d6f1nAUJ8ZIxOaZUVoS14bKCtaLZ/D0= -cloud.google.com/go/iam v1.1.8 h1:r7umDwhj+BQyz0ScZMp4QrGXjSTI3ZINnpgU2nlB/K0= -cloud.google.com/go/iam v1.1.8/go.mod h1:GvE6lyMmfxXauzNq8NbgJbeVQNspG+tcdL/W8QO1+zE= +cloud.google.com/go/iam v1.1.11 h1:0mQ8UKSfdHLut6pH9FM3bI55KWR46ketn0PuXleDyxw= +cloud.google.com/go/iam v1.1.11/go.mod h1:biXoiLWYIKntto2joP+62sd9uW5EpkZmKIvfNcTWlnQ= cloud.google.com/go/iap v1.4.0/go.mod h1:RGFwRJdihTINIe4wZ2iCP0zF/qu18ZwyKxrhMhygBEc= cloud.google.com/go/iap v1.5.0/go.mod h1:UH/CGgKd4KyohZL5Pt0jSKE4m3FR51qg6FKQ/z/Ix9A= cloud.google.com/go/iap v1.6.0/go.mod h1:NSuvI9C/j7UdjGjIde7t7HBz+QTwBcapPE07+sSRcLk= @@ -345,7 +348,8 @@ cloud.google.com/go/kms v1.8.0/go.mod h1:4xFEhYFqvW+4VMELtZyxomGSYtSQKzM178ylFW4 cloud.google.com/go/kms v1.9.0/go.mod h1:qb1tPTgfF9RQP8e1wq4cLFErVuTJv7UsSC915J8dh3w= cloud.google.com/go/kms v1.10.0/go.mod h1:ng3KTUtQQU9bPX3+QGLsflZIHlkbn8amFAMY63m8d24= cloud.google.com/go/kms v1.10.1/go.mod h1:rIWk/TryCkR59GMC3YtHtXeLzd634lBbKenvyySAyYI= -cloud.google.com/go/kms v1.18.0 h1:pqNdaVmZJFP+i8OVLocjfpdTWETTYa20FWOegSCdrRo= +cloud.google.com/go/kms v1.18.3 h1:8+Z2S4bQDSCdghB5ZA5dVDDJTLmnkRlowtFiXqMFd74= +cloud.google.com/go/kms v1.18.3/go.mod h1:y/Lcf6fyhbdn7MrG1VaDqXxM8rhOBc5rWcWAhcvZjQU= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/language v1.7.0/go.mod h1:DJ6dYN/W+SQOjF8e1hLQXMF21AkH2w9wiPzPCJa2MIE= @@ -359,8 +363,8 @@ cloud.google.com/go/logging v1.7.0/go.mod h1:3xjP2CjkM3ZkO73aj4ASA5wRPGGCRrPIAeN cloud.google.com/go/longrunning v0.1.1/go.mod h1:UUFxuDWkv22EuY93jjmDMFT5GPQKeFVJBIF6QlTqdsE= cloud.google.com/go/longrunning v0.3.0/go.mod h1:qth9Y41RRSUE69rDcOn6DdK3HfQfsUI0YSmW3iIlLJc= cloud.google.com/go/longrunning v0.4.1/go.mod h1:4iWDqhBZ70CvZ6BfETbvam3T8FMvLK+eFj0E6AaRQTo= -cloud.google.com/go/longrunning v0.5.7 h1:WLbHekDbjK1fVFD3ibpFFVoyizlLRl73I7YKuAKilhU= -cloud.google.com/go/longrunning v0.5.7/go.mod h1:8GClkudohy1Fxm3owmBGid8W0pSgodEMwEAztp38Xng= +cloud.google.com/go/longrunning v0.5.10 h1:eB/BniENNRKhjz/xgiillrdcH3G74TGSl3BXinGlI7E= +cloud.google.com/go/longrunning v0.5.10/go.mod h1:tljz5guTr5oc/qhlUjBlk7UAIFMOGuPNxkNDZXlLics= cloud.google.com/go/managedidentities v1.3.0/go.mod h1:UzlW3cBOiPrzucO5qWkNkh0w33KFtBJU281hacNvsdE= cloud.google.com/go/managedidentities v1.4.0/go.mod h1:NWSBYbEMgqmbZsLIyKvxrYbtqOsxY1ZrGM+9RgDqInM= cloud.google.com/go/managedidentities v1.5.0/go.mod h1:+dWcZ0JlUmpuxpIDfyP5pP5y0bLdRwOS4Lp7gMni/LA= @@ -384,6 +388,8 @@ cloud.google.com/go/monitoring v1.7.0/go.mod h1:HpYse6kkGo//7p6sT0wsIC6IBDET0RhI cloud.google.com/go/monitoring v1.8.0/go.mod h1:E7PtoMJ1kQXWxPjB6mv2fhC5/15jInuulFdYYtlcvT4= cloud.google.com/go/monitoring v1.12.0/go.mod h1:yx8Jj2fZNEkL/GYZyTLS4ZtZEZN8WtDEiEqG4kLK50w= cloud.google.com/go/monitoring v1.13.0/go.mod h1:k2yMBAB1H9JT/QETjNkgdCGD9bPF712XiLTVr+cBrpw= +cloud.google.com/go/monitoring v1.20.2 h1:B/L+xrw9PYO7ywh37sgnjI/6dzEE+yQTAwfytDcpPto= +cloud.google.com/go/monitoring v1.20.2/go.mod h1:36rpg/7fdQ7NX5pG5x1FA7cXTVXusOp6Zg9r9e1+oek= cloud.google.com/go/networkconnectivity v1.4.0/go.mod h1:nOl7YL8odKyAOtzNX73/M5/mGZgqqMeryi6UPZTk/rA= cloud.google.com/go/networkconnectivity v1.5.0/go.mod h1:3GzqJx7uhtlM3kln0+x5wyFvuVH1pIBJjhCpjzSt75o= cloud.google.com/go/networkconnectivity v1.6.0/go.mod h1:OJOoEXW+0LAxHh89nXd64uGG+FbQoeH8DtxCHVOMlaM= @@ -433,8 +439,8 @@ cloud.google.com/go/privatecatalog v0.5.0/go.mod h1:XgosMUvvPyxDjAVNDYxJ7wBW8//h cloud.google.com/go/privatecatalog v0.6.0/go.mod h1:i/fbkZR0hLN29eEWiiwue8Pb+GforiEIBnV9yrRUOKI= cloud.google.com/go/privatecatalog v0.7.0/go.mod h1:2s5ssIFO69F5csTXcwBP7NPFTZvps26xGzvQ2PQaBYg= cloud.google.com/go/privatecatalog v0.8.0/go.mod h1:nQ6pfaegeDAq/Q5lrfCQzQLhubPiZhSaNhIgfJlnIXs= -cloud.google.com/go/profiler v0.4.0 h1:ZeRDZbsOBDyRG0OiK0Op1/XWZ3xeLwJc9zjkzczUxyY= -cloud.google.com/go/profiler v0.4.0/go.mod h1:RvPlm4dilIr3oJtAOeFQU9Lrt5RoySHSDj4pTd6TWeU= +cloud.google.com/go/profiler v0.4.1 h1:Q7+lOvikTGMJ/IAWocpYYGit4SIIoILmVZfEEWTORSY= +cloud.google.com/go/profiler v0.4.1/go.mod h1:LBrtEX6nbvhv1w/e5CPZmX9ajGG9BGLtGbv56Tg4SHs= cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= @@ -443,8 +449,8 @@ cloud.google.com/go/pubsub v1.26.0/go.mod h1:QgBH3U/jdJy/ftjPhTkyXNj543Tin1pRYcd cloud.google.com/go/pubsub v1.27.1/go.mod h1:hQN39ymbV9geqBnfQq6Xf63yNhUAhv9CZhzp5O6qsW0= cloud.google.com/go/pubsub v1.28.0/go.mod h1:vuXFpwaVoIPQMGXqRyUQigu/AX1S3IWugR9xznmcXX8= cloud.google.com/go/pubsub v1.30.0/go.mod h1:qWi1OPS0B+b5L+Sg6Gmc9zD1Y+HaM0MdUr7LsupY1P4= -cloud.google.com/go/pubsub v1.39.0 h1:qt1+S6H+wwW8Q/YvDwM8lJnq+iIFgFEgaD/7h3lMsAI= -cloud.google.com/go/pubsub v1.39.0/go.mod h1:FrEnrSGU6L0Kh3iBaAbIUM8KMR7LqyEkMboVxGXCT+s= +cloud.google.com/go/pubsub v1.40.0 h1:0LdP+zj5XaPAGtWr2V6r88VXJlmtaB/+fde1q3TU8M0= +cloud.google.com/go/pubsub v1.40.0/go.mod h1:BVJI4sI2FyXp36KFKvFwcfDRDfR8MiLT8mMhmIhdAeA= cloud.google.com/go/pubsublite v1.5.0/go.mod h1:xapqNQ1CuLfGi23Yda/9l4bBCKz/wC3KIJ5gKcxveZg= cloud.google.com/go/pubsublite v1.6.0/go.mod h1:1eFCS0U11xlOuMFV/0iBqw3zP12kddMeCbj/F3FSj9k= cloud.google.com/go/pubsublite v1.7.0/go.mod h1:8hVMwRXfDfvGm3fahVbtDbiLePT3gpoiJYJY+vxWxVM= @@ -534,8 +540,8 @@ cloud.google.com/go/shell v1.6.0/go.mod h1:oHO8QACS90luWgxP3N9iZVuEiSF84zNyLytb+ cloud.google.com/go/spanner v1.41.0/go.mod h1:MLYDBJR/dY4Wt7ZaMIQ7rXOTLjYrmxLE/5ve9vFfWos= cloud.google.com/go/spanner v1.44.0/go.mod h1:G8XIgYdOK+Fbcpbs7p2fiprDw4CaZX63whnSMLVBxjk= cloud.google.com/go/spanner v1.45.0/go.mod h1:FIws5LowYz8YAE1J8fOS7DJup8ff7xJeetWEo5REA2M= -cloud.google.com/go/spanner v1.63.0 h1:P6+BY70Wtol4MtryBgnXZVTZfsdySEvWfz0EpyLwHi4= -cloud.google.com/go/spanner v1.63.0/go.mod h1:iqDx7urZpgD7RekZ+CFvBRH6kVTW1ZSEb2HMDKOp5Cc= +cloud.google.com/go/spanner v1.66.0 h1:PF1upR8n+DVUO9mUpCc1j5kyHn1Xfq0A53ZrnM0AmeU= +cloud.google.com/go/spanner v1.66.0/go.mod h1:gu+weqqrnoBsVlxOmMG5pzDZ2nkpqqJx4MsnmIacH5w= cloud.google.com/go/speech v1.6.0/go.mod h1:79tcr4FHCimOp56lwC01xnt/WPJZc4v3gzyT7FoBkCM= cloud.google.com/go/speech v1.7.0/go.mod h1:KptqL+BAQIhMsj1kOP2la5DSEEerPDuOP/2mmkhHhZQ= cloud.google.com/go/speech v1.8.0/go.mod h1:9bYIl1/tjsAnMgKGHKmBZzXKEkGgtU+MpdDPTE9f7y0= @@ -628,6 +634,7 @@ filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4 gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zumjgTw83q2ge/PI+yyw8= git.sr.ht/~sbinet/gg v0.3.1/go.mod h1:KGYtlADtqsqANL9ueOFkWymvzUvLMQllU5Ixo+8v3pc= github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 h1:bvDV9vkmnHYOMsOr4WLk+Vo07yKIzd94sVoIqshQ4bU= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= github.com/Azure/azure-pipeline-go v0.2.3/go.mod h1:x841ezTBIMG6O3lAcl8ATHnsOPVl2bqk7S3ta6S6u4k= github.com/Azure/azure-storage-blob-go v0.14.0/go.mod h1:SMqIBi+SuiQH32bvyjngEewEeXoPfKMgWlBDaYf6fck= github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 h1:UQHMgLO+TxOElx5B5HZ4hJQsoJ/PvUvKRhJHDQXO8P8= @@ -643,10 +650,10 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.0 h1:oVLqHXhnYtUwM89y9T1fXGaK9wTkXHgNp8/ZNMQzUxE= github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.0/go.mod h1:dppbR7CwXD4pgtV9t3wD1812RaLDcBjtblcDF5f1vI0= github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk= -github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= -github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= -github.com/Microsoft/hcsshim v0.11.4 h1:68vKo2VN8DE9AdN4tnkWnmdhqdbpUFM8OF3Airm7fz8= -github.com/Microsoft/hcsshim v0.11.4/go.mod h1:smjE4dvqPX9Zldna+t5FG3rnoHhaB7QYxPRqGcpAD9w= +github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= +github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/Microsoft/hcsshim v0.11.5 h1:haEcLNpj9Ka1gd3B3tAEs9CpE0c+1IhoL59w/exYU38= +github.com/Microsoft/hcsshim v0.11.5/go.mod h1:MV8xMfmECjl5HdO7U/3/hFVnkmSBjAjmA09d4bExKcU= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/ajstarks/deck v0.0.0-20200831202436-30c9fc6549a9/go.mod h1:JynElWSGnm/4RlzPXRlREEwqTHAN3T56Bv2ITsFT3gY= github.com/ajstarks/deck/generate v0.0.0-20210309230005-c3f852c02e19/go.mod h1:T13YZdzov6OU0A1+RfKZiZN9ca6VeKdBdyDV+BY97Tk= @@ -672,56 +679,56 @@ github.com/aws/aws-sdk-go v1.30.19/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZve github.com/aws/aws-sdk-go v1.34.0 h1:brux2dRrlwCF5JhTL7MUT3WUwo9zfDHZZp3+g3Mvlmo= github.com/aws/aws-sdk-go v1.34.0/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0= github.com/aws/aws-sdk-go-v2 v1.7.1/go.mod h1:L5LuPC1ZgDr2xQS7AmIec/Jlc7O/Y1u2KxJyNVab250= -github.com/aws/aws-sdk-go-v2 v1.30.0 h1:6qAwtzlfcTtcL8NHtbDQAqgM5s6NDipQTkPxyH/6kAA= -github.com/aws/aws-sdk-go-v2 v1.30.0/go.mod h1:ffIFB97e2yNsv4aTSGkqtHnppsIJzw7G7BReUZ3jCXM= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.1 h1:ZY3108YtBNq96jNZTICHxN1gSBSbnvIdYwwqnvCV4Mc= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.1/go.mod h1:t8PYl/6LzdAqsU4/9tz28V/kU+asFePvpOMkdul0gEQ= +github.com/aws/aws-sdk-go-v2 v1.30.3 h1:jUeBtG0Ih+ZIFH0F4UkmL9w3cSpaMv9tYYDbzILP8dY= +github.com/aws/aws-sdk-go-v2 v1.30.3/go.mod h1:nIQjQVp5sfpQcTc9mPSr1B0PaWK5ByX9MOoDadSN4lc= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 h1:tW1/Rkad38LA15X4UQtjXZXNKsCgkshC3EbmcUmghTg= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3/go.mod h1:UbnqO+zjqk3uIt9yCACHJ9IVNhyhOCnYk8yA19SAWrM= github.com/aws/aws-sdk-go-v2/config v1.5.0/go.mod h1:RWlPOAW3E3tbtNAqTwvSW54Of/yP3oiZXMI0xfUdjyA= -github.com/aws/aws-sdk-go-v2/config v1.27.4 h1:AhfWb5ZwimdsYTgP7Od8E9L1u4sKmDW2ZVeLcf2O42M= -github.com/aws/aws-sdk-go-v2/config v1.27.4/go.mod h1:zq2FFXK3A416kiukwpsd+rD4ny6JC7QSkp4QdN1Mp2g= +github.com/aws/aws-sdk-go-v2/config v1.27.27 h1:HdqgGt1OAP0HkEDDShEl0oSYa9ZZBSOmKpdpsDMdO90= +github.com/aws/aws-sdk-go-v2/config v1.27.27/go.mod h1:MVYamCg76dFNINkZFu4n4RjDixhVr51HLj4ErWzrVwg= github.com/aws/aws-sdk-go-v2/credentials v1.3.1/go.mod h1:r0n73xwsIVagq8RsxmZbGSRQFj9As3je72C2WzUIToc= -github.com/aws/aws-sdk-go-v2/credentials v1.17.18 h1:D/ALDWqK4JdY3OFgA2thcPO1c9aYTT5STS/CvnkqY1c= -github.com/aws/aws-sdk-go-v2/credentials v1.17.18/go.mod h1:JuitCWq+F5QGUrmMPsk945rop6bB57jdscu+Glozdnc= +github.com/aws/aws-sdk-go-v2/credentials v1.17.27 h1:2raNba6gr2IfA0eqqiP2XiQ0UVOpGPgDSi0I9iAP+UI= +github.com/aws/aws-sdk-go-v2/credentials v1.17.27/go.mod h1:gniiwbGahQByxan6YjQUMcW4Aov6bLC3m+evgcoN4r4= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.3.0/go.mod h1:2LAuqPx1I6jNfaGDucWfA2zqQCYCOMCDHiCOciALyNw= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.5 h1:dDgptDO9dxeFkXy+tEgVkzSClHZje/6JkPW5aZyEvrQ= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.5/go.mod h1:gjvE2KBUgUQhcv89jqxrIxH9GaKs1JbZzWejj/DaHGA= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11 h1:KreluoV8FZDEtI6Co2xuNk/UqI9iwMrOx/87PBNIKqw= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11/go.mod h1:SeSUYBLsMYFoRvHE0Tjvn7kbxaUhl75CJi1sbfhMxkU= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.3.2/go.mod h1:qaqQiHSrOUVOfKe6fhgQ6UzhxjwqVW8aHNegd6Ws4w4= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.8 h1:wuOjvalpd2CnXffks74Vq6n3yv9vunKCoy4R1sjStGk= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.8/go.mod h1:vywwjy6VnrR48Izg136JoSUXC4mH9QeUi3g0EH9DSrA= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.9 h1:cy8ahBJuhtM8GTTSyOkfy6WVPV1IE+SS5/wfXUYuulw= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.9/go.mod h1:CZBXGLaJnEZI6EVNcPd7a6B5IC5cA/GkRWtu9fp3S6Y= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.9 h1:A4SYk07ef04+vxZToz9LWvAXl9LW0NClpPpMsi31cz0= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.9/go.mod h1:5jJcHuwDagxN+ErjQ3PU3ocf6Ylc/p9x+BLO/+X4iXw= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10 h1:zeN9UtUlA6FTx0vFSayxSX32HDw73Yb6Hh2izDSFxXY= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10/go.mod h1:3HKuexPDcwLWPaqpW2UR/9n8N/u/3CKcGAzSs8p8u8g= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15 h1:SoNJ4RlFEQEbtDcCEt+QG56MY4fm4W8rYirAmq+/DdU= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15/go.mod h1:U9ke74k1n2bf+RIgoX1SXFed1HLs51OgUSs+Ph0KJP8= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15 h1:C6WHdGnTDIYETAm5iErQUiVNsclNx9qbJVPIt03B6bI= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15/go.mod h1:ZQLZqhcu+JhSrA9/NXRm8SkDvsycE+JkV3WGY41e+IM= github.com/aws/aws-sdk-go-v2/internal/ini v1.1.1/go.mod h1:Zy8smImhTdOETZqfyn01iNOe0CNggVbPjCajyaz6Gvg= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 h1:hT8rVHwugYE2lEfdFE0QWVo81lF7jMrYJVDWI+f+VxU= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0/go.mod h1:8tu/lYfQfFe6IGnaOdrpVgEL2IrrDOf6/m9RQum4NkY= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.3 h1:lMwCXiWJlrtZot0NJTjbC8G9zl+V3i68gBTBBvDeEXA= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.3/go.mod h1:5yzAuE9i2RkVAttBl8yxZgQr5OCq4D5yDnG7j9x2L0U= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.15 h1:Z5r7SycxmSllHYmaAZPpmN8GviDrSGhMS6bldqtXZPw= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.15/go.mod h1:CetW7bDE00QoGEmPUoZuRog07SGVAUVW6LFpNP0YfIg= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.2.1/go.mod h1:v33JQ57i2nekYTA70Mb+O18KeH4KqhdqxTJZNK1zdRE= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.2 h1:Ji0DY1xUsUr3I8cHps0G+XM3WWU16lP6yG8qu1GAZAs= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.2/go.mod h1:5CsjAbs3NlGQyZNFACh+zztPDI7fU6eW9QsxjfnuBKg= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.3 h1:xbwRyCy7kXrOj89iIKLB6NfE2WCpP9HoKyk8dMDvnIQ= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.3/go.mod h1:R+/S1O4TYpcktbVwddeOYg+uwUfLhADP2S/x4QwsCTM= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3 h1:dT3MqvGhSoaIhRseqw2I0yH81l7wiR2vjs57O51EAm8= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3/go.mod h1:GlAeCkHwugxdHaueRr4nhPuY+WW+gR8UjlcqzPr1SPI= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.17 h1:YPYe6ZmvUfDDDELqEKtAd6bo8zxhkm+XEFEzQisqUIE= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.17/go.mod h1:oBtcnYua/CgzCWYN7NZ5j7PotFDaFSUjCYVTtfyn7vw= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.2.1/go.mod h1:zceowr5Z1Nh2WVP8bf/3ikB41IZW59E4yIYbg+pC6mw= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.11 h1:o4T+fKxA3gTMcluBNZZXE9DNaMkJuUL1O3mffCUjoJo= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.11/go.mod h1:84oZdJ+VjuJKs9v1UTC9NaodRZRseOXCTgku+vQJWR8= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17 h1:HGErhhrxZlQ044RiM+WdoZxp0p+EGM62y3L6pwA4olE= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17/go.mod h1:RkZEx4l0EHYDJpWppMJ3nD9wZJAa8/0lq9aVC+r2UII= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.5.1/go.mod h1:6EQZIwNNvHpq/2/QSJnp4+ECvqIy55w95Ofs0ze+nGQ= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.3 h1:KV0z2RDc7euMtg8aUT1czv5p29zcLlXALNFsd3jkkEc= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.3/go.mod h1:KZgs2ny8HsxRIRbDwgvJcHHBZPOzQr/+NtGwnP+w2ec= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.15 h1:246A4lSTXWJw/rmlQI+TT2OcqeDMKBdyjEQrafMaQdA= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.15/go.mod h1:haVfg3761/WF7YPuJOER2MP0k4UAXyHaLclKXB6usDg= github.com/aws/aws-sdk-go-v2/service/s3 v1.11.1/go.mod h1:XLAGFrEjbvMCLvAtWLLP32yTv8GpBquCApZEycDLunI= -github.com/aws/aws-sdk-go-v2/service/s3 v1.42.2 h1:NnduxUd9+Fq9DcCDdJK8v6l9lR1xDX4usvog+JuQAno= -github.com/aws/aws-sdk-go-v2/service/s3 v1.42.2/go.mod h1:NXRKkiRF+erX2hnybnVU660cYT5/KChRD4iUgJ97cI8= +github.com/aws/aws-sdk-go-v2/service/s3 v1.58.3 h1:hT8ZAZRIfqBqHbzKTII+CIiY8G2oC9OpLedkZ51DWl8= +github.com/aws/aws-sdk-go-v2/service/s3 v1.58.3/go.mod h1:Lcxzg5rojyVPU/0eFwLtcyTaek/6Mtic5B1gJo7e/zE= github.com/aws/aws-sdk-go-v2/service/sso v1.3.1/go.mod h1:J3A3RGUvuCZjvSuZEcOpHDnzZP/sKbhDWV2T1EOzFIM= -github.com/aws/aws-sdk-go-v2/service/sso v1.20.11 h1:gEYM2GSpr4YNWc6hCd5nod4+d4kd9vWIAWrmGuLdlMw= -github.com/aws/aws-sdk-go-v2/service/sso v1.20.11/go.mod h1:gVvwPdPNYehHSP9Rs7q27U1EU+3Or2ZpXvzAYJNh63w= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.24.5 h1:iXjh3uaH3vsVcnyZX7MqCoCfcyxIrVE9iOQruRaWPrQ= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.24.5/go.mod h1:5ZXesEuy/QcO0WUnt+4sDkxhdXRHTu2yG0uCSH8B6os= +github.com/aws/aws-sdk-go-v2/service/sso v1.22.4 h1:BXx0ZIxvrJdSgSvKTZ+yRBeSqqgPM89VPlulEcl37tM= +github.com/aws/aws-sdk-go-v2/service/sso v1.22.4/go.mod h1:ooyCOXjvJEsUw7x+ZDHeISPMhtwI3ZCB7ggFMcFfWLU= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.4 h1:yiwVzJW2ZxZTurVbYWA7QOrAaCYQR72t0wrSBfoesUE= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.4/go.mod h1:0oxfLkpz3rQ/CHlx5hB7H69YUpFiI1tql6Q6Ne+1bCw= github.com/aws/aws-sdk-go-v2/service/sts v1.6.0/go.mod h1:q7o0j7d7HrJk/vr9uUt3BVRASvcU7gYZB9PUgPiByXg= -github.com/aws/aws-sdk-go-v2/service/sts v1.28.12 h1:M/1u4HBpwLuMtjlxuI2y6HoVLzF5e2mfxHCg7ZVMYmk= -github.com/aws/aws-sdk-go-v2/service/sts v1.28.12/go.mod h1:kcfd+eTdEi/40FIbLq4Hif3XMXnl5b/+t/KTfLt9xIk= +github.com/aws/aws-sdk-go-v2/service/sts v1.30.3 h1:ZsDKRLXGWHk8WdtyYMoGNO7bTudrvuKpDKgMVRlepGE= +github.com/aws/aws-sdk-go-v2/service/sts v1.30.3/go.mod h1:zwySh8fpFyXp9yOr/KVzxOl8SRqgf/IDw5aUt9UKFcQ= github.com/aws/smithy-go v1.6.0/go.mod h1:SObp3lf9smib00L/v3U2eAKG8FyQ7iLrJnQiAmR5n+E= -github.com/aws/smithy-go v1.20.2 h1:tbp628ireGtzcHDDmLT/6ADHidqnwgF57XOXZe6tp4Q= -github.com/aws/smithy-go v1.20.2/go.mod h1:krry+ya/rV9RDcV/Q16kpu6ypI4K2czasz0NC3qS14E= +github.com/aws/smithy-go v1.20.3 h1:ryHwveWzPV5BIof6fyDvor6V3iUL7nTfiTKXHiW05nE= +github.com/aws/smithy-go v1.20.3/go.mod h1:krry+ya/rV9RDcV/Q16kpu6ypI4K2czasz0NC3qS14E= github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/boombuler/barcode v1.0.1/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= @@ -732,8 +739,9 @@ github.com/census-instrumentation/opencensus-proto v0.4.1 h1:iKLQ0xPNFxR/2hzXZMr github.com/census-instrumentation/opencensus-proto v0.4.1/go.mod h1:4T9NM4+4Vw91VeyqjLS6ao50K5bOcLKN6Q42XnYaRYw= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= @@ -751,11 +759,13 @@ github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20220314180256-7f1daf1720fc/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20230105202645-06c439db220b/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= -github.com/cncf/xds/go v0.0.0-20240318125728-8a4994d93e50 h1:DBmgJDC9dTfkVyGgipamEh2BpGYxScCH1TOF1LL1cXc= -github.com/cncf/xds/go v0.0.0-20240318125728-8a4994d93e50/go.mod h1:5e1+Vvlzido69INQaVO6d87Qn543Xr6nooe9Kz7oBFM= +github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b h1:ga8SEFjZ60pxLcmhnThWgvH2wg8376yUJmPhEH4H3kw= +github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= github.com/colinmarc/hdfs/v2 v2.1.1/go.mod h1:M3x+k8UKKmxtFu++uAZ0OtDU8jR3jnaZIAc6yK4Ue0c= -github.com/containerd/containerd v1.7.11 h1:lfGKw3eU35sjV0aG2eYZTiwFEY1pCzxdzicHP3SZILw= -github.com/containerd/containerd v1.7.11/go.mod h1:5UluHxHTX2rdvYuZ5OJTC5m/KJNs0Zs9wVoJm9zf5ZE= +github.com/containerd/containerd v1.7.18 h1:jqjZTQNfXGoEaZdW1WwPU0RqSn1Bm2Ay/KJPUuO8nao= +github.com/containerd/containerd v1.7.18/go.mod h1:IYEk9/IO6wAPUz2bCMVUbsfXjzw5UNP5fLz4PsUygQ4= +github.com/containerd/errdefs v0.1.0 h1:m0wCRBiu1WJT/Fr+iOoQHMQS/eP5myQ8lCv4Dz5ZURM= +github.com/containerd/errdefs v0.1.0/go.mod h1:YgWiiHtLmSeBrvpw+UfPijzbLaB77mEG1WwJTDETIV0= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E= @@ -763,13 +773,14 @@ github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHf github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= +github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/distribution/reference v0.5.0 h1:/FUIFXtfc/x2gpa5/VGfiGLuOIdYa1t65IKK2OFGvA0= -github.com/distribution/reference v0.5.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= -github.com/docker/docker v25.0.5+incompatible h1:UmQydMduGkrD5nQde1mecF/YnSbTOaPeFIeP5C4W+DE= -github.com/docker/docker v25.0.5+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= +github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= +github.com/docker/docker v27.1.1+incompatible h1:hO/M4MtV36kzKldqnA37IWhebRA+LnqqcqDja6kVaKY= +github.com/docker/docker v27.1.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= @@ -802,8 +813,8 @@ github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSw github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= -github.com/fsouza/fake-gcs-server v1.47.7 h1:56/U4rKY081TaNbq0gHWi7/71UxC2KROqcnrD9BRJhs= -github.com/fsouza/fake-gcs-server v1.47.7/go.mod h1:4vPUynN8/zZlxk5Jpy6LvvTTxItdTAObK4DYnp89Jys= +github.com/fsouza/fake-gcs-server v1.49.2 h1:fukDqzEQM50QkA0jAbl6cLqeDu3maQjwZBuys759TR4= +github.com/fsouza/fake-gcs-server v1.49.2/go.mod h1:17SYzJEXRcaAA5ATwwvgBkSIqIy7r1icnGM0y/y4foY= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/go-fonts/dejavu v0.1.0/go.mod h1:4Wt4I4OU2Nq9asgDCteaAaWZOV24E+0/Pwo0gppep4g= github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3T0ecnM9pNujks= @@ -814,11 +825,12 @@ github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gorp/gorp v2.2.0+incompatible h1:xAUh4QgEeqPPhK3vxZN+bzrim1z5Av6q837gtjUlshc= +github.com/go-gorp/gorp v2.2.0+incompatible/go.mod h1:7IfkAQnO7jfT/9IQ3R9wL1dFhukN6aQxzKTHnkxzA/E= github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U= github.com/go-latex/latex v0.0.0-20210823091927-c0d11ff05a81/go.mod h1:SX0U8uGpxhq9o2S/CELCSUxEWWAuoCUcVCQWv7G2OCk= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= -github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= @@ -881,6 +893,7 @@ github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEW github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU= +github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/flatbuffers v2.0.8+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/flatbuffers v23.5.26+incompatible h1:M9dgRyhJemaM4Sw8+66GHBu8ioaQmyPLg1b8VwK5WJg= @@ -909,6 +922,7 @@ github.com/google/martian/v3 v3.1.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIG github.com/google/martian/v3 v3.2.1/go.mod h1:oBOf6HBosgwRXnUGWUB05QECsc6uvmMiJ3+6W4l/CUk= github.com/google/martian/v3 v3.3.2/go.mod h1:oBOf6HBosgwRXnUGWUB05QECsc6uvmMiJ3+6W4l/CUk= github.com/google/martian/v3 v3.3.3 h1:DIhPTQrbPkgs2yJYdXU/eNACCG5DVQjySNRNlflZ9Fc= +github.com/google/martian/v3 v3.3.3/go.mod h1:iEPrYcgCF7jA9OtScMFQyAlZZ4YXTKEtJ1E6RWzmBA0= github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= @@ -924,8 +938,8 @@ github.com/google/pprof v0.0.0-20210226084205-cbba55b83ad5/go.mod h1:kpwsk12EmLe github.com/google/pprof v0.0.0-20210601050228-01bbb1931b22/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20210609004039-a478d1d731e9/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= -github.com/google/pprof v0.0.0-20230602150820-91b7bce49751 h1:hR7/MlvK23p6+lIw9SN1TigNLn9ZnF3W4SYRKq2gAHs= -github.com/google/pprof v0.0.0-20230602150820-91b7bce49751/go.mod h1:Jh3hGz2jkYak8qXPD19ryItVnUgpgeqzdkY/D0EaeuA= +github.com/google/pprof v0.0.0-20240528025155-186aa0362fba h1:ql1qNgCyOB7iAEk8JTNM+zJrgIbnyCKX/wdlyPufP5g= +github.com/google/pprof v0.0.0-20240528025155-186aa0362fba/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/renameio/v2 v2.0.0 h1:UifI23ZTGY8Tt29JbYFiuyIU3eX+RNFtUwefq9qAhxg= github.com/google/renameio/v2 v2.0.0/go.mod h1:BtmJXm5YlszgC+TD4HOEEUFgkJP3nLxehU6hfe7jRt4= @@ -954,8 +968,8 @@ github.com/googleapis/gax-go/v2 v2.5.1/go.mod h1:h6B0KMMFNtI2ddbGJn3T3ZbwkeT6yqE github.com/googleapis/gax-go/v2 v2.6.0/go.mod h1:1mjbznJAPHFpesgE5ucqfYEscaz5kMdcIDwU/6+DDoY= github.com/googleapis/gax-go/v2 v2.7.0/go.mod h1:TEop28CZZQ2y+c0VxMUmu1lV+fQx57QpBWsYpwqHJx8= github.com/googleapis/gax-go/v2 v2.7.1/go.mod h1:4orTrqY6hXxxaUL4LHIPl6lGo8vAE38/qKbhSAKP6QI= -github.com/googleapis/gax-go/v2 v2.12.5 h1:8gw9KZK8TiVKB6q3zHY3SBzLnrGp6HQjyfYBYGmXdxA= -github.com/googleapis/gax-go/v2 v2.12.5/go.mod h1:BUDKcWo+RaKq5SC9vVYL0wLADa3VcfswbOMMRmB9H3E= +github.com/googleapis/gax-go/v2 v2.13.0 h1:yitjD5f7jQHhyDsnhKEBU52NdvvdSeGzlAnDPT0hH1s= +github.com/googleapis/gax-go/v2 v2.13.0/go.mod h1:Z/fvTZXF8/uw7Xu5GuslPw+bplx6SS338j1Is2S+B7A= github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+cLsWGBF62rFAi7WjWO4= github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= @@ -967,6 +981,7 @@ github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFb github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0/go.mod h1:hgWBS7lorOAVIJEQMi4ZsPv9hVvWI6+ch50m39Pf2Ks= github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3/go.mod h1:o//XUCC/F+yRGJoPO/VU0GSB0f8Nhgmxx0VIRUvaC0w= github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 h1:YBftPWNWd4WwGqtY2yeZL2ef8rHAxPBD8KFhJpmcqms= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0/go.mod h1:YN5jB8ie0yfIUg6VvR9Kz84aCaG7AsGZnLjhHbUqwPg= github.com/hashicorp/go-uuid v0.0.0-20180228145832-27454136f036/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= @@ -976,20 +991,23 @@ github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1: github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a h1:bbPeKD0xmW/Y25WS6cokEszi5g+S0QxI/d45PkRi7Nk= +github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= github.com/jackc/pgx/v5 v5.3.1 h1:Fcr8QJ1ZeLi5zsPZqQeUZhNhxfkkKBOgJuYkJHoBOtU= +github.com/jackc/pgx/v5 v5.3.1/go.mod h1:t3JDKnCBlYIc0ewLF0Q7B8MXmoIaBOZj/ic7iHozM/8= github.com/jcmturner/gofork v0.0.0-20180107083740-2aebee971930/go.mod h1:MK8+TM0La+2rjBD4jE12Kj1pCCxK7d2LK/UM3ncEo0o= github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E= +github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ= +github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= github.com/jmespath/go-jmespath v0.3.0/go.mod h1:9QtRXoHjLGCJ5IBSaohpXITPlowMeeYCZ7fLUTSywik= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= -github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6 h1:eQGUsj2LcsLzfrHY1noKDSU7h+c9/rw9pQPwbQ9g1jQ= github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6/go.mod h1:LIAXxPvcUXwOcTIj9LSNSUpE9/eMHalTWxsP/kmWxQI= -github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= @@ -1000,10 +1018,9 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= github.com/klauspost/compress v1.9.7/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= -github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU= -github.com/klauspost/compress v1.17.7 h1:ehO88t2UGzQK66LMdE8tibEd1ErmzZjNEqWkjLAKQQg= -github.com/klauspost/compress v1.17.7/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.2.6 h1:ndNyv040zDGIDh8thGkXYjnFtiN02M1PVVF+JE/48xc= github.com/klauspost/cpuid/v2 v2.2.6/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= @@ -1012,6 +1029,7 @@ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORN github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -1033,11 +1051,14 @@ github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/ github.com/mattn/go-sqlite3 v1.14.14/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE= -github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g= -github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY= +github.com/minio/highwayhash v1.0.3 h1:kbnuUMoHYyVl7szWjSxJnxw11k2U709jqFPPmIUyD6Q= +github.com/minio/highwayhash v1.0.3/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= -github.com/minio/minio-go/v7 v7.0.66 h1:bnTOXOHjOqv/gcMuiVbN9o2ngRItvqE774dG9nq0Dzw= -github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM= +github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= +github.com/minio/minio-go/v7 v7.0.71 h1:No9XfOKTYi6i0GnBj+WZwD8WP5GZfL7n7GOjRqCdAjA= +github.com/minio/minio-go/v7 v7.0.71/go.mod h1:4yBA8v80xGA30cfM3fz0DKYMXunWl/AV/6tWEs9ryzo= +github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= +github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= github.com/moby/patternmatcher v0.6.0 h1:GmP9lR19aU5GqSSFko+5pRqHi+Ohk1O69aFiKkVGiPk= github.com/moby/patternmatcher v0.6.0/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5lXtc= @@ -1046,18 +1067,16 @@ github.com/moby/sys/user v0.1.0 h1:WmZ93f5Ux6het5iituh9x2zAG7NFY9Aqi49jjE1PaQg= github.com/moby/sys/user v0.1.0/go.mod h1:fKJhFOnsCN6xZ5gSfbM6zaHGgDJMrqt9/reuj4T7MmU= github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= -github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe h1:iruDEfMl2E6fbMZ9s0scYfZQ84/6SPL6zC8ACM2oIL0= -github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= +github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= +github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= -github.com/nats-io/jwt/v2 v2.5.5 h1:ROfXb50elFq5c9+1ztaUbdlrArNFl2+fQWP6B8HGEq4= -github.com/nats-io/jwt/v2 v2.5.5/go.mod h1:ZdWS1nZa6WMZfFwwgpEaqBV8EPGVgOTDHN/wTbz0Y5A= -github.com/nats-io/nats-server/v2 v2.10.12 h1:G6u+RDrHkw4bkwn7I911O5jqys7jJVRY6MwgndyUsnE= -github.com/nats-io/nats-server/v2 v2.10.12/go.mod h1:H1n6zXtYLFCgXcf/SF8QNTSIFuS8tyZQMN9NguUHdEs= -github.com/nats-io/nats.go v1.33.1 h1:8TxLZZ/seeEfR97qV0/Bl939tpDnt2Z2fK3HkPypj70= -github.com/nats-io/nats.go v1.33.1/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8= +github.com/nats-io/jwt/v2 v2.5.8 h1:uvdSzwWiEGWGXf+0Q+70qv6AQdvcvxrv9hPM0RiPamE= +github.com/nats-io/jwt/v2 v2.5.8/go.mod h1:ZdWS1nZa6WMZfFwwgpEaqBV8EPGVgOTDHN/wTbz0Y5A= +github.com/nats-io/nats-server/v2 v2.10.18 h1:tRdZmBuWKVAFYtayqlBB2BuCHNGAQPvoQIXOKwU3WSM= +github.com/nats-io/nats-server/v2 v2.10.18/go.mod h1:97Qyg7YydD8blKlR8yBsUlPlWyZKjA7Bp5cl3MUE9K8= +github.com/nats-io/nats.go v1.36.0 h1:suEUPuWzTSse/XhESwqLxXGuj8vGRuPRoG7MoRN/qyU= +github.com/nats-io/nats.go v1.36.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8= github.com/nats-io/nkeys v0.4.7 h1:RwNJbbIdYCoClSDNY7QVKZlyb/wfT6ugvFCiKy6vDvI= github.com/nats-io/nkeys v0.4.7/go.mod h1:kqXRgRDPlGy7nGaEDMuYzmiJCIAAWDK0IMBtDmGD0nc= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= @@ -1065,8 +1084,8 @@ github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OS github.com/ncw/swift v1.0.52/go.mod h1:23YIA4yWVnGwv2dQlN4bB7egfYX6YLn0Yo/S6zZO/ZM= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= -github.com/opencontainers/image-spec v1.1.0-rc5 h1:Ygwkfw9bpDvs+c9E34SdgGOj41dX/cbdlwvlWt0pnFI= -github.com/opencontainers/image-spec v1.1.0-rc5/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8= +github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= +github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= github.com/pborman/getopt v0.0.0-20180729010549-6fdd0a2c7117/go.mod h1:85jBQOZwpVEaDAr341tbn15RS4fCAsIst0qp7i8ex1o= github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY= github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI= @@ -1090,8 +1109,8 @@ github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:Om github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= -github.com/proullon/ramsql v0.1.3 h1:/LRcXJf4lEmhdb4tYcci473I2VynjcZSzh2hsjJ8rSk= -github.com/proullon/ramsql v0.1.3/go.mod h1:CFGqeQHQpdRfWqYmWD3yXqPTEaHkF4zgXy1C6qDWc9E= +github.com/proullon/ramsql v0.1.4 h1:yTFRTn46gFH/kPbzCx+mGjuFlyTBUeDr3h2ldwxddl0= +github.com/proullon/ramsql v0.1.4/go.mod h1:CFGqeQHQpdRfWqYmWD3yXqPTEaHkF4zgXy1C6qDWc9E= github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= @@ -1099,6 +1118,7 @@ github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTE github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/rs/xid v1.5.0 h1:mKX4bl4iPYJtEIxp6CYiUuLQ/8DYMoz0PUdtGgMFRVc= +github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w= github.com/ruudk/golang-pdf417 v0.0.0-20201230142125-a7e3863a1245/go.mod h1:pQAZKsJ8yyVxGRWYNEm9oFB8ieLgKFnamEyDmSA0BRk= @@ -1106,8 +1126,8 @@ github.com/ryszard/goskiplist v0.0.0-20150312221310-2dfbae5fcf46 h1:GHRpF1pTW19a github.com/ryszard/goskiplist v0.0.0-20150312221310-2dfbae5fcf46/go.mod h1:uAQ5PCi+MFsC7HjREoAz1BU+Mq60+05gifQSsHSDG/8= github.com/shabbyrobe/gocovmerge v0.0.0-20180507124511-f6ea450bfb63 h1:J6qvD6rbmOil46orKqJaRPG+zTpoGlBTUdyv8ki63L0= github.com/shabbyrobe/gocovmerge v0.0.0-20180507124511-f6ea450bfb63/go.mod h1:n+VKSARF5y/tS9XFSP7vWDfS+GUC5vs/YT7M5XDTUEM= -github.com/shirou/gopsutil/v3 v3.23.9 h1:ZI5bWVeu2ep4/DIxB4U9okeYJ7zp/QLTO4auRb/ty/E= -github.com/shirou/gopsutil/v3 v3.23.9/go.mod h1:x/NWSb71eMcjFIO0vhyGW5nZ7oSIgVjrCnADckb85GA= +github.com/shirou/gopsutil/v3 v3.23.12 h1:z90NtUkp3bMtmICZKpC4+WaknU1eXtp5vtbQ11DgpE4= +github.com/shirou/gopsutil/v3 v3.23.12/go.mod h1:1FrWgea594Jp7qmjHUUPlJDTPgcsb9mGnXDxavtikzM= github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= @@ -1141,8 +1161,9 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/testcontainers/testcontainers-go v0.26.0 h1:uqcYdoOHBy1ca7gKODfBd9uTHVK3a7UL848z09MVZ0c= -github.com/testcontainers/testcontainers-go v0.26.0/go.mod h1:ICriE9bLX5CLxL9OFQ2N+2N+f+803LNJ1utJb1+Inx0= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/testcontainers/testcontainers-go v0.32.0 h1:ug1aK08L3gCHdhknlTTwWjPHPS+/alvLJU/DRxTD/ME= +github.com/testcontainers/testcontainers-go v0.32.0/go.mod h1:CRHrzHLQhlXUsa5gXjTOfqIEJcrK5+xMDmBr/WMI88E= github.com/tetratelabs/wazero v1.7.3 h1:PBH5KVahrt3S2AHgEjKu4u+LlDbbk+nsGE3KLucy6Rw= github.com/tetratelabs/wazero v1.7.3/go.mod h1:ytl6Zuh20R/eROuyDaGPkp82O9C/DJfXAwJfQ3X6/7Y= github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= @@ -1180,8 +1201,8 @@ github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaD go.einride.tech/aip v0.67.1 h1:d/4TW92OxXBngkSOwWS2CH5rez869KpKMaN44mdxkFI= go.einride.tech/aip v0.67.1/go.mod h1:ZGX4/zKw8dcgzdLsrvpOOGxfxI2QSk12SlP7d6c0/XI= go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= -go.mongodb.org/mongo-driver v1.13.1 h1:YIc7HTYsKndGK4RFzJ3covLz1byri52x0IoMB0Pt/vk= -go.mongodb.org/mongo-driver v1.13.1/go.mod h1:wcDf1JBCXy2mOW0bWHwO/IOYqdca1MPCwDtFu/Z9+eo= +go.mongodb.org/mongo-driver v1.16.1 h1:rIVLL3q0IHM39dvE+z2ulZLp9ENZKThVfuvN/IiN4l8= +go.mongodb.org/mongo-driver v1.16.1/go.mod h1:oB6AhJQvFQL4LEHyXi6aJzQJtBiTQHiAd83l0GdFaiw= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= @@ -1198,17 +1219,22 @@ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1: go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo= go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0 h1:Mne5On7VWdx7omSrSSZvM4Kw7cS7NQkOOmLcgscI51U= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0/go.mod h1:IPtUMKL4O3tH5y+iXVyAXqpAwMuzC1IrxVS81rummfE= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 h1:IeMeyr1aBvBiPVYihXIaeIZba6b8E1bYp7lbdxK8CQg= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0/go.mod h1:oVdCUtjq9MK9BlS7TtucsQwUcXcymNiEDjgDD2jMtZU= go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI= go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= go.opentelemetry.io/otel/sdk v1.24.0 h1:YMPPDNymmQN3ZgczicBY3B6sf9n62Dlj9pWD3ucgoDw= go.opentelemetry.io/otel/sdk v1.24.0/go.mod h1:KVrIYw6tEubO9E96HQpcmpTKDVn9gdv35HoYiQWGDFg= +go.opentelemetry.io/otel/sdk/metric v1.24.0 h1:yyMQrPzF+k88/DbH7o4FMAs80puqd+9osbiBrJrz/w8= +go.opentelemetry.io/otel/sdk/metric v1.24.0/go.mod h1:I6Y5FjH6rvEnTTAYQz3Mmv2kl6Ek5IIrmwTLqMrrOE0= go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI= go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.opentelemetry.io/proto/otlp v0.15.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.opentelemetry.io/proto/otlp v1.0.0 h1:T0TX0tmXU8a3CbNXzEKGeU5mIVOdf0oykP+u2lIVU/I= +go.opentelemetry.io/proto/otlp v1.0.0/go.mod h1:Sy6pihPLfYHkr3NkUbEhGHFhINUSI/v80hjKIs5JXpM= golang.org/x/crypto v0.0.0-20180723164146-c126467f60eb/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -1221,9 +1247,8 @@ golang.org/x/crypto v0.0.0-20210421170649-83a5a9bb288b/go.mod h1:T9bdIzuCu7OtxOm golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= -golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -1283,8 +1308,8 @@ golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91 golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= -golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0= +golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -1325,7 +1350,6 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210813160813-60bc85c4be6d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20211015210444-4f30a5c0130f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220325170049-de3da57026de/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= @@ -1345,8 +1369,8 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= +golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1376,8 +1400,8 @@ golang.org/x/oauth2 v0.4.0/go.mod h1:RznEsdpjGAINPTOF0UH/t+xJ75L18YO3Ho6Pyn+uRec golang.org/x/oauth2 v0.5.0/go.mod h1:9/XBHVqLaWO3/BRHs5jbpYCnOZVjj5V0ndyaAM7KB4I= golang.org/x/oauth2 v0.6.0/go.mod h1:ycmewcwgD4Rpr3eZJLSB4Kyyljb3qDh40vJ8STE5HKw= golang.org/x/oauth2 v0.7.0/go.mod h1:hPLQkd9LyjfXTiRohC/41GhcFqxisoUQ99sCUOHO9x4= -golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= -golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/oauth2 v0.22.0 h1:BzDx2FehcG7jJwgWLELCdmLuxk2i+x9UDpSiss2u0ZA= +golang.org/x/oauth2 v0.22.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -1394,10 +1418,9 @@ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220819030929-7fc1605a5dde/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= -golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190130150945-aca44879d564/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -1482,9 +1505,10 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= +golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= @@ -1493,6 +1517,8 @@ golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU= +golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1509,8 +1535,8 @@ golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= +golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1583,8 +1609,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.3.0/go.mod h1:/rWhSS2+zyEVwoJf8YAX6L2f0ntZ7Kn/mGgAWcipA5k= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/tools v0.22.0 h1:gqSGLZqv+AI9lIQzniJ0nZDRG5GBPsSi+DRNHWNz6yA= +golang.org/x/tools v0.22.0/go.mod h1:aCwcsjqvq7Yqt6TNyX7QMU2enbQ/Gt0bo6krSeEri+c= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -1600,6 +1626,7 @@ gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0= gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA= gonum.org/v1/gonum v0.12.0 h1:xKuo6hzt+gMav00meVPUlXwSdoEJP46BR+wdxQEFK2o= +gonum.org/v1/gonum v0.12.0/go.mod h1:73TDxJfAAHeA8Mk9mf8NlIppyhQNo5GLTcYeqgo2lvY= gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY= @@ -1661,8 +1688,8 @@ google.golang.org/api v0.108.0/go.mod h1:2Ts0XTHNVWxypznxWOYUeI4g3WdP9Pk2Qk58+a/ google.golang.org/api v0.110.0/go.mod h1:7FC4Vvx1Mooxh8C5HWjzZHcavuS2f6pmJpZx60ca7iI= google.golang.org/api v0.111.0/go.mod h1:qtFHvU9mhgTJegR31csQ+rwxyUTHOKFqCKWp1J0fdw0= google.golang.org/api v0.114.0/go.mod h1:ifYI2ZsFK6/uGddGfAD5BMxlnkBqCmqHSDUVi45N5Yg= -google.golang.org/api v0.187.0 h1:Mxs7VATVC2v7CY+7Xwm4ndkX71hpElcvx0D1Ji/p1eo= -google.golang.org/api v0.187.0/go.mod h1:KIHlTc4x7N7gKKuVsdmfBXN13yEEWXWFURWY6SBp2gk= +google.golang.org/api v0.189.0 h1:equMo30LypAkdkLMBqfeIqtyAnlyig1JSZArl4XPwdI= +google.golang.org/api v0.189.0/go.mod h1:FLWGJKb0hb+pU2j+rJqwbnsF+ym+fQs73rbJ+KAUgy8= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -1802,12 +1829,12 @@ google.golang.org/genproto v0.0.0-20230323212658-478b75c54725/go.mod h1:UUQDJDOl google.golang.org/genproto v0.0.0-20230330154414-c0448cd141ea/go.mod h1:UUQDJDOlWu4KYeJZffbWgBkS1YFobzKbLVfK69pe0Ak= google.golang.org/genproto v0.0.0-20230331144136-dcfb400f0633/go.mod h1:UUQDJDOlWu4KYeJZffbWgBkS1YFobzKbLVfK69pe0Ak= google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= -google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d h1:PksQg4dV6Sem3/HkBX+Ltq8T0ke0PKIRBNBatoDTVls= -google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:s7iA721uChleev562UJO2OYB0PPT9CMFjV+Ce7VJH5M= -google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 h1:MuYw1wJzT+ZkybKfaOXKp5hJiZDn2iHaXRw0mRYdHSc= -google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4/go.mod h1:px9SlOOZBg1wM1zdnr8jEL4CNGUBZ+ZKYtNPApNQc4c= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d h1:k3zyW3BYYR30e8v3x0bTDdE9vpYFjZHK+HcyqkrppWk= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/genproto v0.0.0-20240725223205-93522f1f2a9f h1:htT2I9bZvGm+110zq8bIErMX+WgBWxCzV3ChwbvnKnc= +google.golang.org/genproto v0.0.0-20240725223205-93522f1f2a9f/go.mod h1:Sk3mLpoDFTAp6R4OvlcUgaG4ISTspKeFsIAXMn9Bm4Y= +google.golang.org/genproto/googleapis/api v0.0.0-20240722135656-d784300faade h1:WxZOF2yayUHpHSbUE6NMzumUzBxYc3YGwo0YHnbzsJY= +google.golang.org/genproto/googleapis/api v0.0.0-20240722135656-d784300faade/go.mod h1:mw8MG/Qz5wfgYr6VqVCiZcHe/GJEfI+oGGDCohaVgB0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade h1:oCRSWfwGXQsqlVdErcyTt4A93Y8fo0/9D4b1gnI++qo= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1849,8 +1876,8 @@ google.golang.org/grpc v1.52.3/go.mod h1:pu6fVzoFb+NBYNAvQL08ic+lvB2IojljRYuun5v google.golang.org/grpc v1.53.0/go.mod h1:OnIrk0ipVdj4N5d9IUoFUx72/VlD7+jUsHwZgwSMQpw= google.golang.org/grpc v1.54.0/go.mod h1:PUSEXI6iWghWaB6lXM4knEgpJNu2qUcKfDtNci3EC2g= google.golang.org/grpc v1.56.3/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s= -google.golang.org/grpc v1.64.1 h1:LKtvyfbX3UGVPFcGqJ9ItpVWW6oN/2XqTxfAnwRRXiA= -google.golang.org/grpc v1.64.1/go.mod h1:hiQF4LFZelK2WKaP6W0L92zGHtiQdZxk8CrSdvyjeP0= +google.golang.org/grpc v1.65.0 h1:bs/cUb4lp1G5iImFFd3u5ixQzweKizoZJAwBNLR42lc= +google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= @@ -1877,6 +1904,7 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/jcmturner/aescts.v1 v1.0.1/go.mod h1:nsR8qBOg+OucoIW+WMhB3GspUQXq9XorLnQb9XtvcOo= gopkg.in/jcmturner/dnsutils.v1 v1.0.1/go.mod h1:m3v+5svpVOhtFAP/wSz+yzh4Mc0Fg7eRhxkJMWSIz9Q= gopkg.in/jcmturner/goidentity.v3 v3.0.0/go.mod h1:oG2kH0IvSYNIu80dVAyu/yoefjq1mNfM5bm88whjWx4= @@ -1892,8 +1920,11 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gorm.io/driver/postgres v1.5.2 h1:ytTDxxEv+MplXOfFe3Lzm7SjG09fcdb3Z/c056DTBx0= +gorm.io/driver/postgres v1.5.2/go.mod h1:fmpX0m2I1PKuR7mKZiEluwrP3hbs+ps7JIGMUBpCgl8= gorm.io/gorm v1.25.2 h1:gs1o6Vsa+oVKG/a9ElL3XgyGfghFfkKA2SInQaCyMho= +gorm.io/gorm v1.25.2/go.mod h1:L4uxeKpfBml98NYqVqwAdmV1a2nBtAec/cf3fpucW/k= gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU= +gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/sdks/go/cmd/beamctl/cmd/provision.go b/sdks/go/cmd/beamctl/cmd/provision.go index cab82f7bf9db..878c9a77da82 100644 --- a/sdks/go/cmd/beamctl/cmd/provision.go +++ b/sdks/go/cmd/beamctl/cmd/provision.go @@ -17,7 +17,6 @@ package cmd import ( fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" - "github.com/golang/protobuf/proto" "github.com/spf13/cobra" ) @@ -53,6 +52,6 @@ func infoFn(cmd *cobra.Command, args []string) error { return err } - cmd.Print(proto.MarshalTextString(info.GetInfo())) + cmd.Print(info.GetInfo().String()) return nil } diff --git a/sdks/go/container/boot_test.go b/sdks/go/container/boot_test.go index e799e5d65b0c..49c78047249e 100644 --- a/sdks/go/container/boot_test.go +++ b/sdks/go/container/boot_test.go @@ -25,7 +25,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/artifact" fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) func TestEnsureEndpointsSet_AllSet(t *testing.T) { diff --git a/sdks/go/container/tools/provision.go b/sdks/go/container/tools/provision.go index dab3383fc171..6b370a5c2e66 100644 --- a/sdks/go/container/tools/provision.go +++ b/sdks/go/container/tools/provision.go @@ -29,8 +29,8 @@ import ( fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" - "github.com/golang/protobuf/jsonpb" - google_pb "github.com/golang/protobuf/ptypes/struct" + "google.golang.org/protobuf/encoding/protojson" + google_pb "google.golang.org/protobuf/types/known/structpb" ) // ProvisionInfo returns the runtime provisioning info for the worker. @@ -65,7 +65,8 @@ func OptionsToProto(v any) (*google_pb.Struct, error) { // JSONToProto converts JSON-encoded pipeline options to a proto struct. func JSONToProto(data string) (*google_pb.Struct, error) { var out google_pb.Struct - if err := jsonpb.UnmarshalString(string(data), &out); err != nil { + + if err := protojson.Unmarshal([]byte(data), &out); err != nil { return nil, err } return &out, nil @@ -85,5 +86,9 @@ func ProtoToJSON(opt *google_pb.Struct) (string, error) { if opt == nil { return "{}", nil } - return (&jsonpb.Marshaler{}).MarshalToString(opt) + bytes, err := protojson.Marshal(opt) + if err != nil { + return "", err + } + return string(bytes), err } diff --git a/sdks/go/pkg/beam/artifact/gcsproxy/retrieval.go b/sdks/go/pkg/beam/artifact/gcsproxy/retrieval.go index 15c2d9e2954a..ceb8a319be98 100644 --- a/sdks/go/pkg/beam/artifact/gcsproxy/retrieval.go +++ b/sdks/go/pkg/beam/artifact/gcsproxy/retrieval.go @@ -22,8 +22,8 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/gcsx" - "github.com/golang/protobuf/proto" "golang.org/x/net/context" + "google.golang.org/protobuf/proto" ) // RetrievalServer is a artifact retrieval server backed by Google diff --git a/sdks/go/pkg/beam/artifact/gcsproxy/staging.go b/sdks/go/pkg/beam/artifact/gcsproxy/staging.go index a29508439807..9113e780f339 100644 --- a/sdks/go/pkg/beam/artifact/gcsproxy/staging.go +++ b/sdks/go/pkg/beam/artifact/gcsproxy/staging.go @@ -28,8 +28,8 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/gcsx" - "github.com/golang/protobuf/proto" "golang.org/x/net/context" + "google.golang.org/protobuf/proto" ) // StagingServer is a artifact staging server backed by Google Cloud Storage diff --git a/sdks/go/pkg/beam/artifact/materialize.go b/sdks/go/pkg/beam/artifact/materialize.go index 866e0dd99b9f..624e30efcd2b 100644 --- a/sdks/go/pkg/beam/artifact/materialize.go +++ b/sdks/go/pkg/beam/artifact/materialize.go @@ -38,7 +38,7 @@ import ( pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/errorx" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // TODO(lostluck): 2018/05/28 Extract these from their enum descriptors in the pipeline_v1 proto diff --git a/sdks/go/pkg/beam/artifact/materialize_test.go b/sdks/go/pkg/beam/artifact/materialize_test.go index 35223c908b77..31890ed045cc 100644 --- a/sdks/go/pkg/beam/artifact/materialize_test.go +++ b/sdks/go/pkg/beam/artifact/materialize_test.go @@ -29,9 +29,9 @@ import ( jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" - "github.com/golang/protobuf/proto" "google.golang.org/grpc" "google.golang.org/grpc/metadata" + "google.golang.org/protobuf/proto" ) // TestRetrieve tests that we can successfully retrieve fresh files. diff --git a/sdks/go/pkg/beam/coder.go b/sdks/go/pkg/beam/coder.go index 062bb337e8d8..b03b739ed7be 100644 --- a/sdks/go/pkg/beam/coder.go +++ b/sdks/go/pkg/beam/coder.go @@ -30,8 +30,8 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/jsonx" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/reflectx" "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" - protov1 "github.com/golang/protobuf/proto" protov2 "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/protoadapt" "google.golang.org/protobuf/reflect/protoreflect" ) @@ -51,7 +51,7 @@ type jsonCoder interface { json.Unmarshaler } -var protoMessageType = reflect.TypeOf((*protov1.Message)(nil)).Elem() +var protoMessageType = reflect.TypeOf((*protoadapt.MessageV1)(nil)).Elem() var protoReflectMessageType = reflect.TypeOf((*protoreflect.ProtoMessage)(nil)).Elem() var jsonCoderType = reflect.TypeOf((*jsonCoder)(nil)).Elem() @@ -276,8 +276,8 @@ func protoEnc(in T) ([]byte, error) { switch it := in.(type) { case protoreflect.ProtoMessage: p = it - case protov1.Message: - p = protov1.MessageV2(it) + case protoadapt.MessageV1: + p = protoadapt.MessageV2Of(it) } b, err := protov2.MarshalOptions{Deterministic: true}.Marshal(p) if err != nil { @@ -293,8 +293,8 @@ func protoDec(t reflect.Type, in []byte) (T, error) { switch it := reflect.New(t.Elem()).Interface().(type) { case protoreflect.ProtoMessage: p = it - case protov1.Message: - p = protov1.MessageV2(it) + case protoadapt.MessageV1: + p = protoadapt.MessageV2Of(it) } err := protov2.UnmarshalOptions{}.Unmarshal(in, p) if err != nil { diff --git a/sdks/go/pkg/beam/core/runtime/exec/hash.go b/sdks/go/pkg/beam/core/runtime/exec/hash.go index 353d203d4e48..7b540bcf1594 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/hash.go +++ b/sdks/go/pkg/beam/core/runtime/exec/hash.go @@ -37,6 +37,13 @@ type elementHasher interface { func makeElementHasher(c *coder.Coder, wc *coder.WindowCoder) elementHasher { hasher := &maphash.Hash{} we := MakeWindowEncoder(wc) + + // Unwrap length prefix coders. + // A length prefix changes the hash itself, but shouldn't affect + // that identical elements have the same hash, so skip them here. + if c.Kind == coder.LP { + c = c.Components[0] + } switch c.Kind { case coder.Bytes: return &bytesHasher{hash: hasher, we: we} diff --git a/sdks/go/pkg/beam/core/runtime/exec/translate.go b/sdks/go/pkg/beam/core/runtime/exec/translate.go index 72af9e80c405..b74ede228fd9 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/translate.go +++ b/sdks/go/pkg/beam/core/runtime/exec/translate.go @@ -33,7 +33,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // TODO(lostluck): 2018/05/28 Extract these from the canonical enums in beam_runner_api.proto diff --git a/sdks/go/pkg/beam/core/runtime/graphx/coder.go b/sdks/go/pkg/beam/core/runtime/graphx/coder.go index 87b3771e5756..99ca5517d3d3 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/coder.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/coder.go @@ -27,7 +27,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/protox" "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) const ( @@ -615,8 +615,8 @@ func (b *CoderMarshaller) internRowCoder(schema *pipepb.Schema) string { } func (b *CoderMarshaller) internCoder(coder *pipepb.Coder) string { - key := proto.MarshalTextString(coder) - if id, exists := b.coder2id[key]; exists { + key := coder.String() + if id, exists := b.coder2id[(key)]; exists { return id } @@ -626,7 +626,7 @@ func (b *CoderMarshaller) internCoder(coder *pipepb.Coder) string { } else { id = fmt.Sprintf("c%v@%v", len(b.coder2id), b.Namespace) } - b.coder2id[key] = id + b.coder2id[string(key)] = id b.coders[id] = coder return id } diff --git a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go index fdd9355e1cb8..0d44e68285b5 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go @@ -37,8 +37,8 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/reflectx" "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" "github.com/google/uuid" + "google.golang.org/protobuf/proto" ) // Initialize registered schemas. For use by the beam package at beam.Init time. diff --git a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go index 37b3e79f8f50..367d70e81d17 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go @@ -24,7 +24,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" "github.com/google/go-cmp/cmp" "google.golang.org/protobuf/encoding/prototext" "google.golang.org/protobuf/testing/protocmp" @@ -806,7 +805,7 @@ func TestSchemaConversion(t *testing.T) { } if d := cmp.Diff(test.st, got, protocmp.Transform(), - protocmp.IgnoreFields(proto.MessageV2(&pipepb.Schema{}), "id"), + protocmp.IgnoreFields(&pipepb.Schema{}, "id"), ); d != "" { t.Errorf("diff (-want, +got): %v", d) } diff --git a/sdks/go/pkg/beam/core/runtime/graphx/translate.go b/sdks/go/pkg/beam/core/runtime/graphx/translate.go index b05292546133..65280ef6b930 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/translate.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/translate.go @@ -34,7 +34,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/options/resource" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/durationpb" ) @@ -1209,13 +1209,13 @@ func (m *marshaller) addWindowingStrategy(w *window.WindowingStrategy) (string, } func (m *marshaller) internWindowingStrategy(w *pipepb.WindowingStrategy) string { - key := proto.MarshalTextString(w) - if id, exists := m.windowing2id[key]; exists { + key := w.String() + if id, exists := m.windowing2id[(key)]; exists { return id } id := fmt.Sprintf("w%v", len(m.windowing2id)) - m.windowing2id[key] = id + m.windowing2id[string(key)] = id m.windowing[id] = w return id } diff --git a/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go b/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go index a331aedd585d..e18a5f97796b 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go @@ -34,8 +34,8 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/protox" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/reflectx" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" "github.com/google/go-cmp/cmp" + "google.golang.org/protobuf/proto" ) func init() { @@ -181,13 +181,13 @@ func TestMarshal(t *testing.T) { } if got, want := len(p.GetComponents().GetTransforms()), test.transforms; got != want { - t.Errorf("got %d transforms, want %d : %v", got, want, proto.MarshalTextString(p)) + t.Errorf("got %d transforms, want %d : %v", got, want, p.String()) } if got, want := len(p.GetRootTransformIds()), test.roots; got != want { - t.Errorf("got %d roots, want %d : %v", got, want, proto.MarshalTextString(p)) + t.Errorf("got %d roots, want %d : %v", got, want, p.String()) } if got, want := p.GetRequirements(), test.requirements; !cmp.Equal(got, want, cmpopts.SortSlices(func(a, b string) bool { return a < b })) { - t.Errorf("incorrect requirements: got %v, want %v : %v", got, want, proto.MarshalTextString(p)) + t.Errorf("incorrect requirements: got %v, want %v : %v", got, want, p.String()) } }) } @@ -248,7 +248,7 @@ func TestMarshal_PTransformAnnotations(t *testing.T) { pts := p.GetComponents().GetTransforms() if got, want := len(pts), test.transforms; got != want { - t.Errorf("got %d transforms, want %d : %v", got, want, proto.MarshalTextString(p)) + t.Errorf("got %d transforms, want %d : %v", got, want, p.String()) } for _, pt := range pts { // Context annotations only apply to composites, and are not duplicated to leaves. diff --git a/sdks/go/pkg/beam/core/runtime/harness/harness_test.go b/sdks/go/pkg/beam/core/runtime/harness/harness_test.go index 91dd3c591d5b..8c25db613eba 100644 --- a/sdks/go/pkg/beam/core/runtime/harness/harness_test.go +++ b/sdks/go/pkg/beam/core/runtime/harness/harness_test.go @@ -23,7 +23,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/exec" fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // validDescriptor describes a valid pipeline with a source and a sink, but doesn't do anything else. diff --git a/sdks/go/pkg/beam/core/runtime/harness/statemgr.go b/sdks/go/pkg/beam/core/runtime/harness/statemgr.go index 76d4e1f32c23..061cfca011f5 100644 --- a/sdks/go/pkg/beam/core/runtime/harness/statemgr.go +++ b/sdks/go/pkg/beam/core/runtime/harness/statemgr.go @@ -28,7 +28,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" "github.com/apache/beam/sdks/v2/go/pkg/beam/log" fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" - "github.com/golang/protobuf/proto" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" ) @@ -633,7 +632,7 @@ func (c *StateChannel) read(ctx context.Context) { if !ok { // This can happen if Send returns an error that write handles, but // the message was actually sent. - log.Errorf(ctx, "StateChannel[%v].read: no consumer for state response: %v", c.id, proto.MarshalTextString(msg)) + log.Errorf(ctx, "StateChannel[%v].read: no consumer for state response: %v", c.id, msg.String()) continue } @@ -641,7 +640,7 @@ func (c *StateChannel) read(ctx context.Context) { case ch <- msg: // ok default: - panic(fmt.Sprintf("StateChannel[%v].read: failed to consume state response: %v", c.id, proto.MarshalTextString(msg))) + panic(fmt.Sprintf("StateChannel[%v].read: failed to consume state response: %v", c.id, msg.String())) } } } diff --git a/sdks/go/pkg/beam/core/runtime/pipelinex/clone_test.go b/sdks/go/pkg/beam/core/runtime/pipelinex/clone_test.go index 695830a483c0..b58a30983797 100644 --- a/sdks/go/pkg/beam/core/runtime/pipelinex/clone_test.go +++ b/sdks/go/pkg/beam/core/runtime/pipelinex/clone_test.go @@ -19,8 +19,8 @@ import ( "testing" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" "github.com/google/go-cmp/cmp" + "google.golang.org/protobuf/proto" ) func TestShallowClonePTransform(t *testing.T) { diff --git a/sdks/go/pkg/beam/core/runtime/pipelinex/replace.go b/sdks/go/pkg/beam/core/runtime/pipelinex/replace.go index cfcce88675be..9e527f2fd322 100644 --- a/sdks/go/pkg/beam/core/runtime/pipelinex/replace.go +++ b/sdks/go/pkg/beam/core/runtime/pipelinex/replace.go @@ -28,7 +28,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/reflectx" "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // Update merges a pipeline with the given components, which may add, replace diff --git a/sdks/go/pkg/beam/core/runtime/pipelinex/replace_test.go b/sdks/go/pkg/beam/core/runtime/pipelinex/replace_test.go index 79bfd43958af..3024787e6163 100644 --- a/sdks/go/pkg/beam/core/runtime/pipelinex/replace_test.go +++ b/sdks/go/pkg/beam/core/runtime/pipelinex/replace_test.go @@ -20,8 +20,8 @@ import ( "testing" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" "github.com/google/go-cmp/cmp" + "google.golang.org/protobuf/proto" "google.golang.org/protobuf/testing/protocmp" ) diff --git a/sdks/go/pkg/beam/core/runtime/pipelinex/util.go b/sdks/go/pkg/beam/core/runtime/pipelinex/util.go index 5fe9def9b227..4735e7b77d20 100644 --- a/sdks/go/pkg/beam/core/runtime/pipelinex/util.go +++ b/sdks/go/pkg/beam/core/runtime/pipelinex/util.go @@ -19,7 +19,7 @@ import ( "sort" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // Bounded returns true iff all PCollections are bounded. diff --git a/sdks/go/pkg/beam/core/runtime/xlangx/resolve_test.go b/sdks/go/pkg/beam/core/runtime/xlangx/resolve_test.go index 1f18b333541b..eec13c451a13 100644 --- a/sdks/go/pkg/beam/core/runtime/xlangx/resolve_test.go +++ b/sdks/go/pkg/beam/core/runtime/xlangx/resolve_test.go @@ -20,7 +20,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) func createExternalEdge(typeUrn string, typePayload []byte) *graph.MultiEdge { diff --git a/sdks/go/pkg/beam/core/util/protox/any.go b/sdks/go/pkg/beam/core/util/protox/any.go index e539a8c19dec..46bd08b1aff1 100644 --- a/sdks/go/pkg/beam/core/util/protox/any.go +++ b/sdks/go/pkg/beam/core/util/protox/any.go @@ -17,9 +17,9 @@ package protox import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" - "github.com/golang/protobuf/proto" - protobuf "github.com/golang/protobuf/ptypes/any" - protobufw "github.com/golang/protobuf/ptypes/wrappers" + "google.golang.org/protobuf/proto" + protobuf "google.golang.org/protobuf/types/known/anypb" + protobufw "google.golang.org/protobuf/types/known/wrapperspb" ) const ( diff --git a/sdks/go/pkg/beam/core/util/protox/any_test.go b/sdks/go/pkg/beam/core/util/protox/any_test.go index 1975bec405cb..9eb7621db351 100644 --- a/sdks/go/pkg/beam/core/util/protox/any_test.go +++ b/sdks/go/pkg/beam/core/util/protox/any_test.go @@ -19,8 +19,8 @@ import ( "bytes" "testing" - "github.com/golang/protobuf/proto" - protobufw "github.com/golang/protobuf/ptypes/wrappers" + "google.golang.org/protobuf/proto" + protobufw "google.golang.org/protobuf/types/known/wrapperspb" ) func TestProtoPackingInvertibility(t *testing.T) { diff --git a/sdks/go/pkg/beam/core/util/protox/base64.go b/sdks/go/pkg/beam/core/util/protox/base64.go index 7f0f5a4bdeea..79ea8a025f7c 100644 --- a/sdks/go/pkg/beam/core/util/protox/base64.go +++ b/sdks/go/pkg/beam/core/util/protox/base64.go @@ -19,7 +19,7 @@ import ( "encoding/base64" "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // MustEncodeBase64 encodes a proto wrapped in base64 and panics on failure. diff --git a/sdks/go/pkg/beam/core/util/protox/protox.go b/sdks/go/pkg/beam/core/util/protox/protox.go index 3555886eefc9..892a2ba97d03 100644 --- a/sdks/go/pkg/beam/core/util/protox/protox.go +++ b/sdks/go/pkg/beam/core/util/protox/protox.go @@ -16,7 +16,7 @@ // Package protox contains utilities for working with protobufs. package protox -import "github.com/golang/protobuf/proto" +import "google.golang.org/protobuf/proto" // MustEncode encode the message and panics on failure. func MustEncode(msg proto.Message) []byte { diff --git a/sdks/go/pkg/beam/create_test.go b/sdks/go/pkg/beam/create_test.go index 785c3b33db62..e65fefc7f2d8 100644 --- a/sdks/go/pkg/beam/create_test.go +++ b/sdks/go/pkg/beam/create_test.go @@ -23,7 +23,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam" "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert" "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/ptest" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/protoadapt" ) func TestMain(m *testing.M) { @@ -157,7 +157,5 @@ func (t *testProto) Unmarshal(b []byte) error { // Ensure testProto is detected as a proto.Message and can be (un)marshalled by // the proto library. var ( - _ proto.Message = &testProto{} - _ proto.Marshaler = &testProto{} - _ proto.Unmarshaler = &testProto{} + _ protoadapt.MessageV1 = &testProto{} ) diff --git a/sdks/go/pkg/beam/io/filesystem/s3/s3.go b/sdks/go/pkg/beam/io/filesystem/s3/s3.go index 97a2c9aada14..40fde0a300b2 100644 --- a/sdks/go/pkg/beam/io/filesystem/s3/s3.go +++ b/sdks/go/pkg/beam/io/filesystem/s3/s3.go @@ -149,7 +149,7 @@ func (f *fs) OpenWrite(ctx context.Context, filename string) (io.WriteCloser, er func (f *fs) Size(ctx context.Context, filename string) (int64, error) { bucket, key, err := parseURI(filename) if err != nil { - return -1, fmt.Errorf("error parsing S3 uri %s: %v", filename, err) + return -1, fmt.Errorf("error parsing S3 uri %s: %w", filename, err) } params := &s3.HeadObjectInput{ @@ -158,10 +158,14 @@ func (f *fs) Size(ctx context.Context, filename string) (int64, error) { } output, err := f.client.HeadObject(ctx, params) if err != nil { - return -1, fmt.Errorf("error getting metadata for object %s: %v", filename, err) + return -1, fmt.Errorf("error getting metadata for object %s: %w", filename, err) } - return output.ContentLength, err + if output.ContentLength != nil { + return *output.ContentLength, nil + } + + return -1, fmt.Errorf("content length for object %s was nil", filename) } // LastModified returns the time at which the file was last modified. diff --git a/sdks/go/pkg/beam/provision/provision.go b/sdks/go/pkg/beam/provision/provision.go index 3c36973535e7..58a8f5ee8292 100644 --- a/sdks/go/pkg/beam/provision/provision.go +++ b/sdks/go/pkg/beam/provision/provision.go @@ -24,7 +24,7 @@ import ( "github.com/apache/beam/sdks/v2/go/container/tools" fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" - google_pb "github.com/golang/protobuf/ptypes/struct" + google_pb "google.golang.org/protobuf/types/known/structpb" ) // Info returns the runtime provisioning info for the worker. diff --git a/sdks/go/pkg/beam/runners/dataflow/dataflow.go b/sdks/go/pkg/beam/runners/dataflow/dataflow.go index ca701979497a..73667fb8ee6e 100644 --- a/sdks/go/pkg/beam/runners/dataflow/dataflow.go +++ b/sdks/go/pkg/beam/runners/dataflow/dataflow.go @@ -47,7 +47,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/dataflow/dataflowlib" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/gcsx" "github.com/apache/beam/sdks/v2/go/pkg/beam/x/hooks/perf" - "github.com/golang/protobuf/proto" ) // TODO(herohde) 5/16/2017: the Dataflow flags should match the other SDKs. @@ -235,7 +234,7 @@ func Execute(ctx context.Context, p *beam.Pipeline) (beam.PipelineResult, error) if *dryRun { log.Info(ctx, "Dry-run: not submitting job!") - log.Info(ctx, proto.MarshalTextString(model)) + log.Info(ctx, model.String()) job, err := dataflowlib.Translate(ctx, model, opts, workerURL, modelURL) if err != nil { return nil, err diff --git a/sdks/go/pkg/beam/runners/dataflow/dataflowlib/execute.go b/sdks/go/pkg/beam/runners/dataflow/dataflowlib/execute.go index 9a1641e314d1..806b8940ae99 100644 --- a/sdks/go/pkg/beam/runners/dataflow/dataflowlib/execute.go +++ b/sdks/go/pkg/beam/runners/dataflow/dataflowlib/execute.go @@ -30,7 +30,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/log" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/universal/runnerlib" - "github.com/golang/protobuf/proto" df "google.golang.org/api/dataflow/v1b3" "google.golang.org/api/googleapi" ) @@ -82,7 +81,7 @@ func Execute(ctx context.Context, raw *pipepb.Pipeline, opts *JobOptions, worker } // (2) Upload model to GCS - log.Info(ctx, proto.MarshalTextString(raw)) + log.Info(ctx, raw.String()) if err := StageModel(ctx, opts.Project, modelURL, protox.MustEncode(raw)); err != nil { return presult, err diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go index 2c4e08bcd094..bc8449c72b39 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go @@ -45,6 +45,14 @@ type element struct { holdTimestamp mtime.Time // only used for Timers pane typex.PaneInfo transform, family, tag string // only used for Timers. + // Used to ensure ordering within a key when sorting the heap, + // which isn't using a stable sort. + // Since ordering is weak across multiple bundles, it needs only + // be consistent between exiting a stage and entering a stateful stage. + // No synchronization is required in specifying this, + // since keyed elements are only processed by a single bundle at a time, + // if stateful stages are concerned. + sequence int elmBytes []byte // When nil, indicates this is a timer. keyBytes []byte @@ -103,7 +111,8 @@ func (h elementHeap) Less(i, j int) bool { } else if h[i].IsData() && h[j].IsTimer() { return true // i before j. } - // They're the same kind, fall through to timestamp less for consistency. + // They're the same kind, so compare by the sequence value. + return h[i].sequence < h[j].sequence } // Otherwise compare by timestamp. return h[i].timestamp < h[j].timestamp @@ -211,6 +220,16 @@ func (em *ElementManager) AddStage(ID string, inputIDs, outputIDs []string, side for _, input := range inputIDs { em.consumers[input] = append(em.consumers[input], ss.ID) } + + // In very rare cases, we can have a stage without any inputs, such as a flatten. + // In that case, there's nothing that will start the watermark refresh cycle, + // so we must do it here. + if len(inputIDs) == 0 { + refreshes := singleSet(ss.ID) + em.addToTestStreamImpulseSet(refreshes) + em.addRefreshes(refreshes) + } + for _, side := range ss.sides { // Note that we use the StageID as the global ID in the value since we need // to be able to look up the consuming stage, from the global PCollectionID. @@ -265,14 +284,19 @@ func (em *ElementManager) Impulse(stageID string) { } refreshes := stage.updateWatermarks(em) - // Since impulses are synthetic, we need to simulate them properly - // if a pipeline is only test stream driven. + em.addToTestStreamImpulseSet(refreshes) + em.addRefreshes(refreshes) +} + +// addToTestStreamImpulseSet adds to the set of stages to refresh on pipeline start. +// We keep this separate since impulses are synthetic. In a test stream driven pipeline +// these will need to be stimulated separately, to ensure the test stream has progressed. +func (em *ElementManager) addToTestStreamImpulseSet(refreshes set[string]) { if em.impulses == nil { em.impulses = refreshes } else { em.impulses.merge(refreshes) } - em.addRefreshes(refreshes) } type RunBundle struct { @@ -470,7 +494,19 @@ func (em *ElementManager) checkForQuiescence(advanced set[string]) error { outW := ss.OutputWatermark() upPCol, upW := ss.UpstreamWatermark() upS := em.pcolParents[upPCol] + if upS == "" { + upS = "IMPULSE " // (extra spaces to allow print to align better.) + } stageState = append(stageState, fmt.Sprintln(id, "watermark in", inW, "out", outW, "upstream", upW, "from", upS, "pending", ss.pending, "byKey", ss.pendingByKeys, "inprogressKeys", ss.inprogressKeys, "byBundle", ss.inprogressKeysByBundle, "holds", ss.watermarkHolds.heap, "holdCounts", ss.watermarkHolds.counts, "holdsInBundle", ss.inprogressHoldsByBundle, "pttEvents", ss.processingTimeTimers.toFire)) + + var outputConsumers, sideConsumers []string + for _, col := range ss.outputIDs { + outputConsumers = append(outputConsumers, em.consumers[col]...) + for _, l := range em.sideConsumers[col] { + sideConsumers = append(sideConsumers, l.Global) + } + } + stageState = append(stageState, fmt.Sprintf("\tsideInputs: %v outputCols: %v outputConsumers: %v sideConsumers: %v\n", ss.sides, ss.outputIDs, outputConsumers, sideConsumers)) } return errors.Errorf("nothing in progress and no refreshes with non zero pending elements: %v\n%v", v, strings.Join(stageState, "")) } @@ -661,6 +697,7 @@ func reElementResiduals(residuals []Residual, inputInfo PColInfo, rb RunBundle) pane: pn, elmBytes: elmBytes, keyBytes: keyBytes, + sequence: len(unprocessedElements), }) } } @@ -677,6 +714,7 @@ func reElementResiduals(residuals []Residual, inputInfo PColInfo, rb RunBundle) // PersistBundle takes in the stage ID, ID of the bundle associated with the pending // input elements, and the committed output elements. func (em *ElementManager) PersistBundle(rb RunBundle, col2Coders map[string]PColInfo, d TentativeData, inputInfo PColInfo, residuals Residuals) { + var seq int for output, data := range d.Raw { info := col2Coders[output] var newPending []element @@ -716,18 +754,20 @@ func (em *ElementManager) PersistBundle(rb RunBundle, col2Coders map[string]PCol pane: pn, elmBytes: elmBytes, keyBytes: keyBytes, + sequence: seq, }) + seq++ } } } consumers := em.consumers[output] - slog.Debug("PersistBundle: bundle has downstream consumers.", "bundle", rb, slog.Int("newPending", len(newPending)), "consumers", consumers) + sideConsumers := em.sideConsumers[output] + slog.Debug("PersistBundle: bundle has downstream consumers.", "bundle", rb, slog.Int("newPending", len(newPending)), "consumers", consumers, "sideConsumers", sideConsumers) for _, sID := range consumers { consumer := em.stages[sID] count := consumer.AddPending(newPending) em.addPending(count) } - sideConsumers := em.sideConsumers[output] for _, link := range sideConsumers { consumer := em.stages[link.Global] consumer.AddPendingSide(newPending, link.Transform, link.Local) diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go b/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go index 3f52ebc4510c..787d27858a0e 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go @@ -74,6 +74,7 @@ func decodeTimer(keyDec func(io.Reader) []byte, usesGlobalWindow bool, raw []byt timestamp: firing, holdTimestamp: hold, pane: pane, + sequence: len(ret), }) } return keyBytes, tag, ret diff --git a/sdks/go/pkg/beam/runners/prism/internal/environments.go b/sdks/go/pkg/beam/runners/prism/internal/environments.go index 3a429920fb28..add7f769a702 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/environments.go +++ b/sdks/go/pkg/beam/runners/prism/internal/environments.go @@ -32,8 +32,8 @@ import ( "google.golang.org/grpc/credentials/insecure" "google.golang.org/protobuf/proto" - dtyp "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/image" "github.com/docker/docker/api/types/mount" dcli "github.com/docker/docker/client" "github.com/docker/docker/pkg/stdcopy" @@ -132,7 +132,7 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock } if _, _, err := cli.ImageInspectWithRaw(ctx, dp.GetContainerImage()); err != nil { // We don't have a local image, so we should pull it. - if rc, err := cli.ImagePull(ctx, dp.GetContainerImage(), dtyp.ImagePullOptions{}); err == nil { + if rc, err := cli.ImagePull(ctx, dp.GetContainerImage(), image.PullOptions{}); err == nil { // Copy the output, but discard it so we can wait until the image pull is finished. io.Copy(io.Discard, rc) rc.Close() @@ -164,7 +164,7 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock containerID := ccr.ID logger = logger.With("container", containerID) - if err := cli.ContainerStart(ctx, containerID, dtyp.ContainerStartOptions{}); err != nil { + if err := cli.ContainerStart(ctx, containerID, container.StartOptions{}); err != nil { cli.Close() return fmt.Errorf("unable to start container image %v with docker for env %v, err: %w", dp.GetContainerImage(), wk.Env, err) } @@ -189,7 +189,7 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock case resp := <-statusCh: logger.Info("docker container has self terminated", "status_code", resp.StatusCode) - rc, err := cli.ContainerLogs(ctx, containerID, dtyp.ContainerLogsOptions{Details: true, ShowStdout: true, ShowStderr: true}) + rc, err := cli.ContainerLogs(ctx, containerID, container.LogsOptions{Details: true, ShowStdout: true, ShowStderr: true}) if err != nil { logger.Error("docker container logs error", "error", err) } diff --git a/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go b/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go index c60a8bf2a3f5..2d3425af33c6 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go +++ b/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go @@ -178,7 +178,7 @@ func (h *pardo) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb ckvERSID: coder(urns.CoderKV, ckvERID, cSID), } - // PCollections only have two new ones. + // There are only two new PCollections. // INPUT -> same as ordinary DoFn // PWR, uses ckvER // SPLITnSIZED, uses ckvERS @@ -201,7 +201,7 @@ func (h *pardo) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb nSPLITnSIZEDID: pcol(nSPLITnSIZEDID, ckvERSID), } - // PTransforms have 3 new ones, with process sized elements and restrictions + // There are 3 new PTransforms, with process sized elements and restrictions // taking the brunt of the complexity, consuming the inputs ePWRID := "e" + tid + "_pwr" @@ -209,15 +209,19 @@ func (h *pardo) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb eProcessID := "e" + tid + "_processandsplit" tform := func(name, urn, in, out string) *pipepb.PTransform { + // Apparently we also send side inputs to PairWithRestriction + // and SplitAndSize. We should consider wether we could simply + // drop the side inputs from the ParDo payload instead, which + // could lead to an additional fusion oppportunity. + newInputs := maps.Clone(t.GetInputs()) + newInputs[inputLocalID] = in return &pipepb.PTransform{ UniqueName: name, Spec: &pipepb.FunctionSpec{ Urn: urn, Payload: pardoPayload, }, - Inputs: map[string]string{ - inputLocalID: in, - }, + Inputs: newInputs, Outputs: map[string]string{ "i0": out, }, diff --git a/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go b/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go index 910fa0a0ca87..eecebde3d693 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go +++ b/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go @@ -244,7 +244,7 @@ func (h *runner) ExecuteTransform(stageID, tid string, t *pipepb.PTransform, com kc := coders[kcID] ec := coders[ecID] - data = append(data, gbkBytes(ws, wc, kc, ec, inputData, coders, watermark)) + data = append(data, gbkBytes(ws, wc, kc, ec, inputData, coders)) if len(data[0]) == 0 { panic("no data for GBK") } @@ -290,21 +290,33 @@ func windowingStrategy(comps *pipepb.Components, tid string) *pipepb.WindowingSt } // gbkBytes re-encodes gbk inputs in a gbk result. -func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregate [][]byte, coders map[string]*pipepb.Coder, watermark mtime.Time) []byte { - var outputTime func(typex.Window, mtime.Time) mtime.Time +func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregate [][]byte, coders map[string]*pipepb.Coder) []byte { + // Pick how the timestamp of the aggregated output is computed. + var outputTime func(typex.Window, mtime.Time, mtime.Time) mtime.Time switch ws.GetOutputTime() { case pipepb.OutputTime_END_OF_WINDOW: - outputTime = func(w typex.Window, et mtime.Time) mtime.Time { + outputTime = func(w typex.Window, _, _ mtime.Time) mtime.Time { return w.MaxTimestamp() } - case pipepb.OutputTime_EARLIEST_IN_PANE, pipepb.OutputTime_LATEST_IN_PANE: - outputTime = func(w typex.Window, et mtime.Time) mtime.Time { - return et + case pipepb.OutputTime_EARLIEST_IN_PANE: + outputTime = func(_ typex.Window, cur, et mtime.Time) mtime.Time { + if et < cur { + return et + } + return cur + } + case pipepb.OutputTime_LATEST_IN_PANE: + outputTime = func(_ typex.Window, cur, et mtime.Time) mtime.Time { + if et > cur { + return et + } + return cur } default: // TODO need to correct session logic if output time is different. panic(fmt.Sprintf("unsupported OutputTime behavior: %v", ws.GetOutputTime())) } + wDec, wEnc := makeWindowCoders(wc) type keyTime struct { @@ -321,9 +333,8 @@ func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregat kd := pullDecoder(kc, coders) vd := pullDecoder(vc, coders) - // Right, need to get the key coder, and the element coder. - // Cus I'll need to pull out anything the runner knows how to deal with. - // And repeat. + // Aggregate by windows and keys, using the window coder and KV coders. + // We need to extract and split the key bytes from the element bytes. for _, data := range toAggregate { // Parse out each element's data, and repeat. buf := bytes.NewBuffer(data) @@ -340,14 +351,18 @@ func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregat key := string(keyByt) value := vd(buf) for _, w := range ws { - ft := outputTime(w, tm) wk, ok := windows[w] if !ok { wk = make(map[string]keyTime) windows[w] = wk } - kt := wk[key] - kt.time = ft + kt, ok := wk[key] + if !ok { + // If the window+key map doesn't have a value, inititialize time with the element time. + // This allows earliest or latest to work properly in the outputTime function's first use. + kt.time = tm + } + kt.time = outputTime(w, kt.time, tm) kt.key = keyByt kt.w = w kt.values = append(kt.values, value) @@ -372,34 +387,41 @@ func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregat } // Use a decreasing sort (latest to earliest) so we can correct // the output timestamp to the new end of window immeadiately. - // TODO need to correct this if output time is different. sort.Slice(ordered, func(i, j int) bool { return ordered[i].MaxTimestamp() > ordered[j].MaxTimestamp() }) cur := ordered[0] sessionData := windows[cur] + delete(windows, cur) for _, iw := range ordered[1:] { - // If they overlap, then we merge the data. + // Check if the gap between windows is less than the gapSize. + // If not, this window is done, and we start a next window. if iw.End+gapSize < cur.Start { - // Start a new session. + // Store current data with the current window. windows[cur] = sessionData + // Use the incoming window instead, and clear it from the map. cur = iw sessionData = windows[iw] + delete(windows, cur) + // There's nothing to merge, since we've just started with this windowed data. continue } - // Extend the session + // Extend the session with the incoming window, and merge the the incoming window's data. cur.Start = iw.Start toMerge := windows[iw] delete(windows, iw) for k, kt := range toMerge { skt := sessionData[k] + // Ensure the output time matches the given function. + skt.time = outputTime(cur, kt.time, skt.time) skt.key = kt.key skt.w = cur skt.values = append(skt.values, kt.values...) sessionData[k] = skt } } + windows[cur] = sessionData } // Everything's aggregated! // Time to turn things into a windowed KV> diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go index 3efe48e23119..7676d958031c 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go @@ -180,12 +180,21 @@ func (s *Server) Prepare(ctx context.Context, req *jobpb.PrepareJobRequest) (*jo default: // Composites can often have some unknown urn, permit those. - // Eg. The Python SDK has urns "beam:transform:generic_composite:v1", "beam:transform:pickled_python:v1", as well as the deprecated "beam:transform:read:v1", - // but they are composites. Since we don't do anything special with the high level, we simply use their internal subgraph. + // Eg. The Python SDK has urns "beam:transform:generic_composite:v1", "beam:transform:pickled_python:v1", + // as well as the deprecated "beam:transform:read:v1", but they are composites. + // We don't do anything special with these high level composites, but + // we may be dealing with their internal subgraph already, so we ignore this transform. if len(t.GetSubtransforms()) > 0 { continue } - // But if not, fail. + // This may be an "empty" composite without subtransforms or a payload. + // These just do PCollection manipulation which is already represented in the Pipeline graph. + // Simply ignore the composite at this stage, since the runner does nothing with them. + if len(t.GetSpec().GetPayload()) == 0 { + continue + } + // Otherwise fail. + slog.Warn("unknown transform, with payload", "urn", urn, "name", t.GetUniqueName(), "payload", t.GetSpec().GetPayload()) check("PTransform.Spec.Urn", urn+" "+t.GetUniqueName(), "") } } diff --git a/sdks/go/pkg/beam/runners/prism/internal/preprocess.go b/sdks/go/pkg/beam/runners/prism/internal/preprocess.go index 95f6af18ac74..ed7f168e36ee 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/preprocess.go +++ b/sdks/go/pkg/beam/runners/prism/internal/preprocess.go @@ -18,6 +18,7 @@ package internal import ( "fmt" "sort" + "strings" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/pipelinex" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" @@ -438,7 +439,8 @@ func finalizeStage(stg *stage, comps *pipepb.Components, pipelineFacts *fusionFa t := comps.GetTransforms()[link.Transform] var sis map[string]*pipepb.SideInput - if t.GetSpec().GetUrn() == urns.TransformParDo { + switch t.GetSpec().GetUrn() { + case urns.TransformParDo, urns.TransformProcessSizedElements, urns.TransformPairWithRestriction, urns.TransformSplitAndSize, urns.TransformTruncate: pardo := &pipepb.ParDoPayload{} if err := (proto.UnmarshalOptions{}).Unmarshal(t.GetSpec().GetPayload(), pardo); err != nil { return fmt.Errorf("unable to decode ParDoPayload for %v", link.Transform) @@ -485,7 +487,17 @@ func finalizeStage(stg *stage, comps *pipepb.Components, pipelineFacts *fusionFa // Quick check that this is lead by a flatten node, and that it's handled runner side. t := comps.GetTransforms()[stg.transforms[0]] if !(t.GetSpec().GetUrn() == urns.TransformFlatten && t.GetEnvironmentId() == "") { - return fmt.Errorf("expected runner flatten node, but wasn't: %v -- %v", stg.transforms, mainInputs) + formatMap := func(in map[string]string) string { + var b strings.Builder + for k, v := range in { + b.WriteString(k) + b.WriteString(" : ") + b.WriteString(v) + b.WriteString("\n\t") + } + return b.String() + } + return fmt.Errorf("stage requires multiple parallel inputs but wasn't a flatten:\n\ttransforms\n\t%v\n\tmain inputs\n\t%v\n\tsidinputs\n\t%v", strings.Join(stg.transforms, "\n\t\t"), formatMap(mainInputs), sideInputs) } } return nil diff --git a/sdks/go/pkg/beam/runners/prism/internal/stage.go b/sdks/go/pkg/beam/runners/prism/internal/stage.go index 9d1c8481d65e..d4abed293534 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/stage.go +++ b/sdks/go/pkg/beam/runners/prism/internal/stage.go @@ -275,7 +275,11 @@ progress: } func getSideInputs(t *pipepb.PTransform) (map[string]*pipepb.SideInput, error) { - if t.GetSpec().GetUrn() != urns.TransformParDo { + switch t.GetSpec().GetUrn() { + case urns.TransformParDo, urns.TransformProcessSizedElements, urns.TransformPairWithRestriction, urns.TransformSplitAndSize, urns.TransformTruncate: + // Intentionally empty since these are permitted to have side inputs. + default: + // Nothing else is allowed to have side inputs. return nil, nil } // TODO, memoize this, so we don't need to repeatedly unmarshal. @@ -326,7 +330,15 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, em *eng coders := map[string]*pipepb.Coder{} transforms := map[string]*pipepb.PTransform{} + pcollections := map[string]*pipepb.PCollection{} + + clonePColToBundle := func(pid string) *pipepb.PCollection { + col := proto.Clone(comps.GetPcollections()[pid]).(*pipepb.PCollection) + pcollections[pid] = col + return col + } + // Update coders for Stateful transforms. for _, tid := range stg.transforms { t := comps.GetTransforms()[tid] @@ -408,7 +420,7 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, em *eng sink2Col := map[string]string{} col2Coders := map[string]engine.PColInfo{} for _, o := range stg.outputs { - col := comps.GetPcollections()[o.Global] + col := clonePColToBundle(o.Global) wOutCid, err := makeWindowedValueCoder(o.Global, comps, coders) if err != nil { return fmt.Errorf("buildDescriptor: failed to handle coder on stage %v for output %+v, pcol %q %v:\n%w %v", stg.ID, o, o.Global, prototext.Format(col), err, stg.transforms) @@ -435,7 +447,8 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, em *eng var prepareSides []func(b *worker.B, watermark mtime.Time) for _, si := range stg.sideInputs { - col := comps.GetPcollections()[si.Global] + col := clonePColToBundle(si.Global) + oCID := col.GetCoderId() nCID, err := lpUnknownCoders(oCID, coders, comps.GetCoders()) if err != nil { @@ -444,7 +457,7 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, em *eng if oCID != nCID { // Add a synthetic PCollection set with the new coder. newGlobal := si.Global + "_prismside" - comps.GetPcollections()[newGlobal] = &pipepb.PCollection{ + pcollections[newGlobal] = &pipepb.PCollection{ DisplayData: col.GetDisplayData(), UniqueName: col.GetUniqueName(), CoderId: nCID, @@ -453,10 +466,11 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, em *eng } // Update side inputs to point to new PCollection with any replaced coders. transforms[si.Transform].GetInputs()[si.Local] = newGlobal + // TODO: replace si.Global with newGlobal? } - prepSide, err := handleSideInput(si, comps, coders, em) + prepSide, err := handleSideInput(si, comps, transforms, pcollections, coders, em) if err != nil { - slog.Error("buildDescriptor: handleSideInputs", err, slog.String("transformID", si.Transform)) + slog.Error("buildDescriptor: handleSideInputs", "error", err, slog.String("transformID", si.Transform)) return err } prepareSides = append(prepareSides, prepSide) @@ -467,7 +481,13 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, em *eng // coders used by side inputs to the coders map for the bundle, so // needs to be run for every ID. - col := comps.GetPcollections()[stg.primaryInput] + col := clonePColToBundle(stg.primaryInput) + if newCID, err := lpUnknownCoders(col.GetCoderId(), coders, comps.GetCoders()); err == nil && col.GetCoderId() != newCID { + col.CoderId = newCID + } else if err != nil { + return fmt.Errorf("buildDescriptor: couldn't rewrite coder %q for primary input pcollection %q: %w", col.GetCoderId(), stg.primaryInput, err) + } + wInCid, err := makeWindowedValueCoder(stg.primaryInput, comps, coders) if err != nil { return fmt.Errorf("buildDescriptor: failed to handle coder on stage %v for primary input, pcol %q %v:\n%w\n%v", stg.ID, stg.primaryInput, prototext.Format(col), err, stg.transforms) @@ -491,9 +511,14 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, em *eng stg.inputTransformID = stg.ID + "_source" transforms[stg.inputTransformID] = sourceTransform(stg.inputTransformID, portFor(wInCid, wk), stg.primaryInput) - // Add coders for internal collections. + // Update coders for internal collections, and add those collections to the bundle descriptor. for _, pid := range stg.internalCols { - lpUnknownCoders(comps.GetPcollections()[pid].GetCoderId(), coders, comps.GetCoders()) + col := clonePColToBundle(pid) + if newCID, err := lpUnknownCoders(col.GetCoderId(), coders, comps.GetCoders()); err == nil && col.GetCoderId() != newCID { + col.CoderId = newCID + } else if err != nil { + return fmt.Errorf("buildDescriptor: coder couldn't rewrite coder %q for internal pcollection %q: %w", col.GetCoderId(), pid, err) + } } // Add coders for all windowing strategies. // TODO: filter PCollections, filter windowing strategies by Pcollections instead. @@ -514,7 +539,7 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, em *eng Id: stg.ID, Transforms: transforms, WindowingStrategies: comps.GetWindowingStrategies(), - Pcollections: comps.GetPcollections(), + Pcollections: pcollections, Coders: coders, StateApiServiceDescriptor: &pipepb.ApiServiceDescriptor{ Url: wk.Endpoint(), @@ -537,8 +562,8 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, em *eng } // handleSideInput returns a closure that will look up the data for a side input appropriate for the given watermark. -func handleSideInput(link engine.LinkID, comps *pipepb.Components, coders map[string]*pipepb.Coder, em *engine.ElementManager) (func(b *worker.B, watermark mtime.Time), error) { - t := comps.GetTransforms()[link.Transform] +func handleSideInput(link engine.LinkID, comps *pipepb.Components, transforms map[string]*pipepb.PTransform, pcols map[string]*pipepb.PCollection, coders map[string]*pipepb.Coder, em *engine.ElementManager) (func(b *worker.B, watermark mtime.Time), error) { + t := transforms[link.Transform] sis, err := getSideInputs(t) if err != nil { return nil, err @@ -551,7 +576,7 @@ func handleSideInput(link engine.LinkID, comps *pipepb.Components, coders map[st slog.String("local", link.Local), slog.String("global", link.Global)) - col := comps.GetPcollections()[link.Global] + col := pcols[link.Global] // The returned coders are unused here, but they add the side input coders // to the stage components for use SDK side. @@ -575,7 +600,7 @@ func handleSideInput(link engine.LinkID, comps *pipepb.Components, coders map[st slog.String("sourceTransform", t.GetUniqueName()), slog.String("local", link.Local), slog.String("global", link.Global)) - col := comps.GetPcollections()[link.Global] + col := pcols[link.Global] kvc := comps.GetCoders()[col.GetCoderId()] if kvc.GetSpec().GetUrn() != urns.CoderKV { @@ -614,7 +639,7 @@ func handleSideInput(link engine.LinkID, comps *pipepb.Components, coders map[st }] = windowed }, nil default: - return nil, fmt.Errorf("local input %v (global %v) uses accesspattern %v", link.Local, link.Global, si.GetAccessPattern().GetUrn()) + return nil, fmt.Errorf("local input %v (global %v) uses accesspattern %v", link.Local, link.Global, prototext.Format(si.GetAccessPattern())) } } diff --git a/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go b/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go index d25c173e8c2f..f9ec03793488 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go +++ b/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go @@ -452,7 +452,7 @@ func (wk *W) State(state fnpb.BeamFnState_StateServer) error { // TODO: move data handling to be pcollection based. key := req.GetStateKey() - slog.Debug("StateRequest_Get", prototext.Format(req), "bundle", b) + slog.Debug("StateRequest_Get", "request", prototext.Format(req), "bundle", b) var data [][]byte switch key.GetType().(type) { case *fnpb.StateKey_IterableSideInput_: diff --git a/sdks/go/pkg/beam/runners/universal/runnerlib/job.go b/sdks/go/pkg/beam/runners/universal/runnerlib/job.go index 4e50661b3db8..7d6a3027e47e 100644 --- a/sdks/go/pkg/beam/runners/universal/runnerlib/job.go +++ b/sdks/go/pkg/beam/runners/universal/runnerlib/job.go @@ -28,7 +28,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/log" jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" ) // JobOptions capture the various options for submitting jobs @@ -152,7 +151,7 @@ func WaitForCompletion(ctx context.Context, client jobpb.JobServiceClient, jobID } default: - return errors.Errorf("unexpected job update: %v", proto.MarshalTextString(msg)) + return errors.Errorf("unexpected job update: %v", msg.String()) } } } diff --git a/sdks/go/pkg/beam/runners/universal/runnerlib/stage.go b/sdks/go/pkg/beam/runners/universal/runnerlib/stage.go index d5cc6aa7327a..85d6fdc7e2ca 100644 --- a/sdks/go/pkg/beam/runners/universal/runnerlib/stage.go +++ b/sdks/go/pkg/beam/runners/universal/runnerlib/stage.go @@ -29,8 +29,8 @@ import ( jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" - "github.com/golang/protobuf/proto" "google.golang.org/grpc" + "google.golang.org/protobuf/proto" ) // Stage stages the worker binary and any additional files to the given diff --git a/sdks/go/pkg/beam/runners/universal/universal.go b/sdks/go/pkg/beam/runners/universal/universal.go index 8af9e91e1e15..c63175c58578 100644 --- a/sdks/go/pkg/beam/runners/universal/universal.go +++ b/sdks/go/pkg/beam/runners/universal/universal.go @@ -32,7 +32,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/universal/extworker" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/universal/runnerlib" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/vet" - "github.com/golang/protobuf/proto" ) func init() { @@ -93,7 +92,7 @@ func Execute(ctx context.Context, p *beam.Pipeline) (beam.PipelineResult, error) return nil, errors.WithContextf(err, "generating model pipeline") } - log.Info(ctx, proto.MarshalTextString(pipeline)) + log.Info(ctx, pipeline.String()) opt := &runnerlib.JobOptions{ Name: jobopts.GetJobName(), diff --git a/sdks/go/pkg/beam/transforms/xlang/schema/external.go b/sdks/go/pkg/beam/transforms/xlang/schema/external.go index 75be90cbe7b3..55a858b9cf9e 100644 --- a/sdks/go/pkg/beam/transforms/xlang/schema/external.go +++ b/sdks/go/pkg/beam/transforms/xlang/schema/external.go @@ -20,7 +20,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/xlangx" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) const schemaTransformURN = "beam:expansion:payload:schematransform:v1" diff --git a/sdks/java/build-tools/src/main/resources/beam/checkstyle/checkstyle.xml b/sdks/java/build-tools/src/main/resources/beam/checkstyle/checkstyle.xml index 3c4cfdfbc6f5..5cee5d2f33e2 100644 --- a/sdks/java/build-tools/src/main/resources/beam/checkstyle/checkstyle.xml +++ b/sdks/java/build-tools/src/main/resources/beam/checkstyle/checkstyle.xml @@ -119,6 +119,14 @@ page at http://checkstyle.sourceforge.net/config.html --> + + + + + + + + diff --git a/sdks/java/container/boot.go b/sdks/java/container/boot.go index 14e2e4311b45..c23e50dcf1b0 100644 --- a/sdks/java/container/boot.go +++ b/sdks/java/container/boot.go @@ -35,7 +35,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/util/execx" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/syscallx" - "github.com/golang/protobuf/proto" ) var ( @@ -126,12 +125,12 @@ func main() { if err := tools.MakePipelineOptionsFileAndEnvVar(options); err != nil { logger.Fatalf(ctx, "Failed to load pipeline options to worker: %v", err) } - os.Setenv("LOGGING_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *loggingEndpoint})) - os.Setenv("CONTROL_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *controlEndpoint})) + os.Setenv("LOGGING_API_SERVICE_DESCRIPTOR", (&pipepb.ApiServiceDescriptor{Url: *loggingEndpoint}).String()) + os.Setenv("CONTROL_API_SERVICE_DESCRIPTOR", (&pipepb.ApiServiceDescriptor{Url: *controlEndpoint}).String()) os.Setenv("RUNNER_CAPABILITIES", strings.Join(info.GetRunnerCapabilities(), " ")) if info.GetStatusEndpoint() != nil { - os.Setenv("STATUS_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(info.GetStatusEndpoint())) + os.Setenv("STATUS_API_SERVICE_DESCRIPTOR", info.GetStatusEndpoint().String()) } const jarsDir = "/opt/apache/beam/jars" diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java index 7890a9f74b94..6166a562bf2d 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java @@ -17,17 +17,17 @@ */ package org.apache.beam.sdk.metrics; +import java.util.HashSet; +import java.util.Set; + /** * Standard collection of metrics used to record source and sinks information for lineage tracking. */ public class Lineage { - public static final String LINEAGE_NAMESPACE = "lineage"; - public static final String SOURCE_METRIC_NAME = "sources"; - public static final String SINK_METRIC_NAME = "sinks"; - - private static final StringSet SOURCES = Metrics.stringSet(LINEAGE_NAMESPACE, SOURCE_METRIC_NAME); - private static final StringSet SINKS = Metrics.stringSet(LINEAGE_NAMESPACE, SINK_METRIC_NAME); + private static final StringSet SOURCES = + Metrics.stringSet(LINEAGE_NAMESPACE, Type.SOURCE.toString()); + private static final StringSet SINKS = Metrics.stringSet(LINEAGE_NAMESPACE, Type.SINK.toString()); /** {@link StringSet} representing sources and optionally side inputs. */ public static StringSet getSources() { @@ -38,4 +38,35 @@ public static StringSet getSources() { public static StringSet getSinks() { return SINKS; } + + /** Query {@link StringSet} metrics from {@link MetricResults}. */ + public static Set query(MetricResults results, Type type) { + MetricsFilter filter = + MetricsFilter.builder() + .addNameFilter(MetricNameFilter.named(LINEAGE_NAMESPACE, type.toString())) + .build(); + Set result = new HashSet<>(); + for (MetricResult metrics : results.queryMetrics(filter).getStringSets()) { + result.addAll(metrics.getCommitted().getStringSet()); + result.addAll(metrics.getAttempted().getStringSet()); + } + return result; + } + + /** Lineage metrics resource types. */ + public enum Type { + SOURCE("sources"), + SINK("sinks"); + + private final String name; + + Type(String name) { + this.name = name; + } + + @Override + public String toString() { + return name; + } + } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Metrics.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Metrics.java index 916e18647c34..a963015e98a7 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Metrics.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Metrics.java @@ -93,22 +93,23 @@ public static Gauge gauge(Class namespace, String name) { return new DelegatingGauge(MetricName.named(namespace, name)); } - /** - * Create a metric that can have its new value set, and is aggregated by taking the last reported - * value. - */ + /** Create a metric that accumulates and reports set of unique string values. */ public static StringSet stringSet(String namespace, String name) { return new DelegatingStringSet(MetricName.named(namespace, name)); } - /** - * Create a metric that can have its new value set, and is aggregated by taking the last reported - * value. - */ + /** Create a metric that accumulates and reports set of unique string values. */ public static StringSet stringSet(Class namespace, String name) { return new DelegatingStringSet(MetricName.named(namespace, name)); } + /* + * A dedicated namespace for client throttling time. User DoFn can increment this metrics and then + * runner will put back pressure on scaling decision, if supported. + */ + public static final String THROTTLE_TIME_NAMESPACE = "beam-throttling-metrics"; + public static final String THROTTLE_TIME_COUNTER_NAME = "throttling-msecs"; + /** * Implementation of {@link Distribution} that delegates to the instance for the current context. */ diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java index 7f8f2a436433..3421bb4afc85 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java @@ -22,6 +22,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.sdk.util.StringUtils; import org.checkerframework.checker.nullness.qual.NonNull; import org.checkerframework.checker.nullness.qual.Nullable; import org.slf4j.Logger; @@ -134,10 +135,14 @@ public void close() throws IOException { if (container == null && REPORTED_MISSING_CONTAINER.compareAndSet(false, true)) { if (isMetricsSupported()) { LOG.error( - "Unable to update metrics on the current thread. " - + "Most likely caused by using metrics outside the managed work-execution thread."); + "Unable to update metrics on the current thread. Most likely caused by using metrics " + + "outside the managed work-execution thread:\n {}", + StringUtils.arrayToNewlines(Thread.currentThread().getStackTrace(), 10)); } else { - LOG.warn("Reporting metrics are not supported in the current execution environment."); + // rate limiting this log as it can be emitted each time metrics incremented + LOG.warn( + "Reporting metrics are not supported in the current execution environment:\n {}", + StringUtils.arrayToNewlines(Thread.currentThread().getStackTrace(), 10)); } } return container; diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/AutoValueSchema.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/AutoValueSchema.java index abd9bc46bd46..5ccfe39b92af 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/AutoValueSchema.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/AutoValueSchema.java @@ -39,20 +39,20 @@ "nullness", // TODO(https://github.com/apache/beam/issues/20497) "rawtypes" }) -public class AutoValueSchema extends GetterBasedSchemaProvider { +public class AutoValueSchema extends GetterBasedSchemaProviderV2 { /** {@link FieldValueTypeSupplier} that's based on AutoValue getters. */ @VisibleForTesting public static class AbstractGetterTypeSupplier implements FieldValueTypeSupplier { public static final AbstractGetterTypeSupplier INSTANCE = new AbstractGetterTypeSupplier(); @Override - public List get(Class clazz) { + public List get(TypeDescriptor typeDescriptor) { // If the generated class is passed in, we want to look at the base class to find the getters. - Class targetClass = AutoValueUtils.getBaseAutoValueClass(clazz); + TypeDescriptor targetTypeDescriptor = AutoValueUtils.getBaseAutoValueClass(typeDescriptor); List methods = - ReflectUtils.getMethods(targetClass).stream() + ReflectUtils.getMethods(targetTypeDescriptor.getRawType()).stream() .filter(ReflectUtils::isGetter) // All AutoValue getters are marked abstract. .filter(m -> Modifier.isAbstract(m.getModifiers())) @@ -89,9 +89,10 @@ private static void validateFieldNumbers(List types) } @Override - public List fieldValueGetters(Class targetClass, Schema schema) { + public List fieldValueGetters( + TypeDescriptor targetTypeDescriptor, Schema schema) { return JavaBeanUtils.getGetters( - targetClass, + targetTypeDescriptor, schema, AbstractGetterTypeSupplier.INSTANCE, new DefaultTypeConversionsFactory()); @@ -99,17 +100,19 @@ public List fieldValueGetters(Class targetClass, Schema sch @Override public List fieldValueTypeInformations( - Class targetClass, Schema schema) { - return JavaBeanUtils.getFieldTypes(targetClass, schema, AbstractGetterTypeSupplier.INSTANCE); + TypeDescriptor targetTypeDescriptor, Schema schema) { + return JavaBeanUtils.getFieldTypes( + targetTypeDescriptor, schema, AbstractGetterTypeSupplier.INSTANCE); } @Override - public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema schema) { + public SchemaUserTypeCreator schemaTypeCreator( + TypeDescriptor targetTypeDescriptor, Schema schema) { // If a static method is marked with @SchemaCreate, use that. - Method annotated = ReflectUtils.getAnnotatedCreateMethod(targetClass); + Method annotated = ReflectUtils.getAnnotatedCreateMethod(targetTypeDescriptor.getRawType()); if (annotated != null) { return JavaBeanUtils.getStaticCreator( - targetClass, + targetTypeDescriptor, annotated, schema, AbstractGetterTypeSupplier.INSTANCE, @@ -119,7 +122,8 @@ public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema sche // Try to find a generated builder class. If one exists, use that to generate a // SchemaTypeCreator for creating AutoValue objects. SchemaUserTypeCreator creatorFactory = - AutoValueUtils.getBuilderCreator(targetClass, schema, AbstractGetterTypeSupplier.INSTANCE); + AutoValueUtils.getBuilderCreator( + targetTypeDescriptor.getRawType(), schema, AbstractGetterTypeSupplier.INSTANCE); if (creatorFactory != null) { return creatorFactory; } @@ -128,9 +132,10 @@ public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema sche // class. Use that for creating AutoValue objects. creatorFactory = AutoValueUtils.getConstructorCreator( - targetClass, schema, AbstractGetterTypeSupplier.INSTANCE); + targetTypeDescriptor, schema, AbstractGetterTypeSupplier.INSTANCE); if (creatorFactory == null) { - throw new RuntimeException("Could not find a way to create AutoValue class " + targetClass); + throw new RuntimeException( + "Could not find a way to create AutoValue class " + targetTypeDescriptor); } return creatorFactory; @@ -139,6 +144,6 @@ public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema sche @Override public @Nullable Schema schemaFor(TypeDescriptor typeDescriptor) { return JavaBeanUtils.schemaFromJavaBeanClass( - typeDescriptor.getRawType(), AbstractGetterTypeSupplier.INSTANCE); + typeDescriptor, AbstractGetterTypeSupplier.INSTANCE); } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/CachingFactory.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/CachingFactory.java index 2c140bd1dfef..8725833bc1da 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/CachingFactory.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/CachingFactory.java @@ -19,6 +19,7 @@ import java.util.Objects; import java.util.concurrent.ConcurrentHashMap; +import org.apache.beam.sdk.values.TypeDescriptor; import org.checkerframework.checker.nullness.qual.Nullable; /** @@ -36,7 +37,7 @@ "rawtypes" }) public class CachingFactory implements Factory { - private transient @Nullable ConcurrentHashMap cache = null; + private transient @Nullable ConcurrentHashMap, CreatedT> cache = null; private final Factory innerFactory; @@ -45,16 +46,16 @@ public CachingFactory(Factory innerFactory) { } @Override - public CreatedT create(Class clazz, Schema schema) { + public CreatedT create(TypeDescriptor typeDescriptor, Schema schema) { if (cache == null) { cache = new ConcurrentHashMap<>(); } - CreatedT cached = cache.get(clazz); + CreatedT cached = cache.get(typeDescriptor); if (cached != null) { return cached; } - cached = innerFactory.create(clazz, schema); - cache.put(clazz, cached); + cached = innerFactory.create(typeDescriptor, schema); + cache.put(typeDescriptor, cached); return cached; } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/Factory.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/Factory.java index f9da36b97c77..f302f20cfb64 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/Factory.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/Factory.java @@ -19,9 +19,10 @@ import java.io.Serializable; import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.sdk.values.TypeDescriptor; /** A Factory interface for schema-related objects for a specific Java type. */ @Internal public interface Factory extends Serializable { - T create(Class clazz, Schema schema); + T create(TypeDescriptor typeDescriptor, Schema schema); } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/FromRowUsingCreator.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/FromRowUsingCreator.java index 53c098599c36..b839a19a8177 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/FromRowUsingCreator.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/FromRowUsingCreator.java @@ -33,6 +33,7 @@ import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.RowWithGetters; +import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Function; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Collections2; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -47,23 +48,28 @@ "rawtypes" }) class FromRowUsingCreator implements SerializableFunction, Function { - private final Class clazz; + private final TypeDescriptor typeDescriptor; private final GetterBasedSchemaProvider schemaProvider; private final Factory schemaTypeCreatorFactory; @SuppressFBWarnings("SE_TRANSIENT_FIELD_NOT_RESTORED") private transient @MonotonicNonNull Function[] fieldConverters; - public FromRowUsingCreator(Class clazz, GetterBasedSchemaProvider schemaProvider) { - this(clazz, schemaProvider, new CachingFactory<>(schemaProvider::schemaTypeCreator), null); + public FromRowUsingCreator( + TypeDescriptor typeDescriptor, GetterBasedSchemaProvider schemaProvider) { + this( + typeDescriptor, + schemaProvider, + new CachingFactory<>(schemaProvider::schemaTypeCreator), + null); } private FromRowUsingCreator( - Class clazz, + TypeDescriptor typeDescriptor, GetterBasedSchemaProvider schemaProvider, Factory schemaTypeCreatorFactory, @Nullable Function[] fieldConverters) { - this.clazz = clazz; + this.typeDescriptor = typeDescriptor; this.schemaProvider = schemaProvider; this.schemaTypeCreatorFactory = schemaTypeCreatorFactory; this.fieldConverters = fieldConverters; @@ -77,7 +83,7 @@ public T apply(Row row) { } if (row instanceof RowWithGetters) { Object target = ((RowWithGetters) row).getGetterTarget(); - if (target.getClass().equals(clazz)) { + if (target.getClass().equals(typeDescriptor.getRawType())) { // Efficient path: simply extract the underlying object instead of creating a new one. return (T) target; } @@ -91,7 +97,8 @@ public T apply(Row row) { for (int i = 0; i < row.getFieldCount(); ++i) { params[i] = fieldConverters[i].apply(row.getValue(i)); } - SchemaUserTypeCreator creator = schemaTypeCreatorFactory.create(clazz, row.getSchema()); + SchemaUserTypeCreator creator = + schemaTypeCreatorFactory.create(typeDescriptor, row.getSchema()); return (T) creator.create(params); } @@ -99,13 +106,15 @@ private synchronized void initFieldConverters(Schema schema) { if (fieldConverters == null) { CachingFactory> typeFactory = new CachingFactory<>(schemaProvider::fieldValueTypeInformations); - fieldConverters = fieldConverters(clazz, schema, typeFactory); + fieldConverters = fieldConverters(typeDescriptor, schema, typeFactory); } } private Function[] fieldConverters( - Class clazz, Schema schema, Factory> typeFactory) { - List typeInfos = typeFactory.create(clazz, schema); + TypeDescriptor typeDescriptor, + Schema schema, + Factory> typeFactory) { + List typeInfos = typeFactory.create(typeDescriptor, schema); checkState( typeInfos.size() == schema.getFieldCount(), "Did not have a matching number of type informations and fields."); @@ -133,10 +142,9 @@ private Function fieldConverter( if (!needsConversion(type)) { return FieldConverter.IDENTITY; } else if (TypeName.ROW.equals(type.getTypeName())) { - Function[] converters = - fieldConverters(typeInfo.getRawType(), type.getRowSchema(), typeFactory); + Function[] converters = fieldConverters(typeInfo.getType(), type.getRowSchema(), typeFactory); return new FromRowUsingCreator( - typeInfo.getRawType(), schemaProvider, schemaTypeCreatorFactory, converters); + typeInfo.getType(), schemaProvider, schemaTypeCreatorFactory, converters); } else if (TypeName.ARRAY.equals(type.getTypeName())) { return new ConvertCollection( fieldConverter(type.getCollectionElementType(), typeInfo.getElementType(), typeFactory)); @@ -271,11 +279,11 @@ public boolean equals(@Nullable Object o) { return false; } FromRowUsingCreator that = (FromRowUsingCreator) o; - return clazz.equals(that.clazz) && schemaProvider.equals(that.schemaProvider); + return typeDescriptor.equals(that.typeDescriptor) && schemaProvider.equals(that.schemaProvider); } @Override public int hashCode() { - return Objects.hash(clazz, schemaProvider); + return Objects.hash(typeDescriptor, schemaProvider); } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/GetterBasedSchemaProvider.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/GetterBasedSchemaProvider.java index 2b697bebd815..ce5be71933b8 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/GetterBasedSchemaProvider.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/GetterBasedSchemaProvider.java @@ -41,22 +41,77 @@ /** * A {@link SchemaProvider} base class that vends schemas and rows based on {@link * FieldValueGetter}s. + * + * @deprecated new implementations should extend the {@link GetterBasedSchemaProviderV2} class' + * methods which receive {@link TypeDescriptor}s instead of ordinary {@link Class}es as + * arguments, which permits to support generic type signatures during schema inference */ @SuppressWarnings({ "nullness", // TODO(https://github.com/apache/beam/issues/20497) "rawtypes" }) +@Deprecated public abstract class GetterBasedSchemaProvider implements SchemaProvider { - /** Implementing class should override to return FieldValueGetters. */ + + /** + * Implementing class should override to return FieldValueGetters. + * + * @deprecated new implementations should override {@link #fieldValueGetters(TypeDescriptor, + * Schema)} and make this method throw an {@link UnsupportedOperationException} + */ + @Deprecated public abstract List fieldValueGetters(Class targetClass, Schema schema); - /** Implementing class should override to return a list of type-informations. */ + /** + * Delegates to the {@link #fieldValueGetters(Class, Schema)} for backwards compatibility, + * override it if you want to use the richer type signature contained in the {@link + * TypeDescriptor} not subject to the type erasure. + */ + public List fieldValueGetters( + TypeDescriptor targetTypeDescriptor, Schema schema) { + return fieldValueGetters(targetTypeDescriptor.getRawType(), schema); + } + + /** + * Implementing class should override to return a list of type-informations. + * + * @deprecated new implementations should override {@link + * #fieldValueTypeInformations(TypeDescriptor, Schema)} and make this method throw an {@link + * UnsupportedOperationException} + */ + @Deprecated public abstract List fieldValueTypeInformations( Class targetClass, Schema schema); - /** Implementing class should override to return a constructor. */ + /** + * Delegates to the {@link #fieldValueTypeInformations(Class, Schema)} for backwards + * compatibility, override it if you want to use the richer type signature contained in the {@link + * TypeDescriptor} not subject to the type erasure. + */ + public List fieldValueTypeInformations( + TypeDescriptor targetTypeDescriptor, Schema schema) { + return fieldValueTypeInformations(targetTypeDescriptor.getRawType(), schema); + } + + /** + * Implementing class should override to return a constructor. + * + * @deprecated new implementations should override {@link #schemaTypeCreator(TypeDescriptor, + * Schema)} and make this method throw an {@link UnsupportedOperationException} + */ + @Deprecated public abstract SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema schema); + /** + * Delegates to the {@link #schemaTypeCreator(Class, Schema)} for backwards compatibility, + * override it if you want to use the richer type signature contained in the {@link + * TypeDescriptor} not subject to the type erasure. + */ + public SchemaUserTypeCreator schemaTypeCreator( + TypeDescriptor targetTypeDescriptor, Schema schema) { + return schemaTypeCreator(targetTypeDescriptor.getRawType(), schema); + } + private class ToRowWithValueGetters implements SerializableFunction { private final Schema schema; private final Factory> getterFactory; @@ -113,8 +168,7 @@ public SerializableFunction toRowFunction(TypeDescriptor typeDesc @Override @SuppressWarnings("unchecked") public SerializableFunction fromRowFunction(TypeDescriptor typeDescriptor) { - Class clazz = (Class) typeDescriptor.getType(); - return new FromRowUsingCreator<>(clazz, this); + return new FromRowUsingCreator<>(typeDescriptor, this); } @Override @@ -141,8 +195,8 @@ static Factory> of(Factory> getter } @Override - public List create(Class clazz, Schema schema) { - List getters = gettersFactory.create(clazz, schema); + public List create(TypeDescriptor typeDescriptor, Schema schema) { + List getters = gettersFactory.create(typeDescriptor, schema); List rowGetters = new ArrayList<>(getters.size()); for (int i = 0; i < getters.size(); i++) { rowGetters.add(rowValueGetter(getters.get(i), schema.getField(i).getType())); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/GetterBasedSchemaProviderV2.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/GetterBasedSchemaProviderV2.java new file mode 100644 index 000000000000..de31f9947c36 --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/GetterBasedSchemaProviderV2.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas; + +import java.util.List; +import org.apache.beam.sdk.values.TypeDescriptor; + +/** + * A newer version of {@link GetterBasedSchemaProvider}, which works with {@link TypeDescriptor}s, + * and which by default delegates the old, {@link Class} based methods, to the new ones. + */ +@SuppressWarnings("rawtypes") +public abstract class GetterBasedSchemaProviderV2 extends GetterBasedSchemaProvider { + @Override + public List fieldValueGetters(Class targetClass, Schema schema) { + return fieldValueGetters(TypeDescriptor.of(targetClass), schema); + } + + @Override + public abstract List fieldValueGetters( + TypeDescriptor targetTypeDescriptor, Schema schema); + + @Override + public List fieldValueTypeInformations( + Class targetClass, Schema schema) { + return fieldValueTypeInformations(TypeDescriptor.of(targetClass), schema); + } + + @Override + public abstract List fieldValueTypeInformations( + TypeDescriptor targetTypeDescriptor, Schema schema); + + @Override + public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema schema) { + return schemaTypeCreator(TypeDescriptor.of(targetClass), schema); + } + + @Override + public abstract SchemaUserTypeCreator schemaTypeCreator( + TypeDescriptor targetTypeDescriptor, Schema schema); +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/JavaBeanSchema.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/JavaBeanSchema.java index 7024e8be86cf..a9cf01c52057 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/JavaBeanSchema.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/JavaBeanSchema.java @@ -53,16 +53,16 @@ "nullness", // TODO(https://github.com/apache/beam/issues/20497) "rawtypes" }) -public class JavaBeanSchema extends GetterBasedSchemaProvider { +public class JavaBeanSchema extends GetterBasedSchemaProviderV2 { /** {@link FieldValueTypeSupplier} that's based on getter methods. */ @VisibleForTesting public static class GetterTypeSupplier implements FieldValueTypeSupplier { public static final GetterTypeSupplier INSTANCE = new GetterTypeSupplier(); @Override - public List get(Class clazz) { + public List get(TypeDescriptor typeDescriptor) { List methods = - ReflectUtils.getMethods(clazz).stream() + ReflectUtils.getMethods(typeDescriptor.getRawType()).stream() .filter(ReflectUtils::isGetter) .filter(m -> !m.isAnnotationPresent(SchemaIgnore.class)) .collect(Collectors.toList()); @@ -110,8 +110,8 @@ public static class SetterTypeSupplier implements FieldValueTypeSupplier { private static final SetterTypeSupplier INSTANCE = new SetterTypeSupplier(); @Override - public List get(Class clazz) { - return ReflectUtils.getMethods(clazz).stream() + public List get(TypeDescriptor typeDescriptor) { + return ReflectUtils.getMethods(typeDescriptor.getRawType()).stream() .filter(ReflectUtils::isSetter) .filter(m -> !m.isAnnotationPresent(SchemaIgnore.class)) .map(FieldValueTypeInformation::forSetter) @@ -120,19 +120,22 @@ public List get(Class clazz) { if (t.getMethod().getAnnotation(SchemaFieldNumber.class) != null) { throw new RuntimeException( String.format( - "@SchemaFieldNumber can only be used on getters in Java Beans. Found on setter '%s'", + "@SchemaFieldNumber can only be used on getters in Java Beans. Found on" + + " setter '%s'", t.getMethod().getName())); } if (t.getMethod().getAnnotation(SchemaFieldName.class) != null) { throw new RuntimeException( String.format( - "@SchemaFieldName can only be used on getters in Java Beans. Found on setter '%s'", + "@SchemaFieldName can only be used on getters in Java Beans. Found on" + + " setter '%s'", t.getMethod().getName())); } if (t.getMethod().getAnnotation(SchemaCaseFormat.class) != null) { throw new RuntimeException( String.format( - "@SchemaCaseFormat can only be used on getters in Java Beans. Found on setter '%s'", + "@SchemaCaseFormat can only be used on getters in Java Beans. Found on" + + " setter '%s'", t.getMethod().getName())); } return t; @@ -154,40 +157,44 @@ public boolean equals(@Nullable Object obj) { @Override public Schema schemaFor(TypeDescriptor typeDescriptor) { Schema schema = - JavaBeanUtils.schemaFromJavaBeanClass( - typeDescriptor.getRawType(), GetterTypeSupplier.INSTANCE); + JavaBeanUtils.schemaFromJavaBeanClass(typeDescriptor, GetterTypeSupplier.INSTANCE); // If there are no creator methods, then validate that we have setters for every field. // Otherwise, we will have no way of creating instances of the class. if (ReflectUtils.getAnnotatedCreateMethod(typeDescriptor.getRawType()) == null && ReflectUtils.getAnnotatedConstructor(typeDescriptor.getRawType()) == null) { JavaBeanUtils.validateJavaBean( - GetterTypeSupplier.INSTANCE.get(typeDescriptor.getRawType(), schema), - SetterTypeSupplier.INSTANCE.get(typeDescriptor.getRawType(), schema), + GetterTypeSupplier.INSTANCE.get(typeDescriptor, schema), + SetterTypeSupplier.INSTANCE.get(typeDescriptor, schema), schema); } return schema; } @Override - public List fieldValueGetters(Class targetClass, Schema schema) { + public List fieldValueGetters( + TypeDescriptor targetTypeDescriptor, Schema schema) { return JavaBeanUtils.getGetters( - targetClass, schema, GetterTypeSupplier.INSTANCE, new DefaultTypeConversionsFactory()); + targetTypeDescriptor, + schema, + GetterTypeSupplier.INSTANCE, + new DefaultTypeConversionsFactory()); } @Override public List fieldValueTypeInformations( - Class targetClass, Schema schema) { - return JavaBeanUtils.getFieldTypes(targetClass, schema, GetterTypeSupplier.INSTANCE); + TypeDescriptor targetTypeDescriptor, Schema schema) { + return JavaBeanUtils.getFieldTypes(targetTypeDescriptor, schema, GetterTypeSupplier.INSTANCE); } @Override - public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema schema) { + public SchemaUserTypeCreator schemaTypeCreator( + TypeDescriptor targetTypeDescriptor, Schema schema) { // If a static method is marked with @SchemaCreate, use that. - Method annotated = ReflectUtils.getAnnotatedCreateMethod(targetClass); + Method annotated = ReflectUtils.getAnnotatedCreateMethod(targetTypeDescriptor.getRawType()); if (annotated != null) { return JavaBeanUtils.getStaticCreator( - targetClass, + targetTypeDescriptor, annotated, schema, GetterTypeSupplier.INSTANCE, @@ -195,10 +202,11 @@ public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema sche } // If a Constructor was tagged with @SchemaCreate, invoke that constructor. - Constructor constructor = ReflectUtils.getAnnotatedConstructor(targetClass); + Constructor constructor = + ReflectUtils.getAnnotatedConstructor(targetTypeDescriptor.getRawType()); if (constructor != null) { return JavaBeanUtils.getConstructorCreator( - targetClass, + targetTypeDescriptor, constructor, schema, GetterTypeSupplier.INSTANCE, @@ -208,15 +216,18 @@ public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema sche // Else try to make a setter-based creator Factory setterBasedFactory = new SetterBasedCreatorFactory(new JavaBeanSetterFactory()); - return setterBasedFactory.create(targetClass, schema); + return setterBasedFactory.create(targetTypeDescriptor, schema); } /** A factory for creating {@link FieldValueSetter} objects for a JavaBean object. */ private static class JavaBeanSetterFactory implements Factory> { @Override - public List create(Class targetClass, Schema schema) { + public List create(TypeDescriptor targetTypeDescriptor, Schema schema) { return JavaBeanUtils.getSetters( - targetClass, schema, SetterTypeSupplier.INSTANCE, new DefaultTypeConversionsFactory()); + targetTypeDescriptor, + schema, + SetterTypeSupplier.INSTANCE, + new DefaultTypeConversionsFactory()); } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/JavaFieldSchema.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/JavaFieldSchema.java index 16b96f1c7ae1..21f07c47b47f 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/JavaFieldSchema.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/JavaFieldSchema.java @@ -50,16 +50,16 @@ * on the schema. */ @SuppressWarnings({"nullness", "rawtypes"}) -public class JavaFieldSchema extends GetterBasedSchemaProvider { +public class JavaFieldSchema extends GetterBasedSchemaProviderV2 { /** {@link FieldValueTypeSupplier} that's based on public fields. */ @VisibleForTesting public static class JavaFieldTypeSupplier implements FieldValueTypeSupplier { public static final JavaFieldTypeSupplier INSTANCE = new JavaFieldTypeSupplier(); @Override - public List get(Class clazz) { + public List get(TypeDescriptor typeDescriptor) { List fields = - ReflectUtils.getFields(clazz).stream() + ReflectUtils.getFields(typeDescriptor.getRawType()).stream() .filter(m -> !m.isAnnotationPresent(SchemaIgnore.class)) .collect(Collectors.toList()); List types = Lists.newArrayListWithCapacity(fields.size()); @@ -71,8 +71,8 @@ public List get(Class clazz) { // If there are no creators registered, then make sure none of the schema fields are final, // as we (currently) have no way of creating classes in this case. - if (ReflectUtils.getAnnotatedCreateMethod(clazz) == null - && ReflectUtils.getAnnotatedConstructor(clazz) == null) { + if (ReflectUtils.getAnnotatedCreateMethod(typeDescriptor.getRawType()) == null + && ReflectUtils.getAnnotatedConstructor(typeDescriptor.getRawType()) == null) { Optional finalField = types.stream() .map(FieldValueTypeInformation::getField) @@ -81,7 +81,7 @@ public List get(Class clazz) { if (finalField.isPresent()) { throw new IllegalArgumentException( "Class " - + clazz + + typeDescriptor + " has final fields and no " + "registered creator. Cannot use as schema, as we don't know how to create this " + "object automatically"); @@ -111,29 +111,33 @@ private static void validateFieldNumbers(List types) @Override public Schema schemaFor(TypeDescriptor typeDescriptor) { - return POJOUtils.schemaFromPojoClass( - typeDescriptor.getRawType(), JavaFieldTypeSupplier.INSTANCE); + return POJOUtils.schemaFromPojoClass(typeDescriptor, JavaFieldTypeSupplier.INSTANCE); } @Override - public List fieldValueGetters(Class targetClass, Schema schema) { + public List fieldValueGetters( + TypeDescriptor targetTypeDescriptor, Schema schema) { return POJOUtils.getGetters( - targetClass, schema, JavaFieldTypeSupplier.INSTANCE, new DefaultTypeConversionsFactory()); + targetTypeDescriptor, + schema, + JavaFieldTypeSupplier.INSTANCE, + new DefaultTypeConversionsFactory()); } @Override public List fieldValueTypeInformations( - Class targetClass, Schema schema) { - return POJOUtils.getFieldTypes(targetClass, schema, JavaFieldTypeSupplier.INSTANCE); + TypeDescriptor targetTypeDescriptor, Schema schema) { + return POJOUtils.getFieldTypes(targetTypeDescriptor, schema, JavaFieldTypeSupplier.INSTANCE); } @Override - public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema schema) { + public SchemaUserTypeCreator schemaTypeCreator( + TypeDescriptor targetTypeDescriptor, Schema schema) { // If a static method is marked with @SchemaCreate, use that. - Method annotated = ReflectUtils.getAnnotatedCreateMethod(targetClass); + Method annotated = ReflectUtils.getAnnotatedCreateMethod(targetTypeDescriptor.getRawType()); if (annotated != null) { return POJOUtils.getStaticCreator( - targetClass, + targetTypeDescriptor, annotated, schema, JavaFieldTypeSupplier.INSTANCE, @@ -141,10 +145,11 @@ public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema sche } // If a Constructor was tagged with @SchemaCreate, invoke that constructor. - Constructor constructor = ReflectUtils.getAnnotatedConstructor(targetClass); + Constructor constructor = + ReflectUtils.getAnnotatedConstructor(targetTypeDescriptor.getRawType()); if (constructor != null) { return POJOUtils.getConstructorCreator( - targetClass, + targetTypeDescriptor, constructor, schema, JavaFieldTypeSupplier.INSTANCE, @@ -152,6 +157,9 @@ public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema sche } return POJOUtils.getSetFieldCreator( - targetClass, schema, JavaFieldTypeSupplier.INSTANCE, new DefaultTypeConversionsFactory()); + targetTypeDescriptor, + schema, + JavaFieldTypeSupplier.INSTANCE, + new DefaultTypeConversionsFactory()); } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SetterBasedCreatorFactory.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SetterBasedCreatorFactory.java index 7663651ae7c9..e7ded3c52af5 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SetterBasedCreatorFactory.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SetterBasedCreatorFactory.java @@ -19,6 +19,7 @@ import java.lang.reflect.InvocationTargetException; import java.util.List; +import org.apache.beam.sdk.values.TypeDescriptor; /** * A {@link Factory} that uses a default constructor and a list of setters to construct a {@link @@ -35,14 +36,14 @@ public SetterBasedCreatorFactory(Factory> setterFactory) } @Override - public SchemaUserTypeCreator create(Class clazz, Schema schema) { - List setters = setterFactory.create(clazz, schema); + public SchemaUserTypeCreator create(TypeDescriptor typeDescriptor, Schema schema) { + List setters = setterFactory.create(typeDescriptor, schema); return new SchemaUserTypeCreator() { @Override public Object create(Object... params) { Object object; try { - object = clazz.getDeclaredConstructor().newInstance(); + object = typeDescriptor.getRawType().getDeclaredConstructor().newInstance(); } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/annotations/DefaultSchema.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/annotations/DefaultSchema.java index ddebbeb2bffe..6f3e598f5314 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/annotations/DefaultSchema.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/annotations/DefaultSchema.java @@ -101,7 +101,7 @@ public ProviderAndDescriptor( try { return new ProviderAndDescriptor( providerClass.getDeclaredConstructor().newInstance(), - TypeDescriptor.of(clazz)); + typeDescriptor.getSupertype((Class) clazz)); } catch (NoSuchMethodException | InstantiationException | IllegalAccessException diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaRowUdf.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaRowUdf.java index 2ec0a9a60cd6..54e2a595fa71 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaRowUdf.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaRowUdf.java @@ -299,7 +299,7 @@ private static FunctionAndType createFunctionFromName(String name, String path) private static class EmptyFieldValueTypeSupplier implements org.apache.beam.sdk.schemas.utils.FieldValueTypeSupplier { @Override - public List get(Class clazz) { + public List get(TypeDescriptor typeDescriptor) { return Collections.emptyList(); } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/LoggingTransformProvider.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/LoggingTransformProvider.java index 25efaeae2a0e..2908171f5c02 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/LoggingTransformProvider.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/LoggingTransformProvider.java @@ -19,6 +19,7 @@ import com.google.auto.service.AutoService; import com.google.auto.value.AutoValue; +import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.List; import java.util.Map; @@ -36,7 +37,6 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.slf4j.Logger; @@ -166,7 +166,7 @@ private static DoFn createDoFn(Level logLevel, String prefix, Schema r return new DoFn() { @ProcessElement public void processElement(@Element Row row, OutputReceiver out) { - String msg = prefix + new String(fn.apply(row), Charsets.UTF_8); + String msg = prefix + new String(fn.apply(row), StandardCharsets.UTF_8); // Looks like this is the best we can do. // https://stackoverflow.com/questions/2621701/setting-log-level-of-message-at-runtime-in-slf4j switch (logLevel) { diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AutoValueUtils.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AutoValueUtils.java index dcbbf70888d3..d7fddd8abfed 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AutoValueUtils.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AutoValueUtils.java @@ -71,18 +71,18 @@ "rawtypes" }) public class AutoValueUtils { - public static Class getBaseAutoValueClass(Class clazz) { + public static TypeDescriptor getBaseAutoValueClass(TypeDescriptor typeDescriptor) { // AutoValue extensions may be nested - while (clazz != null && clazz.getName().contains("AutoValue_")) { - clazz = clazz.getSuperclass(); + while (typeDescriptor != null && typeDescriptor.getRawType().getName().contains("AutoValue_")) { + typeDescriptor = TypeDescriptor.of(typeDescriptor.getRawType().getSuperclass()); } - return clazz; + return typeDescriptor; } - private static Class getAutoValueGenerated(Class clazz) { - String generatedClassName = getAutoValueGeneratedName(clazz.getName()); + private static TypeDescriptor getAutoValueGenerated(TypeDescriptor typeDescriptor) { + String generatedClassName = getAutoValueGeneratedName(typeDescriptor.getRawType().getName()); try { - return Class.forName(generatedClassName); + return TypeDescriptor.of(Class.forName(generatedClassName)); } catch (ClassNotFoundException e) { throw new IllegalStateException("AutoValue generated class not found: " + generatedClassName); } @@ -121,11 +121,14 @@ private static String getAutoValueGeneratedName(String baseClass) { * Try to find an accessible constructor for creating an AutoValue class. Otherwise return null. */ public static @Nullable SchemaUserTypeCreator getConstructorCreator( - Class clazz, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier) { - Class generatedClass = getAutoValueGenerated(clazz); - List schemaTypes = fieldValueTypeSupplier.get(clazz, schema); + TypeDescriptor typeDescriptor, + Schema schema, + FieldValueTypeSupplier fieldValueTypeSupplier) { + TypeDescriptor generatedTypeDescriptor = getAutoValueGenerated(typeDescriptor); + List schemaTypes = + fieldValueTypeSupplier.get(typeDescriptor, schema); Optional> constructor = - Arrays.stream(generatedClass.getDeclaredConstructors()) + Arrays.stream(generatedTypeDescriptor.getRawType().getDeclaredConstructors()) .filter(c -> !Modifier.isPrivate(c.getModifiers())) .filter(c -> matchConstructor(c, schemaTypes)) .findAny(); @@ -133,7 +136,7 @@ private static String getAutoValueGeneratedName(String baseClass) { .map( c -> JavaBeanUtils.getConstructorCreator( - generatedClass, + generatedTypeDescriptor, c, schema, fieldValueTypeSupplier, @@ -201,7 +204,8 @@ private static boolean matchConstructor( List setterMethods = Lists.newArrayList(); // The builder methods to call in order. - List schemaTypes = fieldValueTypeSupplier.get(clazz, schema); + List schemaTypes = + fieldValueTypeSupplier.get(TypeDescriptor.of(clazz), schema); for (FieldValueTypeInformation type : schemaTypes) { String autoValueFieldName = ReflectUtils.stripGetterPrefix(type.getMethod().getName()); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/FieldValueTypeSupplier.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/FieldValueTypeSupplier.java index d93456b21949..693997f64aa0 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/FieldValueTypeSupplier.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/FieldValueTypeSupplier.java @@ -21,6 +21,7 @@ import java.util.List; import org.apache.beam.sdk.schemas.FieldValueTypeInformation; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.values.TypeDescriptor; /** * A naming policy for schema fields. This maps a name from the class (field name or getter name) to @@ -28,7 +29,7 @@ */ public interface FieldValueTypeSupplier extends Serializable { /** Return all the FieldValueTypeInformations. */ - List get(Class clazz); + List get(TypeDescriptor typeDescriptor); /** * Return all the FieldValueTypeInformations. @@ -36,7 +37,7 @@ public interface FieldValueTypeSupplier extends Serializable { *

If the schema parameter is not null, then the returned list must be in the same order as * fields in the schema. */ - default List get(Class clazz, Schema schema) { - return StaticSchemaInference.sortBySchema(get(clazz), schema); + default List get(TypeDescriptor typeDescriptor, Schema schema) { + return StaticSchemaInference.sortBySchema(get(typeDescriptor), schema); } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/JavaBeanUtils.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/JavaBeanUtils.java index 6573f25c66e2..911f79f6eeed 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/JavaBeanUtils.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/JavaBeanUtils.java @@ -51,8 +51,9 @@ import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.InjectPackageStrategy; import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.StaticFactoryMethodInstruction; import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.TypeConversionsFactory; -import org.apache.beam.sdk.schemas.utils.ReflectUtils.ClassWithSchema; +import org.apache.beam.sdk.schemas.utils.ReflectUtils.TypeDescriptorWithSchema; import org.apache.beam.sdk.util.common.ReflectHelpers; +import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; /** A set of utilities to generate getter and setter classes for JavaBean objects. */ @@ -63,8 +64,8 @@ public class JavaBeanUtils { /** Create a {@link Schema} for a Java Bean class. */ public static Schema schemaFromJavaBeanClass( - Class clazz, FieldValueTypeSupplier fieldValueTypeSupplier) { - return StaticSchemaInference.schemaFromClass(clazz, fieldValueTypeSupplier); + TypeDescriptor typeDescriptor, FieldValueTypeSupplier fieldValueTypeSupplier) { + return StaticSchemaInference.schemaFromClass(typeDescriptor, fieldValueTypeSupplier); } private static final String CONSTRUCTOR_HELP_STRING = @@ -111,18 +112,21 @@ public static void validateJavaBean( // Static ByteBuddy instance used by all helpers. private static final ByteBuddy BYTE_BUDDY = new ByteBuddy(); - private static final Map> CACHED_FIELD_TYPES = - Maps.newConcurrentMap(); + private static final Map, List> + CACHED_FIELD_TYPES = Maps.newConcurrentMap(); public static List getFieldTypes( - Class clazz, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier) { + TypeDescriptor typeDescriptor, + Schema schema, + FieldValueTypeSupplier fieldValueTypeSupplier) { return CACHED_FIELD_TYPES.computeIfAbsent( - ClassWithSchema.create(clazz, schema), c -> fieldValueTypeSupplier.get(clazz, schema)); + TypeDescriptorWithSchema.create(typeDescriptor, schema), + c -> fieldValueTypeSupplier.get(typeDescriptor, schema)); } // The list of getters for a class is cached, so we only create the classes the first time // getSetters is called. - private static final Map> CACHED_GETTERS = + private static final Map, List> CACHED_GETTERS = Maps.newConcurrentMap(); /** @@ -131,14 +135,15 @@ public static List getFieldTypes( *

The returned list is ordered by the order of fields in the schema. */ public static List getGetters( - Class clazz, + TypeDescriptor typeDescriptor, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier, TypeConversionsFactory typeConversionsFactory) { return CACHED_GETTERS.computeIfAbsent( - ClassWithSchema.create(clazz, schema), + TypeDescriptorWithSchema.create(typeDescriptor, schema), c -> { - List types = fieldValueTypeSupplier.get(clazz, schema); + List types = + fieldValueTypeSupplier.get(typeDescriptor, schema); return types.stream() .map(t -> createGetter(t, typeConversionsFactory)) .collect(Collectors.toList()); @@ -186,7 +191,7 @@ private static DynamicType.Builder implementGetterMethods( // The list of setters for a class is cached, so we only create the classes the first time // getSetters is called. - private static final Map> CACHED_SETTERS = + private static final Map, List> CACHED_SETTERS = Maps.newConcurrentMap(); /** @@ -195,14 +200,15 @@ private static DynamicType.Builder implementGetterMethods( *

The returned list is ordered by the order of fields in the schema. */ public static List getSetters( - Class clazz, + TypeDescriptor typeDescriptor, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier, TypeConversionsFactory typeConversionsFactory) { return CACHED_SETTERS.computeIfAbsent( - ClassWithSchema.create(clazz, schema), + TypeDescriptorWithSchema.create(typeDescriptor, schema), c -> { - List types = fieldValueTypeSupplier.get(clazz, schema); + List types = + fieldValueTypeSupplier.get(typeDescriptor, schema); return types.stream() .map(t -> createSetter(t, typeConversionsFactory)) .collect(Collectors.toList()); @@ -250,21 +256,22 @@ private static DynamicType.Builder implementSetterMethods( // The list of constructors for a class is cached, so we only create the classes the first time // getConstructor is called. - public static final Map CACHED_CREATORS = + public static final Map, SchemaUserTypeCreator> CACHED_CREATORS = Maps.newConcurrentMap(); public static SchemaUserTypeCreator getConstructorCreator( - Class clazz, + TypeDescriptor typeDescriptor, Constructor constructor, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier, TypeConversionsFactory typeConversionsFactory) { return CACHED_CREATORS.computeIfAbsent( - ClassWithSchema.create(clazz, schema), + TypeDescriptorWithSchema.create(typeDescriptor, schema), c -> { - List types = fieldValueTypeSupplier.get(clazz, schema); + List types = + fieldValueTypeSupplier.get(typeDescriptor, schema); return createConstructorCreator( - clazz, constructor, schema, types, typeConversionsFactory); + typeDescriptor.getRawType(), constructor, schema, types, typeConversionsFactory); }); } @@ -302,16 +309,18 @@ public static SchemaUserTypeCreator createConstructorCreator( } public static SchemaUserTypeCreator getStaticCreator( - Class clazz, + TypeDescriptor typeDescriptor, Method creator, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier, TypeConversionsFactory typeConversionsFactory) { return CACHED_CREATORS.computeIfAbsent( - ClassWithSchema.create(clazz, schema), + TypeDescriptorWithSchema.create(typeDescriptor, schema), c -> { - List types = fieldValueTypeSupplier.get(clazz, schema); - return createStaticCreator(clazz, creator, schema, types, typeConversionsFactory); + List types = + fieldValueTypeSupplier.get(typeDescriptor, schema); + return createStaticCreator( + typeDescriptor.getRawType(), creator, schema, types, typeConversionsFactory); }); } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/POJOUtils.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/POJOUtils.java index 93875a20707f..571b9c690900 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/POJOUtils.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/POJOUtils.java @@ -59,7 +59,7 @@ import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.StaticFactoryMethodInstruction; import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.TypeConversion; import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.TypeConversionsFactory; -import org.apache.beam.sdk.schemas.utils.ReflectUtils.ClassWithSchema; +import org.apache.beam.sdk.schemas.utils.ReflectUtils.TypeDescriptorWithSchema; import org.apache.beam.sdk.util.common.ReflectHelpers; import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; @@ -71,45 +71,53 @@ "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) }) public class POJOUtils { + public static Schema schemaFromPojoClass( - Class clazz, FieldValueTypeSupplier fieldValueTypeSupplier) { - return StaticSchemaInference.schemaFromClass(clazz, fieldValueTypeSupplier); + TypeDescriptor typeDescriptor, FieldValueTypeSupplier fieldValueTypeSupplier) { + return StaticSchemaInference.schemaFromClass(typeDescriptor, fieldValueTypeSupplier); } // Static ByteBuddy instance used by all helpers. private static final ByteBuddy BYTE_BUDDY = new ByteBuddy(); - private static final Map> CACHED_FIELD_TYPES = - Maps.newConcurrentMap(); + private static final Map, List> + CACHED_FIELD_TYPES = Maps.newConcurrentMap(); public static List getFieldTypes( - Class clazz, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier) { + TypeDescriptor typeDescriptor, + Schema schema, + FieldValueTypeSupplier fieldValueTypeSupplier) { return CACHED_FIELD_TYPES.computeIfAbsent( - ClassWithSchema.create(clazz, schema), c -> fieldValueTypeSupplier.get(clazz, schema)); + TypeDescriptorWithSchema.create(typeDescriptor, schema), + c -> fieldValueTypeSupplier.get(typeDescriptor, schema)); } // The list of getters for a class is cached, so we only create the classes the first time // getSetters is called. - private static final Map> CACHED_GETTERS = + private static final Map> CACHED_GETTERS = Maps.newConcurrentMap(); public static List getGetters( - Class clazz, + TypeDescriptor typeDescriptor, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier, TypeConversionsFactory typeConversionsFactory) { // Return the getters ordered by their position in the schema. return CACHED_GETTERS.computeIfAbsent( - ClassWithSchema.create(clazz, schema), + TypeDescriptorWithSchema.create(typeDescriptor, schema), c -> { - List types = fieldValueTypeSupplier.get(clazz, schema); + List types = + fieldValueTypeSupplier.get(typeDescriptor, schema); List getters = types.stream() .map(t -> createGetter(t, typeConversionsFactory)) .collect(Collectors.toList()); if (getters.size() != schema.getFieldCount()) { throw new RuntimeException( - "Was not able to generate getters for schema: " + schema + " class: " + clazz); + "Was not able to generate getters for schema: " + + schema + + " class: " + + typeDescriptor); } return getters; }); @@ -117,19 +125,21 @@ public static List getGetters( // The list of constructors for a class is cached, so we only create the classes the first time // getConstructor is called. - public static final Map CACHED_CREATORS = + public static final Map CACHED_CREATORS = Maps.newConcurrentMap(); public static SchemaUserTypeCreator getSetFieldCreator( - Class clazz, + TypeDescriptor typeDescriptor, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier, TypeConversionsFactory typeConversionsFactory) { return CACHED_CREATORS.computeIfAbsent( - ClassWithSchema.create(clazz, schema), + TypeDescriptorWithSchema.create(typeDescriptor, schema), c -> { - List types = fieldValueTypeSupplier.get(clazz, schema); - return createSetFieldCreator(clazz, schema, types, typeConversionsFactory); + List types = + fieldValueTypeSupplier.get(typeDescriptor, schema); + return createSetFieldCreator( + typeDescriptor.getRawType(), schema, types, typeConversionsFactory); }); } @@ -171,17 +181,18 @@ private static SchemaUserTypeCreator createSetFieldCreator( } public static SchemaUserTypeCreator getConstructorCreator( - Class clazz, + TypeDescriptor typeDescriptor, Constructor constructor, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier, TypeConversionsFactory typeConversionsFactory) { return CACHED_CREATORS.computeIfAbsent( - ClassWithSchema.create(clazz, schema), + TypeDescriptorWithSchema.create(typeDescriptor, schema), c -> { - List types = fieldValueTypeSupplier.get(clazz, schema); + List types = + fieldValueTypeSupplier.get(typeDescriptor, schema); return createConstructorCreator( - clazz, constructor, schema, types, typeConversionsFactory); + typeDescriptor.getRawType(), constructor, schema, types, typeConversionsFactory); }); } @@ -220,16 +231,18 @@ public static SchemaUserTypeCreator createConstructorCreator( } public static SchemaUserTypeCreator getStaticCreator( - Class clazz, + TypeDescriptor typeDescriptor, Method creator, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier, TypeConversionsFactory typeConversionsFactory) { return CACHED_CREATORS.computeIfAbsent( - ClassWithSchema.create(clazz, schema), + TypeDescriptorWithSchema.create(typeDescriptor, schema), c -> { - List types = fieldValueTypeSupplier.get(clazz, schema); - return createStaticCreator(clazz, creator, schema, types, typeConversionsFactory); + List types = + fieldValueTypeSupplier.get(typeDescriptor, schema); + return createStaticCreator( + typeDescriptor.getRawType(), creator, schema, types, typeConversionsFactory); }); } @@ -324,19 +337,20 @@ private static DynamicType.Builder implementGetterMethods( // The list of setters for a class is cached, so we only create the classes the first time // getSetters is called. - private static final Map> CACHED_SETTERS = + private static final Map> CACHED_SETTERS = Maps.newConcurrentMap(); public static List getSetters( - Class clazz, + TypeDescriptor typeDescriptor, Schema schema, FieldValueTypeSupplier fieldValueTypeSupplier, TypeConversionsFactory typeConversionsFactory) { // Return the setters, ordered by their position in the schema. return CACHED_SETTERS.computeIfAbsent( - ClassWithSchema.create(clazz, schema), + TypeDescriptorWithSchema.create(typeDescriptor, schema), c -> { - List types = fieldValueTypeSupplier.get(clazz, schema); + List types = + fieldValueTypeSupplier.get(typeDescriptor, schema); return types.stream() .map(t -> createSetter(t, typeConversionsFactory)) .collect(Collectors.toList()); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/ReflectUtils.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/ReflectUtils.java index f3888a5ed443..4349a04c28ad 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/ReflectUtils.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/ReflectUtils.java @@ -63,6 +63,19 @@ public static ClassWithSchema create(Class clazz, Schema schema) { } } + /** Represents a type descriptor and a schema. */ + @AutoValue + public abstract static class TypeDescriptorWithSchema { + public abstract TypeDescriptor getTypeDescriptor(); + + public abstract Schema getSchema(); + + public static TypeDescriptorWithSchema create( + TypeDescriptor typeDescriptor, Schema schema) { + return new AutoValue_ReflectUtils_TypeDescriptorWithSchema<>(typeDescriptor, schema); + } + } + private static final Map, List> DECLARED_METHODS = Maps.newConcurrentMap(); private static final Map, Method> ANNOTATED_CONSTRUCTORS = Maps.newConcurrentMap(); private static final Map, List> DECLARED_FIELDS = Maps.newConcurrentMap(); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/StaticSchemaInference.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/StaticSchemaInference.java index 72d79adb8288..196ee6f86593 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/StaticSchemaInference.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/StaticSchemaInference.java @@ -85,25 +85,26 @@ enum MethodType { * public getter methods, or special annotations on the class. */ public static Schema schemaFromClass( - Class clazz, FieldValueTypeSupplier fieldValueTypeSupplier) { - return schemaFromClass(clazz, fieldValueTypeSupplier, new HashMap()); + TypeDescriptor typeDescriptor, FieldValueTypeSupplier fieldValueTypeSupplier) { + return schemaFromClass(typeDescriptor, fieldValueTypeSupplier, new HashMap<>()); } private static Schema schemaFromClass( - Class clazz, + TypeDescriptor typeDescriptor, FieldValueTypeSupplier fieldValueTypeSupplier, - Map alreadyVisitedSchemas) { - if (alreadyVisitedSchemas.containsKey(clazz)) { - Schema existingSchema = alreadyVisitedSchemas.get(clazz); + Map, Schema> alreadyVisitedSchemas) { + if (alreadyVisitedSchemas.containsKey(typeDescriptor)) { + Schema existingSchema = alreadyVisitedSchemas.get(typeDescriptor); if (existingSchema == null) { throw new IllegalArgumentException( - "Cannot infer schema with a circular reference. Class: " + clazz.getTypeName()); + "Cannot infer schema with a circular reference. Class: " + + typeDescriptor.getRawType().getTypeName()); } return existingSchema; } - alreadyVisitedSchemas.put(clazz, null); + alreadyVisitedSchemas.put(typeDescriptor, null); Schema.Builder builder = Schema.builder(); - for (FieldValueTypeInformation type : fieldValueTypeSupplier.get(clazz)) { + for (FieldValueTypeInformation type : fieldValueTypeSupplier.get(typeDescriptor)) { Schema.FieldType fieldType = fieldFromType(type.getType(), fieldValueTypeSupplier, alreadyVisitedSchemas); Schema.Field f = @@ -116,21 +117,21 @@ private static Schema schemaFromClass( builder.addFields(f); } Schema generatedSchema = builder.build(); - alreadyVisitedSchemas.replace(clazz, generatedSchema); + alreadyVisitedSchemas.replace(typeDescriptor, generatedSchema); return generatedSchema; } /** Map a Java field type to a Beam Schema FieldType. */ public static Schema.FieldType fieldFromType( TypeDescriptor type, FieldValueTypeSupplier fieldValueTypeSupplier) { - return fieldFromType(type, fieldValueTypeSupplier, new HashMap()); + return fieldFromType(type, fieldValueTypeSupplier, new HashMap<>()); } // TODO(https://github.com/apache/beam/issues/21567): support type inference for logical types private static Schema.FieldType fieldFromType( TypeDescriptor type, FieldValueTypeSupplier fieldValueTypeSupplier, - Map alreadyVisitedSchemas) { + Map, Schema> alreadyVisitedSchemas) { FieldType primitiveType = PRIMITIVE_TYPES.get(type.getRawType()); if (primitiveType != null) { return primitiveType; @@ -198,8 +199,7 @@ private static Schema.FieldType fieldFromType( throw new RuntimeException("Cannot infer schema from unparameterized collection."); } } else { - return FieldType.row( - schemaFromClass(type.getRawType(), fieldValueTypeSupplier, alreadyVisitedSchemas)); + return FieldType.row(schemaFromClass(type, fieldValueTypeSupplier, alreadyVisitedSchemas)); } } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/PAssert.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/PAssert.java index aa7b2630cce2..7a102747b9f7 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/PAssert.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/PAssert.java @@ -31,6 +31,7 @@ import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.NoSuchElementException; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.Pipeline.PipelineVisitor; import org.apache.beam.sdk.PipelineRunner; @@ -1610,8 +1611,18 @@ private SingletonCheckerDoFn( @ProcessElement public void processElement(ProcessContext c) { - ActualT actualContents = Iterables.getOnlyElement(c.element()); - c.output(doChecks(site, actualContents, checkerFn)); + try { + ActualT actualContents = Iterables.getOnlyElement(c.element()); + c.output(doChecks(site, actualContents, checkerFn)); + } catch (NoSuchElementException e) { + c.output( + SuccessOrFailure.failure( + site, + new IllegalArgumentException( + "expected singleton PCollection but was: empty PCollection", e))); + } catch (IllegalArgumentException e) { + c.output(SuccessOrFailure.failure(site, e)); + } } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/errorhandling/BadRecord.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/errorhandling/BadRecord.java index fd49078350c4..558f912a6b1f 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/errorhandling/BadRecord.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/errorhandling/BadRecord.java @@ -24,6 +24,7 @@ import java.io.IOException; import java.io.PrintStream; import java.io.Serializable; +import java.nio.charset.StandardCharsets; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.schemas.AutoValueSchema; @@ -34,7 +35,6 @@ import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.util.Preconditions; import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.checkerframework.checker.nullness.qual.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -202,11 +202,11 @@ public abstract static class Builder { public Builder addExceptionStackTrace(Exception exception) throws IOException { ByteArrayOutputStream stream = new ByteArrayOutputStream(); - PrintStream printStream = new PrintStream(stream, false, Charsets.UTF_8.name()); + PrintStream printStream = new PrintStream(stream, false, StandardCharsets.UTF_8.name()); exception.printStackTrace(printStream); printStream.close(); - this.setExceptionStacktrace(new String(stream.toByteArray(), Charsets.UTF_8)); + this.setExceptionStacktrace(new String(stream.toByteArray(), StandardCharsets.UTF_8)); return this; } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/reflect/ByteBuddyOnTimerInvokerFactory.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/reflect/ByteBuddyOnTimerInvokerFactory.java index 7b9ac7e15c2e..e318e82513ca 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/reflect/ByteBuddyOnTimerInvokerFactory.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/reflect/ByteBuddyOnTimerInvokerFactory.java @@ -22,6 +22,7 @@ import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; +import java.nio.charset.StandardCharsets; import java.util.concurrent.ExecutionException; import net.bytebuddy.ByteBuddy; import net.bytebuddy.description.modifier.FieldManifestation; @@ -43,7 +44,6 @@ import org.apache.beam.sdk.transforms.DoFn.TimerId; import org.apache.beam.sdk.transforms.reflect.ByteBuddyDoFnInvokerFactory.DoFnMethodWithExtraParametersDelegation; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.CharMatcher; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheBuilder; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheLoader; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.LoadingCache; @@ -191,7 +191,7 @@ public Constructor load(final OnTimerMethodSpecifier onTimerMethodSpecifier) "%s$%s$%s", OnTimerInvoker.class.getSimpleName(), CharMatcher.javaLetterOrDigit().retainFrom(timerId), - BaseEncoding.base64().omitPadding().encode(timerId.getBytes(Charsets.UTF_8))); + BaseEncoding.base64().omitPadding().encode(timerId.getBytes(StandardCharsets.UTF_8))); DynamicType.Builder builder = new ByteBuddy() @@ -241,7 +241,7 @@ public Constructor load(final OnTimerMethodSpecifier onTimerMethodSpecifier) "%s$%s$%s", OnTimerInvoker.class.getSimpleName(), CharMatcher.javaLetterOrDigit().retainFrom(timerId), - BaseEncoding.base64().omitPadding().encode(timerId.getBytes(Charsets.UTF_8))); + BaseEncoding.base64().omitPadding().encode(timerId.getBytes(StandardCharsets.UTF_8))); DynamicType.Builder builder = new ByteBuddy() diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java index 2f034626acd7..527a699568f4 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java @@ -19,6 +19,7 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; import java.util.function.Function; @@ -28,7 +29,6 @@ import org.apache.beam.model.pipeline.v1.RunnerApi.StandardResourceHints; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ProtocolMessageEnum; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; @@ -179,7 +179,7 @@ public ResourceHint mergeWithOuter(ResourceHint outer) { @Override public byte[] toBytes() { - return String.valueOf(value).getBytes(Charsets.US_ASCII); + return String.valueOf(value).getBytes(StandardCharsets.US_ASCII); } } @@ -196,7 +196,7 @@ public static String parse(String s) { @Override public byte[] toBytes() { - return value.getBytes(Charsets.US_ASCII); + return value.getBytes(StandardCharsets.US_ASCII); } @Override @@ -254,7 +254,7 @@ public ResourceHint mergeWithOuter(ResourceHint outer) { @Override public byte[] toBytes() { - return String.valueOf(value).getBytes(Charsets.US_ASCII); + return String.valueOf(value).getBytes(StandardCharsets.US_ASCII); } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/StringUtils.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/StringUtils.java index 13105fb6c02c..ccd58857da04 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/StringUtils.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/StringUtils.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.List; import org.apache.beam.sdk.annotations.Internal; +import org.checkerframework.checker.nullness.qual.Nullable; /** Utilities for working with JSON and other human-readable string formats. */ @Internal @@ -143,4 +144,38 @@ public static int getLevenshteinDistance(final String s, final String t) { return v1[t.length()]; } + + /** + * Convert Array to new lined String. Truncate to first {@code maxLine} elements. + * + *

Useful to truncate stacktrace and for logging. + */ + public static String arrayToNewlines(Object[] array, int maxLine) { + int n = (maxLine > 0 && array.length > maxLine) ? maxLine : array.length; + StringBuilder b = new StringBuilder(); + for (int i = 0; i < n; i++) { + b.append(array[i]); + b.append("\n"); + } + if (array.length > maxLine) { + b.append("...\n"); + } + return b.toString(); + } + + /** + * Truncate String if length greater than maxLen, and append "..." to the end. Handles null. + * + *

Useful to truncate long logging message. + */ + public static String leftTruncate(@Nullable Object element, int maxLen) { + if (element == null) { + return ""; + } + String s = element.toString(); + if (s.length() > maxLen) { + return s.substring(0, maxLen) + "..."; + } + return s; + } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/values/RowWithGetters.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/values/RowWithGetters.java index cb4d83550577..9731507fb0f6 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/values/RowWithGetters.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/values/RowWithGetters.java @@ -51,7 +51,7 @@ public class RowWithGetters extends Row { Schema schema, Factory> getterFactory, Object getterTarget) { super(schema); this.getterTarget = getterTarget; - this.getters = getterFactory.create(getterTarget.getClass(), schema); + this.getters = getterFactory.create(TypeDescriptor.of(getterTarget.getClass()), schema); } @Override diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/StructuralByteArrayTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/StructuralByteArrayTest.java index bd8fdd84fb09..cb0845796fe9 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/StructuralByteArrayTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/StructuralByteArrayTest.java @@ -20,7 +20,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; +import java.nio.charset.StandardCharsets; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -32,10 +32,10 @@ public final class StructuralByteArrayTest { @Test public void testStructuralByteArray() throws Exception { assertEquals( - new StructuralByteArray("test string".getBytes(Charsets.UTF_8)), - new StructuralByteArray("test string".getBytes(Charsets.UTF_8))); + new StructuralByteArray("test string".getBytes(StandardCharsets.UTF_8)), + new StructuralByteArray("test string".getBytes(StandardCharsets.UTF_8))); assertFalse( - new StructuralByteArray("test string".getBytes(Charsets.UTF_8)) - .equals(new StructuralByteArray("diff string".getBytes(Charsets.UTF_8)))); + new StructuralByteArray("test string".getBytes(StandardCharsets.UTF_8)) + .equals(new StructuralByteArray("diff string".getBytes(StandardCharsets.UTF_8)))); } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileBasedSinkTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileBasedSinkTest.java index 7fd54039b1dd..c4f83954e66c 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileBasedSinkTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileBasedSinkTest.java @@ -17,8 +17,8 @@ */ package org.apache.beam.sdk.io; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.beam.sdk.io.WriteFiles.UNKNOWN_SHARDNUM; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets.UTF_8; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.is; diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileIOTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileIOTest.java index b87c9caa1244..90b0822d9dca 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileIOTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileIOTest.java @@ -31,6 +31,7 @@ import java.io.OutputStreamWriter; import java.io.Serializable; import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.nio.file.CopyOption; import java.nio.file.Files; import java.nio.file.Path; @@ -69,7 +70,6 @@ import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.sdk.values.TypeDescriptors; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.joda.time.Duration; import org.junit.Rule; import org.junit.Test; @@ -368,10 +368,10 @@ public void testMatchWatchForNewFiles() throws IOException, InterruptedException public void testRead() throws IOException { final String path = tmpFolder.newFile("file").getAbsolutePath(); final String pathGZ = tmpFolder.newFile("file.gz").getAbsolutePath(); - Files.write(new File(path).toPath(), "Hello world".getBytes(Charsets.UTF_8)); + Files.write(new File(path).toPath(), "Hello world".getBytes(StandardCharsets.UTF_8)); try (Writer writer = new OutputStreamWriter( - new GZIPOutputStream(new FileOutputStream(pathGZ)), Charsets.UTF_8)) { + new GZIPOutputStream(new FileOutputStream(pathGZ)), StandardCharsets.UTF_8)) { writer.write("Hello world"); } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TFRecordIOTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TFRecordIOTest.java index acde8c91431d..a38faf077e07 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TFRecordIOTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TFRecordIOTest.java @@ -67,7 +67,6 @@ import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.display.DisplayData; import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.BaseEncoding; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.ByteStreams; @@ -212,7 +211,7 @@ public void testWriteTwo() throws Exception { @Category(NeedsRunner.class) public void testReadInvalidRecord() throws Exception { expectedException.expectMessage("Not a valid TFRecord. Fewer than 12 bytes."); - runTestRead("bar".getBytes(Charsets.UTF_8), new String[0]); + runTestRead("bar".getBytes(StandardCharsets.UTF_8), new String[0]); } @Test @@ -445,14 +444,14 @@ private static Iterable makeLines(int n, int minRecordSize) { static class ByteArrayToString extends DoFn { @ProcessElement public void processElement(ProcessContext c) { - c.output(new String(c.element(), Charsets.UTF_8)); + c.output(new String(c.element(), StandardCharsets.UTF_8)); } } static class StringToByteArray extends DoFn { @ProcessElement public void processElement(ProcessContext c) { - c.output(c.element().getBytes(Charsets.UTF_8)); + c.output(c.element().getBytes(StandardCharsets.UTF_8)); } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java index 253308d1b93f..8d9adbefd02b 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java @@ -51,6 +51,7 @@ import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -91,7 +92,6 @@ import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Joiner; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -642,7 +642,7 @@ private void runTestRead(String[] expected) throws Exception { try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) { for (String elem : expected) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); } } @@ -865,7 +865,7 @@ public void testProgressEmptyFile() throws IOException { public void testProgressTextFile() throws IOException { String file = "line1\nline2\nline3"; try (BoundedSource.BoundedReader reader = - prepareSource(file.getBytes(Charsets.UTF_8)) + prepareSource(file.getBytes(StandardCharsets.UTF_8)) .createReader(PipelineOptionsFactory.create())) { // Check preconditions before starting assertEquals(0.0, reader.getFractionConsumed(), 1e-6); @@ -901,7 +901,7 @@ public void testProgressTextFile() throws IOException { @Test public void testProgressAfterSplitting() throws IOException { String file = "line1\nline2\nline3"; - BoundedSource source = prepareSource(file.getBytes(Charsets.UTF_8)); + BoundedSource source = prepareSource(file.getBytes(StandardCharsets.UTF_8)); BoundedSource remainder; // Create the remainder, verifying properties pre- and post-splitting. diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOWriteTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOWriteTest.java index 312605f3fcc5..695ff4474d71 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOWriteTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOWriteTest.java @@ -39,6 +39,7 @@ import java.io.OutputStream; import java.nio.channels.Channels; import java.nio.channels.WritableByteChannel; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -73,7 +74,6 @@ import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Function; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Functions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Predicate; @@ -492,7 +492,7 @@ private static void assertOutputFiles( List expectedElements = new ArrayList<>(elems.length); for (String elem : elems) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); expectedElements.add(line); } @@ -509,7 +509,7 @@ private static void assertOutputFiles( private static List readLinesFromFile(File f) throws IOException { List currentFile = new ArrayList<>(); - try (BufferedReader reader = Files.newBufferedReader(f.toPath(), Charsets.UTF_8)) { + try (BufferedReader reader = Files.newBufferedReader(f.toPath(), StandardCharsets.UTF_8)) { while (true) { String line = reader.readLine(); if (line == null) { diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextRowCountEstimatorTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextRowCountEstimatorTest.java index 17ca3ba85fd8..e52d4112e11e 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextRowCountEstimatorTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextRowCountEstimatorTest.java @@ -20,8 +20,8 @@ import java.io.File; import java.io.FileNotFoundException; import java.io.Writer; +import java.nio.charset.StandardCharsets; import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.Files; import org.junit.Assert; import org.junit.Rule; @@ -38,7 +38,7 @@ public class TextRowCountEstimatorTest { @Test public void testNonEmptyFiles() throws Exception { File file1 = temporaryFolder.newFile("file1.txt"); - Writer writer = Files.newWriter(file1, Charsets.UTF_8); + Writer writer = Files.newWriter(file1, StandardCharsets.UTF_8); for (int i = 0; i < 100; i++) { writer.write("123123123\n"); } @@ -47,7 +47,7 @@ public void testNonEmptyFiles() throws Exception { temporaryFolder.newFolder("testfolder"); temporaryFolder.newFolder("testfolder2"); file1 = temporaryFolder.newFile("testfolder/test2.txt"); - writer = Files.newWriter(file1, Charsets.UTF_8); + writer = Files.newWriter(file1, StandardCharsets.UTF_8); for (int i = 0; i < 50; i++) { writer.write("123123123\n"); } @@ -71,7 +71,7 @@ public void testEmptyFolder() throws Exception { @Test public void testEmptyFile() throws Exception { File file1 = temporaryFolder.newFile("file1.txt"); - Writer writer = Files.newWriter(file1, Charsets.UTF_8); + Writer writer = Files.newWriter(file1, StandardCharsets.UTF_8); for (int i = 0; i < 100; i++) { writer.write("\n"); } @@ -86,7 +86,7 @@ public void testEmptyFile() throws Exception { @Test(expected = TextRowCountEstimator.NoEstimationException.class) public void lotsOfNewLines() throws Exception { File file1 = temporaryFolder.newFile("file1.txt"); - Writer writer = Files.newWriter(file1, Charsets.UTF_8); + Writer writer = Files.newWriter(file1, StandardCharsets.UTF_8); for (int i = 0; i < 1000; i++) { writer.write("\n"); } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/WriteFilesTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/WriteFilesTest.java index 0ab8efac7eb1..cc174002bb46 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/WriteFilesTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/WriteFilesTest.java @@ -35,6 +35,7 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.text.DecimalFormat; import java.util.ArrayList; @@ -93,7 +94,6 @@ import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.PDone; import org.apache.beam.sdk.values.ShardedKey; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Optional; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; @@ -1035,7 +1035,8 @@ static void checkFileContents( List actual = Lists.newArrayList(); for (File outputFile : outputFiles) { List actualShard = Lists.newArrayList(); - try (BufferedReader reader = Files.newBufferedReader(outputFile.toPath(), Charsets.UTF_8)) { + try (BufferedReader reader = + Files.newBufferedReader(outputFile.toPath(), StandardCharsets.UTF_8)) { for (; ; ) { String line = reader.readLine(); if (line == null) { diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/metrics/MetricResultsMatchers.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/metrics/MetricResultsMatchers.java index c75d872dc38e..9299ae81fa46 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/metrics/MetricResultsMatchers.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/metrics/MetricResultsMatchers.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.metrics; import java.util.Objects; +import org.checkerframework.checker.nullness.qual.Nullable; import org.hamcrest.Description; import org.hamcrest.Matcher; import org.hamcrest.TypeSafeMatcher; @@ -211,9 +212,9 @@ private static class MatchNameAndKey extends TypeSafeMatcher> private final String namespace; private final String name; - private final String step; + private final @Nullable String step; - MatchNameAndKey(String namespace, String name, String step) { + MatchNameAndKey(String namespace, String name, @Nullable String step) { this.namespace = namespace; this.name = name; this.step = step; @@ -221,7 +222,11 @@ private static class MatchNameAndKey extends TypeSafeMatcher> @Override protected boolean matchesSafely(MetricResult item) { - return MetricFiltering.matches(MetricsFilter.builder().addStep(step).build(), item.getKey()) + MetricsFilter.Builder builder = MetricsFilter.builder(); + if (step != null) { + builder = builder.addStep(step); + } + return MetricFiltering.matches(builder.build(), item.getKey()) && Objects.equals(MetricName.named(namespace, name), item.getName()); } @@ -231,9 +236,10 @@ public void describeTo(Description description) { .appendText("MetricResult{inNamespace=") .appendValue(namespace) .appendText(", name=") - .appendValue(name) - .appendText(", step=") - .appendValue(step); + .appendValue(name); + if (step != null) { + description.appendText(", step=").appendValue(step); + } if (this.getClass() == MatchNameAndKey.class) { description.appendText("}"); } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/metrics/MetricsTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/metrics/MetricsTest.java index 79709c89963b..750d43a4f9ae 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/metrics/MetricsTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/metrics/MetricsTest.java @@ -20,17 +20,25 @@ import static org.apache.beam.sdk.metrics.MetricResultsMatchers.attemptedMetricsResult; import static org.apache.beam.sdk.metrics.MetricResultsMatchers.distributionMinMax; import static org.apache.beam.sdk.metrics.MetricResultsMatchers.metricsResult; +import static org.apache.beam.sdk.testing.SerializableMatchers.greaterThan; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.anyOf; -import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.hasItem; import static org.junit.Assert.assertNull; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +import java.io.IOException; import java.io.Serializable; +import java.util.List; +import java.util.NoSuchElementException; import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.VarIntCoder; +import org.apache.beam.sdk.io.BoundedSource; import org.apache.beam.sdk.io.GenerateSequence; +import org.apache.beam.sdk.io.Read; +import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.testing.NeedsRunner; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.testing.UsesAttemptedMetrics; @@ -45,7 +53,9 @@ import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.TupleTagList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; +import org.hamcrest.Matcher; import org.joda.time.Duration; import org.joda.time.Instant; import org.junit.After; @@ -416,17 +426,136 @@ public void testAttemptedStringSetMetrics() { MetricQueryResults metrics = queryTestMetrics(result); assertStringSetMetrics(metrics, false); } + + @Test + @Category({ValidatesRunner.class, UsesAttemptedMetrics.class, UsesCounterMetrics.class}) + public void testBoundedSourceMetricsInSplit() { + pipeline.apply(Read.from(new CountingSourceWithMetrics(0, 10))); + PipelineResult pipelineResult = pipeline.run(); + MetricQueryResults metrics = + pipelineResult + .metrics() + .queryMetrics( + MetricsFilter.builder() + .addNameFilter( + MetricNameFilter.named( + CountingSourceWithMetrics.class, + CountingSourceWithMetrics.SPLIT_NAME)) + .addNameFilter( + MetricNameFilter.named( + CountingSourceWithMetrics.class, + CountingSourceWithMetrics.ADVANCE_NAME)) + .build()); + assertThat( + metrics.getCounters(), + hasItem( + attemptedMetricsResult( + CountingSourceWithMetrics.class.getName(), + CountingSourceWithMetrics.ADVANCE_NAME, + null, // step name varies depending on the runner + 10L))); + assertThat( + metrics.getCounters(), + hasItem( + metricsResult( + CountingSourceWithMetrics.class.getName(), + CountingSourceWithMetrics.SPLIT_NAME, + null, // step name varies depending on the runner + greaterThan(0L), + false))); + } + } + + public static class CountingSourceWithMetrics extends BoundedSource { + public static final String SPLIT_NAME = "num-split"; + public static final String ADVANCE_NAME = "num-advance"; + private static Counter splitCounter = + Metrics.counter(CountingSourceWithMetrics.class, SPLIT_NAME); + private static Counter advanceCounter = + Metrics.counter(CountingSourceWithMetrics.class, ADVANCE_NAME); + private final int start; + private final int end; + + @Override + public List> split( + long desiredBundleSizeBytes, PipelineOptions options) { + splitCounter.inc(); + // simply split the current source into two + if (end - start >= 2) { + int mid = (start + end + 1) / 2; + return ImmutableList.of( + new CountingSourceWithMetrics(start, mid), new CountingSourceWithMetrics(mid, end)); + } + return null; + } + + @Override + public long getEstimatedSizeBytes(PipelineOptions options) { + return 0; + } + + @Override + public BoundedReader createReader(PipelineOptions options) { + return new CountingReader(); + } + + public CountingSourceWithMetrics(int start, int end) { + this.start = start; + this.end = end; + } + + @Override + public Coder getOutputCoder() { + return VarIntCoder.of(); + } + + public class CountingReader extends BoundedSource.BoundedReader { + private int current; + + @Override + public boolean start() throws IOException { + return current < end; + } + + @Override + public boolean advance() { + ++current; + advanceCounter.inc(); + return current < end; + } + + @Override + public Integer getCurrent() throws NoSuchElementException { + return current; + } + + @Override + public void close() {} + + @Override + public BoundedSource getCurrentSource() { + return null; + } + + public CountingReader() { + current = start; + } + } + } + + private static Matcher> metricsResultPatchStep( + final String name, final String step, final T value, final boolean isCommitted) { + return anyOf( + metricsResult(NAMESPACE, name, step, value, isCommitted), + // portable runner adds a suffix for metrics initiated outside anonymous pardo + metricsResult(NAMESPACE, name, step + "-ParMultiDo-Anonymous-", value, isCommitted)); } private static void assertCounterMetrics(MetricQueryResults metrics, boolean isCommitted) { + System.out.println(metrics.getCounters()); assertThat( metrics.getCounters(), - anyOf( - // Step names are different for portable and non-portable runners. - hasItem(metricsResult(NAMESPACE, "count", "MyStep1", 3L, isCommitted)), - hasItem( - metricsResult( - NAMESPACE, "count", "MyStep1-ParMultiDo-Anonymous-", 3L, isCommitted)))); + hasItem(metricsResultPatchStep("count", "MyStep1", 3L, isCommitted))); assertThat( metrics.getCounters(), @@ -446,27 +575,36 @@ private static void assertGaugeMetrics(MetricQueryResults metrics, boolean isCom } private static void assertStringSetMetrics(MetricQueryResults metrics, boolean isCommitted) { + // TODO(https://github.com/apache/beam/issues/32001) use containsInAnyOrder once portableMetrics + // duplicate metrics issue fixed assertThat( metrics.getStringSets(), - containsInAnyOrder( - metricsResult( - NAMESPACE, + hasItem( + metricsResultPatchStep( "sources", "MyStep1", StringSetResult.create(ImmutableSet.of("gcs")), - isCommitted), + isCommitted))); + assertThat( + metrics.getStringSets(), + hasItem( metricsResult( NAMESPACE, "sinks", "MyStep2", StringSetResult.create(ImmutableSet.of("kafka", "bq")), - isCommitted), - metricsResult( - NAMESPACE, + isCommitted))); + assertThat( + metrics.getStringSets(), + hasItem( + metricsResultPatchStep( "sideinputs", "MyStep1", StringSetResult.create(ImmutableSet.of("bigtable", "spanner")), - isCommitted), + isCommitted))); + assertThat( + metrics.getStringSets(), + hasItem( metricsResult( NAMESPACE, "sideinputs", @@ -478,22 +616,9 @@ private static void assertStringSetMetrics(MetricQueryResults metrics, boolean i private static void assertDistributionMetrics(MetricQueryResults metrics, boolean isCommitted) { assertThat( metrics.getDistributions(), - anyOf( - // Step names are different for portable and non-portable runners. - hasItem( - metricsResult( - NAMESPACE, - "input", - "MyStep1", - DistributionResult.create(26L, 3L, 5L, 13L), - isCommitted)), - hasItem( - metricsResult( - NAMESPACE, - "input", - "MyStep1-ParMultiDo-Anonymous-", - DistributionResult.create(26L, 3L, 5L, 13L), - isCommitted)))); + hasItem( + metricsResultPatchStep( + "input", "MyStep1", DistributionResult.create(26L, 3L, 5L, 13L), isCommitted))); assertThat( metrics.getDistributions(), diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/options/PipelineOptionsFactoryTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/options/PipelineOptionsFactoryTest.java index 2643fb556ff4..291bb5297880 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/options/PipelineOptionsFactoryTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/options/PipelineOptionsFactoryTest.java @@ -59,6 +59,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.PrintStream; +import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.List; import java.util.Map; @@ -74,7 +75,6 @@ import org.apache.beam.sdk.testing.InterceptingUrlClassLoader; import org.apache.beam.sdk.testing.RestoreSystemProperties; import org.apache.beam.sdk.util.common.ReflectHelpers; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ArrayListMultimap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Collections2; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -1727,7 +1727,7 @@ public void testWhenNoHelpIsRequested() { assertFalse( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertEquals("", output); } @@ -1739,7 +1739,7 @@ public void testDefaultHelpAsArgument() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("The set of registered options are:")); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); assertThat(output, containsString("Use --help= for detailed help.")); @@ -1753,7 +1753,7 @@ public void testSpecificHelpAsArgument() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); assertThat(output, containsString("--runner")); assertThat(output, containsString("Default: " + DEFAULT_RUNNER_NAME)); @@ -1769,7 +1769,7 @@ public void testSpecificHelpAsArgumentWithSimpleClassName() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); assertThat(output, containsString("--runner")); assertThat(output, containsString("Default: " + DEFAULT_RUNNER_NAME)); @@ -1785,7 +1785,7 @@ public void testSpecificHelpAsArgumentWithClassNameSuffix() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); assertThat(output, containsString("--runner")); assertThat(output, containsString("Default: " + DEFAULT_RUNNER_NAME)); @@ -1815,7 +1815,7 @@ public void testShortnameSpecificHelpHasMultipleMatches() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("Multiple matches found for NameConflict")); assertThat( output, @@ -1839,7 +1839,7 @@ public void testHelpWithOptionThatOutputsValidEnumTypes() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("")); } @@ -1851,7 +1851,7 @@ public void testHelpWithBadOptionNameAsArgument() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("Unable to find option org.apache.beam.sdk.Pipeline")); assertThat(output, containsString("The set of registered options are:")); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); @@ -1865,7 +1865,7 @@ public void testHelpWithHiddenMethodAndInterface() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); // A hidden interface. assertThat( output, not(containsString("org.apache.beam.sdk.options.DataflowPipelineDebugOptions"))); @@ -1877,7 +1877,7 @@ public void testHelpWithHiddenMethodAndInterface() { public void testProgrammaticPrintHelp() { ByteArrayOutputStream baos = new ByteArrayOutputStream(); PipelineOptionsFactory.printHelp(new PrintStream(baos)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("The set of registered options are:")); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); } @@ -1886,7 +1886,7 @@ public void testProgrammaticPrintHelp() { public void testProgrammaticPrintHelpForSpecificType() { ByteArrayOutputStream baos = new ByteArrayOutputStream(); PipelineOptionsFactory.printHelp(new PrintStream(baos), PipelineOptions.class); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); assertThat(output, containsString("--runner")); assertThat(output, containsString("Default: " + DEFAULT_RUNNER_NAME)); diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java index bd7a0da394ae..3b22addbf545 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java @@ -24,6 +24,7 @@ import static org.junit.Assert.assertThrows; import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; import java.time.LocalDateTime; import java.util.ArrayList; import java.util.HashMap; @@ -54,7 +55,6 @@ import org.apache.beam.sdk.schemas.logicaltypes.VariableString; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.joda.time.Instant; import org.junit.Test; @@ -243,7 +243,8 @@ public static Iterable data() { .setUrn("pythonsdk:value") .setPayload( ByteString.copyFrom( - "some payload describing a python type", Charsets.UTF_8)) + "some payload describing a python type", + StandardCharsets.UTF_8)) .setRepresentation( SchemaApi.FieldType.newBuilder() .setAtomicType(SchemaApi.AtomicType.BYTES)) diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/JavaBeanUtilsTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/JavaBeanUtilsTest.java index 296d4e9953ce..021e39b84849 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/JavaBeanUtilsTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/JavaBeanUtilsTest.java @@ -52,6 +52,7 @@ import org.apache.beam.sdk.schemas.utils.TestJavaBeans.PrimitiveArrayBean; import org.apache.beam.sdk.schemas.utils.TestJavaBeans.PrimitiveMapBean; import org.apache.beam.sdk.schemas.utils.TestJavaBeans.SimpleBean; +import org.apache.beam.sdk.values.TypeDescriptor; import org.joda.time.DateTime; import org.junit.Test; @@ -63,7 +64,8 @@ public class JavaBeanUtilsTest { @Test public void testNullable() { Schema schema = - JavaBeanUtils.schemaFromJavaBeanClass(NullableBean.class, GetterTypeSupplier.INSTANCE); + JavaBeanUtils.schemaFromJavaBeanClass( + new TypeDescriptor() {}, GetterTypeSupplier.INSTANCE); assertTrue(schema.getField("str").getType().getNullable()); assertFalse(schema.getField("anInt").getType().getNullable()); } @@ -71,14 +73,16 @@ public void testNullable() { @Test public void testSimpleBean() { Schema schema = - JavaBeanUtils.schemaFromJavaBeanClass(SimpleBean.class, GetterTypeSupplier.INSTANCE); + JavaBeanUtils.schemaFromJavaBeanClass( + new TypeDescriptor() {}, GetterTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(SIMPLE_BEAN_SCHEMA, schema); } @Test public void testNestedBean() { Schema schema = - JavaBeanUtils.schemaFromJavaBeanClass(NestedBean.class, GetterTypeSupplier.INSTANCE); + JavaBeanUtils.schemaFromJavaBeanClass( + new TypeDescriptor() {}, GetterTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(NESTED_BEAN_SCHEMA, schema); } @@ -86,14 +90,15 @@ public void testNestedBean() { public void testPrimitiveArray() { Schema schema = JavaBeanUtils.schemaFromJavaBeanClass( - PrimitiveArrayBean.class, GetterTypeSupplier.INSTANCE); + new TypeDescriptor() {}, GetterTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(PRIMITIVE_ARRAY_BEAN_SCHEMA, schema); } @Test public void testNestedArray() { Schema schema = - JavaBeanUtils.schemaFromJavaBeanClass(NestedArrayBean.class, GetterTypeSupplier.INSTANCE); + JavaBeanUtils.schemaFromJavaBeanClass( + new TypeDescriptor() {}, GetterTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(NESTED_ARRAY_BEAN_SCHEMA, schema); } @@ -101,21 +106,23 @@ public void testNestedArray() { public void testNestedCollection() { Schema schema = JavaBeanUtils.schemaFromJavaBeanClass( - NestedCollectionBean.class, GetterTypeSupplier.INSTANCE); + new TypeDescriptor() {}, GetterTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(NESTED_COLLECTION_BEAN_SCHEMA, schema); } @Test public void testPrimitiveMap() { Schema schema = - JavaBeanUtils.schemaFromJavaBeanClass(PrimitiveMapBean.class, GetterTypeSupplier.INSTANCE); + JavaBeanUtils.schemaFromJavaBeanClass( + new TypeDescriptor() {}, GetterTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(PRIMITIVE_MAP_BEAN_SCHEMA, schema); } @Test public void testNestedMap() { Schema schema = - JavaBeanUtils.schemaFromJavaBeanClass(NestedMapBean.class, GetterTypeSupplier.INSTANCE); + JavaBeanUtils.schemaFromJavaBeanClass( + new TypeDescriptor() {}, GetterTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(NESTED_MAP_BEAN_SCHEMA, schema); } @@ -137,7 +144,7 @@ public void testGeneratedSimpleGetters() { List getters = JavaBeanUtils.getGetters( - SimpleBean.class, + new TypeDescriptor() {}, SIMPLE_BEAN_SCHEMA, new JavaBeanSchema.GetterTypeSupplier(), new DefaultTypeConversionsFactory()); @@ -169,7 +176,7 @@ public void testGeneratedSimpleSetters() { SimpleBean simpleBean = new SimpleBean(); List setters = JavaBeanUtils.getSetters( - SimpleBean.class, + new TypeDescriptor() {}, SIMPLE_BEAN_SCHEMA, new SetterTypeSupplier(), new DefaultTypeConversionsFactory()); @@ -215,7 +222,7 @@ public void testGeneratedSimpleBoxedGetters() { List getters = JavaBeanUtils.getGetters( - BeanWithBoxedFields.class, + new TypeDescriptor() {}, BEAN_WITH_BOXED_FIELDS_SCHEMA, new JavaBeanSchema.GetterTypeSupplier(), new DefaultTypeConversionsFactory()); @@ -231,7 +238,7 @@ public void testGeneratedSimpleBoxedSetters() { BeanWithBoxedFields bean = new BeanWithBoxedFields(); List setters = JavaBeanUtils.getSetters( - BeanWithBoxedFields.class, + new TypeDescriptor() {}, BEAN_WITH_BOXED_FIELDS_SCHEMA, new SetterTypeSupplier(), new DefaultTypeConversionsFactory()); @@ -254,7 +261,7 @@ public void testGeneratedByteBufferSetters() { BeanWithByteArray bean = new BeanWithByteArray(); List setters = JavaBeanUtils.getSetters( - BeanWithByteArray.class, + new TypeDescriptor() {}, BEAN_WITH_BYTE_ARRAY_SCHEMA, new SetterTypeSupplier(), new DefaultTypeConversionsFactory()); diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/POJOUtilsTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/POJOUtilsTest.java index cd1cdfeb40d2..723353ed8d15 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/POJOUtilsTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/POJOUtilsTest.java @@ -51,6 +51,7 @@ import org.apache.beam.sdk.schemas.utils.TestPOJOs.PrimitiveArrayPOJO; import org.apache.beam.sdk.schemas.utils.TestPOJOs.PrimitiveMapPOJO; import org.apache.beam.sdk.schemas.utils.TestPOJOs.SimplePOJO; +import org.apache.beam.sdk.values.TypeDescriptor; import org.joda.time.DateTime; import org.joda.time.Instant; import org.junit.Test; @@ -69,20 +70,25 @@ public class POJOUtilsTest { @Test public void testNullables() { Schema schema = - POJOUtils.schemaFromPojoClass(POJOWithNullables.class, JavaFieldTypeSupplier.INSTANCE); + POJOUtils.schemaFromPojoClass( + new TypeDescriptor() {}, JavaFieldTypeSupplier.INSTANCE); assertTrue(schema.getField("str").getType().getNullable()); assertFalse(schema.getField("anInt").getType().getNullable()); } @Test public void testSimplePOJO() { - Schema schema = POJOUtils.schemaFromPojoClass(SimplePOJO.class, JavaFieldTypeSupplier.INSTANCE); + Schema schema = + POJOUtils.schemaFromPojoClass( + new TypeDescriptor() {}, JavaFieldTypeSupplier.INSTANCE); assertEquals(SIMPLE_POJO_SCHEMA, schema); } @Test public void testNestedPOJO() { - Schema schema = POJOUtils.schemaFromPojoClass(NestedPOJO.class, JavaFieldTypeSupplier.INSTANCE); + Schema schema = + POJOUtils.schemaFromPojoClass( + new TypeDescriptor() {}, JavaFieldTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(NESTED_POJO_SCHEMA, schema); } @@ -90,42 +96,48 @@ public void testNestedPOJO() { public void testNestedPOJOWithSimplePOJO() { Schema schema = POJOUtils.schemaFromPojoClass( - TestPOJOs.NestedPOJOWithSimplePOJO.class, JavaFieldTypeSupplier.INSTANCE); + new TypeDescriptor() {}, + JavaFieldTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(NESTED_POJO_WITH_SIMPLE_POJO_SCHEMA, schema); } @Test public void testPrimitiveArray() { Schema schema = - POJOUtils.schemaFromPojoClass(PrimitiveArrayPOJO.class, JavaFieldTypeSupplier.INSTANCE); + POJOUtils.schemaFromPojoClass( + new TypeDescriptor() {}, JavaFieldTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(PRIMITIVE_ARRAY_POJO_SCHEMA, schema); } @Test public void testNestedArray() { Schema schema = - POJOUtils.schemaFromPojoClass(NestedArrayPOJO.class, JavaFieldTypeSupplier.INSTANCE); + POJOUtils.schemaFromPojoClass( + new TypeDescriptor() {}, JavaFieldTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(NESTED_ARRAY_POJO_SCHEMA, schema); } @Test public void testNestedCollection() { Schema schema = - POJOUtils.schemaFromPojoClass(NestedCollectionPOJO.class, JavaFieldTypeSupplier.INSTANCE); + POJOUtils.schemaFromPojoClass( + new TypeDescriptor() {}, JavaFieldTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(NESTED_COLLECTION_POJO_SCHEMA, schema); } @Test public void testPrimitiveMap() { Schema schema = - POJOUtils.schemaFromPojoClass(PrimitiveMapPOJO.class, JavaFieldTypeSupplier.INSTANCE); + POJOUtils.schemaFromPojoClass( + new TypeDescriptor() {}, JavaFieldTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(PRIMITIVE_MAP_POJO_SCHEMA, schema); } @Test public void testNestedMap() { Schema schema = - POJOUtils.schemaFromPojoClass(NestedMapPOJO.class, JavaFieldTypeSupplier.INSTANCE); + POJOUtils.schemaFromPojoClass( + new TypeDescriptor() {}, JavaFieldTypeSupplier.INSTANCE); SchemaTestUtils.assertSchemaEquivalent(NESTED_MAP_POJO_SCHEMA, schema); } @@ -148,7 +160,7 @@ public void testGeneratedSimpleGetters() { List getters = POJOUtils.getGetters( - SimplePOJO.class, + new TypeDescriptor() {}, SIMPLE_POJO_SCHEMA, JavaFieldTypeSupplier.INSTANCE, new DefaultTypeConversionsFactory()); @@ -174,7 +186,7 @@ public void testGeneratedSimpleSetters() { SimplePOJO simplePojo = new SimplePOJO(); List setters = POJOUtils.getSetters( - SimplePOJO.class, + new TypeDescriptor() {}, SIMPLE_POJO_SCHEMA, JavaFieldTypeSupplier.INSTANCE, new DefaultTypeConversionsFactory()); @@ -213,7 +225,7 @@ public void testGeneratedSimpleBoxedGetters() { List getters = POJOUtils.getGetters( - POJOWithBoxedFields.class, + new TypeDescriptor() {}, POJO_WITH_BOXED_FIELDS_SCHEMA, JavaFieldTypeSupplier.INSTANCE, new DefaultTypeConversionsFactory()); @@ -229,7 +241,7 @@ public void testGeneratedSimpleBoxedSetters() { POJOWithBoxedFields pojo = new POJOWithBoxedFields(); List setters = POJOUtils.getSetters( - POJOWithBoxedFields.class, + new TypeDescriptor() {}, POJO_WITH_BOXED_FIELDS_SCHEMA, JavaFieldTypeSupplier.INSTANCE, new DefaultTypeConversionsFactory()); @@ -252,7 +264,7 @@ public void testGeneratedByteBufferSetters() { POJOWithByteArray pojo = new POJOWithByteArray(); List setters = POJOUtils.getSetters( - POJOWithByteArray.class, + new TypeDescriptor() {}, POJO_WITH_BYTE_ARRAY_SCHEMA, JavaFieldTypeSupplier.INSTANCE, new DefaultTypeConversionsFactory()); diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/testing/PAssertTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/testing/PAssertTest.java index dfdb6282b549..a02196bb2c05 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/testing/PAssertTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/testing/PAssertTest.java @@ -37,6 +37,7 @@ import org.apache.beam.sdk.coders.AtomicCoder; import org.apache.beam.sdk.coders.CoderException; import org.apache.beam.sdk.coders.SerializableCoder; +import org.apache.beam.sdk.coders.VarIntCoder; import org.apache.beam.sdk.coders.VarLongCoder; import org.apache.beam.sdk.io.GenerateSequence; import org.apache.beam.sdk.testing.PAssert.MatcherCheckerFn; @@ -386,6 +387,36 @@ public void testPAssertEqualsSingletonFalse() throws Exception { assertThat(message, containsString("but: was <42>")); } + @Test + @Category({ValidatesRunner.class, UsesFailureMessage.class}) + public void testPAssertEqualsSingletonFailsForEmptyPCollection() throws Exception { + PCollection pcollection = pipeline.apply(Create.empty(VarIntCoder.of())); + PAssert.thatSingleton("The value was not equal to 44", pcollection).isEqualTo(44); + + Throwable thrown = runExpectingAssertionFailure(pipeline); + + String message = thrown.getMessage(); + + assertThat(message, containsString("The value was not equal to 44")); + assertThat(message, containsString("expected singleton PCollection")); + assertThat(message, containsString("but was: empty PCollection")); + } + + @Test + @Category({ValidatesRunner.class, UsesFailureMessage.class}) + public void testPAssertEqualsSingletonFailsForNonSingletonPCollection() throws Exception { + PCollection pcollection = pipeline.apply(Create.of(44, 44)); + PAssert.thatSingleton("The value was not equal to 44", pcollection).isEqualTo(44); + + Throwable thrown = runExpectingAssertionFailure(pipeline); + + String message = thrown.getMessage(); + + assertThat(message, containsString("The value was not equal to 44")); + assertThat(message, containsString("expected one element")); + assertThat(message, containsString("but was: <44, 44>")); + } + /** Test that we throw an error for false assertion on singleton. */ @Test @Category({ValidatesRunner.class, UsesFailureMessage.class}) diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/BufferedElementCountingOutputStreamTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/BufferedElementCountingOutputStreamTest.java index 5298d29dad10..0c9e0065f5a6 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/BufferedElementCountingOutputStreamTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/BufferedElementCountingOutputStreamTest.java @@ -29,12 +29,12 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; import org.apache.beam.sdk.coders.ByteArrayCoder; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.hamcrest.collection.IsIterableContainingInOrder; import org.junit.Rule; @@ -165,7 +165,7 @@ public void testWritingByteWhenFinishedThrows() throws Exception { public void testWritingBytesWhenFinishedThrows() throws Exception { expectedException.expect(IOException.class); expectedException.expectMessage("Stream has been finished."); - testValues(toBytes("a")).write("b".getBytes(Charsets.UTF_8)); + testValues(toBytes("a")).write("b".getBytes(StandardCharsets.UTF_8)); } @Test @@ -203,7 +203,7 @@ public void testBehaviorWhenBufferPoolEmpty() throws Exception { private List toBytes(String... values) { ImmutableList.Builder builder = ImmutableList.builder(); for (String value : values) { - builder.add(value.getBytes(Charsets.UTF_8)); + builder.add(value.getBytes(StandardCharsets.UTF_8)); } return builder.build(); } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayInputStreamTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayInputStreamTest.java index e87f6a2b0d0a..d26794274653 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayInputStreamTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayInputStreamTest.java @@ -24,7 +24,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; +import java.nio.charset.StandardCharsets; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -33,7 +33,7 @@ @RunWith(JUnit4.class) public class ExposedByteArrayInputStreamTest { - private static final byte[] TEST_DATA = "Hello World!".getBytes(Charsets.UTF_8); + private static final byte[] TEST_DATA = "Hello World!".getBytes(StandardCharsets.UTF_8); private ByteArrayInputStream stream = new ByteArrayInputStream(TEST_DATA); @@ -74,6 +74,6 @@ public void testReadPartial() throws IOException { public void testReadAllAfterReadPartial() throws IOException { assertNotEquals(-1, exposedStream.read()); byte[] ret = exposedStream.readAll(); - assertArrayEquals("ello World!".getBytes(Charsets.UTF_8), ret); + assertArrayEquals("ello World!".getBytes(StandardCharsets.UTF_8), ret); } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayOutputStreamTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayOutputStreamTest.java index 7e1b213c85b2..a4a105a89ddc 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayOutputStreamTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayOutputStreamTest.java @@ -25,7 +25,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; +import java.nio.charset.StandardCharsets; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -34,7 +34,7 @@ @RunWith(JUnit4.class) public class ExposedByteArrayOutputStreamTest { - private static final byte[] TEST_DATA = "Hello World!".getBytes(Charsets.UTF_8); + private static final byte[] TEST_DATA = "Hello World!".getBytes(StandardCharsets.UTF_8); private ExposedByteArrayOutputStream exposedStream = new ExposedByteArrayOutputStream(); private ByteArrayOutputStream stream = new ByteArrayOutputStream(); diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/SerializableUtilsTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/SerializableUtilsTest.java index e15bd42dc3ce..1f3ec0f427b4 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/SerializableUtilsTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/SerializableUtilsTest.java @@ -24,12 +24,12 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.Serializable; +import java.nio.charset.StandardCharsets; import java.util.List; import org.apache.beam.sdk.coders.AtomicCoder; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.CoderException; import org.apache.beam.sdk.testing.InterceptingUrlClassLoader; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.junit.Rule; import org.junit.Test; @@ -93,7 +93,7 @@ public void testDeserializationError() { expectedException.expect(IllegalArgumentException.class); expectedException.expectMessage("unable to deserialize a bogus string"); SerializableUtils.deserializeFromByteArray( - "this isn't legal".getBytes(Charsets.UTF_8), "a bogus string"); + "this isn't legal".getBytes(StandardCharsets.UTF_8), "a bogus string"); } /** A class that is not serializable by Java. */ diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/StringUtilsTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/StringUtilsTest.java index 9e9686ca2011..e8b0e7ecd470 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/StringUtilsTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/StringUtilsTest.java @@ -17,9 +17,13 @@ */ package org.apache.beam.sdk.util; +import static org.apache.commons.lang3.StringUtils.countMatches; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import java.util.UUID; +import java.util.stream.IntStream; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -54,4 +58,23 @@ public void testLevenshteinDistance() { assertEquals(1, StringUtils.getLevenshteinDistance("abc", "ab1c")); // insertion assertEquals(1, StringUtils.getLevenshteinDistance("abc", "a1c")); // modification } + + @Test + public void testArrayToNewlines() { + Object[] uuids = IntStream.range(1, 10).mapToObj(unused -> UUID.randomUUID()).toArray(); + + String r1 = StringUtils.arrayToNewlines(uuids, 6); + assertTrue(r1.endsWith("...\n")); + assertEquals(7, countMatches(r1, "\n")); + String r2 = StringUtils.arrayToNewlines(uuids, 15); + String r3 = StringUtils.arrayToNewlines(uuids, 10); + assertEquals(r3, r2); + } + + @Test + public void testLeftTruncate() { + assertEquals("", StringUtils.leftTruncate(null, 3)); + assertEquals("", StringUtils.leftTruncate("", 3)); + assertEquals("abc...", StringUtils.leftTruncate("abcd", 3)); + } } diff --git a/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceTest.java b/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceTest.java index 3bd87c2ae5c7..1c8d515d5c85 100644 --- a/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceTest.java +++ b/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceTest.java @@ -33,6 +33,7 @@ import com.google.auto.value.AutoValue; import java.io.IOException; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; @@ -60,7 +61,6 @@ import org.apache.beam.sdk.util.ByteStringOutputStream; import org.apache.beam.sdk.util.construction.PipelineTranslation; import org.apache.beam.sdk.values.Row; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -84,7 +84,7 @@ public class ExpansionServiceTest { private ExpansionService expansionService = new ExpansionService(); public static final List BYTE_LIST = ImmutableList.of("testing", "compound", "coders").stream() - .map(str -> str.getBytes(Charsets.UTF_8)) + .map(str -> str.getBytes(StandardCharsets.UTF_8)) .collect(Collectors.toList()); public static final Map BYTE_KV_LIST = ImmutableList.of("testing", "compound", "coders").stream() diff --git a/sdks/java/extensions/arrow/src/main/java/org/apache/beam/sdk/extensions/arrow/ArrowConversion.java b/sdks/java/extensions/arrow/src/main/java/org/apache/beam/sdk/extensions/arrow/ArrowConversion.java index 88e6fefcf9d3..4b6538157fd0 100644 --- a/sdks/java/extensions/arrow/src/main/java/org/apache/beam/sdk/extensions/arrow/ArrowConversion.java +++ b/sdks/java/extensions/arrow/src/main/java/org/apache/beam/sdk/extensions/arrow/ArrowConversion.java @@ -46,6 +46,7 @@ import org.apache.beam.sdk.schemas.Schema.FieldType; import org.apache.beam.sdk.schemas.logicaltypes.FixedBytes; import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptor; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; @@ -291,7 +292,7 @@ private FieldVectorListValueGetterFactory(List fieldVectors) { } @Override - public List create(Class clazz, Schema schema) { + public List create(TypeDescriptor typeDescriptor, Schema schema) { return this.fieldVectors.stream() .map( (fieldVector) -> { diff --git a/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/AvroGenericCoderTranslator.java b/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/AvroGenericCoderTranslator.java index 67f386411d81..e56b95d7f8a6 100644 --- a/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/AvroGenericCoderTranslator.java +++ b/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/AvroGenericCoderTranslator.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.extensions.avro; +import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.List; import org.apache.avro.Schema; @@ -24,7 +25,6 @@ import org.apache.beam.sdk.extensions.avro.coders.AvroGenericCoder; import org.apache.beam.sdk.util.construction.CoderTranslation.TranslationContext; import org.apache.beam.sdk.util.construction.CoderTranslator; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; /** Coder translator for AvroGenericCoder. */ public class AvroGenericCoderTranslator implements CoderTranslator { @@ -35,13 +35,13 @@ public List> getComponents(AvroGenericCoder from) { @Override public byte[] getPayload(AvroGenericCoder from) { - return from.getSchema().toString().getBytes(Charsets.UTF_8); + return from.getSchema().toString().getBytes(StandardCharsets.UTF_8); } @Override public AvroGenericCoder fromComponents( List> components, byte[] payload, TranslationContext context) { - Schema schema = new Schema.Parser().parse(new String(payload, Charsets.UTF_8)); + Schema schema = new Schema.Parser().parse(new String(payload, StandardCharsets.UTF_8)); return AvroGenericCoder.of(schema); } } diff --git a/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/schemas/AvroRecordSchema.java b/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/schemas/AvroRecordSchema.java index e3bf24621cd5..e75647a2ccfa 100644 --- a/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/schemas/AvroRecordSchema.java +++ b/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/schemas/AvroRecordSchema.java @@ -21,7 +21,7 @@ import org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils; import org.apache.beam.sdk.schemas.FieldValueGetter; import org.apache.beam.sdk.schemas.FieldValueTypeInformation; -import org.apache.beam.sdk.schemas.GetterBasedSchemaProvider; +import org.apache.beam.sdk.schemas.GetterBasedSchemaProviderV2; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaProvider; import org.apache.beam.sdk.schemas.SchemaUserTypeCreator; @@ -37,25 +37,27 @@ @SuppressWarnings({ "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) }) -public class AvroRecordSchema extends GetterBasedSchemaProvider { +public class AvroRecordSchema extends GetterBasedSchemaProviderV2 { @Override public Schema schemaFor(TypeDescriptor typeDescriptor) { return AvroUtils.toBeamSchema(typeDescriptor.getRawType()); } @Override - public List fieldValueGetters(Class targetClass, Schema schema) { - return AvroUtils.getGetters(targetClass, schema); + public List fieldValueGetters( + TypeDescriptor targetTypeDescriptor, Schema schema) { + return AvroUtils.getGetters(targetTypeDescriptor, schema); } @Override public List fieldValueTypeInformations( - Class targetClass, Schema schema) { - return AvroUtils.getFieldTypes(targetClass, schema); + TypeDescriptor targetTypeDescriptor, Schema schema) { + return AvroUtils.getFieldTypes(targetTypeDescriptor, schema); } @Override - public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema schema) { - return AvroUtils.getCreator(targetClass, schema); + public SchemaUserTypeCreator schemaTypeCreator( + TypeDescriptor targetTypeDescriptor, Schema schema) { + return AvroUtils.getCreator(targetTypeDescriptor, schema); } } diff --git a/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/schemas/utils/AvroUtils.java b/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/schemas/utils/AvroUtils.java index 7622132c7e27..1b1c45969307 100644 --- a/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/schemas/utils/AvroUtils.java +++ b/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/schemas/utils/AvroUtils.java @@ -808,14 +808,14 @@ public static SchemaCoder schemaCoder(AvroCoder avroCoder) { private static final class AvroSpecificRecordFieldValueTypeSupplier implements FieldValueTypeSupplier { @Override - public List get(Class clazz) { + public List get(TypeDescriptor typeDescriptor) { throw new RuntimeException("Unexpected call."); } @Override - public List get(Class clazz, Schema schema) { + public List get(TypeDescriptor typeDescriptor, Schema schema) { Map mapping = getMapping(schema); - List methods = ReflectUtils.getMethods(clazz); + List methods = ReflectUtils.getMethods(typeDescriptor.getRawType()); List types = Lists.newArrayList(); for (int i = 0; i < methods.size(); ++i) { Method method = methods.get(i); @@ -864,8 +864,9 @@ private Map getMapping(Schema schema) { private static final class AvroPojoFieldValueTypeSupplier implements FieldValueTypeSupplier { @Override - public List get(Class clazz) { - List classFields = ReflectUtils.getFields(clazz); + public List get(TypeDescriptor typeDescriptor) { + List classFields = + ReflectUtils.getFields(typeDescriptor.getRawType()); Map types = Maps.newHashMap(); for (int i = 0; i < classFields.size(); ++i) { java.lang.reflect.Field f = classFields.get(i); @@ -883,36 +884,46 @@ public List get(Class clazz) { } /** Get field types for an AVRO-generated SpecificRecord or a POJO. */ - public static List getFieldTypes(Class clazz, Schema schema) { - if (TypeDescriptor.of(clazz).isSubtypeOf(TypeDescriptor.of(SpecificRecord.class))) { + public static List getFieldTypes( + TypeDescriptor typeDescriptor, Schema schema) { + if (typeDescriptor.isSubtypeOf(TypeDescriptor.of(SpecificRecord.class))) { return JavaBeanUtils.getFieldTypes( - clazz, schema, new AvroSpecificRecordFieldValueTypeSupplier()); + typeDescriptor, schema, new AvroSpecificRecordFieldValueTypeSupplier()); } else { - return POJOUtils.getFieldTypes(clazz, schema, new AvroPojoFieldValueTypeSupplier()); + return POJOUtils.getFieldTypes(typeDescriptor, schema, new AvroPojoFieldValueTypeSupplier()); } } /** Get generated getters for an AVRO-generated SpecificRecord or a POJO. */ - public static List getGetters(Class clazz, Schema schema) { - if (TypeDescriptor.of(clazz).isSubtypeOf(TypeDescriptor.of(SpecificRecord.class))) { + public static List getGetters( + TypeDescriptor typeDescriptor, Schema schema) { + if (typeDescriptor.isSubtypeOf(TypeDescriptor.of(SpecificRecord.class))) { return JavaBeanUtils.getGetters( - clazz, + typeDescriptor, schema, new AvroSpecificRecordFieldValueTypeSupplier(), new AvroTypeConversionFactory()); } else { return POJOUtils.getGetters( - clazz, schema, new AvroPojoFieldValueTypeSupplier(), new AvroTypeConversionFactory()); + typeDescriptor, + schema, + new AvroPojoFieldValueTypeSupplier(), + new AvroTypeConversionFactory()); } } /** Get an object creator for an AVRO-generated SpecificRecord. */ - public static SchemaUserTypeCreator getCreator(Class clazz, Schema schema) { - if (TypeDescriptor.of(clazz).isSubtypeOf(TypeDescriptor.of(SpecificRecord.class))) { - return AvroByteBuddyUtils.getCreator((Class) clazz, schema); + public static SchemaUserTypeCreator getCreator( + TypeDescriptor typeDescriptor, Schema schema) { + if (typeDescriptor.isSubtypeOf(TypeDescriptor.of(SpecificRecord.class))) { + return AvroByteBuddyUtils.getCreator( + (Class) typeDescriptor.getRawType(), schema); } else { return POJOUtils.getSetFieldCreator( - clazz, schema, new AvroPojoFieldValueTypeSupplier(), new AvroTypeConversionFactory()); + typeDescriptor, + schema, + new AvroPojoFieldValueTypeSupplier(), + new AvroTypeConversionFactory()); } } diff --git a/sdks/java/extensions/avro/src/test/java/org/apache/beam/sdk/extensions/avro/io/AvroIOTest.java b/sdks/java/extensions/avro/src/test/java/org/apache/beam/sdk/extensions/avro/io/AvroIOTest.java index 30a1a7787252..2a0bc36f6e9e 100644 --- a/sdks/java/extensions/avro/src/test/java/org/apache/beam/sdk/extensions/avro/io/AvroIOTest.java +++ b/sdks/java/extensions/avro/src/test/java/org/apache/beam/sdk/extensions/avro/io/AvroIOTest.java @@ -36,6 +36,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.Serializable; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -100,7 +101,6 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.TimestampedValue; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ArrayListMultimap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -1436,7 +1436,7 @@ public void testMetadata() throws Exception { "longKey", 100L, "bytesKey", - "bytesValue".getBytes(Charsets.UTF_8)))); + "bytesValue".getBytes(StandardCharsets.UTF_8)))); writePipeline.run(); try (DataFileStream dataFileStream = @@ -1444,7 +1444,7 @@ public void testMetadata() throws Exception { assertEquals("stringValue", dataFileStream.getMetaString("stringKey")); assertEquals(100L, dataFileStream.getMetaLong("longKey")); assertArrayEquals( - "bytesValue".getBytes(Charsets.UTF_8), dataFileStream.getMeta("bytesKey")); + "bytesValue".getBytes(StandardCharsets.UTF_8), dataFileStream.getMeta("bytesKey")); } } diff --git a/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/util/RetryHttpRequestInitializer.java b/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/util/RetryHttpRequestInitializer.java index d053a5f4bf80..b48dc6368050 100644 --- a/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/util/RetryHttpRequestInitializer.java +++ b/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/util/RetryHttpRequestInitializer.java @@ -75,7 +75,7 @@ private static class LoggingHttpBackOffHandler private final Set ignoredResponseCodes; // aggregate the total time spent in exponential backoff private final Counter throttlingMsecs = - Metrics.counter(LoggingHttpBackOffHandler.class, "throttling-msecs"); + Metrics.counter(LoggingHttpBackOffHandler.class, Metrics.THROTTLE_TIME_COUNTER_NAME); private int ioExceptionRetries; private int unsuccessfulResponseRetries; private @Nullable CustomHttpErrors customHttpErrors; diff --git a/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoByteBuddyUtils.java b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoByteBuddyUtils.java index bb2e267bae23..d159e9de44a8 100644 --- a/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoByteBuddyUtils.java +++ b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoByteBuddyUtils.java @@ -476,7 +476,8 @@ public static List getGetters( return CACHED_GETTERS.computeIfAbsent( ClassWithSchema.create(clazz, schema), c -> { - List types = fieldValueTypeSupplier.get(clazz, schema); + List types = + fieldValueTypeSupplier.get(TypeDescriptor.of(clazz), schema); return types.stream() .map( t -> @@ -965,7 +966,7 @@ private static FieldValueGetter createGetter( // Create a map of case enum value to getter. This must be sorted, so store in a TreeMap. TreeMap> oneOfGetters = Maps.newTreeMap(); Map oneOfFieldTypes = - fieldValueTypeSupplier.get(clazz, oneOfType.getOneOfSchema()).stream() + fieldValueTypeSupplier.get(TypeDescriptor.of(clazz), oneOfType.getOneOfSchema()).stream() .collect(Collectors.toMap(FieldValueTypeInformation::getName, f -> f)); for (Field oneOfField : oneOfType.getOneOfSchema().getFields()) { int protoFieldIndex = getFieldNumber(oneOfField); diff --git a/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoMessageSchema.java b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoMessageSchema.java index 1b3d42e35536..faf3ad407af5 100644 --- a/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoMessageSchema.java +++ b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoMessageSchema.java @@ -28,7 +28,7 @@ import org.apache.beam.sdk.extensions.protobuf.ProtoByteBuddyUtils.ProtoTypeConversionsFactory; import org.apache.beam.sdk.schemas.FieldValueGetter; import org.apache.beam.sdk.schemas.FieldValueTypeInformation; -import org.apache.beam.sdk.schemas.GetterBasedSchemaProvider; +import org.apache.beam.sdk.schemas.GetterBasedSchemaProviderV2; import org.apache.beam.sdk.schemas.RowMessages; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.Field; @@ -49,17 +49,17 @@ "nullness", // TODO(https://github.com/apache/beam/issues/20497) "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) }) -public class ProtoMessageSchema extends GetterBasedSchemaProvider { +public class ProtoMessageSchema extends GetterBasedSchemaProviderV2 { private static final class ProtoClassFieldValueTypeSupplier implements FieldValueTypeSupplier { @Override - public List get(Class clazz) { + public List get(TypeDescriptor typeDescriptor) { throw new RuntimeException("Unexpected call."); } @Override - public List get(Class clazz, Schema schema) { - Multimap methods = ReflectUtils.getMethodsMap(clazz); + public List get(TypeDescriptor typeDescriptor, Schema schema) { + Multimap methods = ReflectUtils.getMethodsMap(typeDescriptor.getRawType()); List types = Lists.newArrayListWithCapacity(schema.getFieldCount()); for (int i = 0; i < schema.getFieldCount(); ++i) { @@ -96,9 +96,10 @@ public List get(Class clazz, Schema schema) { } @Override - public List fieldValueGetters(Class targetClass, Schema schema) { + public List fieldValueGetters( + TypeDescriptor targetTypeDescriptor, Schema schema) { return ProtoByteBuddyUtils.getGetters( - targetClass, + targetTypeDescriptor.getRawType(), schema, new ProtoClassFieldValueTypeSupplier(), new ProtoTypeConversionsFactory()); @@ -106,17 +107,19 @@ public List fieldValueGetters(Class targetClass, Schema sch @Override public List fieldValueTypeInformations( - Class targetClass, Schema schema) { - return JavaBeanUtils.getFieldTypes(targetClass, schema, new ProtoClassFieldValueTypeSupplier()); + TypeDescriptor targetTypeDescriptor, Schema schema) { + return JavaBeanUtils.getFieldTypes( + targetTypeDescriptor, schema, new ProtoClassFieldValueTypeSupplier()); } @Override - public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema schema) { + public SchemaUserTypeCreator schemaTypeCreator( + TypeDescriptor targetTypeDescriptor, Schema schema) { SchemaUserTypeCreator creator = ProtoByteBuddyUtils.getBuilderCreator( - targetClass, schema, new ProtoClassFieldValueTypeSupplier()); + targetTypeDescriptor.getRawType(), schema, new ProtoClassFieldValueTypeSupplier()); if (creator == null) { - throw new RuntimeException("Cannot create creator for " + targetClass); + throw new RuntimeException("Cannot create creator for " + targetTypeDescriptor); } return creator; } diff --git a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java index e4e5f3533445..c23a771f3cc8 100644 --- a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java +++ b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -61,7 +62,6 @@ import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -499,7 +499,8 @@ public OutputT expand(InputT input) { requirementsFile.deleteOnExit(); try (Writer fout = new OutputStreamWriter( - new FileOutputStream(requirementsFile.getAbsolutePath()), Charsets.UTF_8)) { + new FileOutputStream(requirementsFile.getAbsolutePath()), + StandardCharsets.UTF_8)) { for (String pkg : extraPackages) { fout.write(pkg); fout.write('\n'); diff --git a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java index 4392f23c4636..ab4d02ec838d 100644 --- a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java +++ b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java @@ -24,13 +24,13 @@ import java.io.InputStreamReader; import java.net.ServerSocket; import java.net.Socket; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.concurrent.TimeoutException; import org.apache.beam.sdk.util.ReleaseInfo; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.ByteStreams; import org.slf4j.Logger; @@ -106,7 +106,8 @@ public AutoCloseable start() throws IOException, InterruptedException { new ProcessBuilder(bootstrapCommand).redirectError(ProcessBuilder.Redirect.INHERIT).start(); bootstrap.getOutputStream().close(); BufferedReader reader = - new BufferedReader(new InputStreamReader(bootstrap.getInputStream(), Charsets.UTF_8)); + new BufferedReader( + new InputStreamReader(bootstrap.getInputStream(), StandardCharsets.UTF_8)); String lastLine = reader.readLine(); String lastNonEmptyLine = lastLine; while (lastLine != null) { diff --git a/sdks/java/extensions/sql/jdbc/build.gradle b/sdks/java/extensions/sql/jdbc/build.gradle index 41fddce7116a..c5d462e0f5ca 100644 --- a/sdks/java/extensions/sql/jdbc/build.gradle +++ b/sdks/java/extensions/sql/jdbc/build.gradle @@ -35,11 +35,11 @@ dependencies { implementation "jline:jline:2.14.6" permitUnusedDeclared "jline:jline:2.14.6" // BEAM-11761 implementation "sqlline:sqlline:1.4.0" - implementation library.java.vendored_guava_32_1_2_jre implementation library.java.vendored_calcite_1_28_0 permitUnusedDeclared library.java.vendored_calcite_1_28_0 testImplementation project(path: ":sdks:java:io:google-cloud-platform", configuration: "testRuntimeMigration") testImplementation library.java.junit + testImplementation library.java.vendored_guava_32_1_2_jre // Depending on outputs so integrationTest can run with only test dependencies. // This enables us to test the JDBC jar being loaded on a custom classloader. integrationTest sourceSets.test.output diff --git a/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java b/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java index ac049608ebcb..8c87343cd7c1 100644 --- a/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java +++ b/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java @@ -23,10 +23,10 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.PrintStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.checkerframework.checker.nullness.qual.Nullable; import sqlline.SqlLine; import sqlline.SqlLine.Status; @@ -68,11 +68,11 @@ static Status runSqlLine( SqlLine sqlLine = new SqlLine(); if (outputStream != null) { - sqlLine.setOutputStream(new PrintStream(outputStream, false, Charsets.UTF_8.name())); + sqlLine.setOutputStream(new PrintStream(outputStream, false, StandardCharsets.UTF_8.name())); } if (errorStream != null) { - sqlLine.setErrorStream(new PrintStream(errorStream, false, Charsets.UTF_8.name())); + sqlLine.setErrorStream(new PrintStream(errorStream, false, StandardCharsets.UTF_8.name())); } return sqlLine.begin(modifiedArgs, inputStream, true); diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/text/TextTableProviderTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/text/TextTableProviderTest.java index e5a46f877001..e34106db1d93 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/text/TextTableProviderTest.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/text/TextTableProviderTest.java @@ -21,6 +21,7 @@ import static org.hamcrest.Matchers.containsInAnyOrder; import java.io.File; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import org.apache.beam.sdk.extensions.sql.SqlTransform; import org.apache.beam.sdk.schemas.Schema; @@ -33,7 +34,6 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TypeDescriptors; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -81,7 +81,7 @@ protected void after() {} public void testLegacyDefaultCsv() throws Exception { Files.write( tempFolder.newFile("test.csv").toPath(), - "hello,13\n\ngoodbye,42\n".getBytes(Charsets.UTF_8)); + "hello,13\n\ngoodbye,42\n".getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = String.format( @@ -105,7 +105,7 @@ public void testLegacyDefaultCsv() throws Exception { public void testLegacyTdfCsv() throws Exception { Files.write( tempFolder.newFile("test.csv").toPath(), - "hello\t13\n\ngoodbye\t42\n".getBytes(Charsets.UTF_8)); + "hello\t13\n\ngoodbye\t42\n".getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = @@ -138,7 +138,7 @@ public void testLegacyTdfCsv() throws Exception { public void testExplicitCsv() throws Exception { Files.write( tempFolder.newFile("test.csv").toPath(), - "hello,13\n\ngoodbye,42\n".getBytes(Charsets.UTF_8)); + "hello,13\n\ngoodbye,42\n".getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = @@ -165,7 +165,8 @@ public void testExplicitCsv() throws Exception { @Test public void testExplicitCsvExcel() throws Exception { Files.write( - tempFolder.newFile("test.csv").toPath(), "hello\n\ngoodbye\n".getBytes(Charsets.UTF_8)); + tempFolder.newFile("test.csv").toPath(), + "hello\n\ngoodbye\n".getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = @@ -191,7 +192,8 @@ public void testExplicitCsvExcel() throws Exception { public void testLines() throws Exception { // Data that looks like CSV but isn't parsed as it Files.write( - tempFolder.newFile("test.csv").toPath(), "hello,13\ngoodbye,42\n".getBytes(Charsets.UTF_8)); + tempFolder.newFile("test.csv").toPath(), + "hello,13\ngoodbye,42\n".getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = @@ -210,7 +212,8 @@ public void testLines() throws Exception { @Test public void testJson() throws Exception { - Files.write(tempFolder.newFile("test.json").toPath(), JSON_TEXT.getBytes(Charsets.UTF_8)); + Files.write( + tempFolder.newFile("test.json").toPath(), JSON_TEXT.getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = @@ -229,7 +232,8 @@ public void testJson() throws Exception { public void testInvalidJson() throws Exception { File deadLetterFile = new File(tempFolder.getRoot(), "dead-letter-file"); Files.write( - tempFolder.newFile("test.json").toPath(), INVALID_JSON_TEXT.getBytes(Charsets.UTF_8)); + tempFolder.newFile("test.json").toPath(), + INVALID_JSON_TEXT.getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/state/FnApiStateAccessor.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/state/FnApiStateAccessor.java index 93f89301d158..5b304890b354 100644 --- a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/state/FnApiStateAccessor.java +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/state/FnApiStateAccessor.java @@ -813,14 +813,8 @@ public WatermarkHoldState bindWatermark( private Cache getCacheFor(StateKey stateKey) { switch (stateKey.getTypeCase()) { case BAG_USER_STATE: - for (CacheToken token : cacheTokens.get()) { - if (!token.hasUserState()) { - continue; - } - return Caches.subCache(processWideCache, token, stateKey); - } - break; case MULTIMAP_KEYS_USER_STATE: + case ORDERED_LIST_USER_STATE: for (CacheToken token : cacheTokens.get()) { if (!token.hasUserState()) { continue; diff --git a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsOptions.java b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsOptions.java index 7cc3562dac8a..482e3aa9cc92 100644 --- a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsOptions.java +++ b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsOptions.java @@ -95,14 +95,14 @@ public Region create(PipelineOptions options) { * *

{@code --awsCredentialsProvider={
    *   "@type": "StaticCredentialsProvider",
-   *   "awsAccessKeyId": "key_id_value",
-   *   "awsSecretKey": "secret_value"
+   *   "accessKeyId": "key_id_value",
+   *   "secretAccessKey": "secret_value"
    * }
    *
    * --awsCredentialsProvider={
    *   "@type": "StaticCredentialsProvider",
-   *   "awsAccessKeyId": "key_id_value",
-   *   "awsSecretKey": "secret_value",
+   *   "accessKeyId": "key_id_value",
+   *   "secretAccessKey": "secret_value",
    *   "sessionToken": "token_value"
    * }}
* diff --git a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/schemas/AwsSchemaProvider.java b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/schemas/AwsSchemaProvider.java index 65812d72df1d..acdfcfc1ad09 100644 --- a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/schemas/AwsSchemaProvider.java +++ b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/schemas/AwsSchemaProvider.java @@ -38,7 +38,7 @@ import org.apache.beam.sdk.schemas.Factory; import org.apache.beam.sdk.schemas.FieldValueGetter; import org.apache.beam.sdk.schemas.FieldValueTypeInformation; -import org.apache.beam.sdk.schemas.GetterBasedSchemaProvider; +import org.apache.beam.sdk.schemas.GetterBasedSchemaProviderV2; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaUserTypeCreator; import org.apache.beam.sdk.transforms.SerializableFunction; @@ -60,7 +60,7 @@ * software.amazon.awssdk.services.dynamodb.model.AttributeValue DynamoDB AttributeValue} ({@link * org.apache.beam.sdk.io.aws2.dynamodb.AttributeValueCoder coder}). */ -public class AwsSchemaProvider extends GetterBasedSchemaProvider { +public class AwsSchemaProvider extends GetterBasedSchemaProviderV2 { /** Byte-code generated {@link SdkBuilder} factories. */ @SuppressWarnings("rawtypes") // Crashes checker otherwise private static final Map FACTORIES = Maps.newConcurrentMap(); @@ -75,9 +75,11 @@ public class AwsSchemaProvider extends GetterBasedSchemaProvider { @SuppressWarnings("rawtypes") @Override - public List fieldValueGetters(Class clazz, Schema schema) { + public List fieldValueGetters( + TypeDescriptor targetTypeDescriptor, Schema schema) { ConverterFactory fromAws = ConverterFactory.fromAws(); - Map> sdkFields = sdkFieldsByName((Class) clazz); + Map> sdkFields = + sdkFieldsByName((Class) targetTypeDescriptor.getRawType()); List getters = new ArrayList<>(schema.getFieldCount()); for (String field : schema.getFieldNames()) { SdkField sdkField = checkStateNotNull(sdkFields.get(field), "Unknown field"); @@ -91,7 +93,7 @@ public List fieldValueGetters(Class clazz, Schema schema) { @Override public SerializableFunction fromRowFunction(TypeDescriptor type) { checkState(SdkPojo.class.isAssignableFrom(type.getRawType()), "Unsupported type %s", type); - return FromRowFactory.create(type.getRawType()); + return FromRowFactory.create(type); } private static class FromRowWithBuilder @@ -114,7 +116,7 @@ public T apply(Row row) { } } SdkBuilder builder = sdkBuilder(cls); - List setters = factory.create(cls, row.getSchema()); + List setters = factory.create(TypeDescriptor.of(cls), row.getSchema()); for (SdkBuilderSetter set : setters) { if (!row.getSchema().hasField(set.name())) { continue; @@ -150,14 +152,19 @@ private static class FromRowFactory implements Factory(new SettersFactory()); @SuppressWarnings("nullness") // schema nullable for this factory - static SerializableFunction create(Class clazz) { - checkState(SdkPojo.class.isAssignableFrom(clazz), "Unsupported clazz %s", clazz); - return (SerializableFunction) new FromRowFactory().cachingFactory.create(clazz, null); + static SerializableFunction create(TypeDescriptor typeDescriptor) { + checkState( + SdkPojo.class.isAssignableFrom(typeDescriptor.getRawType()), + "Unsupported clazz %s", + typeDescriptor); + return (SerializableFunction) + new FromRowFactory().cachingFactory.create(typeDescriptor, null); } @Override - public SerializableFunction create(Class clazz, Schema ignored) { - return new FromRowWithBuilder<>((Class) clazz, settersFactory); + public SerializableFunction create(TypeDescriptor typeDescriptor, Schema ignored) { + return new FromRowWithBuilder<>( + (Class) typeDescriptor.getRawType(), settersFactory); } private class SettersFactory implements Factory> { @@ -168,8 +175,9 @@ private SettersFactory() { } @Override - public List create(Class clazz, Schema schema) { - Map> fields = sdkFieldsByName((Class) clazz); + public List create(TypeDescriptor typeDescriptor, Schema schema) { + Map> fields = + sdkFieldsByName((Class) typeDescriptor.getRawType()); checkForUnknownFields(schema, fields); List setters = new ArrayList<>(schema.getFieldCount()); @@ -192,12 +200,14 @@ private void checkForUnknownFields(Schema schema, Map> field } @Override - public List fieldValueTypeInformations(Class cls, Schema schema) { + public List fieldValueTypeInformations( + TypeDescriptor targetTypeDescriptor, Schema schema) { throw new UnsupportedOperationException("FieldValueTypeInformation not available"); } @Override - public SchemaUserTypeCreator schemaTypeCreator(Class cls, Schema schema) { + public SchemaUserTypeCreator schemaTypeCreator( + TypeDescriptor targetTypeDescriptor, Schema schema) { throw new UnsupportedOperationException("SchemaUserTypeCreator not available"); } diff --git a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/schemas/AwsTypes.java b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/schemas/AwsTypes.java index f5647c040526..a0fc0c8e91cd 100644 --- a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/schemas/AwsTypes.java +++ b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/schemas/AwsTypes.java @@ -38,6 +38,7 @@ import org.apache.beam.sdk.schemas.Schema.FieldType; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Ascii; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; @@ -210,7 +211,8 @@ private static class ToAws extends ConverterFactory { @Override @SuppressWarnings("nullness") // schema nullable for this factory protected SerializableFunction pojoTypeConverter(SdkField field) { - return fromRowFactory.create(targetClassOf(field.constructor().get()), null); + return fromRowFactory.create( + TypeDescriptor.of(targetClassOf(field.constructor().get())), null); } } diff --git a/sdks/java/io/clickhouse/src/main/java/org/apache/beam/sdk/io/clickhouse/ClickHouseWriter.java b/sdks/java/io/clickhouse/src/main/java/org/apache/beam/sdk/io/clickhouse/ClickHouseWriter.java index c8c49a656e3b..09a6ced44d37 100644 --- a/sdks/java/io/clickhouse/src/main/java/org/apache/beam/sdk/io/clickhouse/ClickHouseWriter.java +++ b/sdks/java/io/clickhouse/src/main/java/org/apache/beam/sdk/io/clickhouse/ClickHouseWriter.java @@ -21,12 +21,12 @@ import com.clickhouse.client.ClickHousePipedOutputStream; import com.clickhouse.client.data.BinaryStreamUtils; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.List; import org.apache.beam.sdk.io.clickhouse.TableSchema.ColumnType; import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.RowWithStorage; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.joda.time.Days; import org.joda.time.Instant; @@ -60,7 +60,7 @@ static void writeValue(ClickHouseOutputStream stream, ColumnType columnType, Obj byte[] bytes; if (value instanceof String) { - bytes = ((String) value).getBytes(Charsets.UTF_8); + bytes = ((String) value).getBytes(StandardCharsets.UTF_8); } else { bytes = ((byte[]) value); } diff --git a/sdks/java/io/common/src/main/java/org/apache/beam/sdk/io/common/SchemaAwareJavaBeans.java b/sdks/java/io/common/src/main/java/org/apache/beam/sdk/io/common/SchemaAwareJavaBeans.java index b97d4ab8c5f8..76535c3e17f6 100644 --- a/sdks/java/io/common/src/main/java/org/apache/beam/sdk/io/common/SchemaAwareJavaBeans.java +++ b/sdks/java/io/common/src/main/java/org/apache/beam/sdk/io/common/SchemaAwareJavaBeans.java @@ -137,7 +137,7 @@ public static DoublyNestedDataTypes doublyNestedDataTypes( .build(); } - private static final TypeDescriptor + public static final TypeDescriptor ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR = TypeDescriptor.of(AllPrimitiveDataTypes.class); /** The schema for {@link AllPrimitiveDataTypes}. */ @@ -160,7 +160,7 @@ public static SerializableFunction allPrimitiveDataT return DEFAULT_SCHEMA_PROVIDER.fromRowFunction(ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR); } - private static final TypeDescriptor + public static final TypeDescriptor NULLABLE_ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR = TypeDescriptor.of(NullableAllPrimitiveDataTypes.class); @@ -187,7 +187,7 @@ public static SerializableFunction allPrimitiveDataT NULLABLE_ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR); } - private static final TypeDescriptor TIME_CONTAINING_TYPE_DESCRIPTOR = + public static final TypeDescriptor TIME_CONTAINING_TYPE_DESCRIPTOR = TypeDescriptor.of(TimeContaining.class); /** The schema for {@link TimeContaining}. */ @@ -250,7 +250,7 @@ public static SerializableFunction byteSequenceTypeFromRo return DEFAULT_SCHEMA_PROVIDER.fromRowFunction(BYTE_SEQUENCE_TYPE_TYPE_DESCRIPTOR); } - private static final TypeDescriptor + public static final TypeDescriptor ARRAY_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR = TypeDescriptor.of(ArrayPrimitiveDataTypes.class); /** The schema for {@link ArrayPrimitiveDataTypes}. */ diff --git a/sdks/java/io/contextualtextio/src/test/java/org/apache/beam/sdk/io/contextualtextio/ContextualTextIOTest.java b/sdks/java/io/contextualtextio/src/test/java/org/apache/beam/sdk/io/contextualtextio/ContextualTextIOTest.java index 48904cae430f..2cc89a2a0dc1 100644 --- a/sdks/java/io/contextualtextio/src/test/java/org/apache/beam/sdk/io/contextualtextio/ContextualTextIOTest.java +++ b/sdks/java/io/contextualtextio/src/test/java/org/apache/beam/sdk/io/contextualtextio/ContextualTextIOTest.java @@ -44,6 +44,7 @@ import java.io.OutputStream; import java.io.PrintStream; import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -87,7 +88,6 @@ import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Joiner; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -524,7 +524,7 @@ public String createFileFromList(List input) throws Exception { try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) { for (String elem : input) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); } } @@ -548,7 +548,7 @@ public void multipleFilesTest() throws Exception { for (int lineNum = 0; lineNum < numLines; ++lineNum) { String elem = filename + " " + lineNum; byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); } } @@ -588,7 +588,7 @@ public void testWithHintMatchesManyFiles() throws IOException { for (int lineNum = 0; lineNum < 10 + num; ++lineNum) { String elem = filename + " " + lineNum; byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); } } @@ -817,7 +817,7 @@ private void runTestRead(String[] expected) throws Exception { try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) { for (String elem : expected) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); } } @@ -840,7 +840,7 @@ private void runTestReadLineNumsAndFileName(String[] expected) throws Exception int lineNum = 0; for (String elem : expected) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); actualExpected.add(lineNum + " " + filePath + " " + line); lineNum++; @@ -1121,7 +1121,7 @@ public void testProgressEmptyFile() throws IOException { public void testProgressTextFile() throws IOException { String file = "line1\nline2\nline3"; try (BoundedSource.BoundedReader reader = - prepareSource(file.getBytes(Charsets.UTF_8)) + prepareSource(file.getBytes(StandardCharsets.UTF_8)) .createReader(PipelineOptionsFactory.create())) { // Check preconditions before starting assertEquals(0.0, reader.getFractionConsumed(), 1e-6); @@ -1157,7 +1157,7 @@ public void testProgressTextFile() throws IOException { @Test public void testProgressAfterSplitting() throws IOException { String file = "line1\nline2\nline3"; - BoundedSource source = prepareSource(file.getBytes(Charsets.UTF_8)); + BoundedSource source = prepareSource(file.getBytes(StandardCharsets.UTF_8)); BoundedSource remainder; // Create the remainder, verifying properties pre- and post-splitting. diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIO.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIO.java index 5bed0186e0d6..fc2b68c0a893 100644 --- a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIO.java +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIO.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.io.csv; import static java.util.Objects.requireNonNull; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import static org.apache.beam.sdk.values.TypeDescriptors.rows; import static org.apache.beam.sdk.values.TypeDescriptors.strings; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; @@ -35,8 +36,13 @@ import org.apache.beam.sdk.io.WriteFiles; import org.apache.beam.sdk.io.WriteFilesResult; import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.JavaBeanSchema; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.schemas.SchemaProvider; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.SerializableFunction; @@ -44,6 +50,7 @@ import org.apache.beam.sdk.transforms.display.HasDisplayData; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.apache.commons.csv.CSVFormat; @@ -55,6 +62,72 @@ *

Reading from CSV files is not yet implemented. Please see https://github.com/apache/beam/issues/24552. * + *

Valid CSVFormat Configuration

+ * + *

A {@code + * CSVFormat} must meet the following conditions to be considered valid when reading CSV: + * + *

+ * + *

Ignored CSVFormat parameters

+ * + *

The following {@code CSVFormat} parameters are either not relevant for parsing CSV or are + * validated satisfactorily by the Apache Commons CSV + * library. + * + *

+ * *

Writing CSV files

* *

To write a {@link PCollection} to one or more CSV files, use {@link CsvIO.Write}, using {@link @@ -274,6 +347,161 @@ public static Write writeRows(String to, CSVFormat csvFormat) { .build(); } + /** + * Instantiates a {@link CsvIOParse} for parsing CSV string records into custom {@link + * Schema}-mapped {@code Class}es from the records' assumed CsvFormat. + * See the Beam + * Programming Guide on how to configure your custom {@code Class} for Beam to infer its + * {@link Schema} using a {@link SchemaProvider} annotation such as {@link AutoValueSchema} or + * {@link JavaBeanSchema}. + * + *

Example usage

+ * + * The example below illustrates parsing CsvFormat#DEFAULT + * formatted CSV string records, read from {@link TextIO.Read}, into an {@link AutoValueSchema} + * annotated AutoValue data + * class {@link PCollection}. + * + *
{@code
+   * // SomeDataClass is a data class configured for Beam to automatically infer its Schema.
+   * @DefaultSchema(AutoValueSchema.class)
+   * @AutoValue
+   * abstract class SomeDataClass {
+   *
+   *    abstract String getSomeString();
+   *    abstract Integer getSomeInteger();
+   *
+   *    @AutoValue.Builder
+   *    abstract static class Builder {
+   *      abstract Builder setSomeString(String value);
+   *      abstract Builder setSomeInteger(Integer value);
+   *
+   *      abstract SomeDataClass build();
+   *    }
+   * }
+   *
+   * // Pipeline example reads CSV string records from Google Cloud storage and writes to BigQuery.
+   * Pipeline pipeline = Pipeline.create();
+   *
+   * // Read CSV records from Google Cloud storage using TextIO.
+   * PCollection csvRecords = pipeline
+   *  .apply(TextIO.read().from("gs://bucket/folder/*.csv");
+   *
+   * // Apply the CSV records PCollection to the CsvIOParse transform instantiated using CsvIO.parse.
+   * CsvIOParseResult result = csvRecords.apply(CsvIO.parse(
+   *      SomeDataClass.class,
+   *      CsvFormat.DEFAULT.withHeader("someString", "someInteger")
+   * ));
+   *
+   * // Acquire any processing errors to either write to logs or apply to a downstream dead letter queue such as BigQuery.
+   * result.getErrors().apply(BigQueryIO.write()
+   *  .to("project:dataset.table_of_errors")
+   *  .useBeamSchema()
+   *  .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+   *  .withWriteDisposition(WriteDisposition.WRITE_APPEND));
+   *
+   * // Acquire the successful PCollection output.
+   * PCollection output = result.getOutput();
+   *
+   * // Do something with the output such as write to BigQuery.
+   * output.apply(BigQueryIO.write()
+   *  .to("project:dataset.table_of_output")
+   *  .useBeamSchema()
+   *  .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+   *  .withWriteDisposition(WriteDisposition.WRITE_APPEND));
+   * }
+ */ + public static CsvIOParse parse(Class klass, CSVFormat csvFormat) { + CsvIOParseHelpers.validateCsvFormat(csvFormat); + SchemaProvider provider = new DefaultSchema.DefaultSchemaProvider(); + TypeDescriptor type = TypeDescriptor.of(klass); + Schema schema = + checkStateNotNull( + provider.schemaFor(type), + "Illegal %s: Schema could not be generated from given %s class", + Schema.class, + klass); + CsvIOParseHelpers.validateCsvFormatWithSchema(csvFormat, schema); + SerializableFunction fromRowFn = + checkStateNotNull( + provider.fromRowFunction(type), + "FromRowFn could not be generated from the given %s class", + klass); + SerializableFunction toRowFn = + checkStateNotNull( + provider.toRowFunction(type), + "ToRowFn could not be generated from the given %s class", + klass); + SchemaCoder coder = SchemaCoder.of(schema, type, toRowFn, fromRowFn); + CsvIOParseConfiguration.Builder builder = CsvIOParseConfiguration.builder(); + builder.setCsvFormat(csvFormat).setSchema(schema).setCoder(coder).setFromRowFn(fromRowFn); + return CsvIOParse.builder().setConfigBuilder(builder).build(); + } + + /** + * Instantiates a {@link CsvIOParse} for parsing CSV string records into {@link Row}s from the + * records' assumed CsvFormat + * and expected {@link Schema}. + * + *

Example usage

+ * + * The example below illustrates parsing CsvFormat#DEFAULT + * formatted CSV string records, read from {@link TextIO.Read}, into a {@link Row} {@link + * PCollection}. + * + *
{@code
+   * // Define the expected Schema.
+   * Schema schema = Schema.of(
+   *  Schema.Field.of("someString", FieldType.STRING),
+   *  Schema.Field.of("someInteger", FieldType.INT32)
+   * );
+   *
+   * // Pipeline example reads CSV string records from Google Cloud storage and writes to BigQuery.
+   * Pipeline pipeline = Pipeline.create();
+   *
+   * // Read CSV records from Google Cloud storage using TextIO.
+   * PCollection csvRecords = pipeline
+   *  .apply(TextIO.read().from("gs://bucket/folder/*.csv");
+   *
+   * // Apply the CSV records PCollection to the CsvIOParse transform instantiated using CsvIO.parseRows.
+   * CsvIOParseResult result = csvRecords.apply(CsvIO.parseRow(
+   *      schema,
+   *      CsvFormat.DEFAULT.withHeader("someString", "someInteger")
+   * ));
+   *
+   * // Acquire any processing errors to either write to logs or apply to a downstream dead letter queue such as BigQuery.
+   * result.getErrors().apply(BigQueryIO.write()
+   *  .to("project:dataset.table_of_errors")
+   *  .useBeamSchema()
+   *  .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+   *  .withWriteDisposition(WriteDisposition.WRITE_APPEND));
+   *
+   * // Acquire the successful PCollection output.
+   * PCollection output = result.getOutput();
+   *
+   * // Do something with the output such as write to BigQuery.
+   * output.apply(BigQueryIO.write()
+   *  .to("project:dataset.table_of_output")
+   *  .useBeamSchema()
+   *  .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+   *  .withWriteDisposition(WriteDisposition.WRITE_APPEND));
+   * }
+ */ + public static CsvIOParse parseRows(Schema schema, CSVFormat csvFormat) { + CsvIOParseHelpers.validateCsvFormat(csvFormat); + CsvIOParseHelpers.validateCsvFormatWithSchema(csvFormat, schema); + RowCoder coder = RowCoder.of(schema); + CsvIOParseConfiguration.Builder builder = CsvIOParseConfiguration.builder(); + builder.setCsvFormat(csvFormat).setSchema(schema).setCoder(coder).setFromRowFn(row -> row); + return CsvIOParse.builder().setConfigBuilder(builder).build(); + } + /** {@link PTransform} for writing CSV files. */ @AutoValue public abstract static class Write extends PTransform, WriteFilesResult> diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParse.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParse.java new file mode 100644 index 000000000000..0a27cdbc57ec --- /dev/null +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParse.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import com.google.auto.value.AutoValue; +import java.util.List; +import java.util.Map; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.transforms.Flatten; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionList; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.TupleTag; + +/** + * {@link PTransform} for Parsing CSV Record Strings into {@link Schema}-mapped target types. {@link + * CsvIOParse} is not instantiated directly but via {@link CsvIO#parse} or {@link CsvIO#parseRows}. + */ +@AutoValue +public abstract class CsvIOParse extends PTransform, CsvIOParseResult> { + + final TupleTag outputTag = new TupleTag() {}; + final TupleTag errorTag = new TupleTag() {}; + + static CsvIOParse.Builder builder() { + return new AutoValue_CsvIOParse.Builder<>(); + } + + // TODO(https://github.com/apache/beam/issues/31875): Implement in future PR. + public CsvIOParse withCustomRecordParsing( + Map> customProcessingMap) { + return this; + } + + /** Contains all configuration parameters for {@link CsvIOParse}. */ + abstract CsvIOParseConfiguration.Builder getConfigBuilder(); + + @AutoValue.Builder + abstract static class Builder { + abstract Builder setConfigBuilder(CsvIOParseConfiguration.Builder configBuilder); + + abstract CsvIOParse build(); + } + + @Override + public CsvIOParseResult expand(PCollection input) { + CsvIOParseConfiguration configuration = getConfigBuilder().build(); + + CsvIOStringToCsvRecord stringToCsvRecord = + new CsvIOStringToCsvRecord(configuration.getCsvFormat()); + CsvIOParseResult> stringToCsvRecordResult = input.apply(stringToCsvRecord); + PCollection> stringToRecordOutput = stringToCsvRecordResult.getOutput(); + PCollection stringToRecordErrors = stringToCsvRecordResult.getErrors(); + + CsvIORecordToObjects recordToObjects = new CsvIORecordToObjects(configuration); + CsvIOParseResult recordToObjectsResult = stringToRecordOutput.apply(recordToObjects); + PCollection output = recordToObjectsResult.getOutput(); + PCollection recordToObjectsErrors = recordToObjectsResult.getErrors(); + + PCollectionList errorList = + PCollectionList.of(stringToRecordErrors).and(recordToObjectsErrors); + PCollection errors = errorList.apply(Flatten.pCollections()); + + PCollectionTuple result = PCollectionTuple.of(outputTag, output).and(errorTag, errors); + return CsvIOParseResult.of(outputTag, configuration.getCoder(), errorTag, result); + } +} diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseConfiguration.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseConfiguration.java index 22f06edc8322..dd9ef5b34868 100644 --- a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseConfiguration.java +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseConfiguration.java @@ -18,26 +18,25 @@ package org.apache.beam.sdk.io.csv; import com.google.auto.value.AutoValue; +import java.io.Serializable; import java.util.HashMap; import java.util.Map; import java.util.Optional; +import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.Row; import org.apache.commons.csv.CSVFormat; /** Stores parameters needed for CSV record parsing. */ @AutoValue -abstract class CsvIOParseConfiguration { +abstract class CsvIOParseConfiguration implements Serializable { - static Builder builder() { - return new AutoValue_CsvIOParseConfiguration.Builder(); + static Builder builder() { + return new AutoValue_CsvIOParseConfiguration.Builder<>(); } - /** - * The expected CSVFormat - * of the parsed CSV record. - */ + /** The expected {@link CSVFormat} of the parsed CSV record. */ abstract CSVFormat getCsvFormat(); /** The expected {@link Schema} of the target type. */ @@ -46,20 +45,30 @@ static Builder builder() { /** A map of the {@link Schema.Field#getName()} to the custom CSV processing lambda. */ abstract Map> getCustomProcessingMap(); + /** The expected {@link Coder} of the target type. */ + abstract Coder getCoder(); + + /** A {@link SerializableFunction} that converts from Row to the target type. */ + abstract SerializableFunction getFromRowFn(); + @AutoValue.Builder - abstract static class Builder { - abstract Builder setCsvFormat(CSVFormat csvFormat); + abstract static class Builder implements Serializable { + abstract Builder setCsvFormat(CSVFormat csvFormat); - abstract Builder setSchema(Schema schema); + abstract Builder setSchema(Schema schema); - abstract Builder setCustomProcessingMap( + abstract Builder setCustomProcessingMap( Map> customProcessingMap); + abstract Builder setCoder(Coder coder); + + abstract Builder setFromRowFn(SerializableFunction fromRowFn); + abstract Optional>> getCustomProcessingMap(); - abstract CsvIOParseConfiguration autoBuild(); + abstract CsvIOParseConfiguration autoBuild(); - final CsvIOParseConfiguration build() { + final CsvIOParseConfiguration build() { if (!getCustomProcessingMap().isPresent()) { setCustomProcessingMap(new HashMap<>()); } diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseError.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseError.java index ad7d05912faa..7a2be9786d78 100644 --- a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseError.java +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseError.java @@ -17,9 +17,18 @@ */ package org.apache.beam.sdk.io.csv; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + import com.google.auto.value.AutoValue; +import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.schemas.SchemaProvider; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptor; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Instant; @@ -36,6 +45,21 @@ static Builder builder() { return new AutoValue_CsvIOParseError.Builder(); } + private static final SchemaProvider SCHEMA_PROVIDER = new AutoValueSchema(); + + private static final TypeDescriptor TYPE = + TypeDescriptor.of(CsvIOParseError.class); + + private static final Schema SCHEMA = checkStateNotNull(SCHEMA_PROVIDER.schemaFor(TYPE)); + + private static final SerializableFunction TO_ROW_FN = + checkStateNotNull(SCHEMA_PROVIDER.toRowFunction(TYPE)); + + private static final SerializableFunction FROM_ROW_FN = + checkStateNotNull(SCHEMA_PROVIDER.fromRowFunction(TYPE)); + + static final Coder CODER = SchemaCoder.of(SCHEMA, TYPE, TO_ROW_FN, FROM_ROW_FN); + /** The caught {@link Exception#getMessage()}. */ public abstract String getMessage(); diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseHelpers.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseHelpers.java index df99807cfeaf..15a398d3c557 100644 --- a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseHelpers.java +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseHelpers.java @@ -17,33 +17,108 @@ */ package org.apache.beam.sdk.io.csv; +import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; + import java.math.BigDecimal; -import java.time.Instant; -import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.commons.csv.CSVFormat; +import org.joda.time.Instant; /** A utility class containing shared methods for parsing CSV records. */ final class CsvIOParseHelpers { - /** Validate the {@link CSVFormat} for CSV record parsing requirements. */ - // TODO(https://github.com/apache/beam/issues/31712): implement method. - static void validate(CSVFormat format) {} + /** + * Validate the {@link CSVFormat} for CSV record parsing requirements. See the public-facing + * "Reading CSV Files" section of the {@link CsvIO} documentation for information regarding which + * {@link CSVFormat} parameters are checked during validation. + */ + static void validateCsvFormat(CSVFormat format) { + String[] header = + checkArgumentNotNull(format.getHeader(), "Illegal %s: header is required", CSVFormat.class); + + checkArgument(header.length > 0, "Illegal %s: header cannot be empty", CSVFormat.class); + + checkArgument( + !format.getAllowMissingColumnNames(), + "Illegal %s: cannot allow missing column names", + CSVFormat.class); + + checkArgument( + !format.getIgnoreHeaderCase(), "Illegal %s: cannot ignore header case", CSVFormat.class); + + checkArgument( + !format.getAllowDuplicateHeaderNames(), + "Illegal %s: cannot allow duplicate header names", + CSVFormat.class); + + for (String columnName : header) { + checkArgument( + !Strings.isNullOrEmpty(columnName), + "Illegal %s: column name is required", + CSVFormat.class); + } + checkArgument( + !format.getSkipHeaderRecord(), + "Illegal %s: cannot skip header record because the header is already accounted for", + CSVFormat.class); + } /** * Validate the {@link CSVFormat} in relation to the {@link Schema} for CSV record parsing * requirements. */ - // TODO(https://github.com/apache/beam/issues/31716): implement method. - static void validate(CSVFormat format, Schema schema) {} + static void validateCsvFormatWithSchema(CSVFormat format, Schema schema) { + List header = Arrays.asList(format.getHeader()); + for (Schema.Field field : schema.getFields()) { + String fieldName = field.getName(); + if (!field.getType().getNullable()) { + checkArgument( + header.contains(fieldName), + "Illegal %s: required %s field '%s' not found in header", + CSVFormat.class, + Schema.class.getTypeName(), + fieldName); + } + } + } /** * Build a {@link List} of {@link Schema.Field}s corresponding to the expected position of each * field within the CSV record. */ - // TODO(https://github.com/apache/beam/issues/31718): implement method. - static List mapFieldPositions(CSVFormat format, Schema schema) { - return new ArrayList<>(); + static Map mapFieldPositions(CSVFormat format, Schema schema) { + List header = Arrays.asList(format.getHeader()); + Map indexToFieldMap = new HashMap<>(); + for (Schema.Field field : schema.getFields()) { + int index = getIndex(header, field); + if (index >= 0) { + indexToFieldMap.put(index, field); + } + } + return indexToFieldMap; + } + + /** + * Attains expected index from {@link CSVFormat's} header matching a given {@link Schema.Field}. + */ + private static int getIndex(List header, Schema.Field field) { + String fieldName = field.getName(); + boolean presentInHeader = header.contains(fieldName); + boolean isNullable = field.getType().getNullable(); + if (presentInHeader) { + return header.indexOf(fieldName); + } + if (isNullable) { + return -1; + } + + throw new IllegalArgumentException( + String.format("header does not contain required %s field: %s", Schema.class, fieldName)); } /** diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseKV.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseKV.java new file mode 100644 index 000000000000..6ddafdccd9fa --- /dev/null +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseKV.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.commons.csv.CSVRecord; + +/** + * A {@link PTransform} that takes an input {@link PCollection>} and outputs a + * {@link PCollection} of custom type. + */ +// TODO(https://github.com/apache/beam/issues/31873): implement class after all dependencies are +// completed. +class CsvIOParseKV + extends PTransform>>, CsvIOParseResult> { + + private final Coder outputCoder; + + private CsvIOParseKV(Coder outputCoder) { + this.outputCoder = outputCoder; + } + + // TODO(https://github.com/apache/beam/issues/31873): implement method. + @Override + public CsvIOParseResult expand(PCollection>> input) { + return CsvIOParseResult.empty(input.getPipeline(), outputCoder); + } +} diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseResult.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseResult.java new file mode 100644 index 000000000000..77264fccd2c9 --- /dev/null +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseResult.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import java.util.Map; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.PInput; +import org.apache.beam.sdk.values.POutput; +import org.apache.beam.sdk.values.PValue; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; + +/** + * The {@link T} and {@link CsvIOParseError} {@link PCollection} results of parsing CSV records. Use + * {@link #getOutput()} and {@link #getErrors()} to apply these results in a pipeline. + */ +public class CsvIOParseResult implements POutput { + + static CsvIOParseResult of( + TupleTag outputTag, + Coder outputCoder, + TupleTag errorTag, + PCollectionTuple pct) { + return new CsvIOParseResult<>(outputTag, outputCoder, errorTag, pct); + } + + static CsvIOParseResult empty(Pipeline pipeline, Coder outputCoder) { + return new CsvIOParseResult<>( + new TupleTag() {}, + outputCoder, + new TupleTag() {}, + PCollectionTuple.empty(pipeline)); + } + + private final Pipeline pipeline; + private final TupleTag outputTag; + private final PCollection output; + private final TupleTag errorTag; + private final PCollection errors; + + private CsvIOParseResult( + TupleTag outputTag, + Coder outputCoder, + TupleTag errorTag, + PCollectionTuple pct) { + this.outputTag = outputTag; + this.errorTag = errorTag; + this.pipeline = pct.getPipeline(); + this.output = pct.get(outputTag).setCoder(outputCoder); + this.errors = pct.get(errorTag).setCoder(CsvIOParseError.CODER); + } + + /** The {@link T} {@link PCollection} as a result of successfully parsing CSV records. */ + public PCollection getOutput() { + return output; + } + + /** + * The {@link CsvIOParseError} {@link PCollection} as a result of errors associated with parsing + * CSV records. + */ + public PCollection getErrors() { + return errors; + } + + @Override + public Pipeline getPipeline() { + return pipeline; + } + + @Override + public Map, PValue> expand() { + return ImmutableMap.of( + outputTag, output, + errorTag, errors); + } + + @Override + public void finishSpecifyingOutput( + String transformName, PInput input, PTransform transform) {} +} diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOReadFiles.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOReadFiles.java index 3e0b36b85c2e..b28072091326 100644 --- a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOReadFiles.java +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOReadFiles.java @@ -31,9 +31,9 @@ // dependencies are completed. class CsvIOReadFiles extends PTransform, PCollection> { /** Stores required parameters for parsing. */ - private final CsvIOParseConfiguration.Builder configBuilder; + private final CsvIOParseConfiguration.Builder configBuilder; - CsvIOReadFiles(CsvIOParseConfiguration.Builder configBuilder) { + CsvIOReadFiles(CsvIOParseConfiguration.Builder configBuilder) { this.configBuilder = configBuilder; } diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIORecordToObjects.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIORecordToObjects.java new file mode 100644 index 000000000000..97bceb479345 --- /dev/null +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIORecordToObjects.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.TupleTagList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Throwables; +import org.joda.time.Instant; + +/** + * {@link CsvIORecordToObjects} is a class that takes an input of {@link PCollection>} + * and outputs custom type {@link PCollection}. + */ +class CsvIORecordToObjects extends PTransform>, CsvIOParseResult> { + + /** The expected {@link Schema} of the target type. */ + private final Schema schema; + + /** A map of the {@link Schema.Field#getName()} to the custom CSV processing lambda. */ + private final Map> customProcessingMap; + + /** A {@link Map} of {@link Schema.Field}s to their expected positions within the CSV record. */ + private final Map indexToFieldMap; + + private final TupleTag outputTag = new TupleTag() {}; + + private final TupleTag errorTag = new TupleTag() {}; + + /** + * A {@link SerializableFunction} that converts from {@link Row} to {@link Schema} mapped custom + * type. + */ + private final SerializableFunction fromRowFn; + + /** The expected coder of target type. */ + private final Coder coder; + + CsvIORecordToObjects(CsvIOParseConfiguration configuration) { + this.schema = configuration.getSchema(); + this.customProcessingMap = configuration.getCustomProcessingMap(); + this.indexToFieldMap = + CsvIOParseHelpers.mapFieldPositions(configuration.getCsvFormat(), schema); + this.fromRowFn = configuration.getFromRowFn(); + this.coder = configuration.getCoder(); + } + + @Override + public CsvIOParseResult expand(PCollection> input) { + PCollectionTuple pct = + input.apply( + RecordToObjectsFn.class.getSimpleName(), + ParDo.of(new RecordToObjectsFn()).withOutputTags(outputTag, TupleTagList.of(errorTag))); + + return CsvIOParseResult.of(outputTag, coder, errorTag, pct); + } + + private class RecordToObjectsFn extends DoFn, T> { + @ProcessElement + public void process(@Element List record, MultiOutputReceiver receiver) { + Map fieldNamesToValues = new HashMap<>(); + try { + for (Map.Entry entry : indexToFieldMap.entrySet()) { + Schema.Field field = entry.getValue(); + int index = entry.getKey(); + String cell = record.get(index); + Object value = parseCell(cell, field); + fieldNamesToValues.put(field.getName(), value); + } + Row row = Row.withSchema(schema).withFieldValues(fieldNamesToValues).build(); + receiver.get(outputTag).output(fromRowFn.apply(row)); + } catch (RuntimeException e) { + receiver + .get(errorTag) + .output( + CsvIOParseError.builder() + .setCsvRecord(record.toString()) + .setMessage(Optional.ofNullable(e.getMessage()).orElse("")) + .setStackTrace(Throwables.getStackTraceAsString(e)) + .setObservedTimestamp(Instant.now()) + .build()); + } + } + } + + /** Parses cell to emit the value, as well as potential errors with filename. */ + Object parseCell(String cell, Schema.Field field) { + if (cell == null) { + if (!field.getType().getNullable()) { + throw new IllegalArgumentException( + "Required org.apache.beam.sdk.schemas.Schema field " + + field.getName() + + " has null value"); + } + return cell; + } + if (customProcessingMap.containsKey(field.getName())) { + return customProcessingMap.get(field.getName()).apply(cell); + } + return CsvIOParseHelpers.parseCell(cell, field); + } +} diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecord.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecord.java new file mode 100644 index 000000000000..7fe0f5090d67 --- /dev/null +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecord.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import org.apache.beam.sdk.coders.ListCoder; +import org.apache.beam.sdk.coders.NullableCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.TupleTagList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Throwables; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.joda.time.Instant; + +/** + * {@link CsvIOStringToCsvRecord} is a class that takes a {@link PCollection} input and + * outputs a {@link PCollection} with potential {@link PCollection} for + * targeted error detection. + */ +final class CsvIOStringToCsvRecord + extends PTransform, CsvIOParseResult>> { + + private final CSVFormat csvFormat; + + private final TupleTag> outputTag = new TupleTag>() {}; + + private final TupleTag errorTag = new TupleTag() {}; + + CsvIOStringToCsvRecord(CSVFormat csvFormat) { + this.csvFormat = csvFormat; + } + + /** + * Creates {@link PCollection} from {@link PCollection} for future processing + * to Row or custom type. + */ + @Override + public CsvIOParseResult> expand(PCollection input) { + PCollectionTuple pct = + input.apply( + ProcessLineToRecordFn.class.getSimpleName(), + ParDo.of(new ProcessLineToRecordFn()) + .withOutputTags(outputTag, TupleTagList.of(errorTag))); + + return CsvIOParseResult.of( + outputTag, ListCoder.of(NullableCoder.of(StringUtf8Coder.of())), errorTag, pct); + } + + /** Processes each line in order to convert it to a {@link CSVRecord}. */ + private class ProcessLineToRecordFn extends DoFn> { + private final String headerLine = headerLine(csvFormat); + + @ProcessElement + public void process(@Element String line, MultiOutputReceiver receiver) { + if (headerLine.equals(line)) { + return; + } + try (CSVParser csvParser = CSVParser.parse(line, csvFormat)) { + for (CSVRecord record : csvParser.getRecords()) { + receiver.get(outputTag).output(csvRecordtoList(record)); + } + } catch (RuntimeException | IOException e) { + receiver + .get(errorTag) + .output( + CsvIOParseError.builder() + .setCsvRecord(line) + .setMessage(Optional.ofNullable(e.getMessage()).orElse("")) + .setObservedTimestamp(Instant.now()) + .setStackTrace(Throwables.getStackTraceAsString(e)) + .build()); + } + } + } + + /** Creates a {@link List} containing {@link CSVRecord} values. */ + private static List csvRecordtoList(CSVRecord record) { + List cells = new ArrayList<>(); + for (String cell : record) { + cells.add(cell); + } + return cells; + } + + /** Returns a formatted line of the CSVFormat header. */ + static String headerLine(CSVFormat csvFormat) { + return String.join(String.valueOf(csvFormat.getDelimiter()), csvFormat.getHeader()); + } +} diff --git a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseHelpersTest.java b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseHelpersTest.java index d6129055ae31..5276fa008c7c 100644 --- a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseHelpersTest.java +++ b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseHelpersTest.java @@ -21,10 +21,12 @@ import static org.junit.Assert.assertThrows; import java.math.BigDecimal; -import java.time.DateTimeException; -import java.time.Instant; +import java.util.Map; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.commons.collections.keyvalue.DefaultMapEntry; +import org.apache.commons.csv.CSVFormat; +import org.joda.time.Instant; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -33,6 +35,226 @@ @RunWith(JUnit4.class) public class CsvIOParseHelpersTest { + /** Tests for {@link CsvIOParseHelpers#validateCsvFormat(CSVFormat)}. */ + @Test + public void givenCSVFormatWithHeader_validates() { + CSVFormat format = csvFormatWithHeader(); + CsvIOParseHelpers.validateCsvFormat(format); + } + + @Test + public void givenCSVFormatWithNullHeader_throwsException() { + CSVFormat format = csvFormat(); + String gotMessage = + assertThrows( + IllegalArgumentException.class, () -> CsvIOParseHelpers.validateCsvFormat(format)) + .getMessage(); + assertEquals("Illegal class org.apache.commons.csv.CSVFormat: header is required", gotMessage); + } + + @Test + public void givenCSVFormatWithEmptyHeader_throwsException() { + CSVFormat format = csvFormat().withHeader(); + String gotMessage = + assertThrows( + IllegalArgumentException.class, () -> CsvIOParseHelpers.validateCsvFormat(format)) + .getMessage(); + assertEquals( + "Illegal class org.apache.commons.csv.CSVFormat: header cannot be empty", gotMessage); + } + + @Test + public void givenCSVFormatWithHeaderContainingEmptyString_throwsException() { + CSVFormat format = csvFormat().withHeader("", "bar"); + String gotMessage = + assertThrows( + IllegalArgumentException.class, () -> CsvIOParseHelpers.validateCsvFormat(format)) + .getMessage(); + assertEquals( + "Illegal class org.apache.commons.csv.CSVFormat: column name is required", gotMessage); + } + + @Test + public void givenCSVFormatWithHeaderContainingNull_throwsException() { + CSVFormat format = csvFormat().withHeader(null, "bar"); + String gotMessage = + assertThrows( + IllegalArgumentException.class, () -> CsvIOParseHelpers.validateCsvFormat(format)) + .getMessage(); + assertEquals( + "Illegal class org.apache.commons.csv.CSVFormat: column name is required", gotMessage); + } + + @Test + public void givenCSVFormatThatAllowsMissingColumnNames_throwsException() { + CSVFormat format = csvFormatWithHeader().withAllowMissingColumnNames(true); + String gotMessage = + assertThrows( + IllegalArgumentException.class, () -> CsvIOParseHelpers.validateCsvFormat(format)) + .getMessage(); + assertEquals( + "Illegal class org.apache.commons.csv.CSVFormat: cannot allow missing column names", + gotMessage); + } + + @Test + public void givenCSVFormatThatIgnoresHeaderCase_throwsException() { + CSVFormat format = csvFormatWithHeader().withIgnoreHeaderCase(true); + String gotMessage = + assertThrows( + IllegalArgumentException.class, () -> CsvIOParseHelpers.validateCsvFormat(format)) + .getMessage(); + assertEquals( + "Illegal class org.apache.commons.csv.CSVFormat: cannot ignore header case", gotMessage); + } + + @Test + public void givenCSVFormatThatAllowsDuplicateHeaderNames_throwsException() { + CSVFormat format = csvFormatWithHeader().withAllowDuplicateHeaderNames(true); + String gotMessage = + assertThrows( + IllegalArgumentException.class, () -> CsvIOParseHelpers.validateCsvFormat(format)) + .getMessage(); + assertEquals( + "Illegal class org.apache.commons.csv.CSVFormat: cannot allow duplicate header names", + gotMessage); + } + + @Test + public void givenCSVFormatThatSkipsHeaderRecord_throwsException() { + CSVFormat format = csvFormatWithHeader().withSkipHeaderRecord(true); + String gotMessage = + assertThrows( + IllegalArgumentException.class, () -> CsvIOParseHelpers.validateCsvFormat(format)) + .getMessage(); + assertEquals( + "Illegal class org.apache.commons.csv.CSVFormat: cannot skip header record because the header is already accounted for", + gotMessage); + } + + /** End of tests for {@link CsvIOParseHelpers#validateCsvFormat(CSVFormat)}. */ + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** Tests for {@link CsvIOParseHelpers#validateCsvFormatWithSchema(CSVFormat, Schema)}. */ + @Test + public void givenNullableSchemaFieldNotPresentInHeader_validates() { + CSVFormat format = csvFormat().withHeader("foo", "bar"); + Schema schema = + Schema.of( + Schema.Field.of("foo", Schema.FieldType.STRING), + Schema.Field.of("bar", Schema.FieldType.STRING), + Schema.Field.nullable("baz", Schema.FieldType.STRING)); + CsvIOParseHelpers.validateCsvFormatWithSchema(format, schema); + } + + @Test + public void givenRequiredSchemaFieldNotPresentInHeader_throwsException() { + CSVFormat format = csvFormat().withHeader("foo", "bar"); + Schema schema = + Schema.of( + Schema.Field.of("foo", Schema.FieldType.STRING), + Schema.Field.of("bar", Schema.FieldType.STRING), + Schema.Field.of("baz", Schema.FieldType.STRING)); + String gotMessage = + assertThrows( + IllegalArgumentException.class, + () -> CsvIOParseHelpers.validateCsvFormatWithSchema(format, schema)) + .getMessage(); + assertEquals( + "Illegal class org.apache.commons.csv.CSVFormat: required org.apache.beam.sdk.schemas.Schema field 'baz' not found in header", + gotMessage); + } + + /** End of tests for {@link CsvIOParseHelpers#validateCsvFormatWithSchema(CSVFormat, Schema)}. */ + ////////////////////////////////////////////////////////////////////////////////////////////// + /** Tests for {@link CsvIOParseHelpers#mapFieldPositions(CSVFormat, Schema)}. */ + @Test + public void testHeaderWithComments() { + String[] comments = {"first line", "second line", "third line"}; + Schema schema = + Schema.builder().addStringField("a_string").addStringField("another_string").build(); + ImmutableMap want = + ImmutableMap.of(0, schema.getField("a_string"), 1, schema.getField("another_string")); + Map got = + CsvIOParseHelpers.mapFieldPositions( + csvFormat() + .withHeader("a_string", "another_string") + .withHeaderComments((Object) comments), + schema); + assertEquals(want, got); + } + + @Test + public void givenMatchingHeaderAndSchemaField_mapsPositions() { + Schema schema = + Schema.builder() + .addStringField("a_string") + .addDoubleField("a_double") + .addInt32Field("an_integer") + .build(); + ImmutableMap want = + ImmutableMap.of( + 0, + schema.getField("a_string"), + 1, + schema.getField("an_integer"), + 2, + schema.getField("a_double")); + Map got = + CsvIOParseHelpers.mapFieldPositions( + csvFormat().withHeader("a_string", "an_integer", "a_double"), schema); + assertEquals(want, got); + } + + @Test + public void givenSchemaContainsNullableFieldTypes() { + Schema schema = + Schema.builder() + .addNullableStringField("a_string") + .addDoubleField("a_double") + .addInt32Field("an_integer") + .addDateTimeField("a_datetime") + .addNullableStringField("another_string") + .build(); + ImmutableMap want = + ImmutableMap.of( + 0, + schema.getField("an_integer"), + 1, + schema.getField("a_double"), + 2, + schema.getField("a_datetime")); + Map got = + CsvIOParseHelpers.mapFieldPositions( + csvFormat().withHeader("an_integer", "a_double", "a_datetime"), schema); + assertEquals(want, got); + } + + @Test + public void givenNonNullableHeaderAndSchemaFieldMismatch_throws() { + Schema schema = + Schema.builder() + .addStringField("another_string") + .addInt32Field("an_integer") + .addStringField("a_string") + .build(); + IllegalArgumentException e = + assertThrows( + IllegalArgumentException.class, + () -> + CsvIOParseHelpers.mapFieldPositions( + csvFormat().withHeader("an_integer", "a_string"), schema)); + assertEquals( + "header does not contain required class org.apache.beam.sdk.schemas.Schema field: " + + schema.getField("another_string").getName(), + e.getMessage()); + } + + /** End of tests for {@link CsvIOParseHelpers#mapFieldPositions(CSVFormat, Schema)} */ + + //////////////////////////////////////////////////////////////////////////////////////////// + + /** Tests for {@link CsvIOParseHelpers#parseCell(String, Schema.Field)}. */ @Test public void ignoresCaseFormat() { String allCapsBool = "TRUE"; @@ -159,20 +381,20 @@ public void givenFloatWithSurroundingSpaces_parses() { } @Test - public void givenDatetimeWithSurroundingSpaces() throws DateTimeException { + public void givenDatetimeWithSurroundingSpaces() { Instant datetime = Instant.parse("1234-01-23T10:00:05.000Z"); DefaultMapEntry cellToExpectedValue = new DefaultMapEntry(" 1234-01-23T10:00:05.000Z ", datetime); Schema schema = Schema.builder().addDateTimeField("a_datetime").addStringField("a_string").build(); - DateTimeException e = + IllegalArgumentException e = assertThrows( - DateTimeException.class, + IllegalArgumentException.class, () -> CsvIOParseHelpers.parseCell( cellToExpectedValue.getKey().toString(), schema.getField("a_datetime"))); assertEquals( - "Text " + "' 1234-01-23T10:00:05.000Z '" + " could not be parsed at index 0", + "Invalid format: \" 1234-01-23T10:00:05.000Z \" field a_datetime was received -- type mismatch", e.getMessage()); } @@ -370,4 +592,17 @@ public void givenCellUnsupportedType_throws() { + ", consider using withCustomRecordParsing", e.getMessage()); } + + /** End of tests for {@link CsvIOParseHelpers#parseCell(String, Schema.Field)}. */ + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** Return a {@link CSVFormat} with a header and with no duplicate header names allowed. */ + private static CSVFormat csvFormatWithHeader() { + return csvFormat().withHeader("foo", "bar"); + } + + /** Return a {@link CSVFormat} with no header and with no duplicate header names allowed. */ + private static CSVFormat csvFormat() { + return CSVFormat.DEFAULT.withAllowDuplicateHeaderNames(false); + } } diff --git a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseKVTest.java b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseKVTest.java new file mode 100644 index 000000000000..c20a29174503 --- /dev/null +++ b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseKVTest.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import org.apache.beam.sdk.util.SerializableUtils; +import org.junit.Test; + +/** Contains tests for {@link CsvIOParseKV}. */ +public class CsvIOParseKVTest { + @Test + public void isSerializable() { + SerializableUtils.ensureSerializable(CsvIOParseKV.class); + } +} diff --git a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseTest.java b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseTest.java new file mode 100644 index 000000000000..05d6982004f4 --- /dev/null +++ b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseTest.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NULLABLE_ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypes; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypesFromRowFn; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypesToRowFn; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.NullableCoder; +import org.apache.beam.sdk.coders.RowCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.common.SchemaAwareJavaBeans; +import org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NullableAllPrimitiveDataTypes; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.util.SerializableUtils; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.Row; +import org.apache.commons.csv.CSVFormat; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class CsvIOParseTest { + + private static final String[] HEADER = + new String[] {"aBoolean", "aDouble", "aFloat", "anInteger", "aLong", "aString"}; + private static final Coder + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_CODER = + SchemaCoder.of( + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR, + nullableAllPrimitiveDataTypesToRowFn(), + nullableAllPrimitiveDataTypesFromRowFn()); + private static final SerializableFunction ROW_ROW_SERIALIZABLE_FUNCTION = row -> row; + @Rule public final TestPipeline pipeline = TestPipeline.create(); + + @Test + public void isSerializable() throws Exception { + SerializableUtils.ensureSerializable(CsvIOParse.class); + } + + @Test + public void parseRows() { + PCollection records = + csvRecords( + pipeline, + "# This is a comment", + "aBoolean,aDouble,aFloat,anInteger,aLong,aString", + "true,1.0,2.0,3,4,foo", + "🏵,6.0,7.0,8,9,bar", + "false,12.0,14.0,8,24,\"foo\nbar\"", + "true,1.0,2.0,3,4,foo$,bar"); + List want = + Arrays.asList( + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", true) + .withFieldValue("aDouble", 1.0) + .withFieldValue("aFloat", 2.0f) + .withFieldValue("anInteger", 3) + .withFieldValue("aLong", 4L) + .withFieldValue("aString", "foo") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", null) + .withFieldValue("aDouble", 6.0) + .withFieldValue("aFloat", 7.0f) + .withFieldValue("anInteger", 8) + .withFieldValue("aLong", 9L) + .withFieldValue("aString", "bar") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", false) + .withFieldValue("aDouble", 12.0) + .withFieldValue("aFloat", 14.0f) + .withFieldValue("anInteger", 8) + .withFieldValue("aLong", 24L) + .withFieldValue("aString", "foo\nbar") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", true) + .withFieldValue("aDouble", 1.0) + .withFieldValue("aFloat", 2.0f) + .withFieldValue("anInteger", 3) + .withFieldValue("aLong", 4L) + .withFieldValue("aString", "foo,bar") + .build()); + + CsvIOParseResult result = + records.apply( + underTest( + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + csvFormat(), + emptyCustomProcessingMap(), + ROW_ROW_SERIALIZABLE_FUNCTION, + RowCoder.of(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA))); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void parsePOJOs() { + PCollection records = + csvRecords( + pipeline, + "# This is a comment", + "aBoolean,aDouble,aFloat,anInteger,aLong,aString", + "true,1.0,2.0,3,4,foo", + "🏵,6.0,7.0,8,9,bar", + "false,12.0,14.0,8,24,\"foo\nbar\"", + "true,1.0,2.0,3,4,foo$,bar"); + List want = + Arrays.asList( + nullableAllPrimitiveDataTypes(true, 1.0d, 2.0f, 3, 4L, "foo"), + nullableAllPrimitiveDataTypes(null, 6.0d, 7.0f, 8, 9L, "bar"), + nullableAllPrimitiveDataTypes(false, 12.0d, 14.0f, 8, 24L, "foo\nbar"), + nullableAllPrimitiveDataTypes(true, 1.0d, 2.0f, 3, 4L, "foo,bar")); + + CsvIOParseResult result = + records.apply( + underTest( + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + csvFormat(), + emptyCustomProcessingMap(), + nullableAllPrimitiveDataTypesFromRowFn(), + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_CODER)); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + private static CSVFormat csvFormat() { + return CSVFormat.DEFAULT + .withAllowDuplicateHeaderNames(false) + .withHeader(HEADER) + .withCommentMarker('#') + .withNullString("🏵") + .withEscape('$'); + } + + private static PCollection csvRecords(Pipeline pipeline, String... lines) { + return pipeline.apply( + Create.of(Arrays.asList(lines)).withCoder(NullableCoder.of(StringUtf8Coder.of()))); + } + + private static CsvIOParse underTest( + Schema schema, + CSVFormat csvFormat, + Map> customProcessingMap, + SerializableFunction fromRowFn, + Coder coder) { + CsvIOParseConfiguration.Builder configBuilder = + CsvIOParseConfiguration.builder() + .setSchema(schema) + .setCsvFormat(csvFormat) + .setCustomProcessingMap(customProcessingMap) + .setFromRowFn(fromRowFn) + .setCoder(coder); + return CsvIOParse.builder().setConfigBuilder(configBuilder).build(); + } + + private static Map> emptyCustomProcessingMap() { + return new HashMap<>(); + } +} diff --git a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIORecordToObjectsTest.java b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIORecordToObjectsTest.java new file mode 100644 index 000000000000..9ccb5d0c7bc2 --- /dev/null +++ b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIORecordToObjectsTest.java @@ -0,0 +1,422 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.ALL_PRIMITIVE_DATA_TYPES_SCHEMA; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.AllPrimitiveDataTypes; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NULLABLE_ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NullableAllPrimitiveDataTypes; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.TIME_CONTAINING_SCHEMA; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.TIME_CONTAINING_TYPE_DESCRIPTOR; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.TimeContaining; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.allPrimitiveDataTypes; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.allPrimitiveDataTypesFromRowFn; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.allPrimitiveDataTypesToRowFn; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypes; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypesFromRowFn; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypesToRowFn; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.timeContaining; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.timeContainingFromRowFn; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.timeContainingToRowFn; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.ListCoder; +import org.apache.beam.sdk.coders.NullableCoder; +import org.apache.beam.sdk.coders.RowCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.common.SchemaAwareJavaBeans; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Count; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.util.SerializableUtils; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.commons.csv.CSVFormat; +import org.joda.time.Instant; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link CsvIORecordToObjects}. */ +@RunWith(JUnit4.class) +public class CsvIORecordToObjectsTest { + + @Rule public final TestPipeline pipeline = TestPipeline.create(); + private static final SerializableFunction ROW_ROW_SERIALIZABLE_FUNCTION = row -> row; + private static final RowCoder ALL_PRIMITIVE_DATA_TYPES_ROW_CODER = + RowCoder.of(ALL_PRIMITIVE_DATA_TYPES_SCHEMA); + private static final Coder NULLABLE_ALL_PRIMITIVE_DATA_TYPES_ROW_CODER = + NullableCoder.of(RowCoder.of(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA)); + private static final Coder + ALL_PRIMITIVE_DATA_TYPES_CODER = + SchemaCoder.of( + ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR, + allPrimitiveDataTypesToRowFn(), + allPrimitiveDataTypesFromRowFn()); + private static final Coder + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_CODER = + SchemaCoder.of( + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR, + nullableAllPrimitiveDataTypesToRowFn(), + nullableAllPrimitiveDataTypesFromRowFn()); + private static final Coder TIME_CONTAINING_ROW_CODER = RowCoder.of(TIME_CONTAINING_SCHEMA); + private static final Coder TIME_CONTAINING_POJO_CODER = + SchemaCoder.of( + TIME_CONTAINING_SCHEMA, + TIME_CONTAINING_TYPE_DESCRIPTOR, + timeContainingToRowFn(), + timeContainingFromRowFn()); + + @Test + public void isSerializable() { + SerializableUtils.ensureSerializable(CsvIORecordToObjects.class); + } + + @Test + public void parsesToRows() { + PCollection> input = + csvRecords(pipeline, "true", "1.0", "2.0", "3.0", "4", "5", "foo"); + Row want = + Row.withSchema(ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValues( + ImmutableMap.of( + "aBoolean", + true, + "aDecimal", + BigDecimal.valueOf(1.0), + "aDouble", + 2.0, + "aFloat", + 3.0f, + "anInteger", + 4, + "aLong", + 5L, + "aString", + "foo")) + .build(); + CsvIORecordToObjects underTest = + underTest( + ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + allPrimitiveDataTypesCsvFormat(), + emptyCustomProcessingMap(), + ROW_ROW_SERIALIZABLE_FUNCTION, + ALL_PRIMITIVE_DATA_TYPES_ROW_CODER); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + pipeline.run(); + } + + @Test + public void parsesToPojos() { + PCollection> input = + csvRecords(pipeline, "true", "1.0", "2.0", "3.0", "4", "5", "foo"); + SchemaAwareJavaBeans.AllPrimitiveDataTypes want = + allPrimitiveDataTypes(true, BigDecimal.valueOf(1.0), 2.0d, 3.0f, 4, 5L, "foo"); + CsvIORecordToObjects underTest = + underTest( + ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + allPrimitiveDataTypesCsvFormat(), + emptyCustomProcessingMap(), + allPrimitiveDataTypesFromRowFn(), + ALL_PRIMITIVE_DATA_TYPES_CODER); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + pipeline.run(); + } + + @Test + public void givenNullableField_containsNullCell_parsesToRows() { + PCollection> input = csvRecords(pipeline, "true", "1.0", "2.0", "3", "4", null); + Row want = + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", true) + .withFieldValue("aDouble", 1.0) + .withFieldValue("aFloat", 2.0f) + .withFieldValue("anInteger", 3) + .withFieldValue("aLong", 4L) + .withFieldValue("aString", null) + .build(); + + CsvIORecordToObjects underTest = + underTest( + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + nullableAllPrimitiveDataTypesCsvFormat(), + emptyCustomProcessingMap(), + ROW_ROW_SERIALIZABLE_FUNCTION, + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_ROW_CODER); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + pipeline.run(); + } + + @Test + public void givenNullableField_containsNullCell_parsesToPojos() { + PCollection> input = csvRecords(pipeline, "true", "1.0", "2.0", "3", "4", null); + SchemaAwareJavaBeans.NullableAllPrimitiveDataTypes want = + nullableAllPrimitiveDataTypes(true, 1.0, 2.0f, 3, 4L, null); + + CsvIORecordToObjects underTest = + underTest( + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + nullableAllPrimitiveDataTypesCsvFormat(), + emptyCustomProcessingMap(), + nullableAllPrimitiveDataTypesFromRowFn(), + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_CODER); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + pipeline.run(); + } + + @Test + public void givenNoNullableField_containsNullCell_throws() { + PCollection> input = + csvRecords(pipeline, "true", "1.0", "2.0", "3.0", "4", "5", null); + pipeline.apply( + "Null Cell with No Nullable Fields", + Create.of( + Collections.singletonList( + Arrays.asList("true", "1.0", "2.0", "3.0", "4", "5", null))) + .withCoder(ListCoder.of(NullableCoder.of(StringUtf8Coder.of())))); + CsvIORecordToObjects underTest = + underTest( + ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + allPrimitiveDataTypesCsvFormat(), + emptyCustomProcessingMap(), + allPrimitiveDataTypesFromRowFn(), + ALL_PRIMITIVE_DATA_TYPES_CODER); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).empty(); + PAssert.thatSingleton(result.getErrors().apply(Count.globally())).isEqualTo(1L); + pipeline.run(); + } + + @Test + public void givenAllNullableFields_emptyRecord_parsesToRows() { + PCollection> input = emptyCsvRecords(pipeline); + CsvIORecordToObjects underTest = + underTest( + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + nullableAllPrimitiveDataTypesCsvFormat(), + emptyCustomProcessingMap(), + ROW_ROW_SERIALIZABLE_FUNCTION, + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_ROW_CODER); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).empty(); + PAssert.that(result.getErrors()).empty(); + pipeline.run(); + } + + @Test + public void givenAllNullableFields_emptyRecord_parsesToPojos() { + PCollection> input = emptyCsvRecords(pipeline); + CsvIORecordToObjects underTest = + underTest( + ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + allPrimitiveDataTypesCsvFormat(), + emptyCustomProcessingMap(), + allPrimitiveDataTypesFromRowFn(), + ALL_PRIMITIVE_DATA_TYPES_CODER); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).empty(); + PAssert.that(result.getErrors()).empty(); + pipeline.run(); + } + + @Test + public void givenFieldHasCustomProcessing_parsesToRows() { + PCollection> input = + csvRecords( + pipeline, + "2024-07-25T11:25:14.000Z", + "2024-07-25T11:26:01.000Z,2024-07-25T11:26:22.000Z,2024-07-25T11:26:38.000Z"); + Row want = + Row.withSchema(TIME_CONTAINING_SCHEMA) + .withFieldValue("instant", Instant.parse("2024-07-25T11:25:14.000Z")) + .withFieldValue( + "instantList", + Arrays.asList( + Instant.parse("2024-07-25T11:26:01.000Z"), + Instant.parse("2024-07-25T11:26:22.000Z"), + Instant.parse("2024-07-25T11:26:38.000Z"))) + .build(); + CsvIORecordToObjects underTest = + underTest( + TIME_CONTAINING_SCHEMA, + timeContainingCsvFormat(), + timeContainingCustomProcessingMap(), + ROW_ROW_SERIALIZABLE_FUNCTION, + TIME_CONTAINING_ROW_CODER); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + pipeline.run(); + } + + @Test + public void givenFieldHasCustomProcessing_parsesToPojos() { + PCollection> input = + csvRecords( + pipeline, + "2024-07-25T11:25:14.000Z", + "2024-07-25T11:26:01.000Z,2024-07-25T11:26:22.000Z,2024-07-25T11:26:38.000Z"); + TimeContaining want = + timeContaining( + Instant.parse("2024-07-25T11:25:14.000Z"), + Arrays.asList( + Instant.parse("2024-07-25T11:26:01.000Z"), + Instant.parse("2024-07-25T11:26:22.000Z"), + Instant.parse("2024-07-25T11:26:38.000Z"))); + CsvIORecordToObjects underTest = + underTest( + TIME_CONTAINING_SCHEMA, + timeContainingCsvFormat(), + timeContainingCustomProcessingMap(), + timeContainingFromRowFn(), + TIME_CONTAINING_POJO_CODER); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + pipeline.run(); + } + + @Test + public void givenInvalidCell_throws() { + PCollection> input = + csvRecords(pipeline, "true", "invalid cell for Decimal", "2.0", "3.0", "4", "5", "foo"); + CsvIORecordToObjects underTest = + underTest( + ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + allPrimitiveDataTypesCsvFormat(), + emptyCustomProcessingMap(), + allPrimitiveDataTypesFromRowFn(), + ALL_PRIMITIVE_DATA_TYPES_CODER); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).empty(); + PAssert.thatSingleton(result.getErrors().apply(Count.globally())).isEqualTo(1L); + + pipeline.run(); + } + + @Test + public void givenInvalidCustomProcessing_throws() { + PCollection> input = + csvRecords( + pipeline, + "2024-07-25T11:25:14.000Z", + "2024-15-25T11:26:01.000Z,2024-24-25T11:26:22.000Z,2024-96-25T11:26:38.000Z"); + CsvIORecordToObjects underTest = + underTest( + TIME_CONTAINING_SCHEMA, + timeContainingCsvFormat(), + timeContainingCustomProcessingMap(), + timeContainingFromRowFn(), + TIME_CONTAINING_POJO_CODER); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).empty(); + PAssert.thatSingleton(result.getErrors().apply(Count.globally())).isEqualTo(1L); + + pipeline.run(); + } + + private static PCollection> csvRecords(Pipeline pipeline, String... cells) { + return pipeline.apply( + Create.of(Collections.singletonList(Arrays.asList(cells))) + .withCoder(ListCoder.of(NullableCoder.of(StringUtf8Coder.of())))); + } + + private static PCollection> emptyCsvRecords(Pipeline pipeline) { + return pipeline.apply(Create.empty(ListCoder.of(StringUtf8Coder.of()))); + } + + private static CsvIORecordToObjects underTest( + Schema schema, + CSVFormat csvFormat, + Map> customProcessingMap, + SerializableFunction fromRowFn, + Coder coder) { + CsvIOParseConfiguration configuration = + CsvIOParseConfiguration.builder() + .setSchema(schema) + .setCsvFormat(csvFormat) + .setCustomProcessingMap(customProcessingMap) + .setFromRowFn(fromRowFn) + .setCoder(coder) + .build(); + return new CsvIORecordToObjects<>(configuration); + } + + private static Map> emptyCustomProcessingMap() { + return new HashMap<>(); + } + + private static Map> + timeContainingCustomProcessingMap() { + Map> customProcessingMap = new HashMap<>(); + customProcessingMap.put( + "instantList", + input -> { + List cells = Arrays.asList(input.split(",")); + List output = new ArrayList<>(); + cells.forEach(cell -> output.add(Instant.parse(cell))); + return output; + }); + return customProcessingMap; + } + + private static CSVFormat allPrimitiveDataTypesCsvFormat() { + return CSVFormat.DEFAULT + .withAllowDuplicateHeaderNames(false) + .withHeader("aBoolean", "aDecimal", "aDouble", "aFloat", "anInteger", "aLong", "aString"); + } + + private static CSVFormat nullableAllPrimitiveDataTypesCsvFormat() { + return CSVFormat.DEFAULT + .withAllowDuplicateHeaderNames(false) + .withHeader("aBoolean", "aDouble", "aFloat", "anInteger", "aLong", "aString") + .withNullString("null"); + } + + private static CSVFormat timeContainingCsvFormat() { + return CSVFormat.DEFAULT + .withAllowDuplicateHeaderNames(false) + .withHeader("instant", "instantList"); + } +} diff --git a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecordTest.java b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecordTest.java new file mode 100644 index 000000000000..1618962ef394 --- /dev/null +++ b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecordTest.java @@ -0,0 +1,565 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import static org.apache.beam.sdk.io.csv.CsvIOStringToCsvRecord.headerLine; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Count; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.values.PCollection; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.QuoteMode; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link CsvIOStringToCsvRecord}. */ +@RunWith(JUnit4.class) +public class CsvIOStringToCsvRecordTest { + @Rule public final TestPipeline pipeline = TestPipeline.create(); + + private static final String[] header = {"a_string", "an_integer", "a_double"}; + + @Test + public void givenCommentMarker_skipsLine() { + CSVFormat csvFormat = csvFormat().withCommentMarker('#'); + PCollection input = + pipeline.apply( + Create.of(headerLine(csvFormat), "#should skip me", "a,1,1.1", "b,2,2.2", "c,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenNoCommentMarker_doesntSkipLine() { + CSVFormat csvFormat = csvFormat(); + PCollection input = + pipeline.apply( + Create.of(headerLine(csvFormat), "#comment", "a,1,1.1", "b,2,2.2", "c,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Collections.singletonList("#comment"), + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenCustomDelimiter_splitsCells() { + CSVFormat csvFormat = csvFormat().withDelimiter(';'); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a;1;1.1", "b;2;2.2", "c;3;3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenEscapeCharacter_includeInCell() { + CSVFormat csvFormat = csvFormat().withEscape('$'); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a$,b,1,1.1", "b,2,2.2", "c,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a,b", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenHeaderComment_isNoop() { + CSVFormat csvFormat = csvFormat().withHeaderComments("abc", "def", "xyz"); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a,1,1.1", "b,2,2.2", "c,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenIgnoreEmptyLines_shouldSkip() { + CSVFormat csvFormat = csvFormat().withIgnoreEmptyLines(true); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a,1,1.1", "", "b,2,2.2", "", "c,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenNoIgnoreEmptyLines_isNoop() { + CSVFormat csvFormat = csvFormat().withIgnoreEmptyLines(false); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a,1,1.1", "", "b,2,2.2", "", "c,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenIgnoreSurroundingSpaces_removesSpaces() { + CSVFormat csvFormat = csvFormat().withIgnoreSurroundingSpaces(true); + PCollection input = + pipeline.apply( + Create.of( + headerLine(csvFormat), + " a ,1,1.1", + "b, 2 ,2.2", + "c,3, 3.3 ")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenNotIgnoreSurroundingSpaces_keepsSpaces() { + CSVFormat csvFormat = csvFormat().withIgnoreSurroundingSpaces(false); + PCollection input = + pipeline.apply( + Create.of( + headerLine(csvFormat), + " a ,1,1.1", + "b, 2 ,2.2", + "c,3, 3.3 ")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList(" a ", "1", "1.1"), + Arrays.asList("b", " 2 ", "2.2"), + Arrays.asList("c", "3", " 3.3 "))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenNullString_parsesNullCells() { + CSVFormat csvFormat = csvFormat().withNullString("🐼"); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a,1,🐼", "b,🐼,2.2", "🐼,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", null), + Arrays.asList("b", null, "2.2"), + Arrays.asList(null, "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenNoNullString_isNoop() { + CSVFormat csvFormat = csvFormat(); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a,1,🐼", "b,🐼,2.2", "🐼,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "🐼"), + Arrays.asList("b", "🐼", "2.2"), + Arrays.asList("🐼", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenCustomQuoteCharacter_includesSpecialCharacters() { + CSVFormat csvFormat = csvFormat().withQuote(':'); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), ":a,:,1,1.1", "b,2,2.2", "c,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a,", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenQuoteModeAll_isNoop() { + CSVFormat csvFormat = csvFormat().withQuoteMode(QuoteMode.ALL); + PCollection input = + pipeline.apply( + Create.of( + headerLine(csvFormat), + "\"a\",\"1\",\"1.1\"", + "\"b\",\"2\",\"2.2\"", + "\"c\",\"3\",\"3.3\"")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenQuoteModeAllNonNull_isNoop() { + CSVFormat csvFormat = csvFormat().withNullString("N/A").withQuoteMode(QuoteMode.ALL_NON_NULL); + PCollection input = + pipeline.apply( + Create.of( + headerLine(csvFormat), + "\"a\",\"1\",N/A", + "\"b\",\"2\",\"2.2\"", + "\"c\",\"3\",\"3.3\"")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", null), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenQuoteModeMinimal_isNoop() { + CSVFormat csvFormat = csvFormat().withQuoteMode(QuoteMode.MINIMAL); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "\"a,\",1,1.1", "b,2,2.2", "c,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a,", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenQuoteModeNonNumeric_isNoop() { + CSVFormat csvFormat = csvFormat().withQuoteMode(QuoteMode.NON_NUMERIC); + PCollection input = + pipeline.apply( + Create.of(headerLine(csvFormat), "\"a\",1,1.1", "\"b\",2,2.2", "\"c\",3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenQuoteModeNone_isNoop() { + CSVFormat csvFormat = csvFormat().withEscape('$').withQuoteMode(QuoteMode.NONE); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a,1,1.1", "b,2,2.2", "c,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenCustomRecordSeparator_isNoop() { + CSVFormat csvFormat = csvFormat().withRecordSeparator("😆"); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a,1,1.1😆b,2,2.2😆c,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Collections.singletonList( + Arrays.asList("a", "1", "1.1😆b", "2", "2.2😆c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenSystemRecordSeparator_isNoop() { + CSVFormat csvFormat = csvFormat().withSystemRecordSeparator(); + String systemRecordSeparator = csvFormat.getRecordSeparator(); + PCollection input = + pipeline.apply( + Create.of( + headerLine(csvFormat), + "a,1,1.1" + systemRecordSeparator + "b,2,2.2" + systemRecordSeparator + "c,3,3.3")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenTrailingDelimiter_skipsEndingDelimiter() { + CSVFormat csvFormat = csvFormat().withTrailingDelimiter(true); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a,1,1.1,", "b,2,2.2,", "c,3,3.3,")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenNoTrailingDelimiter_includesEndingCell() { + CSVFormat csvFormat = csvFormat().withTrailingDelimiter(false); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a,1,1.1,", "b,2,2.2,", "c,3,3.3,")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1", ""), + Arrays.asList("b", "2", "2.2", ""), + Arrays.asList("c", "3", "3.3", ""))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenTrim_removesSpaces() { + CSVFormat csvFormat = csvFormat().withTrim(true); + PCollection input = + pipeline.apply( + Create.of( + headerLine(csvFormat), + " a ,1,1.1", + "b, 2 ,2.2", + "c,3, 3.3 ")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a", "1", "1.1"), + Arrays.asList("b", "2", "2.2"), + Arrays.asList("c", "3", "3.3"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenNoTrim_keepsSpaces() { + CSVFormat csvFormat = csvFormat().withTrim(false); + PCollection input = + pipeline.apply( + Create.of( + headerLine(csvFormat), + " a ,1,1.1", + "b, 2 ,2.2", + "c,3, 3.3 ")); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList(" a ", "1", "1.1"), + Arrays.asList("b", " 2 ", "2.2"), + Arrays.asList("c", "3", " 3.3 "))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void testSingleLineCsvRecord() { + String csvRecord = "a,1"; + PCollection input = pipeline.apply(Create.of(csvRecord)); + + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat()); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder(Collections.singletonList(Arrays.asList("a", "1"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void testMultiLineCsvRecord() { + String csvRecords = + "\"a\r\n1\",\"a\r\n2\"" + "\n" + "\"b\r\n1\",\"b\r\n2\"" + "\n" + "\"c\r\n1\",\"c\r\n2\""; + PCollection input = pipeline.apply(Create.of(csvRecords)); + + CsvIOStringToCsvRecord underTest = + new CsvIOStringToCsvRecord(csvFormat().withRecordSeparator('\n')); + CsvIOParseResult> result = input.apply(underTest); + PAssert.that(result.getOutput()) + .containsInAnyOrder( + Arrays.asList( + Arrays.asList("a\r\n1", "a\r\n2"), + Arrays.asList("b\r\n1", "b\r\n2"), + Arrays.asList("c\r\n1", "c\r\n2"))); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenInvalidCsvRecord_throws() { + CSVFormat csvFormat = csvFormat().withQuote('"'); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a,\"1,1.1", "b,2,2.2", "c,3,3.3")); + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.thatSingleton(result.getErrors().apply(Count.globally())).isEqualTo(1L); + pipeline.run(); + } + + private static CSVFormat csvFormat() { + return CSVFormat.DEFAULT.withAllowDuplicateHeaderNames(false).withHeader(header); + } +} diff --git a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOTest.java b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOTest.java new file mode 100644 index 000000000000..13e09725e952 --- /dev/null +++ b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOTest.java @@ -0,0 +1,300 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypes; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import static org.junit.Assert.assertThrows; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.NullableCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.common.SchemaAwareJavaBeans; +import org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NullableAllPrimitiveDataTypes; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.transforms.Count; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.Filter; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.Row; +import org.apache.commons.csv.CSVFormat; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class CsvIOTest { + private static final String[] HEADER = + new String[] {"aBoolean", "aDouble", "aFloat", "anInteger", "aLong", "aString"}; + + @Test + public void parseRows() { + Pipeline pipeline = Pipeline.create(); + PCollection input = + csvRecords( + pipeline, + "# This is a comment", + "aBoolean,aDouble,aFloat,anInteger,aLong,aString", + "true,1.0,2.0,3,4,foo", + "N/A,6.0,7.0,8,9,bar", + "false,12.0,14.0,8,24,\"foo\nbar\"", + "true,1.0,2.0,3,4,foo$,bar"); + List want = + Arrays.asList( + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", true) + .withFieldValue("aDouble", 1.0) + .withFieldValue("aFloat", 2.0f) + .withFieldValue("anInteger", 3) + .withFieldValue("aLong", 4L) + .withFieldValue("aString", "foo") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", null) + .withFieldValue("aDouble", 6.0) + .withFieldValue("aFloat", 7.0f) + .withFieldValue("anInteger", 8) + .withFieldValue("aLong", 9L) + .withFieldValue("aString", "bar") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", false) + .withFieldValue("aDouble", 12.0) + .withFieldValue("aFloat", 14.0f) + .withFieldValue("anInteger", 8) + .withFieldValue("aLong", 24L) + .withFieldValue("aString", "foo\nbar") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", true) + .withFieldValue("aDouble", 1.0) + .withFieldValue("aFloat", 2.0f) + .withFieldValue("anInteger", 3) + .withFieldValue("aLong", 4L) + .withFieldValue("aString", "foo,bar") + .build()); + + CsvIOParse underTest = + CsvIO.parseRows(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, csvFormat()); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void parsesPOJOs() { + Pipeline pipeline = Pipeline.create(); + PCollection input = + csvRecords( + pipeline, + "# This is a comment", + "aBoolean,aDouble,aFloat,anInteger,aLong,aString", + "true,1.0,2.0,3,4,foo", + "N/A,6.0,7.0,8,9,bar", + "false,12.0,14.0,8,24,\"foo\nbar\"", + "true,1.0,2.0,3,4,foo$,bar"); + List want = + Arrays.asList( + nullableAllPrimitiveDataTypes(true, 1.0d, 2.0f, 3, 4L, "foo"), + nullableAllPrimitiveDataTypes(null, 6.0d, 7.0f, 8, 9L, "bar"), + nullableAllPrimitiveDataTypes(false, 12.0d, 14.0f, 8, 24L, "foo\nbar"), + nullableAllPrimitiveDataTypes(true, 1.0d, 2.0f, 3, 4L, "foo,bar")); + + CsvIOParse underTest = + CsvIO.parse(NullableAllPrimitiveDataTypes.class, csvFormat()); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenInvalidCsvFormat_throws() { + Pipeline pipeline = Pipeline.create(); + CSVFormat csvFormat = + CSVFormat.DEFAULT + .withHeader("a_string", "an_integer", "a_double") + .withAllowDuplicateHeaderNames(true); + Schema schema = + Schema.builder() + .addStringField("a_string") + .addInt32Field("an_integer") + .addDoubleField("a_double") + .build(); + assertThrows(IllegalArgumentException.class, () -> CsvIO.parseRows(schema, csvFormat)); + pipeline.run(); + } + + @Test + public void givenMismatchedCsvFormatAndSchema_throws() { + Pipeline pipeline = Pipeline.create(); + CSVFormat csvFormat = + CSVFormat.DEFAULT + .withHeader("a_string", "an_integer", "a_double") + .withAllowDuplicateHeaderNames(true); + Schema schema = Schema.builder().addStringField("a_string").addDoubleField("a_double").build(); + assertThrows(IllegalArgumentException.class, () -> CsvIO.parseRows(schema, csvFormat)); + pipeline.run(); + } + + @Test + public void givenNullSchema_throws() { + Pipeline pipeline = Pipeline.create(); + assertThrows(NullPointerException.class, () -> CsvIO.parseRows(null, csvFormat())); + pipeline.run(); + } + + @Test + public void givenNonSchemaMappedClass_throws() { + Pipeline pipeline = Pipeline.create(); + CSVFormat csvFormat = + CSVFormat.DEFAULT + .withHeader("a_string", "an_integer", "a_double") + .withAllowDuplicateHeaderNames(false); + assertThrows( + IllegalStateException.class, () -> CsvIO.parse(NonSchemaMappedPojo.class, csvFormat)); + pipeline.run(); + } + + @Test + public void givenStringToRecordError_emits() { + Pipeline pipeline = Pipeline.create(); + PCollection input = pipeline.apply(Create.of("true,\"1.1,3.141592,1,5,foo")); + Schema schema = + Schema.builder() + .addBooleanField("aBoolean") + .addDoubleField("aDouble") + .addFloatField("aFloat") + .addInt32Field("anInteger") + .addInt64Field("aLong") + .addStringField("aString") + .build(); + CsvIOParse underTest = CsvIO.parseRows(schema, csvFormat().withQuote('"')); + CsvIOParseResult result = input.apply(underTest); + PAssert.thatSingleton(result.getErrors().apply("Total Errors", Count.globally())).isEqualTo(1L); + PAssert.thatSingleton( + stackTraceContains(result.getErrors(), CsvIOStringToCsvRecord.class.getName())) + .isEqualTo(1L); + + pipeline.run(); + } + + @Test + public void givenRecordToObjectError_emits() { + Pipeline pipeline = Pipeline.create(); + PCollection input = + pipeline.apply(Create.of("true,1.1,3.141592,this_is_an_error,5,foo")); + Schema schema = + Schema.builder() + .addBooleanField("aBoolean") + .addDoubleField("aDouble") + .addFloatField("aFloat") + .addInt32Field("anInteger") + .addInt64Field("aLong") + .addStringField("aString") + .build(); + CsvIOParse underTest = CsvIO.parseRows(schema, csvFormat().withQuote('"')); + CsvIOParseResult result = input.apply(underTest); + PAssert.thatSingleton(result.getErrors().apply(Count.globally())).isEqualTo(1L); + PAssert.thatSingleton( + stackTraceContains(result.getErrors(), CsvIORecordToObjects.class.getName())) + .isEqualTo(1L); + pipeline.run(); + } + + @Test + public void givenStringToRecordError_RecordToObjectError_emits() { + Pipeline pipeline = Pipeline.create(); + PCollection input = + pipeline.apply( + Create.of("true,\"1.1,3.141592,1,5,foo", "true,1.1,3.141592,this_is_an_error,5,foo")); + Schema schema = + Schema.builder() + .addBooleanField("aBoolean") + .addDoubleField("aDouble") + .addFloatField("aFloat") + .addInt32Field("anInteger") + .addInt64Field("aLong") + .addStringField("aString") + .build(); + CsvIOParse underTest = CsvIO.parseRows(schema, csvFormat().withQuote('"')); + CsvIOParseResult result = input.apply(underTest); + PAssert.thatSingleton(result.getErrors().apply(Count.globally())).isEqualTo(2L); + PAssert.thatSingleton( + stackTraceContains(result.getErrors(), CsvIOStringToCsvRecord.class.getName())) + .isEqualTo(1L); + PAssert.thatSingleton( + stackTraceContains(result.getErrors(), CsvIORecordToObjects.class.getName())) + .isEqualTo(1L); + + pipeline.run(); + } + + private static PCollection stackTraceContains( + PCollection errors, String match) { + return errors + .apply(match, Filter.by(input -> checkStateNotNull(input).getStackTrace().contains(match))) + .apply(match, Count.globally()); + } + + private static CSVFormat csvFormat() { + return CSVFormat.DEFAULT + .withAllowDuplicateHeaderNames(false) + .withHeader(HEADER) + .withCommentMarker('#') + .withNullString("N/A") + .withEscape('$'); + } + + private static PCollection csvRecords(Pipeline pipeline, String... lines) { + return pipeline.apply( + Create.of(Arrays.asList(lines)).withCoder(NullableCoder.of(StringUtf8Coder.of()))); + } + + private static class NonSchemaMappedPojo implements Serializable { + private final String aString; + private final Integer anInteger; + private final Double aDouble; + + private NonSchemaMappedPojo(String aString, Integer anInteger, Double aDouble) { + this.aString = aString; + this.anInteger = anInteger; + this.aDouble = aDouble; + } + + public String getAString() { + return aString; + } + + public Integer getAnInteger() { + return anInteger; + } + + public Double getADouble() { + return aDouble; + } + } +} diff --git a/sdks/java/io/google-cloud-platform/build.gradle b/sdks/java/io/google-cloud-platform/build.gradle index e499bae6fc64..23c56f13a94c 100644 --- a/sdks/java/io/google-cloud-platform/build.gradle +++ b/sdks/java/io/google-cloud-platform/build.gradle @@ -218,6 +218,10 @@ task integrationTest(type: Test, dependsOn: processTestResources) { useJUnit { excludeCategories "org.apache.beam.sdk.testing.UsesKms" + filter { + // https://github.com/apache/beam/issues/32071 + excludeTestsMatching 'org.apache.beam.sdk.io.gcp.bigtable.BigtableReadIT.testE2EBigtableSegmentRead' + } } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AppendClientInfo.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AppendClientInfo.java index 5a12e81ea79d..7505f77fb5b4 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AppendClientInfo.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AppendClientInfo.java @@ -40,8 +40,8 @@ */ @AutoValue abstract class AppendClientInfo { - private final Counter activeConnections = - Metrics.counter(AppendClientInfo.class, "activeConnections"); + private final Counter activeStreamAppendClients = + Metrics.counter(AppendClientInfo.class, "activeStreamAppendClients"); abstract @Nullable BigQueryServices.StreamAppendClient getStreamAppendClient(); @@ -123,7 +123,7 @@ public AppendClientInfo withAppendClient( writeStreamService.getStreamAppendClient( streamName, getDescriptor(), useConnectionPool, missingValueInterpretation); - activeConnections.inc(); + activeStreamAppendClients.inc(); return toBuilder().setStreamName(streamName).setStreamAppendClient(client).build(); } @@ -133,7 +133,7 @@ public void close() { BigQueryServices.StreamAppendClient client = getStreamAppendClient(); if (client != null) { getCloseAppendClient().accept(client); - activeConnections.dec(); + activeStreamAppendClients.dec(); } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java index cd1fc6d3842c..ba76f483f774 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java @@ -109,6 +109,28 @@ public interface BigQueryOptions void setNumStorageWriteApiStreamAppendClients(Integer value); + @Description( + "When using the STORAGE_API_AT_LEAST_ONCE write method with multiplexing (ie. useStorageApiConnectionPool=true), " + + "this option sets the minimum number of connections each pool creates before any connections are shared. This is " + + "on a per worker, per region basis. Note that in practice, the minimum number of connections created is the minimum " + + "of this value and (numStorageWriteApiStreamAppendClients x num destinations). BigQuery will create this many " + + "connections at first and will only create more connections if the current ones are \"overwhelmed\". Consider " + + "increasing this value if you are running into performance issues.") + @Default.Integer(2) + Integer getMinConnectionPoolConnections(); + + void setMinConnectionPoolConnections(Integer value); + + @Description( + "When using the STORAGE_API_AT_LEAST_ONCE write method with multiplexing (ie. useStorageApiConnectionPool=true), " + + "this option sets the maximum number of connections each pool creates. This is on a per worker, per region basis. " + + "If writing to many dynamic destinations (>20) and experiencing performance issues or seeing append operations competing" + + "for streams, consider increasing this value.") + @Default.Integer(20) + Integer getMaxConnectionPoolConnections(); + + void setMaxConnectionPoolConnections(Integer value); + @Description("The max number of messages inflight that we expect each connection will retain.") @Default.Long(1000) Long getStorageWriteMaxInflightRequests(); @@ -122,6 +144,11 @@ public interface BigQueryOptions void setStorageWriteMaxInflightBytes(Long value); + @Description( + "Enables multiplexing mode, where multiple tables can share the same connection. Only available when writing with STORAGE_API_AT_LEAST_ONCE" + + " mode. This is recommended if your write operation is creating 20+ connections. When using multiplexing, consider tuning " + + "the number of connections created by the connection pool with minConnectionPoolConnections and maxConnectionPoolConnections. " + + "For more information, see https://cloud.google.com/bigquery/docs/write-api-best-practices#connection_pool_management") @Default.Boolean(false) Boolean getUseStorageApiConnectionPool(); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java index 2bdba0b053c8..b87b6a222a4d 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java @@ -69,6 +69,7 @@ import com.google.cloud.bigquery.storage.v1.BigQueryReadSettings; import com.google.cloud.bigquery.storage.v1.BigQueryWriteClient; import com.google.cloud.bigquery.storage.v1.BigQueryWriteSettings; +import com.google.cloud.bigquery.storage.v1.ConnectionWorkerPool; import com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest; import com.google.cloud.bigquery.storage.v1.CreateWriteStreamRequest; import com.google.cloud.bigquery.storage.v1.FinalizeWriteStreamRequest; @@ -574,7 +575,7 @@ public static class DatasetServiceImpl implements DatasetService { private final long maxRowBatchSize; // aggregate the total time spent in exponential backoff private final Counter throttlingMsecs = - Metrics.counter(DatasetServiceImpl.class, "throttling-msecs"); + Metrics.counter(DatasetServiceImpl.class, Metrics.THROTTLE_TIME_COUNTER_NAME); private @Nullable BoundedExecutorService executor; @@ -1423,6 +1424,14 @@ public StreamAppendClient getStreamAppendClient( bqIOMetadata.getBeamJobId() == null ? "" : bqIOMetadata.getBeamJobId(), bqIOMetadata.getBeamWorkerId() == null ? "" : bqIOMetadata.getBeamWorkerId()); + ConnectionWorkerPool.setOptions( + ConnectionWorkerPool.Settings.builder() + .setMinConnectionsPerRegion( + options.as(BigQueryOptions.class).getMinConnectionPoolConnections()) + .setMaxConnectionsPerRegion( + options.as(BigQueryOptions.class).getMaxConnectionPoolConnections()) + .build()); + StreamWriter streamWriter = StreamWriter.newBuilder(streamName, newWriteClient) .setExecutorProvider( @@ -1654,7 +1663,7 @@ public void cancel() { static class StorageClientImpl implements StorageClient { public static final Counter THROTTLING_MSECS = - Metrics.counter(StorageClientImpl.class, "throttling-msecs"); + Metrics.counter(StorageClientImpl.class, Metrics.THROTTLE_TIME_COUNTER_NAME); private transient long unreportedDelay = 0L; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceBase.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceBase.java index 38c0c8e43b24..998c82ab8d83 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceBase.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySourceBase.java @@ -157,9 +157,6 @@ public List> split(long desiredBundleSizeBytes, PipelineOptions LOG.info("Extract job produced {} files", res.extractedFiles.size()); if (res.extractedFiles.size() > 0) { BigQueryOptions bqOptions = options.as(BigQueryOptions.class); - // emit this table ID as a lineage source - Lineage.getSources() - .add(BigQueryHelpers.dataCatalogName(getTableToExtract(bqOptions), bqOptions)); final String extractDestinationDir = resolveTempLocation(bqOptions.getTempLocation(), "BigQueryExtractTemp", stepUuid); // Match all files in the destination directory to stat them in bulk. diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java index 21c1d961e84c..8a902ec6d264 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java @@ -32,6 +32,8 @@ import com.google.cloud.bigquery.storage.v1.WriteStream.Type; import com.google.protobuf.ByteString; import com.google.protobuf.DescriptorProtos; +import com.google.protobuf.Descriptors.Descriptor; +import com.google.protobuf.Descriptors.DescriptorValidationException; import com.google.protobuf.DynamicMessage; import io.grpc.Status; import java.io.IOException; @@ -771,7 +773,7 @@ long flush( invalidateWriteStream(); allowedRetry = 5; } else { - allowedRetry = 10; + allowedRetry = 35; } // Maximum number of times we retry before we fail the work item. @@ -834,21 +836,28 @@ long flush( c, BigQuerySinkMetrics.RpcMethod.APPEND_ROWS, shortTableUrn); if (successfulRowsReceiver != null) { - for (int i = 0; i < c.protoRows.getSerializedRowsCount(); ++i) { - ByteString rowBytes = c.protoRows.getSerializedRowsList().get(i); - try { - TableRow row = - TableRowToStorageApiProto.tableRowFromMessage( - DynamicMessage.parseFrom( - TableRowToStorageApiProto.wrapDescriptorProto( - Preconditions.checkStateNotNull(appendClientInfo) - .getDescriptor()), - rowBytes), - true); - org.joda.time.Instant timestamp = c.timestamps.get(i); - successfulRowsReceiver.outputWithTimestamp(row, timestamp); - } catch (Exception e) { - LOG.warn("Failure parsing TableRow", e); + Descriptor descriptor = null; + try { + descriptor = + TableRowToStorageApiProto.wrapDescriptorProto( + Preconditions.checkStateNotNull(appendClientInfo).getDescriptor()); + } catch (DescriptorValidationException e) { + LOG.warn( + "Failure getting proto descriptor. Successful output will not be produced.", + e); + } + if (descriptor != null) { + for (int i = 0; i < c.protoRows.getSerializedRowsCount(); ++i) { + ByteString rowBytes = c.protoRows.getSerializedRowsList().get(i); + try { + TableRow row = + TableRowToStorageApiProto.tableRowFromMessage( + DynamicMessage.parseFrom(descriptor, rowBytes), true); + org.joda.time.Instant timestamp = c.timestamps.get(i); + successfulRowsReceiver.outputWithTimestamp(row, timestamp); + } catch (Exception e) { + LOG.warn("Failure parsing TableRow", e); + } } } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProto.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProto.java index c1f452ba93f9..fbc17fb59704 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProto.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProto.java @@ -1099,7 +1099,7 @@ public static TableRow tableRowFromMessage(Message message, boolean includeCdcCo FieldDescriptor fieldDescriptor = field.getKey(); Object fieldValue = field.getValue(); if (includeCdcColumns || !StorageApiCDC.COLUMNS.contains(fieldDescriptor.getName())) { - tableRow.putIfAbsent( + tableRow.put( fieldDescriptor.getName(), jsonValueFromMessageValue(fieldDescriptor, fieldValue, true)); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java index d25ad7d4871d..6d20109e947b 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java @@ -21,13 +21,16 @@ import static org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; import static org.apache.beam.sdk.transforms.errorhandling.BadRecordRouter.BAD_RECORD_TAG; import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects.firstNonNull; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; import com.google.api.gax.batching.BatchingException; import com.google.api.gax.rpc.ApiException; +import com.google.api.gax.rpc.DeadlineExceededException; import com.google.api.gax.rpc.InvalidArgumentException; import com.google.api.gax.rpc.NotFoundException; +import com.google.api.gax.rpc.ResourceExhaustedException; import com.google.auto.value.AutoValue; import com.google.bigtable.v2.MutateRowResponse; import com.google.bigtable.v2.Mutation; @@ -38,6 +41,7 @@ import com.google.cloud.bigtable.data.v2.models.ChangeStreamRecord; import com.google.cloud.bigtable.data.v2.models.KeyOffset; import com.google.protobuf.ByteString; +import io.grpc.StatusRuntimeException; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; @@ -69,6 +73,8 @@ import org.apache.beam.sdk.io.range.ByteKey; import org.apache.beam.sdk.io.range.ByteKeyRange; import org.apache.beam.sdk.io.range.ByteKeyRangeTracker; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.ExperimentalOptions; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.ValueProvider; @@ -82,6 +88,7 @@ import org.apache.beam.sdk.transforms.errorhandling.BadRecordRouter; import org.apache.beam.sdk.transforms.errorhandling.ErrorHandler; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.util.StringUtils; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; @@ -1109,12 +1116,51 @@ public Write withMaxOutstandingBytes(long bytes) { * always enabled on batch writes and limits the number of outstanding requests to the Bigtable * server. * + *

When enabled, will also set default {@link #withThrottlingReportTargetMs} to 1 minute. + * This enables runner react with increased latency in flush call due to flow control. + * *

Does not modify this object. */ public Write withFlowControl(boolean enableFlowControl) { + BigtableWriteOptions options = getBigtableWriteOptions(); + BigtableWriteOptions.Builder builder = options.toBuilder().setFlowControl(enableFlowControl); + if (enableFlowControl) { + builder = builder.setThrottlingReportTargetMs(60_000); + } + return toBuilder().setBigtableWriteOptions(builder.build()).build(); + } + + /** + * Returns a new {@link BigtableIO.Write} with client side latency based throttling enabled. + * + *

Will also set {@link #withThrottlingReportTargetMs} to the same value. + */ + public Write withThrottlingTargetMs(int throttlingTargetMs) { + BigtableWriteOptions options = getBigtableWriteOptions(); + return toBuilder() + .setBigtableWriteOptions( + options + .toBuilder() + .setThrottlingTargetMs(throttlingTargetMs) + .setThrottlingReportTargetMs(throttlingTargetMs) + .build()) + .build(); + } + + /** + * Returns a new {@link BigtableIO.Write} with throttling time reporting enabled. When write + * request latency exceeded the set value, the amount greater than the target will be considered + * as throttling time and report back to runner. + * + *

If not set, defaults to 3 min for completed batch request. Client side flowing control + * configurations (e.g. {@link #withFlowControl}, {@link #withThrottlingTargetMs} will adjust + * the default value accordingly. Set to 0 to disable throttling time reporting. + */ + public Write withThrottlingReportTargetMs(int throttlingReportTargetMs) { BigtableWriteOptions options = getBigtableWriteOptions(); return toBuilder() - .setBigtableWriteOptions(options.toBuilder().setFlowControl(enableFlowControl).build()) + .setBigtableWriteOptions( + options.toBuilder().setThrottlingReportTargetMs(throttlingReportTargetMs).build()) .build(); } @@ -1283,7 +1329,15 @@ private static class BigtableWriterFn private final Coder>> inputCoder; private final BadRecordRouter badRecordRouter; + private final Counter throttlingMsecs = + Metrics.counter(Metrics.THROTTLE_TIME_NAMESPACE, Metrics.THROTTLE_TIME_COUNTER_NAME); + + private final int throttleReportThresMsecs; + private transient Set> badRecords = null; + // Due to callback thread not supporting Beam metrics, Record pending metrics and report later. + private transient long pendingThrottlingMsecs; + private transient boolean reportedLineage; // Assign serviceEntry in startBundle and clear it in tearDown. @Nullable private BigtableServiceEntry serviceEntry; @@ -1301,6 +1355,8 @@ private static class BigtableWriterFn this.badRecordRouter = badRecordRouter; this.failures = new ConcurrentLinkedQueue<>(); this.id = factory.newId(); + // a request completed more than this time will be considered throttled. Disabled if set to 0 + throttleReportThresMsecs = firstNonNull(writeOptions.getThrottlingReportTargetMs(), 180_000); LOG.debug("Created Bigtable Write Fn with writeOptions {} ", writeOptions); } @@ -1322,20 +1378,52 @@ public void startBundle(StartBundleContext c) throws IOException { public void processElement(ProcessContext c, BoundedWindow window) throws Exception { checkForFailures(); KV> record = c.element(); - bigtableWriter.writeRecord(record).whenComplete(handleMutationException(record, window)); + Instant writeStart = Instant.now(); + pendingThrottlingMsecs = 0; + bigtableWriter + .writeRecord(record) + .whenComplete(handleMutationException(record, window, writeStart)); + if (pendingThrottlingMsecs > 0) { + throttlingMsecs.inc(pendingThrottlingMsecs); + } ++recordsWritten; seenWindows.compute(window, (key, count) -> (count != null ? count : 0) + 1); } private BiConsumer handleMutationException( - KV> record, BoundedWindow window) { + KV> record, BoundedWindow window, Instant writeStart) { return (MutateRowResponse result, Throwable exception) -> { if (exception != null) { if (isDataException(exception)) { retryIndividualRecord(record, window); } else { + // Exception due to resource unavailable or rate limited, + // including DEADLINE_EXCEEDED and RESOURCE_EXHAUSTED. + boolean isResourceException = false; + if (exception instanceof StatusRuntimeException) { + StatusRuntimeException se = (StatusRuntimeException) exception; + if (io.grpc.Status.DEADLINE_EXCEEDED.equals(se.getStatus()) + || io.grpc.Status.RESOURCE_EXHAUSTED.equals(se.getStatus())) { + isResourceException = true; + } + } else if (exception instanceof DeadlineExceededException + || exception instanceof ResourceExhaustedException) { + isResourceException = true; + } + if (isResourceException) { + pendingThrottlingMsecs = new Duration(writeStart, Instant.now()).getMillis(); + } failures.add(new BigtableWriteException(record, exception)); } + } else { + // add the excessive amount to throttling metrics if elapsed time > target latency + if (throttleReportThresMsecs > 0) { + long excessTime = + new Duration(writeStart, Instant.now()).getMillis() - throttleReportThresMsecs; + if (excessTime > 0) { + pendingThrottlingMsecs = excessTime; + } + } } }; } @@ -1371,8 +1459,8 @@ private static boolean isDataException(Throwable e) { @FinishBundle public void finishBundle(FinishBundleContext c) throws Exception { try { - if (bigtableWriter != null) { + Instant closeStart = Instant.now(); try { bigtableWriter.close(); } catch (IOException e) { @@ -1381,9 +1469,22 @@ public void finishBundle(FinishBundleContext c) throws Exception { // to the error queue. Bigtable will successfully write other failures in the batch, // so this exception should be ignored if (!(e.getCause() instanceof BatchingException)) { + throttlingMsecs.inc(new Duration(closeStart, Instant.now()).getMillis()); throw e; } } + // add the excessive amount to throttling metrics if elapsed time > target latency + if (throttleReportThresMsecs > 0) { + long excessTime = + new Duration(closeStart, Instant.now()).getMillis() - throttleReportThresMsecs; + if (excessTime > 0) { + throttlingMsecs.inc(excessTime); + } + } + if (!reportedLineage) { + bigtableWriter.reportLineage(); + reportedLineage = true; + } bigtableWriter = null; } @@ -1516,6 +1617,7 @@ public String toString() { private final BigtableConfig config; private final BigtableReadOptions readOptions; private @Nullable Long estimatedSizeBytes; + private transient boolean reportedLineage; private final BigtableServiceFactory.ConfigId configId; @@ -1893,6 +1995,13 @@ public List getRanges() { public ValueProvider getTableId() { return readOptions.getTableId(); } + + void reportLineageOnce(BigtableService.Reader reader) { + if (!reportedLineage) { + reader.reportLineage(); + reportedLineage = true; + } + } } private static class BigtableReader extends BoundedReader { @@ -1923,6 +2032,7 @@ true, makeByteKey(reader.getCurrentRow().getKey()))) || rangeTracker.markDone(); if (hasRecord) { ++recordsReturned; + source.reportLineageOnce(reader); } return hasRecord; } @@ -2015,7 +2125,7 @@ public BigtableWriteException(KV> record, Throwab super( String.format( "Error mutating row %s with mutations %s", - record.getKey().toStringUtf8(), record.getValue()), + record.getKey().toStringUtf8(), StringUtils.leftTruncate(record.getValue(), 100)), cause); this.record = record; } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java index 261cc3ac081d..50d8126999c4 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java @@ -57,6 +57,9 @@ CompletionStage writeRecord(KV * @throws IOException if there is an error closing the writer */ void close() throws IOException; + + /** Report Lineage metrics to runner. */ + default void reportLineage() {} } /** The interface of a class that reads from Cloud Bigtable. */ @@ -77,6 +80,9 @@ interface Reader { Row getCurrentRow() throws NoSuchElementException; void close(); + + /** Report Lineage metrics to runner. */ + default void reportLineage() {} } /** Returns a {@link Reader} that will read from the specified source. */ diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java index 06e0108259d5..6fdf67722bac 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java @@ -24,6 +24,7 @@ import com.google.api.gax.batching.BatchingException; import com.google.api.gax.grpc.GrpcCallContext; import com.google.api.gax.rpc.ApiException; +import com.google.api.gax.rpc.DeadlineExceededException; import com.google.api.gax.rpc.ResponseObserver; import com.google.api.gax.rpc.ServerStream; import com.google.api.gax.rpc.StreamController; @@ -70,6 +71,7 @@ import org.apache.beam.sdk.io.gcp.bigtable.BigtableIO.BigtableSource; import org.apache.beam.sdk.io.range.ByteKeyRange; import org.apache.beam.sdk.metrics.Distribution; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.values.KV; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; @@ -211,6 +213,11 @@ public void close() { exhausted = true; } } + + @Override + public void reportLineage() { + Lineage.getSources().add(String.format("bigtable:%s.%s.%s", projectId, instanceId, tableId)); + } } @VisibleForTesting @@ -224,6 +231,9 @@ static class BigtableSegmentReaderImpl implements Reader { private final int refillSegmentWaterMark; private final long maxSegmentByteSize; private ServiceCallMetric serviceCallMetric; + private final String projectId; + private final String instanceId; + private final String tableId; private static class UpstreamResults { private final List rows; @@ -307,11 +317,19 @@ static BigtableSegmentReaderImpl create( // Asynchronously refill buffer when there is 10% of the elements are left this.refillSegmentWaterMark = Math.max(1, (int) (request.getRowsLimit() * WATERMARK_PERCENTAGE)); + this.projectId = projectId; + this.instanceId = instanceId; + this.tableId = tableId; } @Override public void close() {} + @Override + public void reportLineage() { + Lineage.getSources().add(String.format("bigtable:%s.%s.%s", projectId, instanceId, tableId)); + } + @Override public boolean start() throws IOException { future = fetchNextSegment(); @@ -435,13 +453,18 @@ private ReadRowsRequest truncateRequest(ReadRowsRequest request, ByteString last int startCmp = StartPoint.extract(rowRange).compareTo(new StartPoint(lastKey, true)); int endCmp = EndPoint.extract(rowRange).compareTo(new EndPoint(lastKey, true)); + if (endCmp <= 0) { + // range end is on or left of the split: skip + continue; + } + + RowRange.Builder newRange = rowRange.toBuilder(); if (startCmp > 0) { // If the startKey is passed the split point than add the whole range - segment.addRowRanges(rowRange); - } else if (endCmp > 0) { + segment.addRowRanges(newRange.build()); + } else { // Row is split, remove all read rowKeys and split RowSet at last buffered Row - RowRange subRange = rowRange.toBuilder().setStartKeyOpen(lastKey).build(); - segment.addRowRanges(subRange); + segment.addRowRanges(newRange.setStartKeyOpen(lastKey).build()); } } if (segment.getRowRangesCount() == 0) { @@ -572,6 +595,11 @@ public void writeSingleRecord(KV> record) throws } } + @Override + public void reportLineage() { + Lineage.getSinks().add(String.format("bigtable:%s.%s.%s", projectId, instanceId, tableId)); + } + private ServiceCallMetric createServiceCallMetric() { // Populate metrics HashMap baseLabels = new HashMap<>(); @@ -611,6 +639,9 @@ public void onFailure(Throwable throwable) { if (throwable instanceof StatusRuntimeException) { serviceCallMetric.call( ((StatusRuntimeException) throwable).getStatus().getCode().value()); + } else if (throwable instanceof DeadlineExceededException) { + // incoming throwable can be a StatusRuntimeException or a specific grpc ApiException + serviceCallMetric.call(504); } else { serviceCallMetric.call("unknown"); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteOptions.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteOptions.java index a63cc575809b..5963eb6be3ce 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteOptions.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteOptions.java @@ -57,6 +57,9 @@ abstract class BigtableWriteOptions implements Serializable { /** Returns the target latency if latency based throttling is enabled. */ abstract @Nullable Integer getThrottlingTargetMs(); + /** Returns the target latency if latency based throttling report to runner is enabled. */ + abstract @Nullable Integer getThrottlingReportTargetMs(); + /** Returns true if batch write flow control is enabled. Otherwise return false. */ abstract @Nullable Boolean getFlowControl(); @@ -88,6 +91,8 @@ abstract static class Builder { abstract Builder setThrottlingTargetMs(int targetMs); + abstract Builder setThrottlingReportTargetMs(int targetMs); + abstract Builder setFlowControl(boolean enableFlowControl); abstract Builder setCloseWaitTimeout(Duration timeout); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java index 86cd7a3439aa..1563b0b059f2 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java @@ -1711,7 +1711,7 @@ static class DatastoreWriterFn extends DoFn { private WriteBatcher writeBatcher; private transient AdaptiveThrottler adaptiveThrottler; private final Counter throttlingMsecs = - Metrics.counter(DatastoreWriterFn.class, "throttling-msecs"); + Metrics.counter(DatastoreWriterFn.class, Metrics.THROTTLE_TIME_COUNTER_NAME); private final Counter rpcErrors = Metrics.counter(DatastoreWriterFn.class, "datastoreRpcErrors"); private final Counter rpcSuccesses = diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/RampupThrottlingFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/RampupThrottlingFn.java index db098c0a5166..ae94d4b612d0 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/RampupThrottlingFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/RampupThrottlingFn.java @@ -53,7 +53,8 @@ public class RampupThrottlingFn extends DoFn implements Serializable { private final PCollectionView firstInstantSideInput; @VisibleForTesting - Counter throttlingMsecs = Metrics.counter(RampupThrottlingFn.class, "throttling-msecs"); + Counter throttlingMsecs = + Metrics.counter(RampupThrottlingFn.class, Metrics.THROTTLE_TIME_COUNTER_NAME); // Initialized on every setup. private transient MovingFunction successfulOps; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClient.java index 79a9bb7f07d6..f66ee6e1d842 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClient.java @@ -39,12 +39,15 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.checkerframework.checker.nullness.qual.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** An (abstract) helper class for talking to Pubsub via an underlying transport. */ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) }) public abstract class PubsubClient implements Closeable { + private static final Logger LOG = LoggerFactory.getLogger(PubsubClient.class); private static final Map> schemaTypeToConversionFnMap = ImmutableMap.of( @@ -257,6 +260,10 @@ public String getFullPath() { return String.format("/subscriptions/%s/%s", projectId, subscriptionName); } + public String getDataCatalogName() { + return String.format("pubsub:subscription:%s.%s", projectId, subscriptionName); + } + @Override public boolean equals(@Nullable Object o) { if (this == o) { @@ -293,6 +300,7 @@ public static SubscriptionPath subscriptionPathFromName( /** Path representing a Pubsub topic. */ public static class TopicPath implements Serializable { + // Format: "projects//topics/" private final String path; TopicPath(String path) { @@ -310,6 +318,26 @@ public String getName() { return splits.get(3); } + /** + * Returns the data catalog name. Format "pubsub:topic:`project`.`topic`" This method is + * fail-safe. If topic path is malformed, it returns an empty string. + */ + public String getDataCatalogName() { + List splits = Splitter.on('/').splitToList(path); + if (splits.size() == 4) { + // well-formed path + return String.format("pubsub:topic:%s.%s", splits.get(1), splits.get(3)); + } else { + // Mal-formed path. It is either a test fixture or user error and will fail on publish. + // We do not throw exception instead return empty string here. + LOG.warn( + "Cannot get data catalog name for malformed topic path {}. Expected format: " + + "projects//topics/", + path); + return ""; + } + } + public String getFullPath() { List splits = Splitter.on('/').splitToList(path); checkState(splits.size() == 4, "Malformed topic path %s", path); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 01848d92d928..0fd4e9207d81 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -49,6 +49,7 @@ import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.OutgoingMessage; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.SubscriptionPath; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider; import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; @@ -512,6 +513,10 @@ public String asPath() { } } + public String dataCatalogName() { + return String.format("pubsub:topic:%s.%s", project, topic); + } + @Override public String toString() { return asPath(); @@ -1488,7 +1493,7 @@ public PDone expand(PCollection input) { .get(BAD_RECORD_TAG) .setCoder(BadRecord.getCoder(input.getPipeline()))); PCollection pubsubMessages = - pubsubMessageTuple.get(pubsubMessageTupleTag).setCoder(new PubsubMessageWithTopicCoder()); + pubsubMessageTuple.get(pubsubMessageTupleTag).setCoder(PubsubMessageWithTopicCoder.of()); switch (input.isBounded()) { case BOUNDED: pubsubMessages.apply( @@ -1617,6 +1622,10 @@ public void finishBundle() throws IOException { for (Map.Entry entry : output.entrySet()) { publish(entry.getKey(), entry.getValue().messages); } + // Report lineage for all topics seen + for (PubsubTopic topic : output.keySet()) { + Lineage.getSinks().add(topic.dataCatalogName()); + } output = null; pubsubClient.close(); pubsubClient = null; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java index d10b9a2f1066..768aebe54e65 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java @@ -45,8 +45,8 @@ public static Coder of(TypeDescriptor ignored) { return of(); } - public static PubsubMessageWithAttributesAndMessageIdCoder of() { - return new PubsubMessageWithAttributesAndMessageIdCoder(); + public static PubsubMessageWithTopicCoder of() { + return new PubsubMessageWithTopicCoder(); } @Override diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index aa8e3a411486..defea87e835a 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -41,6 +41,7 @@ import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.PubsubClientFactory; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath; import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.metrics.SinkMetrics; import org.apache.beam.sdk.options.ValueProvider; @@ -69,6 +70,7 @@ import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.hash.Hashing; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; @@ -231,6 +233,9 @@ private static class WriterFn extends DoFn, Void> { /** Client on which to talk to Pubsub. Null until created by {@link #startBundle}. */ private transient @Nullable PubsubClient pubsubClient; + /** Last TopicPath that reported Lineage. */ + private transient @Nullable TopicPath reportedLineage; + private final Counter batchCounter = Metrics.counter(WriterFn.class, "batches"); private final Counter elementCounter = SinkMetrics.elementsWritten(); private final Counter byteCounter = SinkMetrics.bytesWritten(); @@ -290,6 +295,14 @@ private void publishBatch(List messages, int bytes) throws IOEx batchCounter.inc(); elementCounter.inc(messages.size()); byteCounter.inc(bytes); + // Report Lineage multiple once for same topic + if (!topicPath.equals(reportedLineage)) { + String name = topicPath.getDataCatalogName(); + if (!Strings.isNullOrEmpty(name)) { + Lineage.getSinks().add(topicPath.getDataCatalogName()); + } + reportedLineage = topicPath; + } } @StartBundle diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSource.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSource.java index b9a554d54ade..b131b521c067 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSource.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSource.java @@ -56,6 +56,7 @@ import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath; import org.apache.beam.sdk.io.gcp.pubsub.PubsubMessages.DeserializeBytesIntoPubsubMessagePayloadOnly; import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.metrics.SourceMetrics; import org.apache.beam.sdk.options.ExperimentalOptions; import org.apache.beam.sdk.options.PipelineOptions; @@ -1041,6 +1042,19 @@ public List split(int desiredNumSplits, PipelineOptions options) splitSource = new PubsubSource( outer, StaticValueProvider.of(outer.createRandomSubscription(options))); + TopicPath topic = outer.getTopic(); + if (topic != null) { + // is initial split on Read.fromTopic, report Lineage based on topic + Lineage.getSources().add(topic.getDataCatalogName()); + } + } else { + if (subscriptionPath.equals(outer.getSubscriptionProvider())) { + SubscriptionPath sub = subscriptionPath.get(); + if (sub != null) { + // is a split on Read.fromSubscription + Lineage.getSources().add(sub.getDataCatalogName()); + } + } } for (int i = 0; i < desiredNumSplits * SCALE_OUT; i++) { // Since the source is immutable and Pubsub automatically shards we simply diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerAccessor.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerAccessor.java index dc3844504218..943efc9883b6 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerAccessor.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerAccessor.java @@ -30,6 +30,7 @@ import com.google.cloud.spanner.DatabaseAdminClient; import com.google.cloud.spanner.DatabaseClient; import com.google.cloud.spanner.DatabaseId; +import com.google.cloud.spanner.SessionPoolOptions; import com.google.cloud.spanner.Spanner; import com.google.cloud.spanner.SpannerOptions; import com.google.cloud.spanner.v1.stub.SpannerStubSettings; @@ -229,7 +230,9 @@ static SpannerOptions buildSpannerOptions(SpannerConfig spannerConfig) { if (credentials != null && credentials.get() != null) { builder.setCredentials(credentials.get()); } - + SessionPoolOptions sessionPoolOptions = + SessionPoolOptions.newBuilder().setFailIfPoolExhausted().build(); + builder.setSessionPoolOption(sessionPoolOptions); return builder.build(); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchema.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchema.java index 3fd09c63da79..fa44cadeba0a 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchema.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchema.java @@ -184,6 +184,9 @@ private static Type parseSpannerType(String spannerType, Dialect dialect) { if (spannerType.startsWith("BYTES")) { return Type.bytes(); } + if ("TOKENLIST".equals(spannerType)) { + return Type.bytes(); + } if ("TIMESTAMP".equals(spannerType)) { return Type.timestamp(); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOReadTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOReadTest.java index 5c43666e79e5..a8aca7570b33 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOReadTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOReadTest.java @@ -43,6 +43,7 @@ import java.util.List; import java.util.Objects; import java.util.Optional; +import java.util.Set; import java.util.concurrent.ExecutionException; import org.apache.avro.specific.SpecificDatumReader; import org.apache.avro.specific.SpecificRecordBase; @@ -61,9 +62,6 @@ import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; import org.apache.beam.sdk.metrics.Lineage; -import org.apache.beam.sdk.metrics.MetricNameFilter; -import org.apache.beam.sdk.metrics.MetricQueryResults; -import org.apache.beam.sdk.metrics.MetricsFilter; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider; @@ -351,18 +349,8 @@ private void checkTypedReadQueryObjectWithValidate( } private void checkLineageSourceMetric(PipelineResult pipelineResult, String tableName) { - MetricQueryResults lineageMetrics = - pipelineResult - .metrics() - .queryMetrics( - MetricsFilter.builder() - .addNameFilter( - MetricNameFilter.named( - Lineage.LINEAGE_NAMESPACE, Lineage.SOURCE_METRIC_NAME)) - .build()); - assertThat( - lineageMetrics.getStringSets().iterator().next().getCommitted().getStringSet(), - contains("bigquery:" + tableName.replace(':', '.'))); + Set result = Lineage.query(pipelineResult.metrics(), Lineage.Type.SOURCE); + assertThat(result, contains("bigquery:" + tableName.replace(':', '.'))); } @Before @@ -600,10 +588,7 @@ public void processElement(ProcessContext c) throws Exception { new MyData("b", 2L, bd1, bd2), new MyData("c", 3L, bd1, bd2))); PipelineResult result = p.run(); - // Skip when direct runner splits outside of a counters context. - if (useTemplateCompatibility) { - checkLineageSourceMetric(result, "non-executing-project:somedataset.sometable"); - } + checkLineageSourceMetric(result, "non-executing-project:somedataset.sometable"); } @Test diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java index bc90d4c8bae7..c5af8045bfe2 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java @@ -118,9 +118,6 @@ import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; import org.apache.beam.sdk.metrics.Lineage; -import org.apache.beam.sdk.metrics.MetricNameFilter; -import org.apache.beam.sdk.metrics.MetricQueryResults; -import org.apache.beam.sdk.metrics.MetricsFilter; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.schemas.JavaFieldSchema; @@ -285,16 +282,8 @@ public void evaluate() throws Throwable { .withJobService(fakeJobService); private void checkLineageSinkMetric(PipelineResult pipelineResult, String tableName) { - MetricQueryResults lineageMetrics = - pipelineResult - .metrics() - .queryMetrics( - MetricsFilter.builder() - .addNameFilter( - MetricNameFilter.named(Lineage.LINEAGE_NAMESPACE, Lineage.SINK_METRIC_NAME)) - .build()); assertThat( - lineageMetrics.getStringSets().iterator().next().getCommitted().getStringSet(), + Lineage.query(pipelineResult.metrics(), Lineage.Type.SINK), hasItem("bigquery:" + tableName.replace(':', '.'))); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySinkMetricsTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySinkMetricsTest.java index 50660326275c..8695a445c118 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySinkMetricsTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySinkMetricsTest.java @@ -37,6 +37,7 @@ import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Histogram; import org.apache.beam.sdk.metrics.MetricName; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.metrics.MetricsEnvironment; import org.apache.beam.sdk.util.HistogramData; import org.apache.beam.sdk.values.KV; @@ -178,7 +179,8 @@ public void testThrottledTimeCounter() throws Exception { testContainer.assertPerWorkerCounterValue(counterName, 1L); counterName = - MetricName.named(BigQueryServicesImpl.StorageClientImpl.class, "throttling-msecs"); + MetricName.named( + BigQueryServicesImpl.StorageClientImpl.class, Metrics.THROTTLE_TIME_COUNTER_NAME); assertEquals(1L, (long) testContainer.getCounter(counterName).getCumulative()); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java index dd6a55ff4378..e5049b037010 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java @@ -436,6 +436,21 @@ public void testWriteValidationFailsMissingOptionsAndInstanceAndProject() { write.expand(null); } + @Test + public void testWriteClientRateLimitingAlsoSetReportMsecs() { + // client side flow control + BigtableIO.Write write = BigtableIO.write().withTableId("table").withFlowControl(true); + assertEquals( + 60_000, (int) checkNotNull(write.getBigtableWriteOptions().getThrottlingReportTargetMs())); + + // client side latency based throttling + int targetMs = 30_000; + write = BigtableIO.write().withTableId("table").withThrottlingTargetMs(targetMs); + assertEquals( + targetMs, + (int) checkNotNull(write.getBigtableWriteOptions().getThrottlingReportTargetMs())); + } + /** Helper function to make a single row mutation to be written. */ private static KV> makeWrite(String key, String value) { ByteString rowKey = ByteString.copyFromUtf8(key); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableReadIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableReadIT.java index bc88858ebc33..4ce9ad10b2c0 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableReadIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableReadIT.java @@ -17,6 +17,9 @@ */ package org.apache.beam.sdk.io.gcp.bigtable; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasItem; + import com.google.cloud.bigtable.admin.v2.BigtableTableAdminClient; import com.google.cloud.bigtable.admin.v2.BigtableTableAdminSettings; import com.google.cloud.bigtable.admin.v2.models.CreateTableRequest; @@ -28,7 +31,9 @@ import java.util.Date; import org.apache.beam.repackaged.core.org.apache.commons.lang3.StringUtils; import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; @@ -110,7 +115,8 @@ public void testE2EBigtableRead() { p.apply(BigtableIO.read().withBigtableOptions(bigtableOptionsBuilder).withTableId(tableId)) .apply(Count.globally()); PAssert.thatSingleton(count).isEqualTo(numRows); - p.run(); + PipelineResult r = p.run(); + checkLineageSourceMetric(r, tableId); } @Test @@ -138,6 +144,17 @@ public void testE2EBigtableSegmentRead() { .withMaxBufferElementCount(10)) .apply(Count.globally()); PAssert.thatSingleton(count).isEqualTo(numRows); - p.run(); + PipelineResult r = p.run(); + checkLineageSourceMetric(r, tableId); + } + + private void checkLineageSourceMetric(PipelineResult r, String tableId) { + // TODO(https://github.com/apache/beam/issues/32071) test malformed, + // when pipeline.run() is non-blocking, the metrics are not available by the time of query + if (options.getRunner().getName().contains("DirectRunner")) { + assertThat( + Lineage.query(r.metrics(), Lineage.Type.SOURCE), + hasItem(String.format("bigtable:%s.%s.%s", project, options.getInstanceId(), tableId))); + } } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImplTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImplTest.java index 98f33ffeb9d7..a46d47324b93 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImplTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImplTest.java @@ -244,6 +244,68 @@ public Void answer(InvocationOnMock invocation) throws Throwable { Mockito.verify(mockCallMetric, Mockito.times(2)).call("ok"); } + /** + * This test ensures that protobuf creation and interactions with {@link BigtableDataClient} work + * as expected. This test checks that a single row is returned from the future. + * + * @throws IOException + */ + @Test + public void testReadSingleRangeAtSegmentLimit() throws Exception { + RowSet.Builder ranges = RowSet.newBuilder(); + ranges.addRowRanges( + generateRowRange( + generateByteString(DEFAULT_PREFIX, SEGMENT_SIZE), + generateByteString(DEFAULT_PREFIX, SEGMENT_SIZE - 1))); + + // Set up Callable to be returned by stub.createReadRowsCallable() + ServerStreamingCallable mockCallable = Mockito.mock(ServerStreamingCallable.class); + List> expectedResults = + ImmutableList.of( + generateSegmentResult(DEFAULT_PREFIX, 0, SEGMENT_SIZE), ImmutableList.of()); + + // Return multiple answers when mockCallable is called + doAnswer( + new MultipleAnswer( + ImmutableList.of( + generateSegmentResult(DEFAULT_PREFIX, 0, SEGMENT_SIZE), + generateSegmentResult(DEFAULT_PREFIX, SEGMENT_SIZE, SEGMENT_SIZE * 2), + ImmutableList.of()))) + .when(mockCallable) + .call(any(Query.class), any(ResponseObserver.class), any(ApiCallContext.class)); + + when(mockStub.createReadRowsCallable(any(RowAdapter.class))).thenReturn(mockCallable); + ServerStreamingCallable callable = + mockStub.createReadRowsCallable(new BigtableServiceImpl.BigtableRowProtoAdapter()); + // Set up client to return callable + when(mockBigtableDataClient.readRowsCallable(any(RowAdapter.class))).thenReturn(callable); + when(mockBigtableSource.getTableId()).thenReturn(StaticValueProvider.of(TABLE_ID)); + + BigtableService.Reader underTest = + new BigtableServiceImpl.BigtableSegmentReaderImpl( + mockBigtableDataClient, + bigtableDataSettings.getProjectId(), + bigtableDataSettings.getInstanceId(), + mockBigtableSource.getTableId().get(), + ranges.build(), + RowFilter.getDefaultInstance(), + SEGMENT_SIZE, + DEFAULT_BYTE_SEGMENT_SIZE, + mockCallMetric); + + List actualResults = new ArrayList<>(); + Assert.assertTrue(underTest.start()); + do { + actualResults.add(underTest.getCurrentRow()); + } while (underTest.advance()); + + Assert.assertEquals( + expectedResults.stream().flatMap(Collection::stream).collect(Collectors.toList()), + actualResults); + + Mockito.verify(mockCallMetric, Mockito.times(2)).call("ok"); + } + /** * This test ensures that all the rows are properly added to the buffer and read. This example * uses a single range with SEGMENT_SIZE*2+1 rows. Range: [b00000, b00001, ... b00199, b00200) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteIT.java index bf9f7d991fa2..46bb3df836e5 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteIT.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.io.gcp.bigtable; import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasItem; import static org.junit.Assert.assertEquals; import com.google.api.gax.rpc.ServerStream; @@ -39,8 +40,10 @@ import java.util.Objects; import java.util.stream.Collectors; import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.io.GenerateSequence; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.testing.PAssert; @@ -142,7 +145,7 @@ public void processElement(ProcessContext c) { .withProjectId(project) .withInstanceId(options.getInstanceId()) .withTableId(tableId)); - p.run(); + PipelineResult r = p.run(); // Test number of column families and column family name equality Table table = getTable(tableId); @@ -154,6 +157,7 @@ public void processElement(ProcessContext c) { // Test table data equality List> tableData = getTableData(tableId); assertThat(tableData, Matchers.containsInAnyOrder(testData.toArray())); + checkLineageSinkMetric(r, tableId); } @Test @@ -340,7 +344,7 @@ public void failureTest(int numRows, DoFn> tableData = getTableData(tableId); assertEquals(998, tableData.size()); + checkLineageSinkMetric(r, tableId); } @After @@ -412,4 +417,13 @@ private void deleteTable(String tableId) { tableAdminClient.deleteTable(tableId); } } + + private void checkLineageSinkMetric(PipelineResult r, String tableId) { + // Only check lineage metrics on direct runner until Dataflow runner v2 supported report back + if (options.getRunner().getName().contains("DirectRunner")) { + assertThat( + Lineage.query(r.metrics(), Lineage.Type.SINK), + hasItem(String.format("bigtable:%s.%s.%s", project, options.getInstanceId(), tableId))); + } + } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClientTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClientTest.java index 895ed35bfb12..fb007d1171db 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClientTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClientTest.java @@ -171,6 +171,7 @@ public void subscriptionPathFromNameWellFormed() { SubscriptionPath path = PubsubClient.subscriptionPathFromName("test", "something"); assertEquals("projects/test/subscriptions/something", path.getPath()); assertEquals("/subscriptions/test/something", path.getFullPath()); + assertEquals("pubsub:subscription:test.something", path.getDataCatalogName()); } @Test @@ -178,6 +179,7 @@ public void topicPathFromNameWellFormed() { TopicPath path = PubsubClient.topicPathFromName("test", "something"); assertEquals("projects/test/topics/something", path.getPath()); assertEquals("/topics/test/something", path.getFullPath()); + assertEquals("pubsub:topic:test.something", path.getDataCatalogName()); } @Test diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java index fe6338a501c4..74a98f0b8b43 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java @@ -237,6 +237,9 @@ public void testValueProviderTopic() { assertThat(pubsubRead.getTopicProvider(), not(nullValue())); assertThat(pubsubRead.getTopicProvider().isAccessible(), is(true)); assertThat(pubsubRead.getTopicProvider().get().asPath(), equalTo(provider.get())); + assertThat( + pubsubRead.getTopicProvider().get().dataCatalogName(), + equalTo("pubsub:topic:project.topic")); } @Test @@ -732,7 +735,7 @@ public void testWriteMalformedMessagesWithErrorHandler() throws Exception { PCollection messages = pipeline.apply( Create.timestamped(ImmutableList.of(pubsubMsg, failingPubsubMsg)) - .withCoder(new PubsubMessageWithTopicCoder())); + .withCoder(PubsubMessageWithTopicCoder.of())); messages.setIsBoundedInternal(PCollection.IsBounded.BOUNDED); ErrorHandler> badRecordErrorHandler = pipeline.registerBadRecordErrorHandler(new ErrorSinkTransform()); @@ -882,7 +885,7 @@ public void testDynamicTopics(boolean isBounded) throws IOException { PCollection messages = pipeline.apply( - Create.timestamped(pubsubMessages).withCoder(new PubsubMessageWithTopicCoder())); + Create.timestamped(pubsubMessages).withCoder(PubsubMessageWithTopicCoder.of())); if (!isBounded) { messages = messages.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED); } @@ -919,7 +922,7 @@ public void testBigMessageBounded() throws IOException { PCollection messages = pipeline.apply( Create.timestamped(ImmutableList.of(pubsubMsg)) - .withCoder(new PubsubMessageWithTopicCoder())); + .withCoder(PubsubMessageWithTopicCoder.of())); messages.setIsBoundedInternal(PCollection.IsBounded.BOUNDED); messages.apply(PubsubIO.writeMessagesDynamic().withClientFactory(factory)); pipeline.run(); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java index dd5a9abd5ac8..98aade888a33 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java @@ -25,6 +25,7 @@ import com.google.protobuf.Timestamp; import java.io.IOException; import java.io.Serializable; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -47,7 +48,6 @@ import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.junit.Rule; @@ -170,7 +170,7 @@ public void testReadRaw() throws IOException { PCollectionRowTuple begin = PCollectionRowTuple.empty(p); Schema rawSchema = Schema.of(Schema.Field.of("payload", Schema.FieldType.BYTES)); - byte[] payload = "some payload".getBytes(Charsets.UTF_8); + byte[] payload = "some payload".getBytes(StandardCharsets.UTF_8); try (PubsubTestClientFactory clientFactory = clientFactory(ImmutableList.of(incomingMessageOf(payload, CLOCK.currentTimeMillis())))) { @@ -211,7 +211,7 @@ public void testReadAttributes() throws IOException { .addStringField("attr") .addMapField("attrMap", Schema.FieldType.STRING, Schema.FieldType.STRING) .build(); - byte[] payload = "some payload".getBytes(Charsets.UTF_8); + byte[] payload = "some payload".getBytes(StandardCharsets.UTF_8); String attr = "attr value"; try (PubsubTestClientFactory clientFactory = diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java index c9b6bae45b98..be68083bb28c 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java @@ -223,7 +223,7 @@ public void testDynamicTopics() throws IOException { Instant.ofEpochMilli(o.getTimestampMsSinceEpoch()))) .collect(Collectors.toList()); - p.apply(Create.timestamped(pubsubMessages).withCoder(new PubsubMessageWithTopicCoder())) + p.apply(Create.timestamped(pubsubMessages).withCoder(PubsubMessageWithTopicCoder.of())) .apply(sink); p.run(); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerAccessorTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerAccessorTest.java index b80fba31d3a2..70105f820536 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerAccessorTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerAccessorTest.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.io.gcp.spanner; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; @@ -163,5 +164,6 @@ public void testBuildSpannerOptionsWithCredential() { assertEquals("project", options.getProjectId()); assertEquals("test-role", options.getDatabaseRole()); assertEquals(testCredential, options.getCredentials()); + assertNotNull(options.getSessionPoolOptions()); } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java index 166df1704ca8..1e89326d1e8c 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java @@ -40,16 +40,18 @@ public void testSingleTable() throws Exception { .addColumn("test", "jsonVal", "JSON") .addColumn("test", "protoVal", "PROTO") .addColumn("test", "enumVal", "ENUM") + .addColumn("test", "tokens", "TOKENLIST") .build(); assertEquals(1, schema.getTables().size()); - assertEquals(6, schema.getColumns("test").size()); + assertEquals(7, schema.getColumns("test").size()); assertEquals(1, schema.getKeyParts("test").size()); assertEquals(Type.json(), schema.getColumns("test").get(3).getType()); assertEquals( Type.proto("customer.app.TestMessage"), schema.getColumns("test").get(4).getType()); assertEquals( Type.protoEnum("customer.app.TestEnum"), schema.getColumns("test").get(5).getType()); + assertEquals(Type.bytes(), schema.getColumns("test").get(6).getType()); } @Test diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java index 50e0ea8b63d1..c3c1da7c7885 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java @@ -137,7 +137,7 @@ public PCollection expand(PBegin input) { .setCatalogConfig(getCatalogConfig()) .setScanType(IcebergScanConfig.ScanType.TABLE) .setTableIdentifier(tableId) - .setSchema(SchemaAndRowConversions.icebergSchemaToBeamSchema(table.schema())) + .setSchema(IcebergUtils.icebergSchemaToBeamSchema(table.schema())) .build()))); } } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergUtils.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergUtils.java new file mode 100644 index 000000000000..acd9b25a6a5e --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergUtils.java @@ -0,0 +1,381 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg; + +import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.util.Preconditions; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; + +/** Utilities for converting between Beam and Iceberg types. */ +public class IcebergUtils { + // This is made public for users convenience, as many may have more experience working with + // Iceberg types. + + private IcebergUtils() {} + + private static final Map BEAM_TYPES_TO_ICEBERG_TYPES = + ImmutableMap.builder() + .put(Schema.TypeName.BOOLEAN, Types.BooleanType.get()) + .put(Schema.TypeName.INT32, Types.IntegerType.get()) + .put(Schema.TypeName.INT64, Types.LongType.get()) + .put(Schema.TypeName.FLOAT, Types.FloatType.get()) + .put(Schema.TypeName.DOUBLE, Types.DoubleType.get()) + .put(Schema.TypeName.STRING, Types.StringType.get()) + .put(Schema.TypeName.BYTES, Types.BinaryType.get()) + .build(); + + private static Schema.FieldType icebergTypeToBeamFieldType(final Type type) { + switch (type.typeId()) { + case BOOLEAN: + return Schema.FieldType.BOOLEAN; + case INTEGER: + return Schema.FieldType.INT32; + case LONG: + return Schema.FieldType.INT64; + case FLOAT: + return Schema.FieldType.FLOAT; + case DOUBLE: + return Schema.FieldType.DOUBLE; + case DATE: + case TIME: + case TIMESTAMP: // TODO: Logical types? + return Schema.FieldType.DATETIME; + case STRING: + return Schema.FieldType.STRING; + case UUID: + case BINARY: + return Schema.FieldType.BYTES; + case FIXED: + case DECIMAL: + return Schema.FieldType.DECIMAL; + case STRUCT: + return Schema.FieldType.row(icebergStructTypeToBeamSchema(type.asStructType())); + case LIST: + return Schema.FieldType.iterable( + icebergTypeToBeamFieldType(type.asListType().elementType())); + case MAP: + return Schema.FieldType.map( + icebergTypeToBeamFieldType(type.asMapType().keyType()), + icebergTypeToBeamFieldType(type.asMapType().valueType())); + } + throw new RuntimeException("Unrecognized IcebergIO Type"); + } + + private static Schema.Field icebergFieldToBeamField(final Types.NestedField field) { + return Schema.Field.of(field.name(), icebergTypeToBeamFieldType(field.type())) + .withNullable(field.isOptional()); + } + + /** Converts an Iceberg {@link org.apache.iceberg.Schema} to a Beam {@link Schema}. */ + public static Schema icebergSchemaToBeamSchema(final org.apache.iceberg.Schema schema) { + Schema.Builder builder = Schema.builder(); + for (Types.NestedField f : schema.columns()) { + builder.addField(icebergFieldToBeamField(f)); + } + return builder.build(); + } + + private static Schema icebergStructTypeToBeamSchema(final Types.StructType struct) { + Schema.Builder builder = Schema.builder(); + for (Types.NestedField f : struct.fields()) { + builder.addField(icebergFieldToBeamField(f)); + } + return builder.build(); + } + + /** + * Represents a {@link Type} and the most recent field ID used to build it. + * + *

Iceberg Schema fields are required to have unique IDs. This includes unique IDs for a {@link + * org.apache.iceberg.types.Type.NestedType}'s components (e.g. {@link Types.ListType}'s + * collection type, {@link Types.MapType}'s key type and value type, and {@link + * Types.StructType}'s nested fields). The {@code maxId} in this object represents the most recent + * ID used after building this type. This helps signal that the next {@link + * org.apache.iceberg.types.Type.NestedType} we construct should have an ID greater than this one. + */ + @VisibleForTesting + static class TypeAndMaxId { + int maxId; + Type type; + + TypeAndMaxId(int id, Type object) { + this.maxId = id; + this.type = object; + } + } + + /** + * Takes a Beam {@link Schema.FieldType} and an index intended as a starting point for Iceberg + * {@link org.apache.iceberg.types.Type.NestedType}s. Returns an Iceberg {@link Type} and the + * maximum index after building that type. + * + *

Returns this information in an {@link TypeAndMaxId} object. + */ + @VisibleForTesting + static TypeAndMaxId beamFieldTypeToIcebergFieldType( + Schema.FieldType beamType, int nestedFieldId) { + if (BEAM_TYPES_TO_ICEBERG_TYPES.containsKey(beamType.getTypeName())) { + // we don't use nested field ID for primitive types. decrement it so the caller can use it for + // other types. + return new TypeAndMaxId( + --nestedFieldId, BEAM_TYPES_TO_ICEBERG_TYPES.get(beamType.getTypeName())); + } else if (beamType.getTypeName().isCollectionType()) { // ARRAY or ITERABLE + Schema.FieldType beamCollectionType = + Preconditions.checkArgumentNotNull(beamType.getCollectionElementType()); + + // nestedFieldId is reserved for the list's collection type. + // we increment here because further nested fields should use unique ID's + TypeAndMaxId listInfo = + beamFieldTypeToIcebergFieldType(beamCollectionType, nestedFieldId + 1); + Type icebergCollectionType = listInfo.type; + + boolean elementTypeIsNullable = + Preconditions.checkArgumentNotNull(beamType.getCollectionElementType()).getNullable(); + + Type listType = + elementTypeIsNullable + ? Types.ListType.ofOptional(nestedFieldId, icebergCollectionType) + : Types.ListType.ofRequired(nestedFieldId, icebergCollectionType); + + return new TypeAndMaxId(listInfo.maxId, listType); + } else if (beamType.getTypeName().isMapType()) { // MAP + // key and value IDs need to be unique + int keyId = nestedFieldId; + int valueId = keyId + 1; + + // nested field IDs should be unique + nestedFieldId = valueId + 1; + Schema.FieldType beamKeyType = Preconditions.checkArgumentNotNull(beamType.getMapKeyType()); + TypeAndMaxId keyInfo = beamFieldTypeToIcebergFieldType(beamKeyType, nestedFieldId); + Type icebergKeyType = keyInfo.type; + + nestedFieldId = keyInfo.maxId + 1; + Schema.FieldType beamValueType = + Preconditions.checkArgumentNotNull(beamType.getMapValueType()); + TypeAndMaxId valueInfo = beamFieldTypeToIcebergFieldType(beamValueType, nestedFieldId); + Type icebergValueType = valueInfo.type; + + Type mapType = + beamValueType.getNullable() + ? Types.MapType.ofOptional(keyId, valueId, icebergKeyType, icebergValueType) + : Types.MapType.ofRequired(keyId, valueId, icebergKeyType, icebergValueType); + + return new TypeAndMaxId(valueInfo.maxId, mapType); + } else if (beamType.getTypeName().isCompositeType()) { // ROW + // Nested field IDs need to be unique from the field that contains this StructType + Schema nestedSchema = Preconditions.checkArgumentNotNull(beamType.getRowSchema()); + List nestedFields = new ArrayList<>(nestedSchema.getFieldCount()); + + int icebergFieldId = nestedFieldId; + nestedFieldId = icebergFieldId + nestedSchema.getFieldCount(); + for (Schema.Field beamField : nestedSchema.getFields()) { + TypeAndMaxId typeAndMaxId = + beamFieldTypeToIcebergFieldType(beamField.getType(), nestedFieldId); + Types.NestedField icebergField = + Types.NestedField.of( + icebergFieldId++, + beamField.getType().getNullable(), + beamField.getName(), + typeAndMaxId.type); + + nestedFields.add(icebergField); + nestedFieldId = typeAndMaxId.maxId + 1; + } + + Type structType = Types.StructType.of(nestedFields); + + return new TypeAndMaxId(nestedFieldId - 1, structType); + } + + return new TypeAndMaxId(nestedFieldId, Types.StringType.get()); + } + + /** + * Converts a Beam {@link Schema} to an Iceberg {@link org.apache.iceberg.Schema}. + * + *

The following unsupported Beam types will be defaulted to {@link Types.StringType}: + *

  • {@link Schema.TypeName.DECIMAL} + *
  • {@link Schema.TypeName.DATETIME} + *
  • {@link Schema.TypeName.LOGICAL_TYPE} + */ + public static org.apache.iceberg.Schema beamSchemaToIcebergSchema(final Schema schema) { + List fields = new ArrayList<>(schema.getFieldCount()); + int nestedFieldId = schema.getFieldCount() + 1; + int icebergFieldId = 1; + for (Schema.Field beamField : schema.getFields()) { + TypeAndMaxId typeAndMaxId = + beamFieldTypeToIcebergFieldType(beamField.getType(), nestedFieldId); + Types.NestedField icebergField = + Types.NestedField.of( + icebergFieldId++, + beamField.getType().getNullable(), + beamField.getName(), + typeAndMaxId.type); + + fields.add(icebergField); + nestedFieldId = typeAndMaxId.maxId + 1; + } + return new org.apache.iceberg.Schema(fields.toArray(new Types.NestedField[fields.size()])); + } + + /** Converts a Beam {@link Row} to an Iceberg {@link Record}. */ + public static Record beamRowToIcebergRecord(org.apache.iceberg.Schema schema, Row row) { + return copyRowIntoRecord(GenericRecord.create(schema), row); + } + + private static Record copyRowIntoRecord(Record baseRecord, Row value) { + Record rec = baseRecord.copy(); + for (Types.NestedField f : rec.struct().fields()) { + copyFieldIntoRecord(rec, f, value); + } + return rec; + } + + private static void copyFieldIntoRecord(Record rec, Types.NestedField field, Row value) { + String name = field.name(); + switch (field.type().typeId()) { + case BOOLEAN: + Optional.ofNullable(value.getBoolean(name)).ifPresent(v -> rec.setField(name, v)); + break; + case INTEGER: + Optional.ofNullable(value.getInt32(name)).ifPresent(v -> rec.setField(name, v)); + break; + case LONG: + Optional.ofNullable(value.getInt64(name)).ifPresent(v -> rec.setField(name, v)); + break; + case FLOAT: + Optional.ofNullable(value.getFloat(name)).ifPresent(v -> rec.setField(name, v)); + break; + case DOUBLE: + Optional.ofNullable(value.getDouble(name)).ifPresent(v -> rec.setField(name, v)); + break; + case DATE: + throw new UnsupportedOperationException("Date fields not yet supported"); + case TIME: + throw new UnsupportedOperationException("Time fields not yet supported"); + case TIMESTAMP: + Optional.ofNullable(value.getDateTime(name)) + .ifPresent(v -> rec.setField(name, v.getMillis())); + break; + case STRING: + Optional.ofNullable(value.getString(name)).ifPresent(v -> rec.setField(name, v)); + break; + case UUID: + Optional.ofNullable(value.getBytes(name)) + .ifPresent(v -> rec.setField(name, UUID.nameUUIDFromBytes(v))); + break; + case FIXED: + throw new UnsupportedOperationException("Fixed-precision fields are not yet supported."); + case BINARY: + Optional.ofNullable(value.getBytes(name)) + .ifPresent(v -> rec.setField(name, ByteBuffer.wrap(v))); + break; + case DECIMAL: + Optional.ofNullable(value.getDecimal(name)).ifPresent(v -> rec.setField(name, v)); + break; + case STRUCT: + Optional.ofNullable(value.getRow(name)) + .ifPresent( + row -> + rec.setField( + name, + copyRowIntoRecord(GenericRecord.create(field.type().asStructType()), row))); + break; + case LIST: + Optional.ofNullable(value.getArray(name)).ifPresent(list -> rec.setField(name, list)); + break; + case MAP: + Optional.ofNullable(value.getMap(name)).ifPresent(v -> rec.setField(name, v)); + break; + } + } + + /** Converts an Iceberg {@link Record} to a Beam {@link Row}. */ + public static Row icebergRecordToBeamRow(Schema schema, Record record) { + Row.Builder rowBuilder = Row.withSchema(schema); + for (Schema.Field field : schema.getFields()) { + boolean isNullable = field.getType().getNullable(); + @Nullable Object icebergValue = record.getField(field.getName()); + if (icebergValue == null) { + if (isNullable) { + rowBuilder.addValue(null); + continue; + } + throw new RuntimeException( + String.format("Received null value for required field '%s'.", field.getName())); + } + switch (field.getType().getTypeName()) { + case BYTE: + case INT16: + case INT32: + case INT64: + case DECIMAL: // Iceberg and Beam both use BigDecimal + case FLOAT: // Iceberg and Beam both use float + case DOUBLE: // Iceberg and Beam both use double + case STRING: // Iceberg and Beam both use String + case BOOLEAN: // Iceberg and Beam both use String + case ARRAY: + case ITERABLE: + case MAP: + rowBuilder.addValue(icebergValue); + break; + case DATETIME: + // Iceberg uses a long for millis; Beam uses joda time DateTime + long millis = (long) icebergValue; + rowBuilder.addValue(new DateTime(millis, DateTimeZone.UTC)); + break; + case BYTES: + // Iceberg uses ByteBuffer; Beam uses byte[] + rowBuilder.addValue(((ByteBuffer) icebergValue).array()); + break; + case ROW: + Record nestedRecord = (Record) icebergValue; + Schema nestedSchema = + checkArgumentNotNull( + field.getType().getRowSchema(), + "Corrupted schema: Row type did not have associated nested schema."); + rowBuilder.addValue(icebergRecordToBeamRow(nestedSchema, nestedRecord)); + break; + case LOGICAL_TYPE: + throw new UnsupportedOperationException( + "Cannot convert iceberg field to Beam logical type"); + default: + throw new UnsupportedOperationException( + "Unsupported Beam type: " + field.getType().getTypeName()); + } + } + return rowBuilder.build(); + } +} diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java index 859310bdcecb..d7212783d1b6 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/RecordWriter.java @@ -17,7 +17,7 @@ */ package org.apache.beam.sdk.io.iceberg; -import static org.apache.beam.sdk.io.iceberg.SchemaAndRowConversions.rowToRecord; +import static org.apache.beam.sdk.io.iceberg.IcebergUtils.beamRowToIcebergRecord; import java.io.IOException; import org.apache.beam.sdk.values.Row; @@ -80,7 +80,7 @@ class RecordWriter { } public void write(Row row) { - Record record = rowToRecord(table.schema(), row); + Record record = beamRowToIcebergRecord(table.schema(), row); icebergDataWriter.write(record); } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanSource.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanSource.java index ed2f2eda767e..ff2aa0833481 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanSource.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanSource.java @@ -50,7 +50,7 @@ private TableScan getTableScan() { scanConfig .getTable() .newScan() - .project(SchemaAndRowConversions.beamSchemaToIcebergSchema(scanConfig.getSchema())); + .project(IcebergUtils.beamSchemaToIcebergSchema(scanConfig.getSchema())); if (scanConfig.getFilter() != null) { tableScan = tableScan.filter(scanConfig.getFilter()); diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java index 52e6d60c1fbd..b7cb42b2eacb 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ScanTaskReader.java @@ -60,7 +60,7 @@ class ScanTaskReader extends BoundedSource.BoundedReader { public ScanTaskReader(ScanTaskSource source) { this.source = source; - this.project = SchemaAndRowConversions.beamSchemaToIcebergSchema(source.getSchema()); + this.project = IcebergUtils.beamSchemaToIcebergSchema(source.getSchema()); } @Override @@ -160,7 +160,7 @@ public Row getCurrent() throws NoSuchElementException { if (current == null) { throw new NoSuchElementException(); } - return SchemaAndRowConversions.recordToRow(source.getSchema(), current); + return IcebergUtils.icebergRecordToBeamRow(source.getSchema(), current); } @Override diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SchemaAndRowConversions.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SchemaAndRowConversions.java deleted file mode 100644 index e1a8685614f5..000000000000 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SchemaAndRowConversions.java +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io.iceberg; - -import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; - -import java.nio.ByteBuffer; -import java.util.Map; -import java.util.Optional; -import java.util.UUID; -import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.values.Row; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; - -class SchemaAndRowConversions { - - private SchemaAndRowConversions() {} - - static final Map BEAM_TYPES_TO_ICEBERG_TYPES = - ImmutableMap.builder() - .put(Schema.FieldType.BOOLEAN, Types.BooleanType.get()) - .put(Schema.FieldType.INT32, Types.IntegerType.get()) - .put(Schema.FieldType.INT64, Types.LongType.get()) - .put(Schema.FieldType.FLOAT, Types.FloatType.get()) - .put(Schema.FieldType.DOUBLE, Types.DoubleType.get()) - .put(Schema.FieldType.STRING, Types.StringType.get()) - .put(Schema.FieldType.BYTES, Types.BinaryType.get()) - .build(); - - public static Schema.FieldType icebergTypeToBeamFieldType(final Type type) { - switch (type.typeId()) { - case BOOLEAN: - return Schema.FieldType.BOOLEAN; - case INTEGER: - return Schema.FieldType.INT32; - case LONG: - return Schema.FieldType.INT64; - case FLOAT: - return Schema.FieldType.FLOAT; - case DOUBLE: - return Schema.FieldType.DOUBLE; - case DATE: - case TIME: - case TIMESTAMP: // TODO: Logical types? - return Schema.FieldType.DATETIME; - case STRING: - return Schema.FieldType.STRING; - case UUID: - case BINARY: - return Schema.FieldType.BYTES; - case FIXED: - case DECIMAL: - return Schema.FieldType.DECIMAL; - case STRUCT: - return Schema.FieldType.row(icebergStructTypeToBeamSchema(type.asStructType())); - case LIST: - return Schema.FieldType.iterable( - icebergTypeToBeamFieldType(type.asListType().elementType())); - case MAP: - return Schema.FieldType.map( - icebergTypeToBeamFieldType(type.asMapType().keyType()), - icebergTypeToBeamFieldType(type.asMapType().valueType())); - } - throw new RuntimeException("Unrecognized IcebergIO Type"); - } - - public static Schema.Field icebergFieldToBeamField(final Types.NestedField field) { - return Schema.Field.of(field.name(), icebergTypeToBeamFieldType(field.type())) - .withNullable(field.isOptional()); - } - - public static Schema icebergSchemaToBeamSchema(final org.apache.iceberg.Schema schema) { - Schema.Builder builder = Schema.builder(); - for (Types.NestedField f : schema.columns()) { - builder.addField(icebergFieldToBeamField(f)); - } - return builder.build(); - } - - public static Schema icebergStructTypeToBeamSchema(final Types.StructType struct) { - Schema.Builder builder = Schema.builder(); - for (Types.NestedField f : struct.fields()) { - builder.addField(icebergFieldToBeamField(f)); - } - return builder.build(); - } - - public static Types.NestedField beamFieldToIcebergField(int fieldId, final Schema.Field field) { - @Nullable Type icebergType = BEAM_TYPES_TO_ICEBERG_TYPES.get(field.getType()); - - if (icebergType != null) { - return Types.NestedField.of( - fieldId, field.getType().getNullable(), field.getName(), icebergType); - } else { - return Types.NestedField.of( - fieldId, field.getType().getNullable(), field.getName(), Types.StringType.get()); - } - } - - public static org.apache.iceberg.Schema beamSchemaToIcebergSchema(final Schema schema) { - Types.NestedField[] fields = new Types.NestedField[schema.getFieldCount()]; - int fieldId = 0; - for (Schema.Field f : schema.getFields()) { - fields[fieldId++] = beamFieldToIcebergField(fieldId, f); - } - return new org.apache.iceberg.Schema(fields); - } - - public static Record rowToRecord(org.apache.iceberg.Schema schema, Row row) { - return copyRowIntoRecord(GenericRecord.create(schema), row); - } - - private static Record copyRowIntoRecord(Record baseRecord, Row value) { - Record rec = baseRecord.copy(); - for (Types.NestedField f : rec.struct().fields()) { - copyFieldIntoRecord(rec, f, value); - } - return rec; - } - - private static void copyFieldIntoRecord(Record rec, Types.NestedField field, Row value) { - String name = field.name(); - switch (field.type().typeId()) { - case BOOLEAN: - Optional.ofNullable(value.getBoolean(name)).ifPresent(v -> rec.setField(name, v)); - break; - case INTEGER: - Optional.ofNullable(value.getInt32(name)).ifPresent(v -> rec.setField(name, v)); - break; - case LONG: - Optional.ofNullable(value.getInt64(name)).ifPresent(v -> rec.setField(name, v)); - break; - case FLOAT: - Optional.ofNullable(value.getFloat(name)).ifPresent(v -> rec.setField(name, v)); - break; - case DOUBLE: - Optional.ofNullable(value.getDouble(name)).ifPresent(v -> rec.setField(name, v)); - break; - case DATE: - throw new UnsupportedOperationException("Date fields not yet supported"); - case TIME: - throw new UnsupportedOperationException("Time fields not yet supported"); - case TIMESTAMP: - Optional.ofNullable(value.getDateTime(name)) - .ifPresent(v -> rec.setField(name, v.getMillis())); - break; - case STRING: - Optional.ofNullable(value.getString(name)).ifPresent(v -> rec.setField(name, v)); - break; - case UUID: - Optional.ofNullable(value.getBytes(name)) - .ifPresent(v -> rec.setField(name, UUID.nameUUIDFromBytes(v))); - break; - case FIXED: - throw new UnsupportedOperationException("Fixed-precision fields are not yet supported."); - case BINARY: - Optional.ofNullable(value.getBytes(name)) - .ifPresent(v -> rec.setField(name, ByteBuffer.wrap(v))); - break; - case DECIMAL: - Optional.ofNullable(value.getDecimal(name)).ifPresent(v -> rec.setField(name, v)); - break; - case STRUCT: - Optional.ofNullable(value.getRow(name)) - .ifPresent( - row -> - rec.setField( - name, - copyRowIntoRecord(GenericRecord.create(field.type().asStructType()), row))); - break; - case LIST: - throw new UnsupportedOperationException("List fields are not yet supported."); - case MAP: - throw new UnsupportedOperationException("Map fields are not yet supported."); - } - } - - public static Row recordToRow(Schema schema, Record record) { - Row.Builder rowBuilder = Row.withSchema(schema); - for (Schema.Field field : schema.getFields()) { - switch (field.getType().getTypeName()) { - case BYTE: - // I guess allow anything we can cast here - byte byteValue = (byte) record.getField(field.getName()); - rowBuilder.addValue(byteValue); - break; - case INT16: - // I guess allow anything we can cast here - short shortValue = (short) record.getField(field.getName()); - rowBuilder.addValue(shortValue); - break; - case INT32: - // I guess allow anything we can cast here - int intValue = (int) record.getField(field.getName()); - rowBuilder.addValue(intValue); - break; - case INT64: - // I guess allow anything we can cast here - long longValue = (long) record.getField(field.getName()); - rowBuilder.addValue(longValue); - break; - case DECIMAL: - // Iceberg and Beam both use BigDecimal - rowBuilder.addValue(record.getField(field.getName())); - break; - case FLOAT: - // Iceberg and Beam both use float - rowBuilder.addValue(record.getField(field.getName())); - break; - case DOUBLE: - // Iceberg and Beam both use double - rowBuilder.addValue(record.getField(field.getName())); - break; - case STRING: - // Iceberg and Beam both use String - rowBuilder.addValue(record.getField(field.getName())); - break; - case DATETIME: - // Iceberg uses a long for millis; Beam uses joda time DateTime - long millis = (long) record.getField(field.getName()); - rowBuilder.addValue(new DateTime(millis, DateTimeZone.UTC)); - break; - case BOOLEAN: - // Iceberg and Beam both use String - rowBuilder.addValue(record.getField(field.getName())); - break; - case BYTES: - // Iceberg uses ByteBuffer; Beam uses byte[] - rowBuilder.addValue(((ByteBuffer) record.getField(field.getName())).array()); - break; - case ARRAY: - throw new UnsupportedOperationException("Array fields are not yet supported."); - case ITERABLE: - throw new UnsupportedOperationException("Iterable fields are not yet supported."); - case MAP: - throw new UnsupportedOperationException("Map fields are not yet supported."); - case ROW: - Record nestedRecord = (Record) record.getField(field.getName()); - Schema nestedSchema = - checkArgumentNotNull( - field.getType().getRowSchema(), - "Corrupted schema: Row type did not have associated nested schema."); - Row nestedRow = recordToRow(nestedSchema, nestedRecord); - rowBuilder.addValue(nestedRow); - break; - case LOGICAL_TYPE: - throw new UnsupportedOperationException( - "Cannot convert iceberg field to Beam logical type"); - } - } - return rowBuilder.build(); - } -} diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOIT.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOIT.java index 0420e2f57797..3a169eeb40da 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOIT.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOIT.java @@ -108,8 +108,7 @@ public static void beforeClass() { catalogHadoopConf = new Configuration(); catalogHadoopConf.set("fs.gs.project.id", options.getProject()); - catalogHadoopConf.set( - "fs.gs.auth.service.account.json.keyfile", System.getenv("GOOGLE_APPLICATION_CREDENTIALS")); + catalogHadoopConf.set("fs.gs.auth.type", "APPLICATION_DEFAULT"); } @Before @@ -135,8 +134,7 @@ public void setUp() { .addByteArrayField("bytes") .build(); - static final Schema ICEBERG_SCHEMA = - SchemaAndRowConversions.beamSchemaToIcebergSchema(BEAM_SCHEMA); + static final Schema ICEBERG_SCHEMA = IcebergUtils.beamSchemaToIcebergSchema(BEAM_SCHEMA); Map getValues(int num) { String strNum = Integer.toString(num); @@ -239,7 +237,7 @@ public void testWrite() { List inputRows = inputRecords.stream() - .map(record -> SchemaAndRowConversions.recordToRow(BEAM_SCHEMA, record)) + .map(record -> IcebergUtils.icebergRecordToBeamRow(BEAM_SCHEMA, record)) .collect(Collectors.toList()); // Write with Beam diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java index d6db3f689117..3f31073b4448 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java @@ -70,7 +70,7 @@ public void testSimpleScan() throws Exception { TableIdentifier tableId = TableIdentifier.of("default", "table" + Long.toString(UUID.randomUUID().hashCode(), 16)); Table simpleTable = warehouse.createTable(tableId, TestFixtures.SCHEMA); - final Schema schema = SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA); + final Schema schema = IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA); simpleTable .newFastAppend() @@ -91,7 +91,7 @@ public void testSimpleScan() throws Exception { TestFixtures.FILE2SNAPSHOT1, TestFixtures.FILE3SNAPSHOT1) .flatMap(List::stream) - .map(record -> SchemaAndRowConversions.recordToRow(schema, record)) + .map(record -> IcebergUtils.icebergRecordToBeamRow(schema, record)) .collect(Collectors.toList()); Properties props = new Properties(); @@ -105,9 +105,7 @@ public void testSimpleScan() throws Exception { testPipeline .apply(IcebergIO.readRows(catalogConfig).from(tableId)) .apply(ParDo.of(new PrintRow())) - .setCoder( - RowCoder.of( - SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA))); + .setCoder(RowCoder.of(IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA))); PAssert.that(output) .satisfies( diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java index e0a584ec9da9..02213c45e075 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java @@ -17,7 +17,7 @@ */ package org.apache.beam.sdk.io.iceberg; -import static org.apache.beam.sdk.io.iceberg.SchemaAndRowConversions.rowToRecord; +import static org.apache.beam.sdk.io.iceberg.IcebergUtils.beamRowToIcebergRecord; import static org.hamcrest.MatcherAssert.assertThat; import java.io.Serializable; @@ -85,7 +85,7 @@ public void testSimpleAppend() throws Exception { testPipeline .apply("Records To Add", Create.of(TestFixtures.asRows(TestFixtures.FILE1SNAPSHOT1))) - .setRowSchema(SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) + .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) .apply("Append To Table", IcebergIO.writeRows(catalog).to(tableId)); LOG.info("Executing pipeline"); @@ -152,7 +152,7 @@ public IcebergDestination instantiateDestination(Row dest) { TestFixtures.FILE1SNAPSHOT1, TestFixtures.FILE1SNAPSHOT2, TestFixtures.FILE1SNAPSHOT3)))) - .setRowSchema(SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) + .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) .apply("Append To Table", IcebergIO.writeRows(catalog).to(dynamicDestinations)); LOG.info("Executing pipeline"); @@ -235,7 +235,7 @@ public IcebergDestination instantiateDestination(Row dest) { testPipeline .apply("Records To Add", Create.of(TestFixtures.asRows(elements))) - .setRowSchema(SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) + .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) .apply("Append To Table", IcebergIO.writeRows(catalog).to(dynamicDestinations)); LOG.info("Executing pipeline"); @@ -262,9 +262,9 @@ public void testIdempotentCommit() throws Exception { // Create a table and add records to it. Table table = warehouse.createTable(tableId, TestFixtures.SCHEMA); Record record = - rowToRecord( + beamRowToIcebergRecord( table.schema(), - Row.withSchema(SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) + Row.withSchema(IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) .addValues(42L, "bizzle") .build()); diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProviderTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProviderTest.java index bc15021fa2b0..effb5cc4838e 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProviderTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProviderTest.java @@ -73,7 +73,7 @@ public void testSimpleScan() throws Exception { TableIdentifier tableId = TableIdentifier.parse(identifier); Table simpleTable = warehouse.createTable(tableId, TestFixtures.SCHEMA); - final Schema schema = SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA); + final Schema schema = IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA); simpleTable .newFastAppend() @@ -94,7 +94,7 @@ public void testSimpleScan() throws Exception { TestFixtures.FILE2SNAPSHOT1, TestFixtures.FILE3SNAPSHOT1) .flatMap(List::stream) - .map(record -> SchemaAndRowConversions.recordToRow(schema, record)) + .map(record -> IcebergUtils.icebergRecordToBeamRow(schema, record)) .collect(Collectors.toList()); Map properties = new HashMap<>(); @@ -129,7 +129,7 @@ public void testReadUsingManagedTransform() throws Exception { TableIdentifier tableId = TableIdentifier.parse(identifier); Table simpleTable = warehouse.createTable(tableId, TestFixtures.SCHEMA); - final Schema schema = SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA); + final Schema schema = IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA); simpleTable .newFastAppend() @@ -150,7 +150,7 @@ public void testReadUsingManagedTransform() throws Exception { TestFixtures.FILE2SNAPSHOT1, TestFixtures.FILE3SNAPSHOT1) .flatMap(List::stream) - .map(record -> SchemaAndRowConversions.recordToRow(schema, record)) + .map(record -> IcebergUtils.icebergRecordToBeamRow(schema, record)) .collect(Collectors.toList()); String yamlConfig = diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergUtilsTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergUtilsTest.java new file mode 100644 index 000000000000..a20d5b7c8f59 --- /dev/null +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergUtilsTest.java @@ -0,0 +1,677 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg; + +import static org.apache.beam.sdk.io.iceberg.IcebergUtils.TypeAndMaxId; +import static org.apache.beam.sdk.io.iceberg.IcebergUtils.beamFieldTypeToIcebergFieldType; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.junit.Test; +import org.junit.experimental.runners.Enclosed; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(Enclosed.class) +public class IcebergUtilsTest { + + @RunWith(JUnit4.class) + public static class RowToRecordTests { + /** + * Checks a value that when converted to Iceberg type is the same value when interpreted in + * Java. + */ + private void checkRowValueToRecordValue( + Schema.FieldType sourceType, Type destType, Object value) { + checkRowValueToRecordValue(sourceType, value, destType, value); + } + + private void checkRowValueToRecordValue( + Schema.FieldType sourceType, Object sourceValue, Type destType, Object destValue) { + Schema beamSchema = Schema.of(Schema.Field.of("v", sourceType)); + Row row = Row.withSchema(beamSchema).addValues(sourceValue).build(); + + org.apache.iceberg.Schema icebergSchema = + new org.apache.iceberg.Schema(required(0, "v", destType)); + Record record = IcebergUtils.beamRowToIcebergRecord(icebergSchema, row); + + assertThat(record.getField("v"), equalTo(destValue)); + } + + @Test + public void testBoolean() { + checkRowValueToRecordValue(Schema.FieldType.BOOLEAN, Types.BooleanType.get(), true); + checkRowValueToRecordValue(Schema.FieldType.BOOLEAN, Types.BooleanType.get(), false); + } + + @Test + public void testInteger() { + checkRowValueToRecordValue(Schema.FieldType.INT32, Types.IntegerType.get(), -13); + checkRowValueToRecordValue(Schema.FieldType.INT32, Types.IntegerType.get(), 42); + checkRowValueToRecordValue(Schema.FieldType.INT32, Types.IntegerType.get(), 0); + } + + @Test + public void testLong() { + checkRowValueToRecordValue(Schema.FieldType.INT64, Types.LongType.get(), 13L); + checkRowValueToRecordValue(Schema.FieldType.INT64, Types.LongType.get(), 42L); + } + + @Test + public void testFloat() { + checkRowValueToRecordValue(Schema.FieldType.FLOAT, Types.FloatType.get(), 3.14159f); + checkRowValueToRecordValue(Schema.FieldType.FLOAT, Types.FloatType.get(), 42.0f); + } + + @Test + public void testDouble() { + checkRowValueToRecordValue(Schema.FieldType.DOUBLE, Types.DoubleType.get(), 3.14159); + } + + @Test + public void testDate() {} + + @Test + public void testTime() {} + + @Test + public void testTimestamp() { + DateTime dateTime = + new DateTime().withDate(1979, 03, 14).withTime(1, 2, 3, 4).withZone(DateTimeZone.UTC); + + checkRowValueToRecordValue( + Schema.FieldType.DATETIME, + dateTime.toInstant(), + Types.TimestampType.withoutZone(), + dateTime.getMillis()); + } + + @Test + public void testFixed() {} + + @Test + public void testBinary() { + byte[] bytes = new byte[] {1, 2, 3, 4}; + checkRowValueToRecordValue( + Schema.FieldType.BYTES, bytes, Types.BinaryType.get(), ByteBuffer.wrap(bytes)); + } + + @Test + public void testDecimal() { + BigDecimal num = BigDecimal.valueOf(123.456); + + checkRowValueToRecordValue(Schema.FieldType.DECIMAL, Types.DecimalType.of(6, 3), num); + } + + @Test + public void testStruct() { + Schema schema = Schema.builder().addStringField("nested_str").build(); + Row beamRow = Row.withSchema(schema).addValue("str_value").build(); + + Types.NestedField nestedFieldType = required(1, "nested_str", Types.StringType.get()); + GenericRecord icebergRow = + GenericRecord.create(new org.apache.iceberg.Schema(nestedFieldType)); + icebergRow.setField("nested_str", "str_value"); + + checkRowValueToRecordValue( + Schema.FieldType.row(schema), beamRow, Types.StructType.of(nestedFieldType), icebergRow); + } + + @Test + public void testMap() { + Map map = + ImmutableMap.builder().put("a", 123).put("b", 456).put("c", 789).build(); + + checkRowValueToRecordValue( + Schema.FieldType.map(Schema.FieldType.STRING, Schema.FieldType.INT32), + Types.MapType.ofRequired(1, 2, Types.StringType.get(), Types.IntegerType.get()), + map); + } + + @Test + public void testList() { + List list = Arrays.asList("abc", "xyz", "123", "foo", "bar"); + + checkRowValueToRecordValue( + Schema.FieldType.array(Schema.FieldType.STRING), + Types.ListType.ofRequired(1, Types.StringType.get()), + list); + } + } + + @RunWith(JUnit4.class) + public static class RecordToRowTests { + private void checkRecordValueToRowValue( + Type sourceType, Schema.FieldType destType, Object value) { + checkRecordValueToRowValue(sourceType, value, destType, value); + } + + private void checkRecordValueToRowValue( + Type sourceType, Object sourceValue, Schema.FieldType destType, Object destValue) { + Schema beamSchema = Schema.of(Schema.Field.of("v", destType)); + + org.apache.iceberg.Schema icebergSchema = + new org.apache.iceberg.Schema(required(0, "v", sourceType)); + Record record = GenericRecord.create(icebergSchema); + record.setField("v", sourceValue); + + Row row = IcebergUtils.icebergRecordToBeamRow(beamSchema, record); + + assertThat(row.getBaseValue("v"), equalTo(destValue)); + } + + @Test + public void testBoolean() { + checkRecordValueToRowValue(Types.BooleanType.get(), Schema.FieldType.BOOLEAN, true); + checkRecordValueToRowValue(Types.BooleanType.get(), Schema.FieldType.BOOLEAN, false); + } + + @Test + public void testInteger() { + checkRecordValueToRowValue(Types.IntegerType.get(), Schema.FieldType.INT32, -13); + checkRecordValueToRowValue(Types.IntegerType.get(), Schema.FieldType.INT32, 42); + checkRecordValueToRowValue(Types.IntegerType.get(), Schema.FieldType.INT32, 0); + } + + @Test + public void testLong() { + checkRecordValueToRowValue(Types.LongType.get(), Schema.FieldType.INT64, 13L); + checkRecordValueToRowValue(Types.LongType.get(), Schema.FieldType.INT64, 42L); + } + + @Test + public void testFloat() { + checkRecordValueToRowValue(Types.FloatType.get(), Schema.FieldType.FLOAT, 3.14159f); + checkRecordValueToRowValue(Types.FloatType.get(), Schema.FieldType.FLOAT, 42.0f); + } + + @Test + public void testDouble() { + checkRecordValueToRowValue(Types.DoubleType.get(), Schema.FieldType.DOUBLE, 3.14159); + } + + @Test + public void testDate() {} + + @Test + public void testTime() {} + + @Test + public void testTimestamp() { + DateTime dateTime = + new DateTime().withDate(1979, 03, 14).withTime(1, 2, 3, 4).withZone(DateTimeZone.UTC); + + checkRecordValueToRowValue( + Types.TimestampType.withoutZone(), + dateTime.getMillis(), + Schema.FieldType.DATETIME, + dateTime.toInstant()); + } + + @Test + public void testFixed() {} + + @Test + public void testBinary() { + byte[] bytes = new byte[] {1, 2, 3, 4}; + checkRecordValueToRowValue( + Types.BinaryType.get(), ByteBuffer.wrap(bytes), Schema.FieldType.BYTES, bytes); + } + + @Test + public void testDecimal() { + BigDecimal num = BigDecimal.valueOf(123.456); + + checkRecordValueToRowValue(Types.DecimalType.of(6, 3), Schema.FieldType.DECIMAL, num); + } + + @Test + public void testStruct() { + Schema schema = Schema.builder().addStringField("nested_str").build(); + Row beamRow = Row.withSchema(schema).addValue("str_value").build(); + + Types.NestedField nestedFieldType = required(1, "nested_str", Types.StringType.get()); + GenericRecord icebergRow = + GenericRecord.create(new org.apache.iceberg.Schema(nestedFieldType)); + icebergRow.setField("nested_str", "str_value"); + + checkRecordValueToRowValue( + Types.StructType.of(nestedFieldType), icebergRow, Schema.FieldType.row(schema), beamRow); + } + + @Test + public void testMap() { + Map map = + ImmutableMap.builder().put("a", 123).put("b", 456).put("c", 789).build(); + + checkRecordValueToRowValue( + Types.MapType.ofRequired(1, 2, Types.StringType.get(), Types.IntegerType.get()), + Schema.FieldType.map(Schema.FieldType.STRING, Schema.FieldType.INT32), + map); + } + + @Test + public void testList() { + List list = Arrays.asList("abc", "xyz", "123", "foo", "bar"); + + checkRecordValueToRowValue( + Types.ListType.ofRequired(1, Types.StringType.get()), + Schema.FieldType.iterable(Schema.FieldType.STRING), + list); + } + } + + @RunWith(JUnit4.class) + public static class SchemaTests { + private static class BeamFieldTypeTestCase { + final int icebergFieldId; + final Schema.FieldType beamType; + final int expectedMaxId; + final Type expectedIcebergType; + + BeamFieldTypeTestCase( + int icebergFieldId, + Schema.FieldType beamType, + int expectedMaxId, + Type expectedIcebergType) { + this.icebergFieldId = icebergFieldId; + this.beamType = beamType; + this.expectedMaxId = expectedMaxId; + this.expectedIcebergType = expectedIcebergType; + } + } + + private void checkTypes(List testCases) { + for (BeamFieldTypeTestCase testCase : testCases) { + TypeAndMaxId ret = + beamFieldTypeToIcebergFieldType(testCase.beamType, testCase.icebergFieldId); + + assertEquals(testCase.expectedMaxId, ret.maxId); + checkEquals(testCase.expectedIcebergType, ret.type); + } + } + + private void checkEquals(Type expected, Type actual) { + if (expected.isListType() && actual.isListType()) { + Type nestedExpected = expected.asListType().elementType(); + Type nestedActual = actual.asListType().elementType(); + + assertEquals(nestedExpected.typeId(), nestedActual.typeId()); + checkEquals(nestedExpected, nestedActual); + } else { + assertEquals(expected, actual); + } + } + + @Test + public void testPrimitiveBeamFieldTypeToIcebergFieldType() { + // primitive types don't use the nested field ID + List primitives = + Arrays.asList( + new BeamFieldTypeTestCase(1, Schema.FieldType.BOOLEAN, 0, Types.BooleanType.get()), + new BeamFieldTypeTestCase(3, Schema.FieldType.INT32, 2, Types.IntegerType.get()), + new BeamFieldTypeTestCase(6, Schema.FieldType.INT64, 5, Types.LongType.get()), + new BeamFieldTypeTestCase(10, Schema.FieldType.FLOAT, 9, Types.FloatType.get()), + new BeamFieldTypeTestCase(7, Schema.FieldType.DOUBLE, 6, Types.DoubleType.get()), + new BeamFieldTypeTestCase(11, Schema.FieldType.STRING, 10, Types.StringType.get()), + new BeamFieldTypeTestCase(15, Schema.FieldType.BYTES, 14, Types.BinaryType.get())); + + checkTypes(primitives); + } + + @Test + public void testArrayBeamFieldTypeToIcebergFieldType() { + // Iceberg's ListType reserves one nested ID for its element type + List listTypes = + Arrays.asList( + new BeamFieldTypeTestCase( + 1, + Schema.FieldType.array(Schema.FieldType.BOOLEAN), + 1, + Types.ListType.ofRequired(1, Types.BooleanType.get())), + new BeamFieldTypeTestCase( + 3, + Schema.FieldType.iterable(Schema.FieldType.INT32), + 3, + Types.ListType.ofRequired(3, Types.IntegerType.get())), + new BeamFieldTypeTestCase( + 6, + Schema.FieldType.array(Schema.FieldType.INT64), + 6, + Types.ListType.ofRequired(6, Types.LongType.get())), + new BeamFieldTypeTestCase( + 10, + Schema.FieldType.array(Schema.FieldType.FLOAT), + 10, + Types.ListType.ofRequired(10, Types.FloatType.get())), + new BeamFieldTypeTestCase( + 7, + Schema.FieldType.iterable(Schema.FieldType.DOUBLE), + 7, + Types.ListType.ofRequired(7, Types.DoubleType.get())), + new BeamFieldTypeTestCase( + 11, + Schema.FieldType.array(Schema.FieldType.STRING), + 11, + Types.ListType.ofRequired(11, Types.StringType.get())), + new BeamFieldTypeTestCase( + 15, + Schema.FieldType.iterable(Schema.FieldType.BYTES), + 15, + Types.ListType.ofRequired(15, Types.BinaryType.get())), + new BeamFieldTypeTestCase( + 23, + Schema.FieldType.array( + Schema.FieldType.array(Schema.FieldType.iterable(Schema.FieldType.STRING))), + 25, + Types.ListType.ofRequired( + 23, + Types.ListType.ofRequired( + 24, Types.ListType.ofRequired(25, Types.StringType.get()))))); + + checkTypes(listTypes); + } + + @Test + public void testStructBeamFieldTypeToIcebergFieldType() { + // Iceberg sets one unique field ID for each nested type. + List listTypes = + Arrays.asList( + new BeamFieldTypeTestCase( + 1, + Schema.FieldType.row(Schema.builder().addStringField("str").build()), + 1, + Types.StructType.of( + Types.NestedField.required(1, "str", Types.StringType.get()))), + new BeamFieldTypeTestCase( + 3, + Schema.FieldType.row(Schema.builder().addInt32Field("int").build()), + 3, + Types.StructType.of( + Types.NestedField.required(3, "int", Types.IntegerType.get()))), + new BeamFieldTypeTestCase( + 1, + Schema.FieldType.row(BEAM_SCHEMA_PRIMITIVE), + 7, + Types.StructType.of(ICEBERG_SCHEMA_PRIMITIVE.columns())), + new BeamFieldTypeTestCase( + 15, + Schema.FieldType.row( + Schema.builder() + .addArrayField("arr", Schema.FieldType.STRING) + .addNullableStringField("str") + .build()), + 17, + Types.StructType.of( + Types.NestedField.required( + 15, "arr", Types.ListType.ofRequired(17, Types.StringType.get())), + Types.NestedField.optional(16, "str", Types.StringType.get()))), + new BeamFieldTypeTestCase( + 20, + Schema.FieldType.row( + Schema.builder() + .addRowField( + "row", + Schema.builder() + .addRowField( + "nested_row", Schema.builder().addStringField("str").build()) + .build()) + .addNullableRowField( + "nullable_row", Schema.builder().addInt64Field("long").build()) + .build()), + 24, + Types.StructType.of( + Types.NestedField.required( + 20, + "row", + Types.StructType.of( + Types.NestedField.required( + 22, + "nested_row", + Types.StructType.of( + Types.NestedField.required( + 23, "str", Types.StringType.get()))))), + Types.NestedField.optional( + 21, + "nullable_row", + Types.StructType.of( + Types.NestedField.required(24, "long", Types.LongType.get())))))); + + checkTypes(listTypes); + } + + @Test + public void testMapBeamFieldTypeToIcebergFieldType() { + // Iceberg's MapType reserves two nested IDs. one for its key type and one for its value type. + List primitives = + Arrays.asList( + new BeamFieldTypeTestCase( + 1, + Schema.FieldType.map(Schema.FieldType.STRING, Schema.FieldType.INT32), + 2, + Types.MapType.ofRequired(1, 2, Types.StringType.get(), Types.IntegerType.get())), + new BeamFieldTypeTestCase( + 6, + Schema.FieldType.map( + Schema.FieldType.FLOAT, Schema.FieldType.array(Schema.FieldType.STRING)), + 8, + Types.MapType.ofRequired( + 6, + 7, + Types.FloatType.get(), + Types.ListType.ofRequired(8, Types.StringType.get()))), + new BeamFieldTypeTestCase( + 10, + Schema.FieldType.map( + Schema.FieldType.STRING, + Schema.FieldType.map( + Schema.FieldType.BOOLEAN, + Schema.FieldType.map(Schema.FieldType.STRING, Schema.FieldType.INT32))), + 15, + Types.MapType.ofRequired( + 10, + 11, + Types.StringType.get(), + Types.MapType.ofRequired( + 12, + 13, + Types.BooleanType.get(), + Types.MapType.ofRequired( + 14, 15, Types.StringType.get(), Types.IntegerType.get())))), + new BeamFieldTypeTestCase( + 15, + Schema.FieldType.map( + Schema.FieldType.row(Schema.builder().addStringField("str").build()), + Schema.FieldType.row(Schema.builder().addInt32Field("int").build())), + 18, + Types.MapType.ofRequired( + 15, + 16, + Types.StructType.of( + Types.NestedField.required(17, "str", Types.StringType.get())), + Types.StructType.of( + Types.NestedField.required(18, "int", Types.IntegerType.get()))))); + + checkTypes(primitives); + } + + static final Schema BEAM_SCHEMA_PRIMITIVE = + Schema.builder() + .addInt32Field("int") + .addFloatField("float") + .addNullableDoubleField("double") + .addInt64Field("long") + .addNullableStringField("str") + .addNullableBooleanField("bool") + .addByteArrayField("bytes") + .build(); + + static final org.apache.iceberg.Schema ICEBERG_SCHEMA_PRIMITIVE = + new org.apache.iceberg.Schema( + required(1, "int", Types.IntegerType.get()), + required(2, "float", Types.FloatType.get()), + optional(3, "double", Types.DoubleType.get()), + required(4, "long", Types.LongType.get()), + optional(5, "str", Types.StringType.get()), + optional(6, "bool", Types.BooleanType.get()), + required(7, "bytes", Types.BinaryType.get())); + + @Test + public void testPrimitiveBeamSchemaToIcebergSchema() { + org.apache.iceberg.Schema convertedIcebergSchema = + IcebergUtils.beamSchemaToIcebergSchema(BEAM_SCHEMA_PRIMITIVE); + + System.out.println(convertedIcebergSchema); + System.out.println(ICEBERG_SCHEMA_PRIMITIVE); + + assertTrue(convertedIcebergSchema.sameSchema(ICEBERG_SCHEMA_PRIMITIVE)); + } + + @Test + public void testPrimitiveIcebergSchemaToBeamSchema() { + Schema convertedBeamSchema = IcebergUtils.icebergSchemaToBeamSchema(ICEBERG_SCHEMA_PRIMITIVE); + + assertEquals(BEAM_SCHEMA_PRIMITIVE, convertedBeamSchema); + } + + static final Schema BEAM_SCHEMA_LIST = + Schema.builder() + .addIterableField("arr_str", Schema.FieldType.STRING) + .addIterableField("arr_int", Schema.FieldType.INT32) + .addIterableField("arr_bool", Schema.FieldType.BOOLEAN) + .build(); + static final org.apache.iceberg.Schema ICEBERG_SCHEMA_LIST = + new org.apache.iceberg.Schema( + required(1, "arr_str", Types.ListType.ofRequired(4, Types.StringType.get())), + required(2, "arr_int", Types.ListType.ofRequired(5, Types.IntegerType.get())), + required(3, "arr_bool", Types.ListType.ofRequired(6, Types.BooleanType.get()))); + + @Test + public void testArrayBeamSchemaToIcebergSchema() { + org.apache.iceberg.Schema convertedIcebergSchema = + IcebergUtils.beamSchemaToIcebergSchema(BEAM_SCHEMA_LIST); + + assertTrue(convertedIcebergSchema.sameSchema(ICEBERG_SCHEMA_LIST)); + } + + @Test + public void testArrayIcebergSchemaToBeamSchema() { + Schema convertedBeamSchema = IcebergUtils.icebergSchemaToBeamSchema(ICEBERG_SCHEMA_LIST); + + System.out.println(convertedBeamSchema); + System.out.println(BEAM_SCHEMA_LIST); + + assertEquals(BEAM_SCHEMA_LIST, convertedBeamSchema); + } + + static final Schema BEAM_SCHEMA_MAP = + Schema.builder() + .addMapField("str_int", Schema.FieldType.STRING, Schema.FieldType.INT32) + .addNullableMapField("long_bool", Schema.FieldType.INT64, Schema.FieldType.BOOLEAN) + .build(); + + static final org.apache.iceberg.Schema ICEBERG_SCHEMA_MAP = + new org.apache.iceberg.Schema( + required( + 1, + "str_int", + Types.MapType.ofRequired(3, 4, Types.StringType.get(), Types.IntegerType.get())), + optional( + 2, + "long_bool", + Types.MapType.ofRequired(5, 6, Types.LongType.get(), Types.BooleanType.get()))); + + @Test + public void testMapBeamSchemaToIcebergSchema() { + org.apache.iceberg.Schema convertedIcebergSchema = + IcebergUtils.beamSchemaToIcebergSchema(BEAM_SCHEMA_MAP); + + assertTrue(convertedIcebergSchema.sameSchema(ICEBERG_SCHEMA_MAP)); + } + + @Test + public void testMapIcebergSchemaToBeamSchema() { + Schema convertedBeamSchema = IcebergUtils.icebergSchemaToBeamSchema(ICEBERG_SCHEMA_MAP); + + assertEquals(BEAM_SCHEMA_MAP, convertedBeamSchema); + } + + static final Schema BEAM_SCHEMA_STRUCT = + Schema.builder() + .addRowField( + "row", + Schema.builder() + .addStringField("str") + .addNullableInt32Field("int") + .addInt64Field("long") + .build()) + .addNullableRowField( + "nullable_row", + Schema.builder().addNullableStringField("str").addBooleanField("bool").build()) + .build(); + + static final org.apache.iceberg.Schema ICEBERG_SCHEMA_STRUCT = + new org.apache.iceberg.Schema( + required( + 1, + "row", + Types.StructType.of( + required(3, "str", Types.StringType.get()), + optional(4, "int", Types.IntegerType.get()), + required(5, "long", Types.LongType.get()))), + optional( + 2, + "nullable_row", + Types.StructType.of( + optional(6, "str", Types.StringType.get()), + required(7, "bool", Types.BooleanType.get())))); + + @Test + public void testStructBeamSchemaToIcebergSchema() { + org.apache.iceberg.Schema convertedIcebergSchema = + IcebergUtils.beamSchemaToIcebergSchema(BEAM_SCHEMA_STRUCT); + + assertTrue(convertedIcebergSchema.sameSchema(ICEBERG_SCHEMA_STRUCT)); + } + + @Test + public void testStructIcebergSchemaToBeamSchema() { + Schema convertedBeamSchema = IcebergUtils.icebergSchemaToBeamSchema(ICEBERG_SCHEMA_STRUCT); + + assertEquals(BEAM_SCHEMA_STRUCT, convertedBeamSchema); + } + } +} diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java index 75884f4bcf70..a2cd64e23956 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java @@ -102,8 +102,7 @@ public void testSimpleAppend() { testPipeline .apply( "Records To Add", Create.of(TestFixtures.asRows(TestFixtures.FILE1SNAPSHOT1))) - .setRowSchema( - SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA))); + .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA))); PCollection result = input @@ -137,7 +136,7 @@ public void testWriteUsingManagedTransform() { PCollection inputRows = testPipeline .apply("Records To Add", Create.of(TestFixtures.asRows(TestFixtures.FILE1SNAPSHOT1))) - .setRowSchema(SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)); + .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)); PCollection result = inputRows.apply(Managed.write(Managed.ICEBERG).withConfig(configMap)).get(OUTPUT_TAG); diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ScanSourceTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ScanSourceTest.java index 143687e3c999..007cb028c665 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ScanSourceTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ScanSourceTest.java @@ -79,7 +79,7 @@ public void testUnstartedReaderReadsSamesItsSource() throws Exception { .build()) .setScanType(IcebergScanConfig.ScanType.TABLE) .setTableIdentifier(simpleTable.name().replace("hadoop.", "").split("\\.")) - .setSchema(SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) + .setSchema(IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) .build()); BoundedSource.BoundedReader reader = source.createReader(options); @@ -121,7 +121,7 @@ public void testInitialSplitting() throws Exception { .build()) .setScanType(IcebergScanConfig.ScanType.TABLE) .setTableIdentifier(simpleTable.name().replace("hadoop.", "").split("\\.")) - .setSchema(SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) + .setSchema(IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) .build()); // Input data for this test is tiny so try a number of very small split sizes @@ -167,7 +167,7 @@ public void testDoubleInitialSplitting() throws Exception { .build()) .setScanType(IcebergScanConfig.ScanType.TABLE) .setTableIdentifier(simpleTable.name().replace("hadoop.", "").split("\\.")) - .setSchema(SchemaAndRowConversions.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) + .setSchema(IcebergUtils.icebergSchemaToBeamSchema(TestFixtures.SCHEMA)) .build()); // Input data for this test is tiny so make sure to split and get a few, but so they can be diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/SchemaAndRowConversionsTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/SchemaAndRowConversionsTest.java deleted file mode 100644 index 5c708700a17d..000000000000 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/SchemaAndRowConversionsTest.java +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io.iceberg; - -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.equalTo; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.nio.ByteBuffer; -import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.values.Row; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.Record; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.junit.Test; -import org.junit.experimental.runners.Enclosed; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -@RunWith(Enclosed.class) -public class SchemaAndRowConversionsTest { - - @RunWith(JUnit4.class) - public static class RowToRecordTests { - /** - * Checks a value that when converted to Iceberg type is the same value when interpreted in - * Java. - */ - private void checkRowValueToRecordValue( - Schema.FieldType sourceType, Type destType, Object value) { - checkRowValueToRecordValue(sourceType, value, destType, value); - } - - private void checkRowValueToRecordValue( - Schema.FieldType sourceType, Object sourceValue, Type destType, Object destValue) { - Schema beamSchema = Schema.of(Schema.Field.of("v", sourceType)); - Row row = Row.withSchema(beamSchema).addValues(sourceValue).build(); - - org.apache.iceberg.Schema icebergSchema = - new org.apache.iceberg.Schema(required(0, "v", destType)); - Record record = SchemaAndRowConversions.rowToRecord(icebergSchema, row); - - assertThat(record.getField("v"), equalTo(destValue)); - } - - @Test - public void testBoolean() throws Exception { - checkRowValueToRecordValue(Schema.FieldType.BOOLEAN, Types.BooleanType.get(), true); - checkRowValueToRecordValue(Schema.FieldType.BOOLEAN, Types.BooleanType.get(), false); - } - - @Test - public void testInteger() throws Exception { - checkRowValueToRecordValue(Schema.FieldType.INT32, Types.IntegerType.get(), -13); - checkRowValueToRecordValue(Schema.FieldType.INT32, Types.IntegerType.get(), 42); - checkRowValueToRecordValue(Schema.FieldType.INT32, Types.IntegerType.get(), 0); - } - - @Test - public void testLong() throws Exception { - checkRowValueToRecordValue(Schema.FieldType.INT64, Types.LongType.get(), 13L); - checkRowValueToRecordValue(Schema.FieldType.INT64, Types.LongType.get(), 42L); - } - - @Test - public void testFloat() throws Exception { - checkRowValueToRecordValue(Schema.FieldType.FLOAT, Types.FloatType.get(), 3.14159f); - checkRowValueToRecordValue(Schema.FieldType.FLOAT, Types.FloatType.get(), 42.0f); - } - - @Test - public void testDouble() throws Exception { - checkRowValueToRecordValue(Schema.FieldType.DOUBLE, Types.DoubleType.get(), 3.14159); - } - - @Test - public void testDate() throws Exception {} - - @Test - public void testTime() throws Exception {} - - @Test - public void testTimestamp() throws Exception { - DateTime dateTime = - new DateTime().withDate(1979, 03, 14).withTime(1, 2, 3, 4).withZone(DateTimeZone.UTC); - - checkRowValueToRecordValue( - Schema.FieldType.DATETIME, - dateTime.toInstant(), - Types.TimestampType.withoutZone(), - dateTime.getMillis()); - } - - @Test - public void testFixed() throws Exception {} - - @Test - public void testBinary() throws Exception { - byte[] bytes = new byte[] {1, 2, 3, 4}; - checkRowValueToRecordValue( - Schema.FieldType.BYTES, bytes, Types.BinaryType.get(), ByteBuffer.wrap(bytes)); - } - - @Test - public void testDecimal() throws Exception {} - - @Test - public void testStruct() throws Exception {} - - @Test - public void testMap() throws Exception {} - - @Test - public void testList() throws Exception {} - } - - @RunWith(JUnit4.class) - public static class RecordToRowTests { - private void checkRecordValueToRowValue( - Type sourceType, Schema.FieldType destType, Object value) { - checkRecordValueToRowValue(sourceType, value, destType, value); - } - - private void checkRecordValueToRowValue( - Type sourceType, Object sourceValue, Schema.FieldType destType, Object destValue) { - Schema beamSchema = Schema.of(Schema.Field.of("v", destType)); - - org.apache.iceberg.Schema icebergSchema = - new org.apache.iceberg.Schema(required(0, "v", sourceType)); - Record record = GenericRecord.create(icebergSchema); - record.setField("v", sourceValue); - - Row row = SchemaAndRowConversions.recordToRow(beamSchema, record); - - assertThat(row.getBaseValue("v"), equalTo(destValue)); - } - - @Test - public void testBoolean() throws Exception { - checkRecordValueToRowValue(Types.BooleanType.get(), Schema.FieldType.BOOLEAN, true); - checkRecordValueToRowValue(Types.BooleanType.get(), Schema.FieldType.BOOLEAN, false); - } - - @Test - public void testInteger() throws Exception { - checkRecordValueToRowValue(Types.IntegerType.get(), Schema.FieldType.INT32, -13); - checkRecordValueToRowValue(Types.IntegerType.get(), Schema.FieldType.INT32, 42); - checkRecordValueToRowValue(Types.IntegerType.get(), Schema.FieldType.INT32, 0); - } - - @Test - public void testLong() throws Exception { - checkRecordValueToRowValue(Types.LongType.get(), Schema.FieldType.INT64, 13L); - checkRecordValueToRowValue(Types.LongType.get(), Schema.FieldType.INT64, 42L); - } - - @Test - public void testFloat() throws Exception { - checkRecordValueToRowValue(Types.FloatType.get(), Schema.FieldType.FLOAT, 3.14159f); - checkRecordValueToRowValue(Types.FloatType.get(), Schema.FieldType.FLOAT, 42.0f); - } - - @Test - public void testDouble() throws Exception { - checkRecordValueToRowValue(Types.DoubleType.get(), Schema.FieldType.DOUBLE, 3.14159); - } - - @Test - public void testDate() throws Exception {} - - @Test - public void testTime() throws Exception {} - - @Test - public void testTimestamp() throws Exception { - DateTime dateTime = - new DateTime().withDate(1979, 03, 14).withTime(1, 2, 3, 4).withZone(DateTimeZone.UTC); - - checkRecordValueToRowValue( - Types.TimestampType.withoutZone(), - dateTime.getMillis(), - Schema.FieldType.DATETIME, - dateTime.toInstant()); - } - - @Test - public void testFixed() throws Exception {} - - @Test - public void testBinary() throws Exception { - byte[] bytes = new byte[] {1, 2, 3, 4}; - checkRecordValueToRowValue( - Types.BinaryType.get(), ByteBuffer.wrap(bytes), Schema.FieldType.BYTES, bytes); - } - - @Test - public void testDecimal() throws Exception {} - - @Test - public void testStruct() throws Exception {} - - @Test - public void testMap() throws Exception {} - - @Test - public void testList() throws Exception {} - } - - @RunWith(JUnit4.class) - public static class SchemaTests { - static final Schema BEAM_SCHEMA = - Schema.builder() - .addInt32Field("int") - .addFloatField("float") - .addDoubleField("double") - .addInt64Field("long") - .addStringField("str") - .addBooleanField("bool") - .addByteArrayField("bytes") - .build(); - - static final org.apache.iceberg.Schema ICEBERG_SCHEMA = - new org.apache.iceberg.Schema( - Types.NestedField.required(1, "int", Types.IntegerType.get()), - Types.NestedField.required(2, "float", Types.FloatType.get()), - Types.NestedField.required(3, "double", Types.DoubleType.get()), - Types.NestedField.required(4, "long", Types.LongType.get()), - Types.NestedField.required(5, "str", Types.StringType.get()), - Types.NestedField.required(6, "bool", Types.BooleanType.get()), - Types.NestedField.required(7, "bytes", Types.BinaryType.get())); - - @Test - public void testBeamSchemaToIcebergSchema() { - org.apache.iceberg.Schema convertedIcebergSchema = - SchemaAndRowConversions.beamSchemaToIcebergSchema(BEAM_SCHEMA); - - assertTrue(convertedIcebergSchema.sameSchema(ICEBERG_SCHEMA)); - } - - @Test - public void testIcebergSchemaToBeamSchema() { - Schema convertedBeamSchema = - SchemaAndRowConversions.icebergSchemaToBeamSchema(ICEBERG_SCHEMA); - - assertEquals(BEAM_SCHEMA, convertedBeamSchema); - } - } -} diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestFixtures.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestFixtures.java index 4048e88398a9..6143bd03491d 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestFixtures.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/TestFixtures.java @@ -91,7 +91,7 @@ public static final ImmutableList asRows(Iterable records) { ArrayList rows = new ArrayList<>(); for (Record record : records) { rows.add( - Row.withSchema(SchemaAndRowConversions.icebergSchemaToBeamSchema(SCHEMA)) + Row.withSchema(IcebergUtils.icebergSchemaToBeamSchema(SCHEMA)) .withFieldValue("id", record.getField("id")) .withFieldValue("data", record.getField("data")) .build()); diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java index 100f06d42d07..d6ec9015a95f 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java @@ -436,19 +436,6 @@ public ProcessContinuation processElement( "Creating Kafka consumer for process continuation for {}", kafkaSourceDescriptor.getTopicPartition()); try (Consumer consumer = consumerFactoryFn.apply(updatedConsumerConfig)) { - // Check whether current TopicPartition is still available to read. - Set existingTopicPartitions = new HashSet<>(); - for (List topicPartitionList : consumer.listTopics().values()) { - topicPartitionList.forEach( - partitionInfo -> { - existingTopicPartitions.add( - new TopicPartition(partitionInfo.topic(), partitionInfo.partition())); - }); - } - if (!existingTopicPartitions.contains(kafkaSourceDescriptor.getTopicPartition())) { - return ProcessContinuation.stop(); - } - ConsumerSpEL.evaluateAssign( consumer, ImmutableList.of(kafkaSourceDescriptor.getTopicPartition())); long startOffset = tracker.currentRestriction().getFrom(); @@ -462,6 +449,10 @@ public ProcessContinuation processElement( // When there are no records available for the current TopicPartition, self-checkpoint // and move to process the next element. if (rawRecords.isEmpty()) { + if (!topicPartitionExists( + kafkaSourceDescriptor.getTopicPartition(), consumer.listTopics())) { + return ProcessContinuation.stop(); + } if (timestampPolicy != null) { updateWatermarkManually(timestampPolicy, watermarkEstimator, tracker); } @@ -522,6 +513,23 @@ public ProcessContinuation processElement( } } + private boolean topicPartitionExists( + TopicPartition topicPartition, Map> topicListMap) { + // Check if the current TopicPartition still exists. + Set existingTopicPartitions = new HashSet<>(); + for (List topicPartitionList : topicListMap.values()) { + topicPartitionList.forEach( + partitionInfo -> { + existingTopicPartitions.add( + new TopicPartition(partitionInfo.topic(), partitionInfo.partition())); + }); + } + if (!existingTopicPartitions.contains(topicPartition)) { + return false; + } + return true; + } + // see https://github.com/apache/beam/issues/25962 private ConsumerRecords poll( Consumer consumer, TopicPartition topicPartition) { diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java index b8ff08485c3b..6ee3d9d96ef6 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java @@ -21,6 +21,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import java.nio.charset.StandardCharsets; import java.time.Duration; import java.util.ArrayList; import java.util.Collection; @@ -57,7 +58,6 @@ import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -257,8 +257,8 @@ public synchronized ConsumerRecords poll(Duration timeout) { topicPartition.topic(), topicPartition.partition(), startOffset + i, - key.getBytes(Charsets.UTF_8), - value.getBytes(Charsets.UTF_8))); + key.getBytes(StandardCharsets.UTF_8), + value.getBytes(StandardCharsets.UTF_8))); } if (records.isEmpty()) { return ConsumerRecords.empty(); @@ -515,7 +515,7 @@ public void testProcessElementWithEmptyPoll() throws Exception { public void testProcessElementWhenTopicPartitionIsRemoved() throws Exception { MockMultiOutputReceiver receiver = new MockMultiOutputReceiver(); consumer.setRemoved(); - consumer.setNumOfRecordsPerPoll(10); + consumer.setNumOfRecordsPerPoll(-1); OffsetRangeTracker tracker = new OffsetRangeTracker(new OffsetRange(0L, Long.MAX_VALUE)); ProcessContinuation result = dofnInstance.processElement( diff --git a/sdks/java/io/solace/build.gradle b/sdks/java/io/solace/build.gradle index 7a74236539fb..741db51a5772 100644 --- a/sdks/java/io/solace/build.gradle +++ b/sdks/java/io/solace/build.gradle @@ -34,6 +34,9 @@ dependencies { implementation library.java.joda_time implementation library.java.solace implementation library.java.google_cloud_core + implementation library.java.google_cloud_secret_manager + implementation library.java.proto_google_cloud_secret_manager_v1 + implementation library.java.protobuf_java implementation library.java.vendored_guava_32_1_2_jre implementation project(":sdks:java:extensions:avro") implementation library.java.avro diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java index e6b0dd34b184..bb9f0c6ea689 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java @@ -22,6 +22,7 @@ import com.google.auto.value.AutoValue; import com.solacesystems.jcsmp.BytesXMLMessage; +import com.solacesystems.jcsmp.DeliveryMode; import com.solacesystems.jcsmp.Destination; import com.solacesystems.jcsmp.JCSMPFactory; import com.solacesystems.jcsmp.Queue; @@ -31,18 +32,22 @@ import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.coders.CannotProvideCoderException; import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.io.solace.broker.BasicAuthJcsmpSessionServiceFactory; +import org.apache.beam.sdk.io.solace.broker.GCPSecretSessionServiceFactory; import org.apache.beam.sdk.io.solace.broker.SempClientFactory; import org.apache.beam.sdk.io.solace.broker.SessionService; import org.apache.beam.sdk.io.solace.broker.SessionServiceFactory; import org.apache.beam.sdk.io.solace.data.Solace; import org.apache.beam.sdk.io.solace.data.Solace.SolaceRecordMapper; import org.apache.beam.sdk.io.solace.read.UnboundedSolaceSource; +import org.apache.beam.sdk.io.solace.write.SolaceOutput; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.checkerframework.checker.nullness.qual.Nullable; @@ -194,6 +199,186 @@ *

    For the authentication to the SEMP API ({@link Read#withSempClientFactory(SempClientFactory)}) * the connector provides {@link org.apache.beam.sdk.io.solace.broker.BasicAuthSempClientFactory} to * authenticate using the Basic Authentication. + * + *

    Writing

    + * + *

    To write to Solace, use {@link #write()} with a {@link PCollection}. You can + * also use {@link #write(SerializableFunction)} to specify a format function to convert the input + * type to {@link Solace.Record}. + * + *

    Writing to a static topic or queue

    + * + *

    The connector uses the Solace JCSMP API. + * The clients will write to a SMF + * topic to the port 55555 of the host. If you want to use a different port, specify it in the + * host property with the format "X.X.X.X:PORT". + * + *

    Once you have a {@link PCollection} of {@link Solace.Record}, you can write to Solace using: + * + *

    {@code
    + * PCollection solaceRecs = ...;
    + *
    + * PCollection results =
    + *         solaceRecs.apply(
    + *                 "Write to Solace",
    + *                 SolaceIO.write()
    + *                         .to(SolaceIO.topicFromName("some-topic"))
    + *                         .withSessionServiceFactory(
    + *                            BasicAuthJcsmpSessionServiceFactory.builder()
    + *                              .username("username")
    + *                              .password("password")
    + *                              .host("host:port")
    + *                              .build()));
    + * }
    + * + *

    The above code snippet will write to the VPN named "default", using 4 clients per worker (VM + * in Dataflow), and a maximum of 20 workers/VMs for writing (default values). You can change the + * default VPN name by setting the required JCSMP property in the session factory (in this case, + * with {@link BasicAuthJcsmpSessionServiceFactory#vpnName()}), the number of clients per worker + * with {@link Write#withNumberOfClientsPerWorker(int)} and the number of parallel write clients + * using {@link Write#withMaxNumOfUsedWorkers(int)}. + * + *

    Writing to dynamic destinations

    + * + * To write to dynamic destinations, don't set the {@link Write#to(Solace.Queue)} or {@link + * Write#to(Solace.Topic)} property and make sure that all the {@link Solace.Record}s have their + * destination field set to either a topic or a queue. You can do this prior to calling the write + * connector, or by using a format function and {@link #write(SerializableFunction)}. + * + *

    For instance, you can create a function like the following: + * + *

    {@code
    + * // Generate Record with different destinations
    + * SerializableFunction formatFn =
    + *    (MyType msg) -> {
    + *       int queue = ... // some random number
    + *       return Solace.Record.builder()
    + *         .setDestination(Solace.Destination.builder()
    + *                        .setName(String.format("q%d", queue))
    + *                        .setType(Solace.DestinationType.QUEUE)
    + *                        .build())
    + *         .setMessageId(msg.getMessageId())
    + *         .build();
    + * };
    + * }
    + * + * And then use the connector as follows: + * + *
    {@code
    + * // Ignore "to" method to use dynamic destinations
    + * SolaceOutput solaceResponses = msgs.apply("Write to Solace",
    + *   SolaceIO.write(formatFn)
    + *        .withDeliveryMode(DeliveryMode.PERSISTENT)
    + *        .withWriterType(SolaceIO.WriterType.STREAMING)
    + * ...
    + * }
    + * + *

    Direct and persistent messages, and latency metrics

    + * + *

    The connector can write either direct or persistent messages. The default mode is DIRECT. + * + *

    The connector returns a {@link PCollection} of {@link Solace.PublishResult}, that you can use + * to get a confirmation of messages that have been published, or rejected, but only if it is + * publishing persistent messages. + * + *

    If you are publishing persistent messages, then you can have some feedback about whether the + * messages have been published, and some publishing latency metrics. If the message has been + * published, {@link Solace.PublishResult#getPublished()} will be true. If it is false, it means + * that the message could not be published, and {@link Solace.PublishResult#getError()} will contain + * more details about why the message could not be published. To get latency metrics as well as the + * results, set the property {@link Write#publishLatencyMetrics()}. + * + *

    Throughput and latency

    + * + *

    This connector can work in two main modes: high latency or high throughput. The default mode + * favors high throughput over high latency. You can control this behavior with the methods {@link + * Write#withSubmissionMode(SubmissionMode)} and {@link Write#withWriterType(WriterType)}. + * + *

    The default mode works like the following options: + * + *

    {@code
    + * PCollection solaceRecs = ...;
    + *
    + * PCollection results =
    + *         solaceRecs.apply(
    + *                 "Write to Solace",
    + *                 SolaceIO.write()
    + *                         .to(SolaceIO.topicFromName("some-topic"))
    + *                         .withSessionServiceFactory(
    + *                            BasicAuthJcsmpSessionServiceFactory.builder()
    + *                              .username("username")
    + *                              .password("password")
    + *                              .host("host:port")
    + *                              .build())
    + *                         .withSubmissionMode(SubmissionMode.HIGHER_THROUGHPUT)
    + *                         .withWriterType(WriterType.BATCHED));
    + * }
    + * + *

    {@link SubmissionMode#HIGHER_THROUGHPUT} and {@link WriterType#BATCHED} are the default + * values, and offer the higher possible throughput, and the lowest usage of resources in the runner + * side (due to the lower backpressure). + * + *

    This connector writes bundles of 50 messages, using a bulk publish JCSMP method. This will + * increase the latency, since a message needs to "wait" until 50 messages are accumulated, before + * they are submitted to Solace. + * + *

    For the lowest latency possible, use {@link SubmissionMode#LOWER_LATENCY} and {@link + * WriterType#STREAMING}. + * + *

    {@code
    + * PCollection results =
    + *         solaceRecs.apply(
    + *                 "Write to Solace",
    + *                 SolaceIO.write()
    + *                         .to(SolaceIO.topicFromName("some-topic"))
    + *                         .withSessionServiceFactory(
    + *                            BasicAuthJcsmpSessionServiceFactory.builder()
    + *                              .username("username")
    + *                              .password("password")
    + *                              .host("host:port")
    + *                              .build())
    + *                         .withSubmissionMode(SubmissionMode.LOWER_LATENCY)
    + *                         .withWriterType(WriterType.STREAMING));
    + * }
    + * + *

    The streaming connector publishes each message individually, without holding up or batching + * before the message is sent to Solace. This will ensure the lowest possible latency, but it will + * offer a much lower throughput. The streaming connector does not use state & timers. + * + *

    Both connectors uses state & timers to control the level of parallelism. If you are using + * Cloud Dataflow, it is recommended that you enable Streaming Engine to use this + * connector. + * + *

    Authentication

    + * + *

    When writing to Solace, the user must use {@link + * Write#withSessionServiceFactory(SessionServiceFactory)} to create a JCSMP session. + * + *

    See {@link Write#withSessionServiceFactory(SessionServiceFactory)} for session authentication. + * The connector provides implementation of the {@link SessionServiceFactory} using basic + * authentication ({@link BasicAuthJcsmpSessionServiceFactory}), and another implementation using + * basic authentication but with a password stored as a secret in Google Cloud Secret Manager + * ({@link GCPSecretSessionServiceFactory}) + * + *

    Connector retries

    + * + *

    When the worker using the connector is created, the connector will attempt to connect to + * Solace. + * + *

    If the client cannot connect to Solace for whatever reason, the connector will retry the + * connections using the following strategy. There will be a maximum of 4 retries. The first retry + * will be attempted 1 second after the first connection attempt. Every subsequent retry will + * multiply that time by a factor of two, with a maximum of 10 seconds. + * + *

    If after those retries the client is still unable to connect to Solace, the connector will + * attempt to reconnect using the same strategy repeated for every single incoming message. If for + * some reason, there is a persistent issue that prevents the connection (e.g. client quota + * exhausted), you will need to stop your job manually, or the connector will keep retrying. + * + *

    This strategy is applied to all the remote calls sent to Solace, either to connect, pull + * messages, push messages, etc. */ @Internal public class SolaceIO { @@ -209,6 +394,14 @@ public class SolaceIO { }; private static final boolean DEFAULT_DEDUPLICATE_RECORDS = false; + public static final int DEFAULT_WRITER_MAX_NUMBER_OF_WORKERS = 20; + public static final int DEFAULT_WRITER_CLIENTS_PER_WORKER = 4; + public static final Boolean DEFAULT_WRITER_PUBLISH_LATENCY_METRICS = false; + public static final SubmissionMode DEFAULT_WRITER_SUBMISSION_MODE = + SubmissionMode.HIGHER_THROUGHPUT; + public static final DeliveryMode DEFAULT_WRITER_DELIVERY_MODE = DeliveryMode.DIRECT; + public static final WriterType DEFAULT_WRITER_TYPE = WriterType.BATCHED; + /** Get a {@link Topic} object from the topic name. */ static Topic topicFromName(String topicName) { return JCSMPFactory.onlyInstance().createTopic(topicName); @@ -277,13 +470,31 @@ public static Read read( .setDeduplicateRecords(DEFAULT_DEDUPLICATE_RECORDS)); } + /** + * Create a {@link Write} transform, to write to Solace with a custom type. + * + *

    If you are using a custom data class, the format function should return a {@link + * Solace.Record} corresponding to your custom data class instance. + * + *

    If you are using this formatting function with dynamic destinations, you must ensure that + * you set the right value in the destination value of the {@link Solace.Record} messages. + */ + public static Write write(SerializableFunction formatFunction) { + return Write.builder().setFormatFunction(formatFunction).build(); + } + + /** Create a {@link Write} transform, to write to Solace using {@link Solace.Record} objects. */ + public static Write write() { + return Write.builder().build(); + } + public static class Read extends PTransform> { private static final Logger LOG = LoggerFactory.getLogger(Read.class); @VisibleForTesting final Configuration.Builder configurationBuilder; - public Read(Configuration.Builder configurationBuilder) { + private Read(Configuration.Builder configurationBuilder) { this.configurationBuilder = configurationBuilder; } @@ -569,4 +780,232 @@ private Queue initializeQueueForTopicIfNeeded( } } } + + public enum SubmissionMode { + HIGHER_THROUGHPUT, + LOWER_LATENCY + } + + public enum WriterType { + STREAMING, + BATCHED + } + + @AutoValue + public abstract static class Write extends PTransform, SolaceOutput> { + + public static final TupleTag FAILED_PUBLISH_TAG = + new TupleTag() {}; + public static final TupleTag SUCCESSFUL_PUBLISH_TAG = + new TupleTag() {}; + + /** + * Write to a Solace topic. + * + *

    The topic does not need to exist before launching the pipeline. + * + *

    This will write all records to the same topic, ignoring their destination field. + * + *

    Optional. If not specified, the connector will use dynamic destinations based on the + * destination field of {@link Solace.Record}. + */ + public Write to(Solace.Topic topic) { + return toBuilder().setDestination(topicFromName(topic.getName())).build(); + } + + /** + * Write to a Solace queue. + * + *

    The queue must exist prior to launching the pipeline. + * + *

    This will write all records to the same queue, ignoring their destination field. + * + *

    Optional. If not specified, the connector will use dynamic destinations based on the + * destination field of {@link Solace.Record}. + */ + public Write to(Solace.Queue queue) { + return toBuilder().setDestination(queueFromName(queue.getName())).build(); + } + + /** + * The number of workers used by the job to write to Solace. + * + *

    This is optional, the default value is 20. + * + *

    This is the maximum value that the job would use, but depending on the amount of data, the + * actual number of writers may be lower than this value. With the Dataflow runner, the + * connector will as maximum this number of VMs in the job (but the job itself may use more + * VMs). + * + *

    Set this number taking into account the limits in the number of clients in your Solace + * cluster, and the need for performance when writing to Solace (more workers will achieve + * higher throughput). + */ + public Write withMaxNumOfUsedWorkers(int maxNumOfUsedWorkers) { + return toBuilder().setMaxNumOfUsedWorkers(maxNumOfUsedWorkers).build(); + } + + /** + * The number of clients that each worker will create. + * + *

    This is optional, the default number is 4. + * + *

    The number of clients is per worker. If there are more than one worker, the number of + * clients will be multiplied by the number of workers. With the Dataflow runner, this will be + * the number of clients created per VM. The clients will be re-used across different threads in + * the same worker. + * + *

    Set this number in combination with {@link #withMaxNumOfUsedWorkers}, to ensure that the + * limit for number of clients in your Solace cluster is not exceeded. + * + *

    Normally, using a higher number of clients with fewer workers will achieve better + * throughput at a lower cost, since the workers are better utilized. A good rule of thumb to + * use is setting as many clients per worker as vCPUs the worker has. + */ + public Write withNumberOfClientsPerWorker(int numberOfClientsPerWorker) { + return toBuilder().setNumberOfClientsPerWorker(numberOfClientsPerWorker).build(); + } + + /** + * Set the delivery mode. This is optional, the default value is DIRECT. + * + *

    For more details, see https://docs.solace.com/API/API-Developer-Guide/Message-Delivery-Modes.htm + */ + public Write withDeliveryMode(DeliveryMode deliveryMode) { + return toBuilder().setDeliveryMode(deliveryMode).build(); + } + + /** + * Publish latency metrics using Beam metrics. + * + *

    Latency metrics are only available if {@link #withDeliveryMode(DeliveryMode)} is set to + * PERSISTENT. In that mode, latency is measured for each single message, as the time difference + * between the message creation and the reception of the publishing confirmation. + * + *

    For the batched writer, the creation time is set for every message in a batch shortly + * before the batch is submitted. So the latency is very close to the actual publishing latency, + * and it does not take into account the time spent waiting for the batch to be submitted. + * + *

    This is optional, the default value is false (don't publish latency metrics). + */ + public Write publishLatencyMetrics() { + return toBuilder().setPublishLatencyMetrics(true).build(); + } + + /** + * This setting controls the JCSMP property MESSAGE_CALLBACK_ON_REACTOR. Optional. + * + *

    For full details, please check https://docs.solace.com/API/API-Developer-Guide/Java-API-Best-Practices.htm. + * + *

    The Solace JCSMP client libraries can dispatch messages using two different modes: + * + *

    One of the modes dispatches messages directly from the same thread that is doing the rest + * of I/O work. This mode favors lower latency but lower throughput. Set this to LOWER_LATENCY + * to use that mode (MESSAGE_CALLBACK_ON_REACTOR set to True). + * + *

    The other mode uses a parallel thread to accumulate and dispatch messages. This mode + * favors higher throughput but also has higher latency. Set this to HIGHER_THROUGHPUT to use + * that mode. This is the default mode (MESSAGE_CALLBACK_ON_REACTOR set to False). + * + *

    This is optional, the default value is HIGHER_THROUGHPUT. + */ + public Write withSubmissionMode(SubmissionMode submissionMode) { + return toBuilder().setDispatchMode(submissionMode).build(); + } + + /** + * Set the type of writer used by the connector. Optional. + * + *

    The Solace writer can either use the JCSMP modes in streaming or batched. + * + *

    In streaming mode, the publishing latency will be lower, but the throughput will also be + * lower. + * + *

    With the batched mode, messages are accumulated until a batch size of 50 is reached, or 5 + * seconds have elapsed since the first message in the batch was received. The 50 messages are + * sent to Solace in a single batch. This writer offers higher throughput but higher publishing + * latency, as messages can be held up for up to 5 seconds until they are published. + * + *

    Notice that this is the message publishing latency, not the end-to-end latency. For very + * large scale pipelines, you will probably prefer to use the HIGHER_THROUGHPUT mode, as with + * lower throughput messages will accumulate in the pipeline, and the end-to-end latency may + * actually be higher. + * + *

    This is optional, the default is the BATCHED writer. + */ + public Write withWriterType(WriterType writerType) { + return toBuilder().setWriterType(writerType).build(); + } + + /** + * Set the provider used to obtain the properties to initialize a new session in the broker. + * + *

    This provider should define the destination host where the broker is listening, and all + * the properties related to authentication (base auth, client certificate, etc.). + */ + public Write withSessionServiceFactory(SessionServiceFactory factory) { + return toBuilder().setSessionServiceFactory(factory).build(); + } + + abstract int getMaxNumOfUsedWorkers(); + + abstract int getNumberOfClientsPerWorker(); + + abstract @Nullable Destination getDestination(); + + abstract DeliveryMode getDeliveryMode(); + + abstract boolean getPublishLatencyMetrics(); + + abstract SubmissionMode getDispatchMode(); + + abstract WriterType getWriterType(); + + abstract @Nullable SerializableFunction getFormatFunction(); + + abstract @Nullable SessionServiceFactory getSessionServiceFactory(); + + static Builder builder() { + return new AutoValue_SolaceIO_Write.Builder() + .setDeliveryMode(DEFAULT_WRITER_DELIVERY_MODE) + .setMaxNumOfUsedWorkers(DEFAULT_WRITER_MAX_NUMBER_OF_WORKERS) + .setNumberOfClientsPerWorker(DEFAULT_WRITER_CLIENTS_PER_WORKER) + .setPublishLatencyMetrics(DEFAULT_WRITER_PUBLISH_LATENCY_METRICS) + .setDispatchMode(DEFAULT_WRITER_SUBMISSION_MODE) + .setWriterType(DEFAULT_WRITER_TYPE); + } + + abstract Builder toBuilder(); + + @AutoValue.Builder + abstract static class Builder { + abstract Builder setMaxNumOfUsedWorkers(int maxNumOfUsedWorkers); + + abstract Builder setNumberOfClientsPerWorker(int numberOfClientsPerWorker); + + abstract Builder setDestination(Destination topicOrQueue); + + abstract Builder setDeliveryMode(DeliveryMode deliveryMode); + + abstract Builder setPublishLatencyMetrics(Boolean publishLatencyMetrics); + + abstract Builder setDispatchMode(SubmissionMode submissionMode); + + abstract Builder setWriterType(WriterType writerType); + + abstract Builder setFormatFunction(SerializableFunction formatFunction); + + abstract Builder setSessionServiceFactory(SessionServiceFactory factory); + + abstract Write build(); + } + + @Override + public SolaceOutput expand(PCollection input) { + // TODO: will be sent in upcoming PR + return SolaceOutput.in(input.getPipeline(), null, null); + } + } } diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionService.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionService.java index 7863dbd129ce..2137d574b09a 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionService.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionService.java @@ -39,13 +39,14 @@ *

    This class provides a way to connect to a Solace broker and receive messages from a queue. The * connection is established using basic authentication. */ -public class BasicAuthJcsmpSessionService implements SessionService { +public class BasicAuthJcsmpSessionService extends SessionService { private final String queueName; private final String host; private final String username; private final String password; private final String vpnName; @Nullable private JCSMPSession jcsmpSession; + @Nullable private MessageReceiver messageReceiver; private final RetryCallableManager retryCallableManager = RetryCallableManager.create(); /** @@ -73,12 +74,14 @@ public void connect() { @Override public void close() { - if (isClosed()) { - return; - } retryCallableManager.retryCallable( () -> { - checkStateNotNull(jcsmpSession).closeSession(); + if (messageReceiver != null) { + messageReceiver.close(); + } + if (!isClosed()) { + checkStateNotNull(jcsmpSession).closeSession(); + } return 0; }, ImmutableSet.of(IOException.class)); @@ -86,8 +89,10 @@ public void close() { @Override public MessageReceiver createReceiver() { - return retryCallableManager.retryCallable( - this::createFlowReceiver, ImmutableSet.of(JCSMPException.class)); + this.messageReceiver = + retryCallableManager.retryCallable( + this::createFlowReceiver, ImmutableSet.of(JCSMPException.class)); + return this.messageReceiver; } @Override @@ -137,12 +142,19 @@ private int connectSession() throws JCSMPException { } private JCSMPSession createSessionObject() throws InvalidPropertiesException { - JCSMPProperties properties = new JCSMPProperties(); - properties.setProperty(JCSMPProperties.HOST, host); - properties.setProperty(JCSMPProperties.USERNAME, username); - properties.setProperty(JCSMPProperties.PASSWORD, password); - properties.setProperty(JCSMPProperties.VPN_NAME, vpnName); - + JCSMPProperties properties = initializeSessionProperties(new JCSMPProperties()); return JCSMPFactory.onlyInstance().createSession(properties); } + + @Override + public JCSMPProperties initializeSessionProperties(JCSMPProperties baseProps) { + baseProps.setProperty(JCSMPProperties.VPN_NAME, vpnName); + + baseProps.setProperty( + JCSMPProperties.AUTHENTICATION_SCHEME, JCSMPProperties.AUTHENTICATION_SCHEME_BASIC); + baseProps.setProperty(JCSMPProperties.USERNAME, username); + baseProps.setProperty(JCSMPProperties.PASSWORD, password); + baseProps.setProperty(JCSMPProperties.HOST, host); + return baseProps; + } } diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionServiceFactory.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionServiceFactory.java index 8cb4ff0af053..2084e61b7e38 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionServiceFactory.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/BasicAuthJcsmpSessionServiceFactory.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.io.solace.broker; +import static org.apache.beam.sdk.io.solace.broker.SessionService.DEFAULT_VPN_NAME; import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import com.google.auto.value.AutoValue; @@ -39,7 +40,7 @@ public abstract class BasicAuthJcsmpSessionServiceFactory extends SessionService public abstract String vpnName(); public static Builder builder() { - return new AutoValue_BasicAuthJcsmpSessionServiceFactory.Builder(); + return new AutoValue_BasicAuthJcsmpSessionServiceFactory.Builder().vpnName(DEFAULT_VPN_NAME); } @AutoValue.Builder diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/GCPSecretSessionServiceFactory.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/GCPSecretSessionServiceFactory.java new file mode 100644 index 000000000000..dd87e1d75fa5 --- /dev/null +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/GCPSecretSessionServiceFactory.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.solace.broker; + +import static org.apache.beam.sdk.io.solace.broker.SessionService.DEFAULT_VPN_NAME; + +import com.google.auto.value.AutoValue; +import com.google.cloud.secretmanager.v1.SecretManagerServiceClient; +import com.google.cloud.secretmanager.v1.SecretVersionName; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Optional; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class implements a {@link SessionServiceFactory} that retrieve the basic authentication + * credentials from a Google Cloud Secret Manager secret. + * + *

    It can be used to avoid having to pass the password as an option of your pipeline. For this + * provider to work, the worker where the job runs needs to have the necessary credentials to access + * the secret. In Dataflow, this implies adding the necessary permissions to the worker service + * account. For other runners, set the credentials in the pipeline options using {@link + * org.apache.beam.sdk.extensions.gcp.options.GcpOptions}. + * + *

    It also shows how to implement a {@link SessionServiceFactory} that depends on using external + * resources to retrieve the Solace session properties. In this case, using the Google Cloud Secrete + * Manager client. + * + *

    Example of how to create the provider object: + * + *

    {@code
    + * GCPSecretSessionServiceFactory factory =
    + *     GCPSecretSessionServiceFactory.builder()
    + *         .username("user")
    + *         .host("host:port")
    + *         .passwordSecretName("secret-name")
    + *         .build();
    + *
    + * SessionService serviceUsingGCPSecret = factory.create();
    + * }
    + */ +@AutoValue +public abstract class GCPSecretSessionServiceFactory extends SessionServiceFactory { + + private static final Logger LOG = LoggerFactory.getLogger(GCPSecretSessionServiceFactory.class); + + private static final String PROJECT_NOT_FOUND = "PROJECT-NOT-FOUND"; + + public abstract String username(); + + public abstract String host(); + + public abstract String passwordSecretName(); + + public abstract String vpnName(); + + public abstract @Nullable String secretManagerProjectId(); + + public abstract String passwordSecretVersion(); + + public static GCPSecretSessionServiceFactory.Builder builder() { + return new AutoValue_GCPSecretSessionServiceFactory.Builder() + .passwordSecretVersion("latest") + .vpnName(DEFAULT_VPN_NAME); + } + + @AutoValue.Builder + public abstract static class Builder { + + /** Username to be used to authenticate with the broker. */ + public abstract GCPSecretSessionServiceFactory.Builder username(String username); + + /** + * The location of the broker, including port details if it is not listening in the default + * port. + */ + public abstract GCPSecretSessionServiceFactory.Builder host(String host); + + /** The Secret Manager secret name where the password is stored. */ + public abstract GCPSecretSessionServiceFactory.Builder passwordSecretName(String name); + + /** Optional. Solace broker VPN name. If not set, "default" is used. */ + public abstract GCPSecretSessionServiceFactory.Builder vpnName(String name); + + /** + * Optional for Dataflow or VMs running on Google Cloud. The project id of the project where the + * secret is stored. If not set, the project id where the job is running is used. + */ + public abstract GCPSecretSessionServiceFactory.Builder secretManagerProjectId(String id); + + /** Optional. Solace broker password secret version. If not set, "latest" is used. */ + public abstract GCPSecretSessionServiceFactory.Builder passwordSecretVersion(String version); + + public abstract GCPSecretSessionServiceFactory build(); + } + + @Override + public SessionService create() { + String password = null; + try { + password = retrieveSecret(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + BasicAuthJcsmpSessionServiceFactory factory = + BasicAuthJcsmpSessionServiceFactory.builder() + .username(username()) + .host(host()) + .password(password) + .vpnName(vpnName()) + .build(); + + return factory.create(); + } + + private String retrieveSecret() throws IOException { + try (SecretManagerServiceClient client = SecretManagerServiceClient.create()) { + String projectId = + Optional.ofNullable(secretManagerProjectId()).orElse(getProjectIdFromVmMetadata()); + SecretVersionName secretVersionName = + SecretVersionName.of(projectId, passwordSecretName(), passwordSecretVersion()); + return client.accessSecretVersion(secretVersionName).getPayload().getData().toStringUtf8(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private String getProjectIdFromVmMetadata() throws IOException { + URL metadataUrl = + new URL("http://metadata.google.internal/computeMetadata/v1/project/project-id"); + HttpURLConnection connection = (HttpURLConnection) metadataUrl.openConnection(); + connection.setRequestProperty("Metadata-Flavor", "Google"); + + String output; + try (BufferedReader reader = + new BufferedReader( + new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8))) { + output = reader.readLine(); + } + + if (output == null || output.isEmpty()) { + LOG.error( + "Cannot retrieve project id from VM metadata, please set a project id in your GoogleCloudSecretProvider."); + } + return output != null ? output : PROJECT_NOT_FOUND; + } +} diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/MessageReceiver.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/MessageReceiver.java index 199a83e322bd..95f989bd1be9 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/MessageReceiver.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/MessageReceiver.java @@ -49,6 +49,9 @@ public interface MessageReceiver { */ BytesXMLMessage receive() throws IOException; + /** Closes the message receiver. */ + void close(); + /** * Test clients may return {@literal true} to signal that all expected messages have been pulled * and the test may complete. Real clients should always return {@literal false}. diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionService.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionService.java index cd368865f0c3..aed700a71ded 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionService.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionService.java @@ -17,34 +17,220 @@ */ package org.apache.beam.sdk.io.solace.broker; +import com.solacesystems.jcsmp.JCSMPProperties; import java.io.Serializable; +import org.apache.beam.sdk.io.solace.SolaceIO; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * The SessionService interface provides a set of methods for managing a session with the Solace * messaging system. It allows for establishing a connection, creating a message-receiver object, * checking if the connection is closed or not, and gracefully closing the session. + * + *

    Override this class and the method {@link #initializeSessionProperties(JCSMPProperties)} with + * your specific properties, including all those related to authentication. + * + *

    The connector will call the method only once per session created, so you can perform + * relatively heavy operations in that method (e.g. connect to a store or vault to retrieve + * credentials). + * + *

    There are some default properties that are set by default and can be overridden in this + * provider, that are relevant for the writer connector, and not used in the case of the read + * connector (since they are not necessary for reading): + * + *

      + *
    • VPN_NAME: default + *
    • GENERATE_SEND_TIMESTAMPS: true + *
    • PUB_MULTI_THREAD: true + *
    + * + *

    The connector overrides other properties, regardless of what this provider sends to the + * connector. Those properties are the following. Again, these properties are only relevant for the + * write connector. + * + *

      + *
    • PUB_ACK_WINDOW_SIZE + *
    • MESSAGE_CALLBACK_ON_REACTOR + *
    + * + * Those properties are set by the connector based on the values of {@link + * org.apache.beam.sdk.io.solace.SolaceIO.Write#withWriterType(SolaceIO.WriterType)} and {@link + * org.apache.beam.sdk.io.solace.SolaceIO.Write#withSubmissionMode(SolaceIO.SubmissionMode)}. + * + *

    The method will always run in a worker thread or task, and not in the driver program. If you + * need to access any resource to set the properties, you need to make sure that the worker has the + * network connectivity required for that, and that any credential or configuration is passed to the + * provider through the constructor. + * + *

    The connector ensures that no two threads will be calling that method at the same time, so you + * don't have to take any specific precautions to avoid race conditions. + * + *

    For basic authentication, use {@link BasicAuthJcsmpSessionService} and {@link + * BasicAuthJcsmpSessionServiceFactory}. + * + *

    For other situations, you need to extend this class. For instance: + * + *

    {@code
    + * public class MySessionService extends SessionService {
    + *   private final String authToken;
    + *
    + *   public MySessionService(String token) {
    + *    this.oauthToken = token;
    + *    ...
    + *   }
    + *
    + *   {@literal }@Override
    + *   public JCSMPProperties initializeSessionProperties(JCSMPProperties baseProps) {
    + *     baseProps.setProperty(JCSMPProperties.AUTHENTICATION_SCHEME, JCSMPProperties.AUTHENTICATION_SCHEME_OAUTH2);
    + *     baseProps.setProperty(JCSMPProperties.OAUTH2_ACCESS_TOKEN, authToken);
    + *     return props;
    + *   }
    + *
    + *   {@literal }@Override
    + *   public void connect() {
    + *       ...
    + *   }
    + *
    + *   ...
    + * }
    + * }
    */ -public interface SessionService extends Serializable { +public abstract class SessionService implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(SessionService.class); + + public static final String DEFAULT_VPN_NAME = "default"; + + private static final int STREAMING_PUB_ACK_WINDOW = 50; + private static final int BATCHED_PUB_ACK_WINDOW = 255; /** * Establishes a connection to the service. This could involve providing connection details like * host, port, VPN name, username, and password. */ - void connect(); + public abstract void connect(); /** Gracefully closes the connection to the service. */ - void close(); + public abstract void close(); /** * Checks whether the connection to the service is currently closed. This method is called when an * `UnboundedSolaceReader` is starting to read messages - a session will be created if this * returns true. */ - boolean isClosed(); + public abstract boolean isClosed(); /** * Creates a MessageReceiver object for receiving messages from Solace. Typically, this object is * created from the session instance. */ - MessageReceiver createReceiver(); + public abstract MessageReceiver createReceiver(); + + /** + * Override this method and provide your specific properties, including all those related to + * authentication, and possibly others too. The {@code}baseProperties{@code} parameter sets the + * Solace VPN to "default" if none is specified. + * + *

    You should add your properties to the parameter {@code}baseProperties{@code}, and return the + * result. + * + *

    The method will be used whenever the session needs to be created or refreshed. If you are + * setting credentials with expiration, just make sure that the latest available credentials (e.g. + * renewed token) are set when the method is called. + * + *

    For a list of all the properties that can be set, please check the following link: + * + *

    + */ + public abstract JCSMPProperties initializeSessionProperties(JCSMPProperties baseProperties); + + /** + * This method will be called by the write connector when a new session is started. + * + *

    This call will happen in the worker, so you need to make sure that the worker has access to + * the resources you need to set the properties. + * + *

    The call will happen only once per session initialization. Typically, that will be when the + * worker and the client are created. But if for any reason the session is lost (e.g. expired auth + * token), this method will be called again. + */ + public final JCSMPProperties initializeWriteSessionProperties(SolaceIO.SubmissionMode mode) { + JCSMPProperties jcsmpProperties = initializeSessionProperties(getDefaultProperties()); + return overrideConnectorProperties(jcsmpProperties, mode); + } + + private static JCSMPProperties getDefaultProperties() { + JCSMPProperties props = new JCSMPProperties(); + props.setProperty(JCSMPProperties.VPN_NAME, DEFAULT_VPN_NAME); + // Outgoing messages will have a sender timestamp field populated + props.setProperty(JCSMPProperties.GENERATE_SEND_TIMESTAMPS, true); + // Make XMLProducer safe to access from several threads. This is the default value, setting + // it just in case. + props.setProperty(JCSMPProperties.PUB_MULTI_THREAD, true); + + return props; + } + + /** + * This method overrides some properties for the broker session to prevent misconfiguration, + * taking into account how the write connector works. + */ + private static JCSMPProperties overrideConnectorProperties( + JCSMPProperties props, SolaceIO.SubmissionMode mode) { + + // PUB_ACK_WINDOW_SIZE heavily affects performance when publishing persistent + // messages. It can be a value between 1 and 255. This is the batch size for the ack + // received from Solace. A value of 1 will have the lowest latency, but a very low + // throughput and a monumental backpressure. + + // This controls how the messages are sent to Solace + if (mode == SolaceIO.SubmissionMode.HIGHER_THROUGHPUT) { + // Create a parallel thread and a queue to send the messages + + Boolean msgCbProp = props.getBooleanProperty(JCSMPProperties.MESSAGE_CALLBACK_ON_REACTOR); + if (msgCbProp != null && msgCbProp) { + LOG.warn( + "SolaceIO.Write: Overriding MESSAGE_CALLBACK_ON_REACTOR to false since" + + " HIGHER_THROUGHPUT mode was selected"); + } + + props.setProperty(JCSMPProperties.MESSAGE_CALLBACK_ON_REACTOR, false); + + Integer ackWindowSize = props.getIntegerProperty(JCSMPProperties.PUB_ACK_WINDOW_SIZE); + if ((ackWindowSize != null && ackWindowSize != BATCHED_PUB_ACK_WINDOW)) { + LOG.warn( + String.format( + "SolaceIO.Write: Overriding PUB_ACK_WINDOW_SIZE to %d since" + + " HIGHER_THROUGHPUT mode was selected", + BATCHED_PUB_ACK_WINDOW)); + } + props.setProperty(JCSMPProperties.PUB_ACK_WINDOW_SIZE, BATCHED_PUB_ACK_WINDOW); + } else { + // Send from the same thread where the produced is being called. This offers the lowest + // latency, but a low throughput too. + Boolean msgCbProp = props.getBooleanProperty(JCSMPProperties.MESSAGE_CALLBACK_ON_REACTOR); + if (msgCbProp != null && !msgCbProp) { + LOG.warn( + "SolaceIO.Write: Overriding MESSAGE_CALLBACK_ON_REACTOR to true since" + + " LOWER_LATENCY mode was selected"); + } + + props.setProperty(JCSMPProperties.MESSAGE_CALLBACK_ON_REACTOR, true); + + Integer ackWindowSize = props.getIntegerProperty(JCSMPProperties.PUB_ACK_WINDOW_SIZE); + if ((ackWindowSize != null && ackWindowSize != STREAMING_PUB_ACK_WINDOW)) { + LOG.warn( + String.format( + "SolaceIO.Write: Overriding PUB_ACK_WINDOW_SIZE to %d since" + + " LOWER_LATENCY mode was selected", + STREAMING_PUB_ACK_WINDOW)); + } + + props.setProperty(JCSMPProperties.PUB_ACK_WINDOW_SIZE, STREAMING_PUB_ACK_WINDOW); + } + return props; + } } diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionServiceFactory.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionServiceFactory.java index 7d1dee7a1187..027de2cff134 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionServiceFactory.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SessionServiceFactory.java @@ -26,9 +26,8 @@ * queue property and mandates the implementation of a create() method in concrete subclasses. */ public abstract class SessionServiceFactory implements Serializable { - /** - * A reference to a Queue object. This is set when the pipline is constructed (in the {@link + * A reference to a Queue object. This is set when the pipeline is constructed (in the {@link * org.apache.beam.sdk.io.solace.SolaceIO.Read#expand(org.apache.beam.sdk.values.PBegin)} method). * This could be used to associate the created SessionService with a specific queue for message * handling. diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SolaceMessageReceiver.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SolaceMessageReceiver.java index e5f129d3ddfc..d548d2049a5b 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SolaceMessageReceiver.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/broker/SolaceMessageReceiver.java @@ -69,4 +69,11 @@ public BytesXMLMessage receive() throws IOException { throw new IOException(e); } } + + @Override + public void close() { + if (!isClosed()) { + this.flowReceiver.close(); + } + } } diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/data/Solace.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/data/Solace.java index 18fee9184446..00b94b5b9ea9 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/data/Solace.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/data/Solace.java @@ -24,6 +24,7 @@ import java.nio.ByteBuffer; import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldNumber; import org.checkerframework.checker.nullness.qual.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -118,6 +119,7 @@ public abstract static class Record { * * @return The message ID, or null if not available. */ + @SchemaFieldNumber("0") public abstract @Nullable String getMessageId(); /** @@ -127,6 +129,7 @@ public abstract static class Record { * * @return The message payload. */ + @SchemaFieldNumber("1") public abstract ByteBuffer getPayload(); /** * Gets the destination (topic or queue) to which the message was sent. @@ -135,6 +138,7 @@ public abstract static class Record { * * @return The destination, or null if not available. */ + @SchemaFieldNumber("2") public abstract @Nullable Destination getDestination(); /** @@ -146,6 +150,7 @@ public abstract static class Record { * * @return The expiration timestamp. */ + @SchemaFieldNumber("3") public abstract long getExpiration(); /** @@ -155,6 +160,7 @@ public abstract static class Record { * * @return The message priority. */ + @SchemaFieldNumber("4") public abstract int getPriority(); /** @@ -164,6 +170,7 @@ public abstract static class Record { * * @return True if redelivered, false otherwise. */ + @SchemaFieldNumber("5") public abstract boolean getRedelivered(); /** @@ -173,6 +180,7 @@ public abstract static class Record { * * @return The reply-to destination, or null if not specified. */ + @SchemaFieldNumber("6") public abstract @Nullable Destination getReplyTo(); /** @@ -183,6 +191,7 @@ public abstract static class Record { * * @return The timestamp. */ + @SchemaFieldNumber("7") public abstract long getReceiveTimestamp(); /** @@ -191,6 +200,7 @@ public abstract static class Record { * * @return The sender timestamp, or null if not available. */ + @SchemaFieldNumber("8") public abstract @Nullable Long getSenderTimestamp(); /** @@ -200,6 +210,7 @@ public abstract static class Record { * * @return The sequence number, or null if not available. */ + @SchemaFieldNumber("9") public abstract @Nullable Long getSequenceNumber(); /** @@ -210,6 +221,7 @@ public abstract static class Record { * * @return The time-to-live value. */ + @SchemaFieldNumber("10") public abstract long getTimeToLive(); /** @@ -225,7 +237,9 @@ public abstract static class Record { * * @return The replication group message ID, or null if not present. */ + @SchemaFieldNumber("11") public abstract @Nullable String getReplicationGroupMessageId(); + /** * Gets the attachment data of the message as a ByteString, if any. This might represent files * or other binary content associated with the message. @@ -234,6 +248,7 @@ public abstract static class Record { * * @return The attachment data, or an empty ByteString if no attachment is present. */ + @SchemaFieldNumber("12") public abstract ByteBuffer getAttachmentBytes(); static Builder builder() { @@ -271,6 +286,90 @@ abstract static class Builder { abstract Record build(); } } + + /** + * The result of writing a message to Solace. This will be returned by the {@link + * com.google.cloud.dataflow.dce.io.solace.SolaceIO.Write} connector. + * + *

    This class provides a builder to create instances, but you will probably not need it. The + * write connector will create and return instances of {@link Solace.PublishResult}. + * + *

    If the message has been published, {@link Solace.PublishResult#getPublished()} will be true. + * If it is false, it means that the message could not be published, and {@link + * Solace.PublishResult#getError()} will contain more details about why the message could not be + * published. + */ + @AutoValue + @DefaultSchema(AutoValueSchema.class) + public abstract static class PublishResult { + /** The message id of the message that was published. */ + @SchemaFieldNumber("0") + public abstract String getMessageId(); + + /** Whether the message was published or not. */ + @SchemaFieldNumber("1") + public abstract Boolean getPublished(); + + /** + * The publishing latency in milliseconds. This is the difference between the time the message + * was created, and the time the message was published. It is only available if the {@link + * CorrelationKey} class is used as correlation key of the messages. + */ + @SchemaFieldNumber("2") + public abstract @Nullable Long getLatencyMilliseconds(); + + /** The error details if the message could not be published. */ + @SchemaFieldNumber("3") + public abstract @Nullable String getError(); + + public static Builder builder() { + return new AutoValue_Solace_PublishResult.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + public abstract Builder setMessageId(String messageId); + + public abstract Builder setPublished(Boolean published); + + public abstract Builder setLatencyMilliseconds(Long latencyMs); + + public abstract Builder setError(String error); + + public abstract PublishResult build(); + } + } + + /** + * The correlation key is an object that is passed back to the client during the event broker ack + * or nack. + * + *

    In the streaming writer is optionally used to calculate publish latencies, by calculating + * the time difference between the creation of the correlation key, and the time of the ack. + */ + @AutoValue + @DefaultSchema(AutoValueSchema.class) + public abstract static class CorrelationKey { + @SchemaFieldNumber("0") + public abstract String getMessageId(); + + @SchemaFieldNumber("1") + public abstract long getPublishMonotonicMillis(); + + public static Builder builder() { + return new AutoValue_Solace_CorrelationKey.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + public abstract Builder setMessageId(String messageId); + + public abstract Builder setPublishMonotonicMillis(long millis); + + public abstract CorrelationKey build(); + } + } + /** * A utility class for mapping {@link BytesXMLMessage} instances to {@link Solace.Record} objects. * This simplifies the process of converting raw Solace messages into a format suitable for use diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/write/SolaceOutput.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/write/SolaceOutput.java new file mode 100644 index 000000000000..6c37f879ae7f --- /dev/null +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/write/SolaceOutput.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.solace.write; + +import java.util.Map; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.io.solace.SolaceIO; +import org.apache.beam.sdk.io.solace.data.Solace; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PInput; +import org.apache.beam.sdk.values.POutput; +import org.apache.beam.sdk.values.PValue; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.checkerframework.checker.nullness.qual.Nullable; + +/** + * The {@link SolaceIO.Write} transform's output return this type, containing both the successful + * publishes ({@link #getSuccessfulPublish()}) and the failed publishes ({@link + * #getFailedPublish()}). + * + *

    The streaming writer with DIRECT messages does not return anything, and the output {@link + * PCollection}s will be equal to null. + */ +public final class SolaceOutput implements POutput { + private final Pipeline pipeline; + private final TupleTag failedPublishTag; + private final TupleTag successfulPublishTag; + private final @Nullable PCollection failedPublish; + private final @Nullable PCollection successfulPublish; + + public @Nullable PCollection getFailedPublish() { + return failedPublish; + } + + public @Nullable PCollection getSuccessfulPublish() { + return successfulPublish; + } + + public static SolaceOutput in( + Pipeline pipeline, + @Nullable PCollection failedPublish, + @Nullable PCollection successfulPublish) { + return new SolaceOutput( + pipeline, + SolaceIO.Write.FAILED_PUBLISH_TAG, + SolaceIO.Write.SUCCESSFUL_PUBLISH_TAG, + failedPublish, + successfulPublish); + } + + private SolaceOutput( + Pipeline pipeline, + TupleTag failedPublishTag, + TupleTag successfulPublishTag, + @Nullable PCollection failedPublish, + @Nullable PCollection successfulPublish) { + this.pipeline = pipeline; + this.failedPublishTag = failedPublishTag; + this.successfulPublishTag = successfulPublishTag; + this.failedPublish = failedPublish; + this.successfulPublish = successfulPublish; + } + + @Override + public Pipeline getPipeline() { + return pipeline; + } + + @Override + public Map, PValue> expand() { + ImmutableMap.Builder, PValue> builder = ImmutableMap., PValue>builder(); + + if (failedPublish != null) { + builder.put(failedPublishTag, failedPublish); + } + + if (successfulPublish != null) { + builder.put(successfulPublishTag, successfulPublish); + } + + return builder.build(); + } + + @Override + public void finishSpecifyingOutput( + String transformName, PInput input, PTransform transform) {} +} diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/write/package-info.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/write/package-info.java new file mode 100644 index 000000000000..65974b9b29c2 --- /dev/null +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/write/package-info.java @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** SolaceIO Write connector. */ +package org.apache.beam.sdk.io.solace.write; diff --git a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockEmptySessionService.java b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockEmptySessionService.java index 285c1cb8a7e8..ec0ae7194686 100644 --- a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockEmptySessionService.java +++ b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockEmptySessionService.java @@ -17,10 +17,11 @@ */ package org.apache.beam.sdk.io.solace; +import com.solacesystems.jcsmp.JCSMPProperties; import org.apache.beam.sdk.io.solace.broker.MessageReceiver; import org.apache.beam.sdk.io.solace.broker.SessionService; -public class MockEmptySessionService implements SessionService { +public class MockEmptySessionService extends SessionService { String exceptionMessage = "This is an empty client, use a MockSessionService instead."; @@ -43,4 +44,9 @@ public MessageReceiver createReceiver() { public void connect() { throw new UnsupportedOperationException(exceptionMessage); } + + @Override + public JCSMPProperties initializeSessionProperties(JCSMPProperties baseProperties) { + throw new UnsupportedOperationException(exceptionMessage); + } } diff --git a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockSessionService.java b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockSessionService.java index 7b14da138c64..a4d6a42ef302 100644 --- a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockSessionService.java +++ b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/MockSessionService.java @@ -18,23 +18,35 @@ package org.apache.beam.sdk.io.solace; import com.solacesystems.jcsmp.BytesXMLMessage; +import com.solacesystems.jcsmp.JCSMPProperties; import java.io.IOException; import java.io.Serializable; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.beam.sdk.io.solace.SolaceIO.SubmissionMode; import org.apache.beam.sdk.io.solace.broker.MessageReceiver; import org.apache.beam.sdk.io.solace.broker.SessionService; import org.apache.beam.sdk.transforms.SerializableFunction; +import org.checkerframework.checker.nullness.qual.Nullable; -public class MockSessionService implements SessionService { +public class MockSessionService extends SessionService { private final SerializableFunction getRecordFn; private MessageReceiver messageReceiver = null; private final int minMessagesReceived; + private final @Nullable SubmissionMode mode; public MockSessionService( - SerializableFunction getRecordFn, int minMessagesReceived) { + SerializableFunction getRecordFn, + int minMessagesReceived, + @Nullable SubmissionMode mode) { this.getRecordFn = getRecordFn; this.minMessagesReceived = minMessagesReceived; + this.mode = mode; + } + + public MockSessionService( + SerializableFunction getRecordFn, int minMessagesReceived) { + this(getRecordFn, minMessagesReceived, null); } @Override @@ -80,9 +92,24 @@ public BytesXMLMessage receive() throws IOException { return getRecordFn.apply(counter.getAndIncrement()); } + @Override + public void close() {} + @Override public boolean isEOF() { return counter.get() >= minMessagesReceived; } } + + @Override + public JCSMPProperties initializeSessionProperties(JCSMPProperties baseProperties) { + // Let's override some properties that will be overriden by the connector + // Opposite of the mode, to test that is overriden + baseProperties.setProperty( + JCSMPProperties.MESSAGE_CALLBACK_ON_REACTOR, mode == SubmissionMode.HIGHER_THROUGHPUT); + + baseProperties.setProperty(JCSMPProperties.PUB_ACK_WINDOW_SIZE, 87); + + return baseProperties; + } } diff --git a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/broker/BasicAuthWriterSessionTest.java b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/broker/BasicAuthWriterSessionTest.java new file mode 100644 index 000000000000..e33917641e33 --- /dev/null +++ b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/broker/BasicAuthWriterSessionTest.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.solace.broker; + +import static org.apache.beam.sdk.io.solace.broker.SessionService.DEFAULT_VPN_NAME; +import static org.junit.Assert.assertEquals; + +import com.solacesystems.jcsmp.JCSMPFactory; +import com.solacesystems.jcsmp.JCSMPProperties; +import com.solacesystems.jcsmp.Queue; +import org.apache.beam.sdk.io.solace.SolaceIO; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class BasicAuthWriterSessionTest { + private final String username = "Some Username"; + private final String password = "Some Password"; + private final String host = "Some Host"; + private final String vpn = "Some non default VPN"; + SessionService withVpn; + SessionService withoutVpn; + + @Before + public void setUp() throws Exception { + Queue q = JCSMPFactory.onlyInstance().createQueue("test-queue"); + + BasicAuthJcsmpSessionServiceFactory factoryWithVpn = + BasicAuthJcsmpSessionServiceFactory.builder() + .username(username) + .password(password) + .host(host) + .vpnName(vpn) + .build(); + factoryWithVpn.setQueue(q); + withVpn = factoryWithVpn.create(); + + BasicAuthJcsmpSessionServiceFactory factoryNoVpn = + BasicAuthJcsmpSessionServiceFactory.builder() + .username(username) + .password(password) + .host(host) + .build(); + factoryNoVpn.setQueue(q); + withoutVpn = factoryNoVpn.create(); + } + + @Test + public void testAuthProperties() { + SolaceIO.SubmissionMode mode = SolaceIO.SubmissionMode.HIGHER_THROUGHPUT; + JCSMPProperties props = withoutVpn.initializeWriteSessionProperties(mode); + assertEquals(username, props.getStringProperty(JCSMPProperties.USERNAME)); + assertEquals(password, props.getStringProperty(JCSMPProperties.PASSWORD)); + assertEquals(host, props.getStringProperty(JCSMPProperties.HOST)); + assertEquals( + JCSMPProperties.AUTHENTICATION_SCHEME_BASIC, + props.getStringProperty(JCSMPProperties.AUTHENTICATION_SCHEME)); + } + + @Test + public void testVpnNames() { + SolaceIO.SubmissionMode mode = SolaceIO.SubmissionMode.LOWER_LATENCY; + JCSMPProperties propsWithoutVpn = withoutVpn.initializeWriteSessionProperties(mode); + assertEquals(DEFAULT_VPN_NAME, propsWithoutVpn.getStringProperty(JCSMPProperties.VPN_NAME)); + JCSMPProperties propsWithVpn = withVpn.initializeWriteSessionProperties(mode); + assertEquals(vpn, propsWithVpn.getStringProperty(JCSMPProperties.VPN_NAME)); + } + + @Test + public void testOverrideWithHigherThroughput() { + SolaceIO.SubmissionMode mode = SolaceIO.SubmissionMode.HIGHER_THROUGHPUT; + JCSMPProperties props = withoutVpn.initializeWriteSessionProperties(mode); + + assertEquals(false, props.getBooleanProperty(JCSMPProperties.MESSAGE_CALLBACK_ON_REACTOR)); + assertEquals( + Long.valueOf(255), + Long.valueOf(props.getIntegerProperty(JCSMPProperties.PUB_ACK_WINDOW_SIZE))); + } + + @Test + public void testOverrideWithLowerLatency() { + SolaceIO.SubmissionMode mode = SolaceIO.SubmissionMode.LOWER_LATENCY; + JCSMPProperties props = withoutVpn.initializeWriteSessionProperties(mode); + assertEquals(true, props.getBooleanProperty(JCSMPProperties.MESSAGE_CALLBACK_ON_REACTOR)); + assertEquals( + Long.valueOf(50), + Long.valueOf(props.getIntegerProperty(JCSMPProperties.PUB_ACK_WINDOW_SIZE))); + } +} diff --git a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/broker/OverrideWriterPropertiesTest.java b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/broker/OverrideWriterPropertiesTest.java new file mode 100644 index 000000000000..0c6f88a7c9d5 --- /dev/null +++ b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/broker/OverrideWriterPropertiesTest.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.solace.broker; + +import static org.junit.Assert.assertEquals; + +import com.solacesystems.jcsmp.JCSMPProperties; +import org.apache.beam.sdk.io.solace.MockSessionService; +import org.apache.beam.sdk.io.solace.SolaceIO; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class OverrideWriterPropertiesTest { + @Test + public void testOverrideForHigherThroughput() { + SolaceIO.SubmissionMode mode = SolaceIO.SubmissionMode.HIGHER_THROUGHPUT; + MockSessionService service = new MockSessionService(null, 0, mode); + + // Test HIGHER_THROUGHPUT mode + JCSMPProperties props = service.initializeWriteSessionProperties(mode); + assertEquals(false, props.getBooleanProperty(JCSMPProperties.MESSAGE_CALLBACK_ON_REACTOR)); + assertEquals( + Long.valueOf(255), + Long.valueOf(props.getIntegerProperty(JCSMPProperties.PUB_ACK_WINDOW_SIZE))); + } + + @Test + public void testOverrideForLowerLatency() { + SolaceIO.SubmissionMode mode = SolaceIO.SubmissionMode.LOWER_LATENCY; + MockSessionService service = new MockSessionService(null, 0, mode); + + // Test HIGHER_THROUGHPUT mode + JCSMPProperties props = service.initializeWriteSessionProperties(mode); + assertEquals(true, props.getBooleanProperty(JCSMPProperties.MESSAGE_CALLBACK_ON_REACTOR)); + assertEquals( + Long.valueOf(50), + Long.valueOf(props.getIntegerProperty(JCSMPProperties.PUB_ACK_WINDOW_SIZE))); + } +} diff --git a/sdks/java/io/synthetic/src/main/java/org/apache/beam/sdk/io/synthetic/SyntheticStep.java b/sdks/java/io/synthetic/src/main/java/org/apache/beam/sdk/io/synthetic/SyntheticStep.java index d32640ffbf7d..98db23c95a38 100644 --- a/sdks/java/io/synthetic/src/main/java/org/apache/beam/sdk/io/synthetic/SyntheticStep.java +++ b/sdks/java/io/synthetic/src/main/java/org/apache/beam/sdk/io/synthetic/SyntheticStep.java @@ -58,7 +58,7 @@ public class SyntheticStep extends DoFn, KV> private final KV idAndThroughput; private final Counter throttlingCounter = - Metrics.counter("dataflow-throttling-metrics", "throttling-msecs"); + Metrics.counter("dataflow-throttling-metrics", Metrics.THROTTLE_TIME_COUNTER_NAME); /** * Static cache to store one worker level rate limiter for a step. Value in KV is the desired diff --git a/sdks/java/io/thrift/src/main/java/org/apache/beam/sdk/io/thrift/ThriftSchema.java b/sdks/java/io/thrift/src/main/java/org/apache/beam/sdk/io/thrift/ThriftSchema.java index 5ee78590f679..5f4e195f227f 100644 --- a/sdks/java/io/thrift/src/main/java/org/apache/beam/sdk/io/thrift/ThriftSchema.java +++ b/sdks/java/io/thrift/src/main/java/org/apache/beam/sdk/io/thrift/ThriftSchema.java @@ -35,7 +35,7 @@ import java.util.stream.StreamSupport; import org.apache.beam.sdk.schemas.FieldValueGetter; import org.apache.beam.sdk.schemas.FieldValueTypeInformation; -import org.apache.beam.sdk.schemas.GetterBasedSchemaProvider; +import org.apache.beam.sdk.schemas.GetterBasedSchemaProviderV2; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.FieldType; import org.apache.beam.sdk.schemas.SchemaProvider; @@ -105,7 +105,7 @@ * not. On decoding, we set all non-{@code null} beam row values to the corresponding thrift fields, * leaving the rest unset. */ -public final class ThriftSchema extends GetterBasedSchemaProvider { +public final class ThriftSchema extends GetterBasedSchemaProviderV2 { private static final ThriftSchema defaultProvider = new ThriftSchema(Collections.emptyMap()); private final Map typedefs; @@ -203,17 +203,19 @@ private Schema.Field beamField(FieldMetaData fieldDescriptor) { @SuppressWarnings("rawtypes") @Override public @NonNull List fieldValueGetters( - @NonNull Class targetClass, @NonNull Schema schema) { - return schemaFieldDescriptors(targetClass, schema).keySet().stream() + @NonNull TypeDescriptor targetTypeDescriptor, @NonNull Schema schema) { + return schemaFieldDescriptors(targetTypeDescriptor.getRawType(), schema).keySet().stream() .map(FieldExtractor::new) .collect(Collectors.toList()); } @Override public @NonNull List fieldValueTypeInformations( - @NonNull Class targetClass, @NonNull Schema schema) { - return schemaFieldDescriptors(targetClass, schema).values().stream() - .map(descriptor -> fieldValueTypeInfo(targetClass, descriptor.fieldName)) + @NonNull TypeDescriptor targetTypeDescriptor, @NonNull Schema schema) { + return schemaFieldDescriptors(targetTypeDescriptor.getRawType(), schema).values().stream() + .map( + descriptor -> + fieldValueTypeInfo(targetTypeDescriptor.getRawType(), descriptor.fieldName)) .collect(Collectors.toList()); } @@ -252,10 +254,11 @@ private FieldValueTypeInformation fieldValueTypeInfo(Class type, String field @Override public @NonNull SchemaUserTypeCreator schemaTypeCreator( - @NonNull Class targetClass, @NonNull Schema schema) { + @NonNull TypeDescriptor targetTypeDescriptor, @NonNull Schema schema) { final Map fieldDescriptors = - schemaFieldDescriptors(targetClass, schema); - return params -> restoreThriftObject(targetClass, fieldDescriptors, params); + schemaFieldDescriptors(targetTypeDescriptor.getRawType(), schema); + return params -> + restoreThriftObject(targetTypeDescriptor.getRawType(), fieldDescriptors, params); } @SuppressWarnings("nullness") diff --git a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/QueryReader.java b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/QueryReader.java index 4983d52a642f..8071bad84d73 100644 --- a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/QueryReader.java +++ b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/QueryReader.java @@ -17,11 +17,11 @@ */ package org.apache.beam.sdk.tpcds; +import java.nio.charset.StandardCharsets; import java.util.Set; import org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.sql.SqlNode; import org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.sql.parser.SqlParseException; import org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.sql.parser.SqlParser; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.Resources; /** @@ -39,7 +39,7 @@ public class QueryReader { */ public static String readQuery(String queryFileName) throws Exception { String path = "queries/" + queryFileName + ".sql"; - return Resources.toString(Resources.getResource(path), Charsets.UTF_8); + return Resources.toString(Resources.getResource(path), StandardCharsets.UTF_8); } /** diff --git a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java index 1550a25b7c8f..6efb7e7e0659 100644 --- a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java +++ b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java @@ -21,6 +21,7 @@ import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -56,7 +57,6 @@ import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.sql.SqlIdentifier; import org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.sql.util.SqlBasicVisitor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.Resources; import org.apache.commons.csv.CSVFormat; @@ -212,7 +212,7 @@ private static PCollection getTableCSV( private static org.apache.avro.Schema getAvroSchema(String tableName) throws IOException { String path = "schemas_avro/" + tableName + ".json"; return new org.apache.avro.Schema.Parser() - .parse(Resources.toString(Resources.getResource(path), Charsets.UTF_8)); + .parse(Resources.toString(Resources.getResource(path), StandardCharsets.UTF_8)); } static org.apache.avro.Schema getProjectedSchema( diff --git a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TableSchemaJSONLoader.java b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TableSchemaJSONLoader.java index 485fa83a4a8e..97116e14cdcd 100644 --- a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TableSchemaJSONLoader.java +++ b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TableSchemaJSONLoader.java @@ -20,11 +20,11 @@ import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.Resources; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.reflect.ClassPath; import org.json.simple.JSONArray; @@ -49,7 +49,7 @@ public class TableSchemaJSONLoader { @SuppressWarnings({"rawtypes", "DefaultCharset"}) public static String parseTableSchema(String tableName) throws Exception { String path = "schemas/" + tableName + ".json"; - String schema = Resources.toString(Resources.getResource(path), Charsets.UTF_8); + String schema = Resources.toString(Resources.getResource(path), StandardCharsets.UTF_8); JSONObject jsonObject = (JSONObject) new JSONParser().parse(schema); JSONArray jsonArray = (JSONArray) jsonObject.get("schema"); diff --git a/sdks/java/transform-service/launcher/src/test/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncherTest.java b/sdks/java/transform-service/launcher/src/test/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncherTest.java index a9ae5360a859..b766d2b13a4b 100644 --- a/sdks/java/transform-service/launcher/src/test/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncherTest.java +++ b/sdks/java/transform-service/launcher/src/test/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncherTest.java @@ -25,12 +25,12 @@ import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.UUID; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.junit.Assert; import org.junit.Test; import org.junit.runner.RunWith; @@ -77,7 +77,7 @@ public void testLauncherInstallsDependencies() throws IOException { try (Writer fout = new OutputStreamWriter( - new FileOutputStream(requirementsFile.getAbsolutePath()), Charsets.UTF_8)) { + new FileOutputStream(requirementsFile.getAbsolutePath()), StandardCharsets.UTF_8)) { fout.write("pypipackage1\n"); fout.write("pypipackage2\n"); } @@ -118,7 +118,7 @@ public void testLauncherInstallsLocalDependencies() throws IOException { dependency1.deleteOnExit(); try (Writer fout = new OutputStreamWriter( - new FileOutputStream(dependency1.getAbsolutePath()), Charsets.UTF_8)) { + new FileOutputStream(dependency1.getAbsolutePath()), StandardCharsets.UTF_8)) { fout.write("tempdata\n"); } @@ -128,7 +128,7 @@ public void testLauncherInstallsLocalDependencies() throws IOException { dependency2.deleteOnExit(); try (Writer fout = new OutputStreamWriter( - new FileOutputStream(dependency2.getAbsolutePath()), Charsets.UTF_8)) { + new FileOutputStream(dependency2.getAbsolutePath()), StandardCharsets.UTF_8)) { fout.write("tempdata\n"); } @@ -140,7 +140,7 @@ public void testLauncherInstallsLocalDependencies() throws IOException { requirementsFile.deleteOnExit(); try (Writer fout = new OutputStreamWriter( - new FileOutputStream(requirementsFile.getAbsolutePath()), Charsets.UTF_8)) { + new FileOutputStream(requirementsFile.getAbsolutePath()), StandardCharsets.UTF_8)) { fout.write(dependency1.getAbsolutePath() + "\n"); fout.write(dependency2.getAbsolutePath() + "\n"); fout.write("pypipackage" + "\n"); diff --git a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py index a4bd0d0a8127..c7ea908a9336 100644 --- a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py +++ b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py @@ -557,6 +557,7 @@ def test_series_tests(self): 'pandas.core.series.Series': ['ser.iloc[0] = 999'], }, not_implemented_ok={ + 'pandas.core.series.Series.case_when': ['*'], 'pandas.core.series.Series.transform': [ # str arg not supported. Tested with np.sum in # frames_test.py::DeferredFrameTest::test_groupby_transform_sum diff --git a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py index 7064a5add13c..50b026edf240 100644 --- a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py +++ b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py @@ -61,14 +61,13 @@ MAX_TIMESTAMP = 0x7fffffffffffffff -class ExtractUserAndTimestampDoFn(beam.DoFn): +def extract_user_and_timestamp(element): """Extracts user and timestamp representing a Wikipedia edit.""" - def process(self, element): - table_row = json.loads(element) - if 'contributor_username' in table_row: - user_name = table_row['contributor_username'] - timestamp = table_row['timestamp'] - yield TimestampedValue(user_name, timestamp) + table_row = json.loads(element) + if 'contributor_username' in table_row: + user_name = table_row['contributor_username'] + timestamp = table_row['timestamp'] + return TimestampedValue(user_name, timestamp) class ComputeSessions(beam.PTransform): @@ -98,19 +97,15 @@ def expand(self, pcoll): without_defaults()) -class SessionsToStringsDoFn(beam.DoFn): +def sessions_to_strings(element, window=beam.DoFn.WindowParam): """Adds the session information to be part of the key.""" - def process(self, element, window=beam.DoFn.WindowParam): - yield (element[0] + ' : ' + str(window), element[1]) + return (element[0] + ' : ' + str(window), element[1]) -class FormatOutputDoFn(beam.DoFn): +def format_output(element, window=beam.DoFn.WindowParam): """Formats a string containing the user, count, and session.""" - def process(self, element, window=beam.DoFn.WindowParam): - for kv in element: - session = kv[0] - count = kv[1] - yield session + ' : ' + str(count) + ' : ' + str(window) + for session, count in element: + yield session + ' : ' + str(count) + ' : ' + str(window) class ComputeTopSessions(beam.PTransform): @@ -124,14 +119,13 @@ def __init__(self, sampling_threshold): def expand(self, pcoll): return ( pcoll - | - 'ExtractUserAndTimestamp' >> beam.ParDo(ExtractUserAndTimestampDoFn()) + | 'ExtractUserAndTimestamp' >> beam.Map(extract_user_and_timestamp) | beam.Filter( lambda x: (abs(hash(x)) <= MAX_TIMESTAMP * self.sampling_threshold)) | ComputeSessions() - | 'SessionsToStrings' >> beam.ParDo(SessionsToStringsDoFn()) + | 'SessionsToStrings' >> beam.Map(sessions_to_strings) | TopPerMonth() - | 'FormatOutput' >> beam.ParDo(FormatOutputDoFn())) + | 'FormatOutput' >> beam.FlatMap(format_output)) def run(argv=None): diff --git a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_test.py b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_test.py index 3c171664e45d..92d1d196fe05 100644 --- a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_test.py +++ b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_test.py @@ -28,6 +28,8 @@ from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to +# TODO: Unit test top_wikipedia_sessions.extract_user_and_timestamp, etc. + class ComputeTopSessionsTest(unittest.TestCase): diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/requirements.txt b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/requirements.txt index 2e65f0fba246..ba1103dd1ef9 100644 --- a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/requirements.txt +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/requirements.txt @@ -13,6 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -torch==1.13.1 +torch==2.2.0 numpy==1.22.4 Pillow==10.2.0 \ No newline at end of file diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py index e1a4af31f1c2..3203c21a8e64 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py @@ -765,10 +765,26 @@ def process( GlobalWindows.windowed_value((destination, job_reference))) def finish_bundle(self): + dataset_locations = {} + for windowed_value in self.pending_jobs: + table_ref = bigquery_tools.parse_table_reference(windowed_value.value[0]) + project_dataset = (table_ref.projectId, table_ref.datasetId) + job_ref = windowed_value.value[1] + # In some cases (e.g. when the load job op returns a 409 ALREADY_EXISTS), + # the returned job reference may not include a location. In such cases, + # we need to override with the dataset's location. + job_location = job_ref.location + if not job_location and project_dataset not in dataset_locations: + job_location = self.bq_wrapper.get_table_location( + table_ref.projectId, table_ref.datasetId, table_ref.tableId) + dataset_locations[project_dataset] = job_location + self.bq_wrapper.wait_for_bq_job( - job_ref, sleep_duration_sec=_SLEEP_DURATION_BETWEEN_POLLS) + job_ref, + sleep_duration_sec=_SLEEP_DURATION_BETWEEN_POLLS, + location=job_location) return self.pending_jobs diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py index 0605206714ed..f27c7899f9f3 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py @@ -426,6 +426,7 @@ def test_records_traverse_transform_with_mocks(self): job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = bigquery_api.Job() result_job.jobReference = job_reference @@ -481,6 +482,7 @@ def test_load_job_id_used(self): job_reference = bigquery_api.JobReference() job_reference.projectId = 'loadJobProject' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = bigquery_api.Job() result_job.jobReference = job_reference @@ -515,6 +517,7 @@ def test_load_job_id_use_for_copy_job(self): job_reference = bigquery_api.JobReference() job_reference.projectId = 'loadJobProject' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = mock.Mock() result_job.jobReference = job_reference @@ -567,10 +570,12 @@ def test_wait_for_load_job_completion(self, sleep_mock): job_1.jobReference = bigquery_api.JobReference() job_1.jobReference.projectId = 'project1' job_1.jobReference.jobId = 'jobId1' + job_1.jobReference.location = 'US' job_2 = bigquery_api.Job() job_2.jobReference = bigquery_api.JobReference() job_2.jobReference.projectId = 'project1' job_2.jobReference.jobId = 'jobId2' + job_2.jobReference.location = 'US' job_1_waiting = mock.Mock() job_1_waiting.status.state = 'RUNNING' @@ -610,10 +615,12 @@ def test_one_load_job_failed_after_waiting(self, sleep_mock): job_1.jobReference = bigquery_api.JobReference() job_1.jobReference.projectId = 'project1' job_1.jobReference.jobId = 'jobId1' + job_1.jobReference.location = 'US' job_2 = bigquery_api.Job() job_2.jobReference = bigquery_api.JobReference() job_2.jobReference.projectId = 'project1' job_2.jobReference.jobId = 'jobId2' + job_2.jobReference.location = 'US' job_1_waiting = mock.Mock() job_1_waiting.status.state = 'RUNNING' @@ -650,6 +657,7 @@ def test_multiple_partition_files(self): job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = mock.Mock() result_job.jobReference = job_reference @@ -732,6 +740,7 @@ def test_multiple_partition_files_write_dispositions( job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = mock.Mock() result_job.jobReference = job_reference @@ -774,6 +783,7 @@ def test_triggering_frequency(self, is_streaming, with_auto_sharding): job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = bigquery_api.Job() result_job.jobReference = job_reference diff --git a/sdks/python/apache_beam/io/gcp/bigquery_read_internal.py b/sdks/python/apache_beam/io/gcp/bigquery_read_internal.py index ce49cd0161df..f3881ed261ae 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_read_internal.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_read_internal.py @@ -24,7 +24,7 @@ import decimal import json import logging -import random +import secrets import time import uuid from typing import TYPE_CHECKING @@ -212,7 +212,7 @@ def __init__( self._source_uuid = unique_id self.kms_key = kms_key self.project = project - self.temp_dataset = temp_dataset or 'bq_read_all_%s' % uuid.uuid4().hex + self.temp_dataset = temp_dataset self.query_priority = query_priority self.bq_io_metadata = None @@ -226,22 +226,27 @@ def display_data(self): 'temp_dataset': str(self.temp_dataset) } - def _get_temp_dataset(self): - if isinstance(self.temp_dataset, str): - return DatasetReference( - datasetId=self.temp_dataset, projectId=self._get_project()) - else: + def _get_temp_dataset_id(self): + if self.temp_dataset is None: + return None + elif isinstance(self.temp_dataset, DatasetReference): + return self.temp_dataset.datasetId + elif isinstance(self.temp_dataset, str): return self.temp_dataset + else: + raise ValueError("temp_dataset has to be either str or DatasetReference") - def process(self, - element: 'ReadFromBigQueryRequest') -> Iterable[BoundedSource]: - bq = bigquery_tools.BigQueryWrapper( - temp_dataset_id=self._get_temp_dataset().datasetId, + def start_bundle(self): + self.bq = bigquery_tools.BigQueryWrapper( + temp_dataset_id=self._get_temp_dataset_id(), client=bigquery_tools.BigQueryWrapper._bigquery_client(self.options)) + def process(self, + element: 'ReadFromBigQueryRequest') -> Iterable[BoundedSource]: if element.query is not None: - self._setup_temporary_dataset(bq, element) - table_reference = self._execute_query(bq, element) + if not self.bq.created_temp_dataset: + self._setup_temporary_dataset(self.bq, element) + table_reference = self._execute_query(self.bq, element) else: assert element.table table_reference = bigquery_tools.parse_table_reference( @@ -250,19 +255,21 @@ def process(self, if not table_reference.projectId: table_reference.projectId = self._get_project() - schema, metadata_list = self._export_files(bq, element, table_reference) + schema, metadata_list = self._export_files( + self.bq, element, table_reference) for metadata in metadata_list: yield self._create_source(metadata.path, schema) if element.query is not None: - bq._delete_table( + self.bq._delete_table( table_reference.projectId, table_reference.datasetId, table_reference.tableId) - if bq.created_temp_dataset: - self._clean_temporary_dataset(bq, element) + def finish_bundle(self): + if self.bq.created_temp_dataset: + self.bq.clean_up_temporary_dataset(self._get_project()) def _get_bq_metadata(self): if not self.bq_io_metadata: @@ -288,12 +295,6 @@ def _setup_temporary_dataset( self._get_project(), element.query, not element.use_standard_sql) bq.create_temporary_dataset(self._get_project(), location) - def _clean_temporary_dataset( - self, - bq: bigquery_tools.BigQueryWrapper, - element: 'ReadFromBigQueryRequest'): - bq.clean_up_temporary_dataset(self._get_project()) - def _execute_query( self, bq: bigquery_tools.BigQueryWrapper, @@ -302,7 +303,7 @@ def _execute_query( self._job_name, self._source_uuid, bigquery_tools.BigQueryJobTypes.QUERY, - '%s_%s' % (int(time.time()), random.randint(0, 1000))) + '%s_%s' % (int(time.time()), secrets.token_hex(3))) job = bq._start_query_job( self._get_project(), element.query, diff --git a/sdks/python/apache_beam/io/gcp/bigquery_read_it_test.py b/sdks/python/apache_beam/io/gcp/bigquery_read_it_test.py index d56a4c764715..913d6e078d89 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_read_it_test.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_read_it_test.py @@ -109,11 +109,11 @@ def tearDownClass(cls): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=cls.project, datasetId=cls.dataset_id, deleteContents=True) try: - _LOGGER.info( + _LOGGER.debug( "Deleting dataset %s in project %s", cls.dataset_id, cls.project) cls.bigquery_client.client.datasets.Delete(request) except HttpError: - _LOGGER.debug( + _LOGGER.warning( 'Failed to clean up dataset %s in project %s', cls.dataset_id, cls.project) diff --git a/sdks/python/apache_beam/io/gcp/bigquery_tools.py b/sdks/python/apache_beam/io/gcp/bigquery_tools.py index a92f30ec35ce..c7128e7899ec 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_tools.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_tools.py @@ -631,7 +631,8 @@ def _start_query_job( return self._start_job(request) - def wait_for_bq_job(self, job_reference, sleep_duration_sec=5, max_retries=0): + def wait_for_bq_job( + self, job_reference, sleep_duration_sec=5, max_retries=0, location=None): """Poll job until it is DONE. Args: @@ -639,6 +640,7 @@ def wait_for_bq_job(self, job_reference, sleep_duration_sec=5, max_retries=0): sleep_duration_sec: Specifies the delay in seconds between retries. max_retries: The total number of times to retry. If equals to 0, the function waits forever. + location: Fall back on this location if job_reference doesn't have one. Raises: `RuntimeError`: If the job is FAILED or the number of retries has been @@ -648,7 +650,9 @@ def wait_for_bq_job(self, job_reference, sleep_duration_sec=5, max_retries=0): while True: retry += 1 job = self.get_job( - job_reference.projectId, job_reference.jobId, job_reference.location) + job_reference.projectId, + job_reference.jobId, + job_reference.location or location) _LOGGER.info('Job %s status: %s', job.id, job.status.state) if job.status.state == 'DONE' and job.status.errorResult: raise RuntimeError( diff --git a/sdks/python/apache_beam/metrics/cells.pxd b/sdks/python/apache_beam/metrics/cells.pxd index 0eaa890c02ac..a8f4003d8980 100644 --- a/sdks/python/apache_beam/metrics/cells.pxd +++ b/sdks/python/apache_beam/metrics/cells.pxd @@ -44,6 +44,12 @@ cdef class GaugeCell(MetricCell): cdef readonly object data +cdef class StringSetCell(MetricCell): + cdef readonly set data + + cdef inline bint _update(self, value) except -1 + + cdef class DistributionData(object): cdef readonly libc.stdint.int64_t sum cdef readonly libc.stdint.int64_t count diff --git a/sdks/python/apache_beam/metrics/cells.py b/sdks/python/apache_beam/metrics/cells.py index 3bfbfc6b2e77..407106342fb8 100644 --- a/sdks/python/apache_beam/metrics/cells.py +++ b/sdks/python/apache_beam/metrics/cells.py @@ -266,6 +266,62 @@ def to_runner_api_monitoring_info_impl(self, name, transform_id): ptransform=transform_id) +class StringSetCell(MetricCell): + """For internal use only; no backwards-compatibility guarantees. + + Tracks the current value for a StringSet metric. + + Each cell tracks the state of a metric independently per context per bundle. + Therefore, each metric has a different cell in each bundle, that is later + aggregated. + + This class is thread safe. + """ + def __init__(self, *args): + super().__init__(*args) + self.data = StringSetAggregator.identity_element() + + def add(self, value): + self.update(value) + + def update(self, value): + # type: (str) -> None + if cython.compiled: + # We will hold the GIL throughout the entire _update. + self._update(value) + else: + with self._lock: + self._update(value) + + def _update(self, value): + self.data.add(value) + + def get_cumulative(self): + # type: () -> set + with self._lock: + return set(self.data) + + def combine(self, other): + # type: (StringSetCell) -> StringSetCell + combined = StringSetAggregator().combine(self.data, other.data) + result = StringSetCell() + result.data = combined + return result + + def to_runner_api_monitoring_info_impl(self, name, transform_id): + from apache_beam.metrics import monitoring_infos + + return monitoring_infos.user_set_string( + name.namespace, + name.name, + self.get_cumulative(), + ptransform=transform_id) + + def reset(self): + # type: () -> None + self.data = StringSetAggregator.identity_element() + + class DistributionResult(object): """The result of a Distribution metric.""" def __init__(self, data): @@ -551,3 +607,22 @@ def combine(self, x, y): def result(self, x): # type: (GaugeData) -> GaugeResult return GaugeResult(x.get_cumulative()) + + +class StringSetAggregator(MetricAggregator): + @staticmethod + def identity_element(): + # type: () -> set + return set() + + def combine(self, x, y): + # type: (set, set) -> set + if len(x) == 0: + return y + elif len(y) == 0: + return x + else: + return set.union(x, y) + + def result(self, x): + return x diff --git a/sdks/python/apache_beam/metrics/cells_test.py b/sdks/python/apache_beam/metrics/cells_test.py index 3d4d81c3d12b..052ff051bf96 100644 --- a/sdks/python/apache_beam/metrics/cells_test.py +++ b/sdks/python/apache_beam/metrics/cells_test.py @@ -25,6 +25,7 @@ from apache_beam.metrics.cells import DistributionData from apache_beam.metrics.cells import GaugeCell from apache_beam.metrics.cells import GaugeData +from apache_beam.metrics.cells import StringSetCell from apache_beam.metrics.metricbase import MetricName @@ -169,5 +170,28 @@ def test_start_time_set(self): self.assertGreater(mi.start_time.seconds, 0) +class TestStringSetCell(unittest.TestCase): + def test_not_leak_mutable_set(self): + c = StringSetCell() + c.add('test') + c.add('another') + s = c.get_cumulative() + self.assertEqual(s, set(('test', 'another'))) + s.add('yet another') + self.assertEqual(c.get_cumulative(), set(('test', 'another'))) + + def test_combine_appropriately(self): + s1 = StringSetCell() + s1.add('1') + s1.add('2') + + s2 = StringSetCell() + s2.add('1') + s2.add('3') + + result = s2.combine(s1) + self.assertEqual(result.data, set(('1', '2', '3'))) + + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/metrics/execution.py b/sdks/python/apache_beam/metrics/execution.py index 570062371cae..37007add9163 100644 --- a/sdks/python/apache_beam/metrics/execution.py +++ b/sdks/python/apache_beam/metrics/execution.py @@ -46,6 +46,7 @@ from apache_beam.metrics.cells import CounterCell from apache_beam.metrics.cells import DistributionCell from apache_beam.metrics.cells import GaugeCell +from apache_beam.metrics.cells import StringSetCell from apache_beam.runners.worker import statesampler from apache_beam.runners.worker.statesampler import get_current_tracker @@ -257,6 +258,12 @@ def get_gauge(self, metric_name): GaugeCell, self.get_metric_cell(_TypedMetricName(GaugeCell, metric_name))) + def get_string_set(self, metric_name): + # type: (MetricName) -> StringSetCell + return cast( + StringSetCell, + self.get_metric_cell(_TypedMetricName(StringSetCell, metric_name))) + def get_metric_cell(self, typed_metric_name): # type: (_TypedMetricName) -> MetricCell cell = self.metrics.get(typed_metric_name, None) @@ -290,7 +297,13 @@ def get_cumulative(self): v in self.metrics.items() if k.cell_type == GaugeCell } - return MetricUpdates(counters, distributions, gauges) + string_sets = { + MetricKey(self.step_name, k.metric_name): v.get_cumulative() + for k, + v in self.metrics.items() if k.cell_type == StringSetCell + } + + return MetricUpdates(counters, distributions, gauges, string_sets) def to_runner_api(self): return [ @@ -342,7 +355,8 @@ def __init__( self, counters=None, # type: Optional[Dict[MetricKey, int]] distributions=None, # type: Optional[Dict[MetricKey, DistributionData]] - gauges=None # type: Optional[Dict[MetricKey, GaugeData]] + gauges=None, # type: Optional[Dict[MetricKey, GaugeData]] + string_sets=None, # type: Optional[Dict[MetricKey, set]] ): # type: (...) -> None @@ -352,7 +366,9 @@ def __init__( counters: Dictionary of MetricKey:MetricUpdate updates. distributions: Dictionary of MetricKey:MetricUpdate objects. gauges: Dictionary of MetricKey:MetricUpdate objects. + string_sets: Dictionary of MetricKey:MetricUpdate objects. """ self.counters = counters or {} self.distributions = distributions or {} self.gauges = gauges or {} + self.string_sets = string_sets or {} diff --git a/sdks/python/apache_beam/metrics/execution_test.py b/sdks/python/apache_beam/metrics/execution_test.py index a888376e7091..b157aeb20e9e 100644 --- a/sdks/python/apache_beam/metrics/execution_test.py +++ b/sdks/python/apache_beam/metrics/execution_test.py @@ -17,6 +17,7 @@ # pytype: skip-file +import functools import unittest from apache_beam.metrics.execution import MetricKey @@ -88,10 +89,12 @@ def test_get_cumulative_or_updates(self): distribution = mc.get_distribution( MetricName('namespace', 'name{}'.format(i))) gauge = mc.get_gauge(MetricName('namespace', 'name{}'.format(i))) + str_set = mc.get_string_set(MetricName('namespace', 'name{}'.format(i))) counter.inc(i) distribution.update(i) gauge.set(i) + str_set.add(str(i % 7)) all_values.append(i) # Retrieve ALL updates. @@ -99,6 +102,7 @@ def test_get_cumulative_or_updates(self): self.assertEqual(len(cumulative.counters), 10) self.assertEqual(len(cumulative.distributions), 10) self.assertEqual(len(cumulative.gauges), 10) + self.assertEqual(len(cumulative.string_sets), 10) self.assertEqual( set(all_values), {v @@ -106,6 +110,11 @@ def test_get_cumulative_or_updates(self): self.assertEqual( set(all_values), {v.value for _, v in cumulative.gauges.items()}) + self.assertEqual({str(i % 7) + for i in all_values}, + functools.reduce( + set.union, + (v for _, v in cumulative.string_sets.items()))) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/metrics/metric.py b/sdks/python/apache_beam/metrics/metric.py index 3722af6dc17a..77cafb8bd64b 100644 --- a/sdks/python/apache_beam/metrics/metric.py +++ b/sdks/python/apache_beam/metrics/metric.py @@ -44,6 +44,7 @@ from apache_beam.metrics.metricbase import Distribution from apache_beam.metrics.metricbase import Gauge from apache_beam.metrics.metricbase import MetricName +from apache_beam.metrics.metricbase import StringSet if TYPE_CHECKING: from apache_beam.metrics.execution import MetricKey @@ -115,6 +116,23 @@ def gauge( namespace = Metrics.get_namespace(namespace) return Metrics.DelegatingGauge(MetricName(namespace, name)) + @staticmethod + def string_set( + namespace: Union[Type, str], name: str) -> 'Metrics.DelegatingStringSet': + """Obtains or creates a String set metric. + + String set metrics are restricted to string values. + + Args: + namespace: A class or string that gives the namespace to a metric + name: A string that gives a unique name to a metric + + Returns: + A StringSet object. + """ + namespace = Metrics.get_namespace(namespace) + return Metrics.DelegatingStringSet(MetricName(namespace, name)) + class DelegatingCounter(Counter): """Metrics Counter that Delegates functionality to MetricsEnvironment.""" def __init__( @@ -138,11 +156,18 @@ def __init__(self, metric_name: MetricName) -> None: super().__init__(metric_name) self.set = MetricUpdater(cells.GaugeCell, metric_name) # type: ignore[assignment] + class DelegatingStringSet(StringSet): + """Metrics StringSet that Delegates functionality to MetricsEnvironment.""" + def __init__(self, metric_name: MetricName) -> None: + super().__init__(metric_name) + self.add = MetricUpdater(cells.StringSetCell, metric_name) # type: ignore[assignment] + class MetricResults(object): COUNTERS = "counters" DISTRIBUTIONS = "distributions" GAUGES = "gauges" + STRINGSETS = "string_sets" @staticmethod def _matches_name(filter: 'MetricsFilter', metric_key: 'MetricKey') -> bool: @@ -207,11 +232,13 @@ def query( { "counters": [MetricResult(counter_key, committed, attempted), ...], "distributions": [MetricResult(dist_key, committed, attempted), ...], - "gauges": [] // Empty list if nothing matched the filter. + "gauges": [], // Empty list if nothing matched the filter. + "string_sets": [] [MetricResult(string_set_key, committed, attempted), + ...] } The committed / attempted values are DistributionResult / GaugeResult / int - objects. + / set objects. """ raise NotImplementedError diff --git a/sdks/python/apache_beam/metrics/metricbase.py b/sdks/python/apache_beam/metrics/metricbase.py index 53da01f3955c..7819dbb093a5 100644 --- a/sdks/python/apache_beam/metrics/metricbase.py +++ b/sdks/python/apache_beam/metrics/metricbase.py @@ -38,7 +38,13 @@ from typing import Optional __all__ = [ - 'Metric', 'Counter', 'Distribution', 'Gauge', 'Histogram', 'MetricName' + 'Metric', + 'Counter', + 'Distribution', + 'Gauge', + 'StringSet', + 'Histogram', + 'MetricName' ] @@ -138,6 +144,14 @@ def set(self, value): raise NotImplementedError +class StringSet(Metric): + """StringSet Metric interface. + + Reports set of unique string values during pipeline execution..""" + def add(self, value): + raise NotImplementedError + + class Histogram(Metric): """Histogram Metric interface. diff --git a/sdks/python/apache_beam/metrics/monitoring_infos.py b/sdks/python/apache_beam/metrics/monitoring_infos.py index 0e638c9eb4fe..a9540f2846ad 100644 --- a/sdks/python/apache_beam/metrics/monitoring_infos.py +++ b/sdks/python/apache_beam/metrics/monitoring_infos.py @@ -48,8 +48,13 @@ USER_DISTRIBUTION_URN = ( common_urns.monitoring_info_specs.USER_DISTRIBUTION_INT64.spec.urn) USER_GAUGE_URN = common_urns.monitoring_info_specs.USER_LATEST_INT64.spec.urn -USER_METRIC_URNS = set( - [USER_COUNTER_URN, USER_DISTRIBUTION_URN, USER_GAUGE_URN]) +USER_STRING_SET_URN = common_urns.monitoring_info_specs.USER_SET_STRING.spec.urn +USER_METRIC_URNS = set([ + USER_COUNTER_URN, + USER_DISTRIBUTION_URN, + USER_GAUGE_URN, + USER_STRING_SET_URN +]) WORK_REMAINING_URN = common_urns.monitoring_info_specs.WORK_REMAINING.spec.urn WORK_COMPLETED_URN = common_urns.monitoring_info_specs.WORK_COMPLETED.spec.urn DATA_CHANNEL_READ_INDEX = ( @@ -65,10 +70,12 @@ common_urns.monitoring_info_types.DISTRIBUTION_INT64_TYPE.urn) LATEST_INT64_TYPE = common_urns.monitoring_info_types.LATEST_INT64_TYPE.urn PROGRESS_TYPE = common_urns.monitoring_info_types.PROGRESS_TYPE.urn +STRING_SET_TYPE = common_urns.monitoring_info_types.SET_STRING_TYPE.urn COUNTER_TYPES = set([SUM_INT64_TYPE]) DISTRIBUTION_TYPES = set([DISTRIBUTION_INT64_TYPE]) GAUGE_TYPES = set([LATEST_INT64_TYPE]) +STRING_SET_TYPES = set([STRING_SET_TYPE]) # TODO(migryz) extract values from beam_fn_api.proto::MonitoringInfoLabels PCOLLECTION_LABEL = ( @@ -147,6 +154,14 @@ def extract_distribution(monitoring_info_proto): coders.VarIntCoder(), monitoring_info_proto.payload) +def extract_string_set_value(monitoring_info_proto): + if not is_string_set(monitoring_info_proto): + raise ValueError('Unsupported type %s' % monitoring_info_proto.type) + + coder = coders.IterableCoder(coders.StrUtf8Coder()) + return set(coder.decode(monitoring_info_proto.payload)) + + def create_labels(ptransform=None, namespace=None, name=None, pcollection=None): """Create the label dictionary based on the provided values. @@ -241,8 +256,8 @@ def int64_user_gauge(namespace, name, metric, ptransform=None): """Return the gauge monitoring info for the URN, metric and labels. Args: - namespace: User-defined namespace of counter. - name: Name of counter. + namespace: User-defined namespace of gauge metric. + name: Name of gauge metric. metric: The GaugeData containing the metrics. ptransform: The ptransform id used as a label. """ @@ -284,6 +299,24 @@ def int64_gauge(urn, metric, ptransform=None): return create_monitoring_info(urn, LATEST_INT64_TYPE, payload, labels) +def user_set_string(namespace, name, metric, ptransform=None): + """Return the string set monitoring info for the URN, metric and labels. + + Args: + namespace: User-defined namespace of StringSet. + name: Name of StringSet. + metric: The set representing the metrics. + ptransform: The ptransform id used as a label. + """ + labels = create_labels(ptransform=ptransform, namespace=namespace, name=name) + if isinstance(metric, set): + metric = list(metric) + if isinstance(metric, list): + metric = coders.IterableCoder(coders.StrUtf8Coder()).encode(metric) + return create_monitoring_info( + USER_STRING_SET_URN, STRING_SET_TYPE, metric, labels) + + def create_monitoring_info(urn, type_urn, payload, labels=None): # type: (...) -> metrics_pb2.MonitoringInfo @@ -320,15 +353,21 @@ def is_distribution(monitoring_info_proto): return monitoring_info_proto.type in DISTRIBUTION_TYPES +def is_string_set(monitoring_info_proto): + """Returns true if the monitoring info is a StringSet metric.""" + return monitoring_info_proto.type in STRING_SET_TYPES + + def is_user_monitoring_info(monitoring_info_proto): """Returns true if the monitoring info is a user metric.""" return monitoring_info_proto.urn in USER_METRIC_URNS def extract_metric_result_map_value(monitoring_info_proto): - # type: (...) -> Union[None, int, DistributionResult, GaugeResult] + # type: (...) -> Union[None, int, DistributionResult, GaugeResult, set] - """Returns the relevant GaugeResult, DistributionResult or int value. + """Returns the relevant GaugeResult, DistributionResult or int value for + counter metric, set for StringSet metric. These are the proper format for use in the MetricResult.query() result. """ @@ -342,6 +381,8 @@ def extract_metric_result_map_value(monitoring_info_proto): if is_gauge(monitoring_info_proto): (timestamp, value) = extract_gauge_value(monitoring_info_proto) return GaugeResult(GaugeData(value, timestamp)) + if is_string_set(monitoring_info_proto): + return extract_string_set_value(monitoring_info_proto) return None diff --git a/sdks/python/apache_beam/metrics/monitoring_infos_test.py b/sdks/python/apache_beam/metrics/monitoring_infos_test.py index d19e8bc10df1..022943f417c2 100644 --- a/sdks/python/apache_beam/metrics/monitoring_infos_test.py +++ b/sdks/python/apache_beam/metrics/monitoring_infos_test.py @@ -21,6 +21,7 @@ from apache_beam.metrics import monitoring_infos from apache_beam.metrics.cells import CounterCell from apache_beam.metrics.cells import GaugeCell +from apache_beam.metrics.cells import StringSetCell class MonitoringInfosTest(unittest.TestCase): @@ -64,6 +65,17 @@ def test_parse_namespace_and_name_for_user_gauge_metric(self): self.assertEqual(namespace, "counternamespace") self.assertEqual(name, "countername") + def test_parse_namespace_and_name_for_user_string_set_metric(self): + urn = monitoring_infos.USER_STRING_SET_URN + labels = {} + labels[monitoring_infos.NAMESPACE_LABEL] = "stringsetnamespace" + labels[monitoring_infos.NAME_LABEL] = "stringsetname" + input = monitoring_infos.create_monitoring_info( + urn, "typeurn", None, labels) + namespace, name = monitoring_infos.parse_namespace_and_name(input) + self.assertEqual(namespace, "stringsetnamespace") + self.assertEqual(name, "stringsetname") + def test_int64_user_gauge(self): metric = GaugeCell().get_cumulative() result = monitoring_infos.int64_user_gauge( @@ -105,6 +117,19 @@ def test_int64_counter(self): self.assertEqual(0, counter_value) self.assertEqual(result.labels, expected_labels) + def test_user_set_string(self): + expected_labels = {} + expected_labels[monitoring_infos.NAMESPACE_LABEL] = "stringsetnamespace" + expected_labels[monitoring_infos.NAME_LABEL] = "stringsetname" + + metric = StringSetCell().get_cumulative() + result = monitoring_infos.user_set_string( + 'stringsetnamespace', 'stringsetname', metric) + string_set_value = monitoring_infos.extract_string_set_value(result) + + self.assertEqual(set(), string_set_value) + self.assertEqual(result.labels, expected_labels) + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/ml/inference/base.py b/sdks/python/apache_beam/ml/inference/base.py index 401b57fdb807..29a568def07b 100644 --- a/sdks/python/apache_beam/ml/inference/base.py +++ b/sdks/python/apache_beam/ml/inference/base.py @@ -1586,6 +1586,15 @@ def _run_inference(self, batch, inference_args): except BaseException as e: if self._metrics_collector: self._metrics_collector.failed_batches_counter.inc() + if (e is pickle.PickleError and + self._model_handler.share_model_across_processes()): + raise TypeError( + 'Pickling error encountered while running inference. ' + 'This may be caused by trying to send unpickleable ' + 'data to a model which is shared across processes. ' + 'For more information, see ' + 'https://beam.apache.org/documentation/ml/large-language-modeling/#pickling-errors' # pylint: disable=line-too-long + ) from e raise e predictions = list(result_generator) diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py index 91efcdd76a27..2934a5362910 100644 --- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py +++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py @@ -677,7 +677,7 @@ def _deduplicate_device_value(self, device: Optional[str]): self._load_pipeline_args['device'] = 'cpu' else: if is_gpu_available_torch(): - self._load_pipeline_args['device'] = 'cuda:1' + self._load_pipeline_args['device'] = 'cuda:0' else: _LOGGER.warning( "HuggingFaceModelHandler specified a 'GPU' device, " diff --git a/sdks/python/apache_beam/options/pipeline_options.py b/sdks/python/apache_beam/options/pipeline_options.py index 6b1dd8bb48c0..0f8457a40a7b 100644 --- a/sdks/python/apache_beam/options/pipeline_options.py +++ b/sdks/python/apache_beam/options/pipeline_options.py @@ -247,6 +247,20 @@ def __init__(self, flags=None, **kwargs): self._all_options[option_name] = getattr( self._visible_options, option_name) + def __getstate__(self): + # The impersonate_service_account option must be used only at submission of + # a Beam job. However, Beam IOs might store pipeline options + # within transform implementation that becomes serialized in RunnerAPI, + # causing this option to be inadvertently used at runtime. + # This serialization hook removes it. + if self.view_as(GoogleCloudOptions).impersonate_service_account: + dict_copy = dict(self.__dict__) + dict_copy['_all_options'] = dict(dict_copy['_all_options']) + dict_copy['_all_options']['impersonate_service_account'] = None + return dict_copy + else: + return self.__dict__ + @classmethod def _add_argparse_args(cls, parser): # type: (_BeamArgumentParser) -> None diff --git a/sdks/python/apache_beam/runners/dataflow/dataflow_metrics.py b/sdks/python/apache_beam/runners/dataflow/dataflow_metrics.py index 7e6a11c4abf8..78c3b64595b0 100644 --- a/sdks/python/apache_beam/runners/dataflow/dataflow_metrics.py +++ b/sdks/python/apache_beam/runners/dataflow/dataflow_metrics.py @@ -90,6 +90,10 @@ def _is_counter(metric_result): def _is_distribution(metric_result): return isinstance(metric_result.attempted, DistributionResult) + @staticmethod + def _is_string_set(metric_result): + return isinstance(metric_result.attempted, set) + def _translate_step_name(self, internal_name): """Translate between internal step names (e.g. "s1") and user step names.""" if not self._job_graph: @@ -233,6 +237,8 @@ def _get_metric_value(self, metric): lambda x: x.key == 'sum').value.double_value) return DistributionResult( DistributionData(dist_sum, dist_count, dist_min, dist_max)) + #TODO(https://github.com/apache/beam/issues/31788) support StringSet after + # re-generate apiclient else: return None @@ -277,8 +283,13 @@ def query(self, filter=None): elm for elm in metric_results if self.matches(filter, elm.key) and DataflowMetrics._is_distribution(elm) ], - self.GAUGES: [] - } # TODO(pabloem): Add Gauge support for dataflow. + # TODO(pabloem): Add Gauge support for dataflow. + self.GAUGES: [], + self.STRINGSETS: [ + elm for elm in metric_results if self.matches(filter, elm.key) and + DataflowMetrics._is_string_set(elm) + ] + } def main(argv): diff --git a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py index badc3683bb28..20cae582f320 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py @@ -733,6 +733,12 @@ def _update_container_image_for_dataflow(beam_container_image_url): # By default Dataflow pipelines use containers hosted in Dataflow GCR # instead of Docker Hub. image_suffix = beam_container_image_url.rsplit('/', 1)[1] + + # trim "RCX" as release candidate tag exists on Docker Hub but not GCR + check_rc = image_suffix.lower().split('rc') + if len(check_rc) == 2: + image_suffix = image_suffix[:-2 - len(check_rc[1])] + return names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY + '/' + image_suffix @staticmethod diff --git a/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py b/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py index a4e1a5253a73..8331d9cf3919 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py @@ -321,6 +321,43 @@ def test_dataflow_container_image_override_prime(self): self._verify_dataflow_container_image_override(pipeline_options) + def _verify_dataflow_container_image_override_rc(self, pipeline_options): + pipeline = Pipeline(options=pipeline_options) + pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned + + dummy_env = DockerEnvironment( + container_image='apache/beam_dummy_name:2.00.0RC10') + proto_pipeline, _ = pipeline.to_runner_api( + return_context=True, default_environment=dummy_env) + + # Accessing non-public method for testing. + apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( + proto_pipeline, {}, pipeline_options) + + from apache_beam.utils import proto_utils + found_override = False + trimed_rc = True + for env in proto_pipeline.components.environments.values(): + docker_payload = proto_utils.parse_Bytes( + env.payload, beam_runner_api_pb2.DockerPayload) + if docker_payload.container_image.startswith( + names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY): + found_override = True + if docker_payload.container_image.split(':')[-1] != '2.00.0': + trimed_rc = False + + self.assertTrue(found_override) + self.assertTrue(trimed_rc) + + def test_dataflow_container_image_override_rc(self): + pipeline_options = PipelineOptions([ + '--experiments=use_runner_v2', + '--temp_location', + 'gs://any-location/temp' + ]) + + self._verify_dataflow_container_image_override_rc(pipeline_options) + def _verify_non_apache_container_not_overridden(self, pipeline_options): pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned diff --git a/sdks/python/apache_beam/runners/dataflow/internal/clients/dataflow/dataflow_v1b3_client.py b/sdks/python/apache_beam/runners/dataflow/internal/clients/dataflow/dataflow_v1b3_client.py index cc982098797b..e42b180bbecd 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/clients/dataflow/dataflow_v1b3_client.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/clients/dataflow/dataflow_v1b3_client.py @@ -1,8 +1,5 @@ """Generated client library for dataflow version v1b3.""" # NOTE: This file is autogenerated and should not be edited by hand. - -from __future__ import absolute_import - from apitools.base.py import base_api from . import dataflow_v1b3_messages as messages @@ -17,9 +14,7 @@ class DataflowV1b3(base_api.BaseApiClient): _PACKAGE = 'dataflow' _SCOPES = [ 'https://www.googleapis.com/auth/cloud-platform', - 'https://www.googleapis.com/auth/compute', - 'https://www.googleapis.com/auth/compute.readonly', - 'https://www.googleapis.com/auth/userinfo.email' + 'https://www.googleapis.com/auth/compute' ] _VERSION = 'v1b3' _CLIENT_ID = '1042881264118.apps.googleusercontent.com' @@ -75,7 +70,6 @@ def __init__( self.projects_locations_jobs = self.ProjectsLocationsJobsService(self) self.projects_locations_snapshots = self.ProjectsLocationsSnapshotsService( self) - self.projects_locations_sql = self.ProjectsLocationsSqlService(self) self.projects_locations_templates = self.ProjectsLocationsTemplatesService( self) self.projects_locations = self.ProjectsLocationsService(self) @@ -254,7 +248,7 @@ def __init__(self, client): self._upload_configs = {} def Aggregated(self, request, global_params=None): - r"""List the jobs of a project across all regions. + r"""List the jobs of a project across all regions. **Note:** This method doesn't support filtering the list of jobs by name. Args: request: (DataflowProjectsJobsAggregatedRequest) input message @@ -270,7 +264,8 @@ def Aggregated(self, request, global_params=None): method_id='dataflow.projects.jobs.aggregated', ordered_params=['projectId'], path_params=['projectId'], - query_params=['filter', 'location', 'pageSize', 'pageToken', 'view'], + query_params= + ['filter', 'location', 'name', 'pageSize', 'pageToken', 'view'], relative_path='v1b3/projects/{projectId}/jobs:aggregated', request_field='', request_type_name='DataflowProjectsJobsAggregatedRequest', @@ -279,7 +274,7 @@ def Aggregated(self, request, global_params=None): ) def Create(self, request, global_params=None): - r"""Creates a Cloud Dataflow job. To create a job, we recommend using `projects.locations.jobs.create` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). Using `projects.jobs.create` is not recommended, as your job will always start in `us-central1`. + r"""Creates a Cloud Dataflow job. To create a job, we recommend using `projects.locations.jobs.create` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). Using `projects.jobs.create` is not recommended, as your job will always start in `us-central1`. Do not enter confidential information when you supply string values using the API. Args: request: (DataflowProjectsJobsCreateRequest) input message @@ -354,7 +349,7 @@ def GetMetrics(self, request, global_params=None): ) def List(self, request, global_params=None): - r"""List the jobs of a project. To list the jobs of a project in a region, we recommend using `projects.locations.jobs.list` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). To list the all jobs across all regions, use `projects.jobs.aggregated`. Using `projects.jobs.list` is not recommended, as you can only get the list of jobs that are running in `us-central1`. + r"""List the jobs of a project. To list the jobs of a project in a region, we recommend using `projects.locations.jobs.list` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). To list the all jobs across all regions, use `projects.jobs.aggregated`. Using `projects.jobs.list` is not recommended, because you can only get the list of jobs that are running in `us-central1`. `projects.locations.jobs.list` and `projects.jobs.list` support filtering the list of jobs by name. Filtering by name isn't supported by `projects.jobs.aggregated`. Args: request: (DataflowProjectsJobsListRequest) input message @@ -370,7 +365,8 @@ def List(self, request, global_params=None): method_id='dataflow.projects.jobs.list', ordered_params=['projectId'], path_params=['projectId'], - query_params=['filter', 'location', 'pageSize', 'pageToken', 'view'], + query_params= + ['filter', 'location', 'name', 'pageSize', 'pageToken', 'view'], relative_path='v1b3/projects/{projectId}/jobs', request_field='', request_type_name='DataflowProjectsJobsListRequest', @@ -420,7 +416,7 @@ def Update(self, request, global_params=None): method_id='dataflow.projects.jobs.update', ordered_params=['projectId', 'jobId'], path_params=['jobId', 'projectId'], - query_params=['location'], + query_params=['location', 'updateMask'], relative_path='v1b3/projects/{projectId}/jobs/{jobId}', request_field='job', request_type_name='DataflowProjectsJobsUpdateRequest', @@ -611,7 +607,7 @@ def __init__(self, client): self._upload_configs = {} def GetExecutionDetails(self, request, global_params=None): - r"""Request detailed information about the execution status of a stage of the job. + r"""Request detailed information about the execution status of a stage of the job. EXPERIMENTAL. This API is subject to change or removal without notice. Args: request: (DataflowProjectsLocationsJobsStagesGetExecutionDetailsRequest) input message @@ -710,7 +706,7 @@ def __init__(self, client): self._upload_configs = {} def Create(self, request, global_params=None): - r"""Creates a Cloud Dataflow job. To create a job, we recommend using `projects.locations.jobs.create` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). Using `projects.jobs.create` is not recommended, as your job will always start in `us-central1`. + r"""Creates a Cloud Dataflow job. To create a job, we recommend using `projects.locations.jobs.create` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). Using `projects.jobs.create` is not recommended, as your job will always start in `us-central1`. Do not enter confidential information when you supply string values using the API. Args: request: (DataflowProjectsLocationsJobsCreateRequest) input message @@ -761,7 +757,7 @@ def Get(self, request, global_params=None): ) def GetExecutionDetails(self, request, global_params=None): - r"""Request detailed information about the execution status of the job. + r"""Request detailed information about the execution status of the job. EXPERIMENTAL. This API is subject to change or removal without notice. Args: request: (DataflowProjectsLocationsJobsGetExecutionDetailsRequest) input message @@ -814,7 +810,7 @@ def GetMetrics(self, request, global_params=None): ) def List(self, request, global_params=None): - r"""List the jobs of a project. To list the jobs of a project in a region, we recommend using `projects.locations.jobs.list` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). To list the all jobs across all regions, use `projects.jobs.aggregated`. Using `projects.jobs.list` is not recommended, as you can only get the list of jobs that are running in `us-central1`. + r"""List the jobs of a project. To list the jobs of a project in a region, we recommend using `projects.locations.jobs.list` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). To list the all jobs across all regions, use `projects.jobs.aggregated`. Using `projects.jobs.list` is not recommended, because you can only get the list of jobs that are running in `us-central1`. `projects.locations.jobs.list` and `projects.jobs.list` support filtering the list of jobs by name. Filtering by name isn't supported by `projects.jobs.aggregated`. Args: request: (DataflowProjectsLocationsJobsListRequest) input message @@ -830,7 +826,7 @@ def List(self, request, global_params=None): method_id='dataflow.projects.locations.jobs.list', ordered_params=['projectId', 'location'], path_params=['location', 'projectId'], - query_params=['filter', 'pageSize', 'pageToken', 'view'], + query_params=['filter', 'name', 'pageSize', 'pageToken', 'view'], relative_path='v1b3/projects/{projectId}/locations/{location}/jobs', request_field='', request_type_name='DataflowProjectsLocationsJobsListRequest', @@ -881,7 +877,7 @@ def Update(self, request, global_params=None): method_id='dataflow.projects.locations.jobs.update', ordered_params=['projectId', 'location', 'jobId'], path_params=['jobId', 'location', 'projectId'], - query_params=[], + query_params=['updateMask'], relative_path= 'v1b3/projects/{projectId}/locations/{location}/jobs/{jobId}', request_field='job', @@ -978,41 +974,6 @@ def List(self, request, global_params=None): supports_download=False, ) - class ProjectsLocationsSqlService(base_api.BaseApiService): - """Service class for the projects_locations_sql resource.""" - - _NAME = 'projects_locations_sql' - - def __init__(self, client): - super(DataflowV1b3.ProjectsLocationsSqlService, self).__init__(client) - self._upload_configs = {} - - def Validate(self, request, global_params=None): - r"""Validates a GoogleSQL query for Cloud Dataflow syntax. Will always confirm the given query parses correctly, and if able to look up schema information from DataCatalog, will validate that the query analyzes properly as well. - - Args: - request: (DataflowProjectsLocationsSqlValidateRequest) input message - global_params: (StandardQueryParameters, default: None) global arguments - Returns: - (ValidateResponse) The response message. - """ - config = self.GetMethodConfig('Validate') - return self._RunMethod(config, request, global_params=global_params) - - Validate.method_config = lambda: base_api.ApiMethodInfo( - http_method='GET', - method_id='dataflow.projects.locations.sql.validate', - ordered_params=['projectId', 'location'], - path_params=['location', 'projectId'], - query_params=['query'], - relative_path= - 'v1b3/projects/{projectId}/locations/{location}/sql:validate', - request_field='', - request_type_name='DataflowProjectsLocationsSqlValidateRequest', - response_type_name='ValidateResponse', - supports_download=False, - ) - class ProjectsLocationsTemplatesService(base_api.BaseApiService): """Service class for the projects_locations_templates resource.""" @@ -1024,7 +985,7 @@ def __init__(self, client): self._upload_configs = {} def Create(self, request, global_params=None): - r"""Creates a Cloud Dataflow job from a template. + r"""Creates a Cloud Dataflow job from a template. Do not enter confidential information when you supply string values using the API. To create a job, we recommend using `projects.locations.templates.create` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). Using `projects.templates.create` is not recommended, because your job will always start in `us-central1`. Args: request: (DataflowProjectsLocationsTemplatesCreateRequest) input message @@ -1050,7 +1011,7 @@ def Create(self, request, global_params=None): ) def Get(self, request, global_params=None): - r"""Get the template associated with a template. + r"""Get the template associated with a template. To get the template, we recommend using `projects.locations.templates.get` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). Using `projects.templates.get` is not recommended, because only templates that are running in `us-central1` are retrieved. Args: request: (DataflowProjectsLocationsTemplatesGetRequest) input message @@ -1076,7 +1037,7 @@ def Get(self, request, global_params=None): ) def Launch(self, request, global_params=None): - r"""Launch a template. + r"""Launches a template. To launch a template, we recommend using `projects.locations.templates.launch` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). Using `projects.templates.launch` is not recommended, because jobs launched from the template will always start in `us-central1`. Args: request: (DataflowProjectsLocationsTemplatesLaunchRequest) input message @@ -1210,7 +1171,7 @@ def __init__(self, client): self._upload_configs = {} def Create(self, request, global_params=None): - r"""Creates a Cloud Dataflow job from a template. + r"""Creates a Cloud Dataflow job from a template. Do not enter confidential information when you supply string values using the API. To create a job, we recommend using `projects.locations.templates.create` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). Using `projects.templates.create` is not recommended, because your job will always start in `us-central1`. Args: request: (DataflowProjectsTemplatesCreateRequest) input message @@ -1235,7 +1196,7 @@ def Create(self, request, global_params=None): ) def Get(self, request, global_params=None): - r"""Get the template associated with a template. + r"""Get the template associated with a template. To get the template, we recommend using `projects.locations.templates.get` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). Using `projects.templates.get` is not recommended, because only templates that are running in `us-central1` are retrieved. Args: request: (DataflowProjectsTemplatesGetRequest) input message @@ -1260,7 +1221,7 @@ def Get(self, request, global_params=None): ) def Launch(self, request, global_params=None): - r"""Launch a template. + r"""Launches a template. To launch a template, we recommend using `projects.locations.templates.launch` with a [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints). Using `projects.templates.launch` is not recommended, because jobs launched from the template will always start in `us-central1`. Args: request: (DataflowProjectsTemplatesLaunchRequest) input message diff --git a/sdks/python/apache_beam/runners/dataflow/internal/clients/dataflow/dataflow_v1b3_messages.py b/sdks/python/apache_beam/runners/dataflow/internal/clients/dataflow/dataflow_v1b3_messages.py index e7cf625250d2..c0bbfa74ac1e 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/clients/dataflow/dataflow_v1b3_messages.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/clients/dataflow/dataflow_v1b3_messages.py @@ -4,8 +4,6 @@ """ # NOTE: This file is autogenerated and should not be edited by hand. -from __future__ import absolute_import - from apitools.base.protorpclite import messages as _messages from apitools.base.py import encoding from apitools.base.py import extra_types @@ -169,6 +167,22 @@ class AlgorithmValueValuesEnum(_messages.Enum): maxNumWorkers = _messages.IntegerField(2, variant=_messages.Variant.INT32) +class Base2Exponent(_messages.Message): + r"""Exponential buckets where the growth factor between buckets is + `2**(2**-scale)`. e.g. for `scale=1` growth factor is + `2**(2**(-1))=sqrt(2)`. `n` buckets will have the following boundaries. - + 0th: [0, gf) - i in [1, n-1]: [gf^(i), gf^(i+1)) + + Fields: + numberOfBuckets: Must be greater than 0. + scale: Must be between -3 and 3. This forces the growth factor of the + bucket boundaries to be between `2^(1/8)` and `256`. + """ + + numberOfBuckets = _messages.IntegerField(1, variant=_messages.Variant.INT32) + scale = _messages.IntegerField(2, variant=_messages.Variant.INT32) + + class BigQueryIODetails(_messages.Message): r"""Metadata for a BigQuery connector used by the job. @@ -199,6 +213,18 @@ class BigTableIODetails(_messages.Message): tableId = _messages.StringField(3) +class BucketOptions(_messages.Message): + r"""`BucketOptions` describes the bucket boundaries used in the histogram. + + Fields: + exponential: Bucket boundaries grow exponentially. + linear: Bucket boundaries grow linearly. + """ + + exponential = _messages.MessageField('Base2Exponent', 1) + linear = _messages.MessageField('Linear', 2) + + class CPUTime(_messages.Message): r"""Modeled after information exposed by /proc/stat. @@ -288,6 +314,12 @@ class ContainerSpec(_messages.Message): Fields: defaultEnvironment: Default runtime environment for the job. image: Name of the docker container image. E.g., gcr.io/project/some-image + imageRepositoryCertPath: Cloud Storage path to self-signed certificate of + private registry. + imageRepositoryPasswordSecretId: Secret Manager secret id for password to + authenticate to private registry. + imageRepositoryUsernameSecretId: Secret Manager secret id for username to + authenticate to private registry. metadata: Metadata describing a template including description and validation rules. sdkInfo: Required. SDK info of the Flex Template. @@ -296,8 +328,11 @@ class ContainerSpec(_messages.Message): defaultEnvironment = _messages.MessageField( 'FlexTemplateRuntimeEnvironment', 1) image = _messages.StringField(2) - metadata = _messages.MessageField('TemplateMetadata', 3) - sdkInfo = _messages.MessageField('SDKInfo', 4) + imageRepositoryCertPath = _messages.StringField(3) + imageRepositoryPasswordSecretId = _messages.StringField(4) + imageRepositoryUsernameSecretId = _messages.StringField(5) + metadata = _messages.MessageField('TemplateMetadata', 6) + sdkInfo = _messages.MessageField('SDKInfo', 7) class CounterMetadata(_messages.Message): @@ -568,6 +603,94 @@ class DataDiskAssignment(_messages.Message): vmInstance = _messages.StringField(2) +class DataSamplingConfig(_messages.Message): + r"""Configuration options for sampling elements. + + Enums: + BehaviorsValueListEntryValuesEnum: + + Fields: + behaviors: List of given sampling behaviors to enable. For example, + specifying behaviors = [ALWAYS_ON] samples in-flight elements but does + not sample exceptions. Can be used to specify multiple behaviors like, + behaviors = [ALWAYS_ON, EXCEPTIONS] for specifying periodic sampling and + exception sampling. If DISABLED is in the list, then sampling will be + disabled and ignore the other given behaviors. Ordering does not matter. + """ + class BehaviorsValueListEntryValuesEnum(_messages.Enum): + r"""BehaviorsValueListEntryValuesEnum enum type. + + Values: + DATA_SAMPLING_BEHAVIOR_UNSPECIFIED: If given, has no effect on sampling + behavior. Used as an unknown or unset sentinel value. + DISABLED: When given, disables element sampling. Has same behavior as + not setting the behavior. + ALWAYS_ON: When given, enables sampling in-flight from all PCollections. + EXCEPTIONS: When given, enables sampling input elements when a user- + defined DoFn causes an exception. + """ + DATA_SAMPLING_BEHAVIOR_UNSPECIFIED = 0 + DISABLED = 1 + ALWAYS_ON = 2 + EXCEPTIONS = 3 + + behaviors = _messages.EnumField( + 'BehaviorsValueListEntryValuesEnum', 1, repeated=True) + + +class DataSamplingReport(_messages.Message): + r"""Contains per-worker telemetry about the data sampling feature. + + Fields: + bytesWrittenDelta: Optional. Delta of bytes written to file from previous + report. + elementsSampledBytes: Optional. Delta of bytes sampled from previous + report. + elementsSampledCount: Optional. Delta of number of elements sampled from + previous report. + exceptionsSampledCount: Optional. Delta of number of samples taken from + user code exceptions from previous report. + pcollectionsSampledCount: Optional. Delta of number of PCollections + sampled from previous report. + persistenceErrorsCount: Optional. Delta of errors counts from persisting + the samples from previous report. + translationErrorsCount: Optional. Delta of errors counts from retrieving, + or translating the samples from previous report. + """ + + bytesWrittenDelta = _messages.IntegerField(1) + elementsSampledBytes = _messages.IntegerField(2) + elementsSampledCount = _messages.IntegerField(3) + exceptionsSampledCount = _messages.IntegerField(4) + pcollectionsSampledCount = _messages.IntegerField(5) + persistenceErrorsCount = _messages.IntegerField(6) + translationErrorsCount = _messages.IntegerField(7) + + +class DataflowHistogramValue(_messages.Message): + r"""Summary statistics for a population of values. HistogramValue contains a + sequence of buckets and gives a count of values that fall into each bucket. + Bucket boundares are defined by a formula and bucket widths are either fixed + or exponentially increasing. + + Fields: + bucketCounts: Optional. The number of values in each bucket of the + histogram, as described in `bucket_options`. `bucket_counts` should + contain N values, where N is the number of buckets specified in + `bucket_options`. If `bucket_counts` has fewer than N values, the + remaining values are assumed to be 0. + bucketOptions: Describes the bucket boundaries used in the histogram. + count: Number of values recorded in this histogram. + outlierStats: Statistics on the values recorded in the histogram that fall + out of the bucket boundaries. + """ + + bucketCounts = _messages.IntegerField(1, repeated=True) + bucketOptions = _messages.MessageField('BucketOptions', 2) + count = _messages.IntegerField(3) + outlierStats = _messages.MessageField('OutlierStats', 4) + + class DataflowProjectsDeleteSnapshotsRequest(_messages.Message): r"""A DataflowProjectsDeleteSnapshotsRequest object. @@ -596,6 +719,7 @@ class DataflowProjectsJobsAggregatedRequest(_messages.Message): location: The [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) that contains this job. + name: Optional. The job name. pageSize: If there are many jobs, limit response to at most this many. The actual number of jobs returned will be the lesser of max_responses and an unspecified server-defined limit. @@ -635,7 +759,12 @@ class ViewValueValuesEnum(_messages.Enum): JOB_VIEW_SUMMARY: Request summary information only: Project ID, Job ID, job name, job type, job status, start/end time, and Cloud SDK version details. - JOB_VIEW_ALL: Request all information available for this job. + JOB_VIEW_ALL: Request all information available for this job. When the + job is in `JOB_STATE_PENDING`, the job has been created but is not yet + running, and not all job information is available. For complete job + information, wait until the job in is `JOB_STATE_RUNNING`. For more + information, see [JobState](https://cloud.google.com/dataflow/docs/ref + erence/rest/v1b3/projects.jobs#jobstate). JOB_VIEW_DESCRIPTION: Request summary info and limited job description data for steps, labels and environment. """ @@ -646,10 +775,11 @@ class ViewValueValuesEnum(_messages.Enum): filter = _messages.EnumField('FilterValueValuesEnum', 1) location = _messages.StringField(2) - pageSize = _messages.IntegerField(3, variant=_messages.Variant.INT32) - pageToken = _messages.StringField(4) - projectId = _messages.StringField(5, required=True) - view = _messages.EnumField('ViewValueValuesEnum', 6) + name = _messages.StringField(3) + pageSize = _messages.IntegerField(4, variant=_messages.Variant.INT32) + pageToken = _messages.StringField(5) + projectId = _messages.StringField(6, required=True) + view = _messages.EnumField('ViewValueValuesEnum', 7) class DataflowProjectsJobsCreateRequest(_messages.Message): @@ -677,7 +807,12 @@ class ViewValueValuesEnum(_messages.Enum): JOB_VIEW_SUMMARY: Request summary information only: Project ID, Job ID, job name, job type, job status, start/end time, and Cloud SDK version details. - JOB_VIEW_ALL: Request all information available for this job. + JOB_VIEW_ALL: Request all information available for this job. When the + job is in `JOB_STATE_PENDING`, the job has been created but is not yet + running, and not all job information is available. For complete job + information, wait until the job in is `JOB_STATE_RUNNING`. For more + information, see [JobState](https://cloud.google.com/dataflow/docs/ref + erence/rest/v1b3/projects.jobs#jobstate). JOB_VIEW_DESCRIPTION: Request summary info and limited job description data for steps, labels and environment. """ @@ -766,7 +901,12 @@ class ViewValueValuesEnum(_messages.Enum): JOB_VIEW_SUMMARY: Request summary information only: Project ID, Job ID, job name, job type, job status, start/end time, and Cloud SDK version details. - JOB_VIEW_ALL: Request all information available for this job. + JOB_VIEW_ALL: Request all information available for this job. When the + job is in `JOB_STATE_PENDING`, the job has been created but is not yet + running, and not all job information is available. For complete job + information, wait until the job in is `JOB_STATE_RUNNING`. For more + information, see [JobState](https://cloud.google.com/dataflow/docs/ref + erence/rest/v1b3/projects.jobs#jobstate). JOB_VIEW_DESCRIPTION: Request summary info and limited job description data for steps, labels and environment. """ @@ -794,6 +934,7 @@ class DataflowProjectsJobsListRequest(_messages.Message): location: The [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) that contains this job. + name: Optional. The job name. pageSize: If there are many jobs, limit response to at most this many. The actual number of jobs returned will be the lesser of max_responses and an unspecified server-defined limit. @@ -833,7 +974,12 @@ class ViewValueValuesEnum(_messages.Enum): JOB_VIEW_SUMMARY: Request summary information only: Project ID, Job ID, job name, job type, job status, start/end time, and Cloud SDK version details. - JOB_VIEW_ALL: Request all information available for this job. + JOB_VIEW_ALL: Request all information available for this job. When the + job is in `JOB_STATE_PENDING`, the job has been created but is not yet + running, and not all job information is available. For complete job + information, wait until the job in is `JOB_STATE_RUNNING`. For more + information, see [JobState](https://cloud.google.com/dataflow/docs/ref + erence/rest/v1b3/projects.jobs#jobstate). JOB_VIEW_DESCRIPTION: Request summary info and limited job description data for steps, labels and environment. """ @@ -844,10 +990,11 @@ class ViewValueValuesEnum(_messages.Enum): filter = _messages.EnumField('FilterValueValuesEnum', 1) location = _messages.StringField(2) - pageSize = _messages.IntegerField(3, variant=_messages.Variant.INT32) - pageToken = _messages.StringField(4) - projectId = _messages.StringField(5, required=True) - view = _messages.EnumField('ViewValueValuesEnum', 6) + name = _messages.StringField(3) + pageSize = _messages.IntegerField(4, variant=_messages.Variant.INT32) + pageToken = _messages.StringField(5) + projectId = _messages.StringField(6, required=True) + view = _messages.EnumField('ViewValueValuesEnum', 7) class DataflowProjectsJobsMessagesListRequest(_messages.Message): @@ -947,12 +1094,19 @@ class DataflowProjectsJobsUpdateRequest(_messages.Message): (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) that contains this job. projectId: The ID of the Cloud Platform project that the job belongs to. + updateMask: The list of fields to update relative to Job. If empty, only + RequestedJobState will be considered for update. If the FieldMask is not + empty and RequestedJobState is none/empty, The fields specified in the + update mask will be the only ones considered for update. If both + RequestedJobState and update_mask are specified, an error will be + returned as we cannot update both state and mask. """ job = _messages.MessageField('Job', 1) jobId = _messages.StringField(2, required=True) location = _messages.StringField(3) projectId = _messages.StringField(4, required=True) + updateMask = _messages.StringField(5) class DataflowProjectsJobsWorkItemsLeaseRequest(_messages.Message): @@ -1030,7 +1184,12 @@ class ViewValueValuesEnum(_messages.Enum): JOB_VIEW_SUMMARY: Request summary information only: Project ID, Job ID, job name, job type, job status, start/end time, and Cloud SDK version details. - JOB_VIEW_ALL: Request all information available for this job. + JOB_VIEW_ALL: Request all information available for this job. When the + job is in `JOB_STATE_PENDING`, the job has been created but is not yet + running, and not all job information is available. For complete job + information, wait until the job in is `JOB_STATE_RUNNING`. For more + information, see [JobState](https://cloud.google.com/dataflow/docs/ref + erence/rest/v1b3/projects.jobs#jobstate). JOB_VIEW_DESCRIPTION: Request summary info and limited job description data for steps, labels and environment. """ @@ -1152,7 +1311,12 @@ class ViewValueValuesEnum(_messages.Enum): JOB_VIEW_SUMMARY: Request summary information only: Project ID, Job ID, job name, job type, job status, start/end time, and Cloud SDK version details. - JOB_VIEW_ALL: Request all information available for this job. + JOB_VIEW_ALL: Request all information available for this job. When the + job is in `JOB_STATE_PENDING`, the job has been created but is not yet + running, and not all job information is available. For complete job + information, wait until the job in is `JOB_STATE_RUNNING`. For more + information, see [JobState](https://cloud.google.com/dataflow/docs/ref + erence/rest/v1b3/projects.jobs#jobstate). JOB_VIEW_DESCRIPTION: Request summary info and limited job description data for steps, labels and environment. """ @@ -1180,6 +1344,7 @@ class DataflowProjectsLocationsJobsListRequest(_messages.Message): location: The [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) that contains this job. + name: Optional. The job name. pageSize: If there are many jobs, limit response to at most this many. The actual number of jobs returned will be the lesser of max_responses and an unspecified server-defined limit. @@ -1219,7 +1384,12 @@ class ViewValueValuesEnum(_messages.Enum): JOB_VIEW_SUMMARY: Request summary information only: Project ID, Job ID, job name, job type, job status, start/end time, and Cloud SDK version details. - JOB_VIEW_ALL: Request all information available for this job. + JOB_VIEW_ALL: Request all information available for this job. When the + job is in `JOB_STATE_PENDING`, the job has been created but is not yet + running, and not all job information is available. For complete job + information, wait until the job in is `JOB_STATE_RUNNING`. For more + information, see [JobState](https://cloud.google.com/dataflow/docs/ref + erence/rest/v1b3/projects.jobs#jobstate). JOB_VIEW_DESCRIPTION: Request summary info and limited job description data for steps, labels and environment. """ @@ -1230,10 +1400,11 @@ class ViewValueValuesEnum(_messages.Enum): filter = _messages.EnumField('FilterValueValuesEnum', 1) location = _messages.StringField(2, required=True) - pageSize = _messages.IntegerField(3, variant=_messages.Variant.INT32) - pageToken = _messages.StringField(4) - projectId = _messages.StringField(5, required=True) - view = _messages.EnumField('ViewValueValuesEnum', 6) + name = _messages.StringField(3) + pageSize = _messages.IntegerField(4, variant=_messages.Variant.INT32) + pageToken = _messages.StringField(5) + projectId = _messages.StringField(6, required=True) + view = _messages.EnumField('ViewValueValuesEnum', 7) class DataflowProjectsLocationsJobsMessagesListRequest(_messages.Message): @@ -1380,12 +1551,19 @@ class DataflowProjectsLocationsJobsUpdateRequest(_messages.Message): (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) that contains this job. projectId: The ID of the Cloud Platform project that the job belongs to. + updateMask: The list of fields to update relative to Job. If empty, only + RequestedJobState will be considered for update. If the FieldMask is not + empty and RequestedJobState is none/empty, The fields specified in the + update mask will be the only ones considered for update. If both + RequestedJobState and update_mask are specified, an error will be + returned as we cannot update both state and mask. """ job = _messages.MessageField('Job', 1) jobId = _messages.StringField(2, required=True) location = _messages.StringField(3, required=True) projectId = _messages.StringField(4, required=True) + updateMask = _messages.StringField(5) class DataflowProjectsLocationsJobsWorkItemsLeaseRequest(_messages.Message): @@ -1472,23 +1650,6 @@ class DataflowProjectsLocationsSnapshotsListRequest(_messages.Message): projectId = _messages.StringField(3, required=True) -class DataflowProjectsLocationsSqlValidateRequest(_messages.Message): - r"""A DataflowProjectsLocationsSqlValidateRequest object. - - Fields: - location: The [regional endpoint] - (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) to - which to direct the request. - projectId: Required. The ID of the Cloud Platform project that the job - belongs to. - query: The sql query to validate. - """ - - location = _messages.StringField(1, required=True) - projectId = _messages.StringField(2, required=True) - query = _messages.StringField(3) - - class DataflowProjectsLocationsTemplatesCreateRequest(_messages.Message): r"""A DataflowProjectsLocationsTemplatesCreateRequest object. @@ -1543,13 +1704,13 @@ class DataflowProjectsLocationsTemplatesLaunchRequest(_messages.Message): r"""A DataflowProjectsLocationsTemplatesLaunchRequest object. Fields: - dynamicTemplate_gcsPath: Path to dynamic template spec file on Cloud - Storage. The file must be a Json serialized DynamicTemplateFieSpec - object. + dynamicTemplate_gcsPath: Path to the dynamic template specification file + on Cloud Storage. The file must be a JSON serialized + `DynamicTemplateFileSpec` object. dynamicTemplate_stagingLocation: Cloud Storage path for staging dependencies. Must be a valid Cloud Storage URL, beginning with `gs://`. - gcsPath: A Cloud Storage path to the template from which to create the - job. Must be valid Cloud Storage URL, beginning with 'gs://'. + gcsPath: A Cloud Storage path to the template to use to create the job. + Must be valid Cloud Storage URL, beginning with `gs://`. launchTemplateParameters: A LaunchTemplateParameters resource to be passed as the request body. location: The [regional endpoint] @@ -1668,13 +1829,13 @@ class DataflowProjectsTemplatesLaunchRequest(_messages.Message): r"""A DataflowProjectsTemplatesLaunchRequest object. Fields: - dynamicTemplate_gcsPath: Path to dynamic template spec file on Cloud - Storage. The file must be a Json serialized DynamicTemplateFieSpec - object. + dynamicTemplate_gcsPath: Path to the dynamic template specification file + on Cloud Storage. The file must be a JSON serialized + `DynamicTemplateFileSpec` object. dynamicTemplate_stagingLocation: Cloud Storage path for staging dependencies. Must be a valid Cloud Storage URL, beginning with `gs://`. - gcsPath: A Cloud Storage path to the template from which to create the - job. Must be valid Cloud Storage URL, beginning with 'gs://'. + gcsPath: A Cloud Storage path to the template to use to create the job. + Must be valid Cloud Storage URL, beginning with `gs://`. launchTemplateParameters: A LaunchTemplateParameters resource to be passed as the request body. location: The [regional endpoint] @@ -1726,11 +1887,14 @@ class DebugOptions(_messages.Message): r"""Describes any options that have an effect on the debugging of pipelines. Fields: - enableHotKeyLogging: When true, enables the logging of the literal hot key - to the user's Cloud Logging. + dataSampling: Configuration options for sampling elements from a running + pipeline. + enableHotKeyLogging: Optional. When true, enables the logging of the + literal hot key to the user's Cloud Logging. """ - enableHotKeyLogging = _messages.BooleanField(1) + dataSampling = _messages.MessageField('DataSamplingConfig', 1) + enableHotKeyLogging = _messages.BooleanField(2) class DeleteSnapshotResponse(_messages.Message): @@ -1883,10 +2047,17 @@ class Environment(_messages.Message): r"""Describes the environment in which a Dataflow Job runs. Enums: - FlexResourceSchedulingGoalValueValuesEnum: Which Flexible Resource - Scheduling mode to run in. + FlexResourceSchedulingGoalValueValuesEnum: Optional. Which Flexible + Resource Scheduling mode to run in. ShuffleModeValueValuesEnum: Output only. The shuffle mode used for the job. + StreamingModeValueValuesEnum: Optional. Specifies the Streaming Engine + message processing guarantees. Reduces cost and latency but might result + in duplicate messages committed to storage. Designed to run simple + mapping streaming ETL jobs at the lowest cost. For example, Change Data + Capture (CDC) to BigQuery is a canonical use case. For more information, + see [Set the pipeline streaming + mode](https://cloud.google.com/dataflow/docs/guides/streaming-modes). Messages: InternalExperimentsValue: Experimental settings. @@ -1903,31 +2074,38 @@ class Environment(_messages.Message): unknown or unspecified, the service will attempt to choose a reasonable default. This should be in the form of the API service name, e.g. "compute.googleapis.com". - dataset: The dataset for the current project where various workflow - related tables are stored. The supported resource type is: Google - BigQuery: bigquery.googleapis.com/{dataset} - debugOptions: Any debugging options to be supplied to the job. + dataset: Optional. The dataset for the current project where various + workflow related tables are stored. The supported resource type is: + Google BigQuery: bigquery.googleapis.com/{dataset} + debugOptions: Optional. Any debugging options to be supplied to the job. experiments: The list of experiments to enable. This field should be used for SDK related experiments and not for service related experiments. The proper field for service related experiments is service_options. - flexResourceSchedulingGoal: Which Flexible Resource Scheduling mode to run - in. + flexResourceSchedulingGoal: Optional. Which Flexible Resource Scheduling + mode to run in. internalExperiments: Experimental settings. sdkPipelineOptions: The Cloud Dataflow SDK pipeline options specified by the user. These options are passed through the service and are used to recreate the SDK pipeline options on the worker in a language agnostic and platform independent way. - serviceAccountEmail: Identity to run virtual machines as. Defaults to the - default account. - serviceKmsKeyName: If set, contains the Cloud KMS key identifier used to - encrypt data at rest, AKA a Customer Managed Encryption Key (CMEK). - Format: + serviceAccountEmail: Optional. Identity to run virtual machines as. + Defaults to the default account. + serviceKmsKeyName: Optional. If set, contains the Cloud KMS key identifier + used to encrypt data at rest, AKA a Customer Managed Encryption Key + (CMEK). Format: projects/PROJECT_ID/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY - serviceOptions: The list of service options to enable. This field should - be used for service related experiments only. These experiments, when - graduating to GA, should be replaced by dedicated fields or become - default (i.e. always on). + serviceOptions: Optional. The list of service options to enable. This + field should be used for service related experiments only. These + experiments, when graduating to GA, should be replaced by dedicated + fields or become default (i.e. always on). shuffleMode: Output only. The shuffle mode used for the job. + streamingMode: Optional. Specifies the Streaming Engine message processing + guarantees. Reduces cost and latency but might result in duplicate + messages committed to storage. Designed to run simple mapping streaming + ETL jobs at the lowest cost. For example, Change Data Capture (CDC) to + BigQuery is a canonical use case. For more information, see [Set the + pipeline streaming + mode](https://cloud.google.com/dataflow/docs/guides/streaming-modes). tempStoragePrefix: The prefix of the resources the system should use for temporary storage. The system will append the suffix "/temp-{JOBNAME} to this resource prefix, where {JOBNAME} is the value of the job_name @@ -1937,17 +2115,19 @@ class Environment(_messages.Message): The supported resource type is: Google Cloud Storage: storage.googleapis.com/{bucket}/{object} bucket.storage.googleapis.com/{object} + useStreamingEngineResourceBasedBilling: Output only. Whether the job uses + the Streaming Engine resource-based billing model. userAgent: A description of the process that generated the request. version: A structure describing which components and their versions of the service are required in order to run the job. workerPools: The worker pools. At least one "harness" worker pool must be specified in order for the job to have workers. - workerRegion: The Compute Engine region + workerRegion: Optional. The Compute Engine region (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in which worker processing should occur, e.g. "us-west1". Mutually exclusive with worker_zone. If neither worker_region nor worker_zone is specified, default to the control plane's region. - workerZone: The Compute Engine zone + workerZone: Optional. The Compute Engine zone (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in which worker processing should occur, e.g. "us-west1-a". Mutually exclusive with worker_region. If neither worker_region nor worker_zone @@ -1955,7 +2135,7 @@ class Environment(_messages.Message): available capacity. """ class FlexResourceSchedulingGoalValueValuesEnum(_messages.Enum): - r"""Which Flexible Resource Scheduling mode to run in. + r"""Optional. Which Flexible Resource Scheduling mode to run in. Values: FLEXRS_UNSPECIFIED: Run in the default mode. @@ -1978,6 +2158,29 @@ class ShuffleModeValueValuesEnum(_messages.Enum): VM_BASED = 1 SERVICE_BASED = 2 + class StreamingModeValueValuesEnum(_messages.Enum): + r"""Optional. Specifies the Streaming Engine message processing + guarantees. Reduces cost and latency but might result in duplicate + messages committed to storage. Designed to run simple mapping streaming + ETL jobs at the lowest cost. For example, Change Data Capture (CDC) to + BigQuery is a canonical use case. For more information, see [Set the + pipeline streaming + mode](https://cloud.google.com/dataflow/docs/guides/streaming-modes). + + Values: + STREAMING_MODE_UNSPECIFIED: Run in the default mode. + STREAMING_MODE_EXACTLY_ONCE: In this mode, message deduplication is + performed against persistent state to make sure each message is + processed and committed to storage exactly once. + STREAMING_MODE_AT_LEAST_ONCE: Message deduplication is not performed. + Messages might be processed multiple times, and the results are + applied multiple times. Note: Setting this value also enables + Streaming Engine and Streaming Engine resource-based billing. + """ + STREAMING_MODE_UNSPECIFIED = 0 + STREAMING_MODE_EXACTLY_ONCE = 1 + STREAMING_MODE_AT_LEAST_ONCE = 2 + @encoding.MapUnrecognizedFields('additionalProperties') class InternalExperimentsValue(_messages.Message): r"""Experimental settings. @@ -2093,12 +2296,14 @@ class AdditionalProperty(_messages.Message): serviceKmsKeyName = _messages.StringField(9) serviceOptions = _messages.StringField(10, repeated=True) shuffleMode = _messages.EnumField('ShuffleModeValueValuesEnum', 11) - tempStoragePrefix = _messages.StringField(12) - userAgent = _messages.MessageField('UserAgentValue', 13) - version = _messages.MessageField('VersionValue', 14) - workerPools = _messages.MessageField('WorkerPool', 15, repeated=True) - workerRegion = _messages.StringField(16) - workerZone = _messages.StringField(17) + streamingMode = _messages.EnumField('StreamingModeValueValuesEnum', 12) + tempStoragePrefix = _messages.StringField(13) + useStreamingEngineResourceBasedBilling = _messages.BooleanField(14) + userAgent = _messages.MessageField('UserAgentValue', 15) + version = _messages.MessageField('VersionValue', 16) + workerPools = _messages.MessageField('WorkerPool', 17, repeated=True) + workerRegion = _messages.StringField(18) + workerZone = _messages.StringField(19) class ExecutionStageState(_messages.Message): @@ -2281,12 +2486,20 @@ class FlattenInstruction(_messages.Message): class FlexTemplateRuntimeEnvironment(_messages.Message): r"""The environment values to be set at runtime for flex template. + LINT.IfChange Enums: AutoscalingAlgorithmValueValuesEnum: The algorithm to use for autoscaling FlexrsGoalValueValuesEnum: Set FlexRS goal for the job. https://cloud.google.com/dataflow/docs/guides/flexrs IpConfigurationValueValuesEnum: Configuration for VM IPs. + StreamingModeValueValuesEnum: Optional. Specifies the Streaming Engine + message processing guarantees. Reduces cost and latency but might result + in duplicate messages committed to storage. Designed to run simple + mapping streaming ETL jobs at the lowest cost. For example, Change Data + Capture (CDC) to BigQuery is a canonical use case. For more information, + see [Set the pipeline streaming + mode](https://cloud.google.com/dataflow/docs/guides/streaming-modes). Messages: AdditionalUserLabelsValue: Additional user labels to be specified for the @@ -2304,10 +2517,15 @@ class FlexTemplateRuntimeEnvironment(_messages.Message): value pairs. Example: { "name": "wrench", "mass": "1kg", "count": "3" }. autoscalingAlgorithm: The algorithm to use for autoscaling diskSizeGb: Worker disk size, in gigabytes. - dumpHeapOnOom: If true, save a heap dump before killing a thread or - process which is GC thrashing or out of memory. The location of the heap - file will either be echoed back to the user, or the user will be given - the opportunity to download the heap file. + dumpHeapOnOom: If true, when processing time is spent almost entirely on + garbage collection (GC), saves a heap dump before ending the thread or + process. If false, ends the thread or process without saving a heap + dump. Does not save a heap dump when the Java Virtual Machine (JVM) has + an out of memory error during processing. The location of the heap file + is either echoed back to the user, or the user is given the opportunity + to download the heap file. + enableLauncherVmSerialPortLogging: If true serial port logging will be + enabled for the launcher VM. enableStreamingEngine: Whether to enable Streaming Engine for the job. flexrsGoal: Set FlexRS goal for the job. https://cloud.google.com/dataflow/docs/guides/flexrs @@ -2325,8 +2543,8 @@ class FlexTemplateRuntimeEnvironment(_messages.Message): numWorkers: The initial number of Google Compute Engine instances for the job. saveHeapDumpsToGcsPath: Cloud Storage bucket (directory) to upload heap - dumps to the given location. Enabling this implies that heap dumps - should be generated on OOM (dump_heap_on_oom is set to true). + dumps to. Enabling this field implies that `dump_heap_on_oom` is set to + true. sdkContainerImage: Docker registry location of container image to use for the 'worker harness. Default is the container for the version of the SDK. Note this field is only valid for portable pipelines. @@ -2334,6 +2552,13 @@ class FlexTemplateRuntimeEnvironment(_messages.Message): job as. stagingLocation: The Cloud Storage path for staging local files. Must be a valid Cloud Storage URL, beginning with `gs://`. + streamingMode: Optional. Specifies the Streaming Engine message processing + guarantees. Reduces cost and latency but might result in duplicate + messages committed to storage. Designed to run simple mapping streaming + ETL jobs at the lowest cost. For example, Change Data Capture (CDC) to + BigQuery is a canonical use case. For more information, see [Set the + pipeline streaming + mode](https://cloud.google.com/dataflow/docs/guides/streaming-modes). subnetwork: Subnetwork to which VMs will be assigned, if desired. You can specify a subnetwork using either a complete URL or an abbreviated path. Expected to be of the form "https://www.googleapis.com/compute/v1/projec @@ -2397,6 +2622,29 @@ class IpConfigurationValueValuesEnum(_messages.Enum): WORKER_IP_PUBLIC = 1 WORKER_IP_PRIVATE = 2 + class StreamingModeValueValuesEnum(_messages.Enum): + r"""Optional. Specifies the Streaming Engine message processing + guarantees. Reduces cost and latency but might result in duplicate + messages committed to storage. Designed to run simple mapping streaming + ETL jobs at the lowest cost. For example, Change Data Capture (CDC) to + BigQuery is a canonical use case. For more information, see [Set the + pipeline streaming + mode](https://cloud.google.com/dataflow/docs/guides/streaming-modes). + + Values: + STREAMING_MODE_UNSPECIFIED: Run in the default mode. + STREAMING_MODE_EXACTLY_ONCE: In this mode, message deduplication is + performed against persistent state to make sure each message is + processed and committed to storage exactly once. + STREAMING_MODE_AT_LEAST_ONCE: Message deduplication is not performed. + Messages might be processed multiple times, and the results are + applied multiple times. Note: Setting this value also enables + Streaming Engine and Streaming Engine resource-based billing. + """ + STREAMING_MODE_UNSPECIFIED = 0 + STREAMING_MODE_EXACTLY_ONCE = 1 + STREAMING_MODE_AT_LEAST_ONCE = 2 + @encoding.MapUnrecognizedFields('additionalProperties') class AdditionalUserLabelsValue(_messages.Message): r"""Additional user labels to be specified for the job. Keys and values @@ -2433,24 +2681,26 @@ class AdditionalProperty(_messages.Message): 'AutoscalingAlgorithmValueValuesEnum', 3) diskSizeGb = _messages.IntegerField(4, variant=_messages.Variant.INT32) dumpHeapOnOom = _messages.BooleanField(5) - enableStreamingEngine = _messages.BooleanField(6) - flexrsGoal = _messages.EnumField('FlexrsGoalValueValuesEnum', 7) - ipConfiguration = _messages.EnumField('IpConfigurationValueValuesEnum', 8) - kmsKeyName = _messages.StringField(9) - launcherMachineType = _messages.StringField(10) - machineType = _messages.StringField(11) - maxWorkers = _messages.IntegerField(12, variant=_messages.Variant.INT32) - network = _messages.StringField(13) - numWorkers = _messages.IntegerField(14, variant=_messages.Variant.INT32) - saveHeapDumpsToGcsPath = _messages.StringField(15) - sdkContainerImage = _messages.StringField(16) - serviceAccountEmail = _messages.StringField(17) - stagingLocation = _messages.StringField(18) - subnetwork = _messages.StringField(19) - tempLocation = _messages.StringField(20) - workerRegion = _messages.StringField(21) - workerZone = _messages.StringField(22) - zone = _messages.StringField(23) + enableLauncherVmSerialPortLogging = _messages.BooleanField(6) + enableStreamingEngine = _messages.BooleanField(7) + flexrsGoal = _messages.EnumField('FlexrsGoalValueValuesEnum', 8) + ipConfiguration = _messages.EnumField('IpConfigurationValueValuesEnum', 9) + kmsKeyName = _messages.StringField(10) + launcherMachineType = _messages.StringField(11) + machineType = _messages.StringField(12) + maxWorkers = _messages.IntegerField(13, variant=_messages.Variant.INT32) + network = _messages.StringField(14) + numWorkers = _messages.IntegerField(15, variant=_messages.Variant.INT32) + saveHeapDumpsToGcsPath = _messages.StringField(16) + sdkContainerImage = _messages.StringField(17) + serviceAccountEmail = _messages.StringField(18) + stagingLocation = _messages.StringField(19) + streamingMode = _messages.EnumField('StreamingModeValueValuesEnum', 20) + subnetwork = _messages.StringField(21) + tempLocation = _messages.StringField(22) + workerRegion = _messages.StringField(23) + workerZone = _messages.StringField(24) + zone = _messages.StringField(25) class FloatingPointList(_messages.Message): @@ -2556,6 +2806,46 @@ class Histogram(_messages.Message): firstBucketOffset = _messages.IntegerField(2, variant=_messages.Variant.INT32) +class HotKeyDebuggingInfo(_messages.Message): + r"""Information useful for debugging a hot key detection. + + Messages: + DetectedHotKeysValue: Debugging information for each detected hot key. + Keyed by a hash of the key. + + Fields: + detectedHotKeys: Debugging information for each detected hot key. Keyed by + a hash of the key. + """ + @encoding.MapUnrecognizedFields('additionalProperties') + class DetectedHotKeysValue(_messages.Message): + r"""Debugging information for each detected hot key. Keyed by a hash of + the key. + + Messages: + AdditionalProperty: An additional property for a DetectedHotKeysValue + object. + + Fields: + additionalProperties: Additional properties of type DetectedHotKeysValue + """ + class AdditionalProperty(_messages.Message): + r"""An additional property for a DetectedHotKeysValue object. + + Fields: + key: Name of the additional property. + value: A HotKeyInfo attribute. + """ + + key = _messages.StringField(1) + value = _messages.MessageField('HotKeyInfo', 2) + + additionalProperties = _messages.MessageField( + 'AdditionalProperty', 1, repeated=True) + + detectedHotKeys = _messages.MessageField('DetectedHotKeysValue', 1) + + class HotKeyDetection(_messages.Message): r"""Proto describing a hot key detected on a given WorkItem. @@ -2572,6 +2862,25 @@ class HotKeyDetection(_messages.Message): userStepName = _messages.StringField(3) +class HotKeyInfo(_messages.Message): + r"""Information about a hot key. + + Fields: + hotKeyAge: The age of the hot key measured from when it was first + detected. + key: A detected hot key that is causing limited parallelism. This field + will be populated only if the following flag is set to true: "-- + enable_hot_key_logging". + keyTruncated: If true, then the above key is truncated and cannot be + deserialized. This occurs if the key above is populated and the key size + is >5MB. + """ + + hotKeyAge = _messages.StringField(1) + key = _messages.StringField(2) + keyTruncated = _messages.BooleanField(3) + + class InstructionInput(_messages.Message): r"""An input of an instruction, as a reference to an output of a producer instruction. @@ -2676,22 +2985,25 @@ class IntegerMean(_messages.Message): class Job(_messages.Message): - r"""Defines a job to be run by the Cloud Dataflow service. + r"""Defines a job to be run by the Cloud Dataflow service. Do not enter + confidential information when you supply string values using the API. Enums: CurrentStateValueValuesEnum: The current state of the job. Jobs are created in the `JOB_STATE_STOPPED` state unless otherwise specified. A job in the `JOB_STATE_RUNNING` state may asynchronously enter a terminal state. After a job has reached a terminal state, no further state - updates may be made. This field may be mutated by the Cloud Dataflow + updates may be made. This field might be mutated by the Dataflow service; callers cannot mutate it. - RequestedStateValueValuesEnum: The job's requested state. `UpdateJob` may - be used to switch between the `JOB_STATE_STOPPED` and - `JOB_STATE_RUNNING` states, by setting requested_state. `UpdateJob` may - also be used to directly set a job's requested state to - `JOB_STATE_CANCELLED` or `JOB_STATE_DONE`, irrevocably terminating the - job if it has not already reached a terminal state. - TypeValueValuesEnum: The type of Cloud Dataflow job. + RequestedStateValueValuesEnum: The job's requested state. Applies to + `UpdateJob` requests. Set `requested_state` with `UpdateJob` requests to + switch between the states `JOB_STATE_STOPPED` and `JOB_STATE_RUNNING`. + You can also use `UpdateJob` requests to change a job's state from + `JOB_STATE_RUNNING` to `JOB_STATE_CANCELLED`, `JOB_STATE_DONE`, or + `JOB_STATE_DRAINED`. These states irrevocably terminate the job if it + hasn't already reached a terminal state. This field has no effect on + `CreateJob` requests. + TypeValueValuesEnum: Optional. The type of Dataflow job. Messages: LabelsValue: User-defined labels for this job. The labels map can contain @@ -2700,8 +3012,9 @@ class Job(_messages.Message): \p{Ll}\p{Lo}{0,62} * Values must conform to regexp: [\p{Ll}\p{Lo}\p{N}_-]{0,63} * Both keys and values are additionally constrained to be <= 128 bytes in size. - TransformNameMappingValue: The map of transform name prefixes of the job - to be replaced to the corresponding name prefixes of the new job. + TransformNameMappingValue: Optional. The map of transform name prefixes of + the job to be replaced to the corresponding name prefixes of the new + job. Fields: clientRequestId: The client's unique identifier of the job, re-used across @@ -2719,14 +3032,13 @@ class Job(_messages.Message): `JOB_STATE_STOPPED` state unless otherwise specified. A job in the `JOB_STATE_RUNNING` state may asynchronously enter a terminal state. After a job has reached a terminal state, no further state updates may - be made. This field may be mutated by the Cloud Dataflow service; - callers cannot mutate it. + be made. This field might be mutated by the Dataflow service; callers + cannot mutate it. currentStateTime: The timestamp associated with the current state. - environment: The environment for the job. + environment: Optional. The environment for the job. executionInfo: Deprecated. - id: The unique ID of this job. This field is set by the Cloud Dataflow - service when the Job is created, and is immutable for the life of the - job. + id: The unique ID of this job. This field is set by the Dataflow service + when the job is created, and is immutable for the life of the job. jobMetadata: This field is populated by the Dataflow service to support filtering jobs by the metadata values provided here. Populated for ListJobs and all GetJob views SUMMARY and higher. @@ -2736,33 +3048,44 @@ class Job(_messages.Message): \p{Ll}\p{Lo}{0,62} * Values must conform to regexp: [\p{Ll}\p{Lo}\p{N}_-]{0,63} * Both keys and values are additionally constrained to be <= 128 bytes in size. - location: The [regional endpoint] + location: Optional. The [regional endpoint] (https://cloud.google.com/dataflow/docs/concepts/regional-endpoints) that contains this job. - name: The user-specified Cloud Dataflow job name. Only one Job with a - given name may exist in a project at any given time. If a caller - attempts to create a Job with the same name as an already-existing Job, - the attempt returns the existing Job. The name must match the regular - expression `[a-z]([-a-z0-9]{0,1022}[a-z0-9])?` + name: Optional. The user-specified Dataflow job name. Only one active job + with a given name can exist in a project within one region at any given + time. Jobs in different regions can have the same name. If a caller + attempts to create a job with the same name as an active job that + already exists, the attempt returns the existing job. The name must + match the regular expression `[a-z]([-a-z0-9]{0,1022}[a-z0-9])?` pipelineDescription: Preliminary field: The format of this data may change at any time. A description of the user pipeline and stages through which it is executed. Created by Cloud Dataflow service. Only retrieved with JOB_VIEW_DESCRIPTION or JOB_VIEW_ALL. - projectId: The ID of the Cloud Platform project that the job belongs to. + projectId: The ID of the Google Cloud project that the job belongs to. replaceJobId: If this job is an update of an existing job, this field is the job ID of the job it replaced. When sending a `CreateJobRequest`, you can update a job by specifying it here. The job named here is stopped, and its intermediate state is transferred to this job. replacedByJobId: If another job is an update of this job (and thus, this job is in `JOB_STATE_UPDATED`), this field contains the ID of that job. - requestedState: The job's requested state. `UpdateJob` may be used to - switch between the `JOB_STATE_STOPPED` and `JOB_STATE_RUNNING` states, - by setting requested_state. `UpdateJob` may also be used to directly set - a job's requested state to `JOB_STATE_CANCELLED` or `JOB_STATE_DONE`, - irrevocably terminating the job if it has not already reached a terminal - state. + requestedState: The job's requested state. Applies to `UpdateJob` + requests. Set `requested_state` with `UpdateJob` requests to switch + between the states `JOB_STATE_STOPPED` and `JOB_STATE_RUNNING`. You can + also use `UpdateJob` requests to change a job's state from + `JOB_STATE_RUNNING` to `JOB_STATE_CANCELLED`, `JOB_STATE_DONE`, or + `JOB_STATE_DRAINED`. These states irrevocably terminate the job if it + hasn't already reached a terminal state. This field has no effect on + `CreateJob` requests. + runtimeUpdatableParams: This field may ONLY be modified at runtime using + the projects.jobs.update method to adjust job behavior. This field has + no effect when specified at job creation. + satisfiesPzi: Output only. Reserved for future use. This field is set only + in responses from the server; it is ignored if it is set in any + requests. satisfiesPzs: Reserved for future use. This field is set only in responses from the server; it is ignored if it is set in any requests. + serviceResources: Output only. Resources used by the Dataflow Service to + run the job. stageStates: This field may be mutated by the Cloud Dataflow service; callers cannot mutate it. startTime: The timestamp when the job was started (transitioned to @@ -2781,17 +3104,17 @@ class Job(_messages.Message): The supported files are: Google Cloud Storage: storage.googleapis.com/{bucket}/{object} bucket.storage.googleapis.com/{object} - transformNameMapping: The map of transform name prefixes of the job to be - replaced to the corresponding name prefixes of the new job. - type: The type of Cloud Dataflow job. + transformNameMapping: Optional. The map of transform name prefixes of the + job to be replaced to the corresponding name prefixes of the new job. + type: Optional. The type of Dataflow job. """ class CurrentStateValueValuesEnum(_messages.Enum): r"""The current state of the job. Jobs are created in the `JOB_STATE_STOPPED` state unless otherwise specified. A job in the `JOB_STATE_RUNNING` state may asynchronously enter a terminal state. After a job has reached a terminal state, no further state updates may be made. - This field may be mutated by the Cloud Dataflow service; callers cannot - mutate it. + This field might be mutated by the Dataflow service; callers cannot mutate + it. Values: JOB_STATE_UNKNOWN: The job's run state isn't specified. @@ -2859,11 +3182,13 @@ class CurrentStateValueValuesEnum(_messages.Enum): JOB_STATE_RESOURCE_CLEANING_UP = 12 class RequestedStateValueValuesEnum(_messages.Enum): - r"""The job's requested state. `UpdateJob` may be used to switch between - the `JOB_STATE_STOPPED` and `JOB_STATE_RUNNING` states, by setting - requested_state. `UpdateJob` may also be used to directly set a job's - requested state to `JOB_STATE_CANCELLED` or `JOB_STATE_DONE`, irrevocably - terminating the job if it has not already reached a terminal state. + r"""The job's requested state. Applies to `UpdateJob` requests. Set + `requested_state` with `UpdateJob` requests to switch between the states + `JOB_STATE_STOPPED` and `JOB_STATE_RUNNING`. You can also use `UpdateJob` + requests to change a job's state from `JOB_STATE_RUNNING` to + `JOB_STATE_CANCELLED`, `JOB_STATE_DONE`, or `JOB_STATE_DRAINED`. These + states irrevocably terminate the job if it hasn't already reached a + terminal state. This field has no effect on `CreateJob` requests. Values: JOB_STATE_UNKNOWN: The job's run state isn't specified. @@ -2931,7 +3256,7 @@ class RequestedStateValueValuesEnum(_messages.Enum): JOB_STATE_RESOURCE_CLEANING_UP = 12 class TypeValueValuesEnum(_messages.Enum): - r"""The type of Cloud Dataflow job. + r"""Optional. The type of Dataflow job. Values: JOB_TYPE_UNKNOWN: The type of the job is unspecified, or unknown. @@ -2975,8 +3300,8 @@ class AdditionalProperty(_messages.Message): @encoding.MapUnrecognizedFields('additionalProperties') class TransformNameMappingValue(_messages.Message): - r"""The map of transform name prefixes of the job to be replaced to the - corresponding name prefixes of the new job. + r"""Optional. The map of transform name prefixes of the job to be replaced + to the corresponding name prefixes of the new job. Messages: AdditionalProperty: An additional property for a @@ -3017,14 +3342,17 @@ class AdditionalProperty(_messages.Message): replaceJobId = _messages.StringField(15) replacedByJobId = _messages.StringField(16) requestedState = _messages.EnumField('RequestedStateValueValuesEnum', 17) - satisfiesPzs = _messages.BooleanField(18) - stageStates = _messages.MessageField('ExecutionStageState', 19, repeated=True) - startTime = _messages.StringField(20) - steps = _messages.MessageField('Step', 21, repeated=True) - stepsLocation = _messages.StringField(22) - tempFiles = _messages.StringField(23, repeated=True) - transformNameMapping = _messages.MessageField('TransformNameMappingValue', 24) - type = _messages.EnumField('TypeValueValuesEnum', 25) + runtimeUpdatableParams = _messages.MessageField('RuntimeUpdatableParams', 18) + satisfiesPzi = _messages.BooleanField(19) + satisfiesPzs = _messages.BooleanField(20) + serviceResources = _messages.MessageField('ServiceResources', 21) + stageStates = _messages.MessageField('ExecutionStageState', 22, repeated=True) + startTime = _messages.StringField(23) + steps = _messages.MessageField('Step', 24, repeated=True) + stepsLocation = _messages.StringField(25) + tempFiles = _messages.StringField(26, repeated=True) + transformNameMapping = _messages.MessageField('TransformNameMappingValue', 27) + type = _messages.EnumField('TypeValueValuesEnum', 28) class JobExecutionDetails(_messages.Message): @@ -3150,6 +3478,10 @@ class JobMetadata(_messages.Message): r"""Metadata available primarily for filtering jobs. Will be included in the ListJob response and Job SUMMARY view. + Messages: + UserDisplayPropertiesValue: List of display properties to help UI filter + jobs. + Fields: bigTableDetails: Identification of a Cloud Bigtable source used in the Dataflow job. @@ -3163,7 +3495,33 @@ class JobMetadata(_messages.Message): sdkVersion: The SDK version used to run the job. spannerDetails: Identification of a Spanner source used in the Dataflow job. + userDisplayProperties: List of display properties to help UI filter jobs. """ + @encoding.MapUnrecognizedFields('additionalProperties') + class UserDisplayPropertiesValue(_messages.Message): + r"""List of display properties to help UI filter jobs. + + Messages: + AdditionalProperty: An additional property for a + UserDisplayPropertiesValue object. + + Fields: + additionalProperties: Additional properties of type + UserDisplayPropertiesValue + """ + class AdditionalProperty(_messages.Message): + r"""An additional property for a UserDisplayPropertiesValue object. + + Fields: + key: Name of the additional property. + value: A string attribute. + """ + + key = _messages.StringField(1) + value = _messages.StringField(2) + + additionalProperties = _messages.MessageField( + 'AdditionalProperty', 1, repeated=True) bigTableDetails = _messages.MessageField( 'BigTableIODetails', 1, repeated=True) @@ -3175,14 +3533,18 @@ class JobMetadata(_messages.Message): pubsubDetails = _messages.MessageField('PubSubIODetails', 5, repeated=True) sdkVersion = _messages.MessageField('SdkVersion', 6) spannerDetails = _messages.MessageField('SpannerIODetails', 7, repeated=True) + userDisplayProperties = _messages.MessageField( + 'UserDisplayPropertiesValue', 8) class JobMetrics(_messages.Message): r"""JobMetrics contains a collection of metrics describing the detailed progress of a Dataflow job. Metrics correspond to user-defined and system- - defined metrics in the job. This resource captures only the most recent - values of each metric; time-series data can be queried for them (under the - same metric names) from Cloud Monitoring. + defined metrics in the job. For more information, see [Dataflow job metrics] + (https://cloud.google.com/dataflow/docs/guides/using-monitoring-intf). This + resource captures only the most recent values of each metric; time-series + data can be queried for them (under the same metric names) from Cloud + Monitoring. Fields: metricTime: Timestamp as of which metric values are current. @@ -3380,7 +3742,10 @@ class LaunchFlexTemplateResponse(_messages.Message): class LaunchTemplateParameters(_messages.Message): - r"""Parameters to provide to the template being launched. + r"""Parameters to provide to the template being launched. Note that the + [metadata in the pipeline code] + (https://cloud.google.com/dataflow/docs/guides/templates/creating- + templates#metadata) determines which runtime parameters are valid. Messages: ParametersValue: The runtime parameters to pass to the job. @@ -3390,7 +3755,8 @@ class LaunchTemplateParameters(_messages.Message): Fields: environment: The runtime environment for the job. - jobName: Required. The job name to use for the created job. + jobName: Required. The job name to use for the created job. The name must + match the regular expression `[a-z]([-a-z0-9]{0,1022}[a-z0-9])?` parameters: The runtime parameters to pass to the job. transformNameMapping: Only applicable when updating a pipeline. Map of transform name prefixes of the job to be replaced to the corresponding @@ -3567,6 +3933,21 @@ class AdditionalProperty(_messages.Message): workItems = _messages.MessageField('WorkItem', 2, repeated=True) +class Linear(_messages.Message): + r"""Linear buckets with the following boundaries for indices in 0 to n-1. - + i in [0, n-1]: [start + (i)*width, start + (i+1)*width) + + Fields: + numberOfBuckets: Must be greater than 0. + start: Lower bound of the first bucket. + width: Distance between bucket boundaries. Must be greater than 0. + """ + + numberOfBuckets = _messages.IntegerField(1, variant=_messages.Variant.INT32) + start = _messages.FloatField(2) + width = _messages.FloatField(3) + + class ListJobMessagesResponse(_messages.Message): r"""Response to a request to list job messages. @@ -3778,6 +4159,49 @@ class MetricUpdate(_messages.Message): updateTime = _messages.StringField(11) +class MetricValue(_messages.Message): + r"""The value of a metric along with its name and labels. + + Messages: + MetricLabelsValue: Optional. Set of metric labels for this metric. + + Fields: + metric: Base name for this metric. + metricLabels: Optional. Set of metric labels for this metric. + valueHistogram: Histogram value of this metric. + valueInt64: Integer value of this metric. + """ + @encoding.MapUnrecognizedFields('additionalProperties') + class MetricLabelsValue(_messages.Message): + r"""Optional. Set of metric labels for this metric. + + Messages: + AdditionalProperty: An additional property for a MetricLabelsValue + object. + + Fields: + additionalProperties: Additional properties of type MetricLabelsValue + """ + class AdditionalProperty(_messages.Message): + r"""An additional property for a MetricLabelsValue object. + + Fields: + key: Name of the additional property. + value: A string attribute. + """ + + key = _messages.StringField(1) + value = _messages.StringField(2) + + additionalProperties = _messages.MessageField( + 'AdditionalProperty', 1, repeated=True) + + metric = _messages.StringField(1) + metricLabels = _messages.MessageField('MetricLabelsValue', 2) + valueHistogram = _messages.MessageField('DataflowHistogramValue', 3) + valueInt64 = _messages.IntegerField(4) + + class MountedDataDisk(_messages.Message): r"""Describes mounted data disk. @@ -3843,6 +4267,24 @@ class KindValueValuesEnum(_messages.Enum): name = _messages.StringField(2) +class OutlierStats(_messages.Message): + r"""Statistics for the underflow and overflow bucket. + + Fields: + overflowCount: Number of values that are larger than the upper bound of + the largest bucket. + overflowMean: Mean of values in the overflow bucket. + underflowCount: Number of values that are smaller than the lower bound of + the smallest bucket. + underflowMean: Mean of values in the undeflow bucket. + """ + + overflowCount = _messages.IntegerField(1) + overflowMean = _messages.FloatField(2) + underflowCount = _messages.IntegerField(3) + underflowMean = _messages.FloatField(4) + + class Package(_messages.Message): r"""The packages that must be installed in order for a worker to run the steps of the Cloud Dataflow job that will be assigned to its worker pool. @@ -3964,13 +4406,32 @@ class ParameterMetadata(_messages.Message): Fields: customMetadata: Optional. Additional metadata for describing this parameter. + defaultValue: Optional. The default values will pre-populate the parameter + with the given value from the proto. If default_value is left empty, the + parameter will be populated with a default of the relevant type, e.g. + false for a boolean. + enumOptions: Optional. The options shown when ENUM ParameterType is + specified. + groupName: Optional. Specifies a group name for this parameter to be + rendered under. Group header text will be rendered exactly as specified + in this field. Only considered when parent_name is NOT provided. helpText: Required. The help text to display for the parameter. + hiddenUi: Optional. Whether the parameter should be hidden in the UI. isOptional: Optional. Whether the parameter is optional. Defaults to false. label: Required. The label to display for the parameter. name: Required. The name of the parameter. paramType: Optional. The type of the parameter. Used for selecting input picker. + parentName: Optional. Specifies the name of the parent parameter. Used in + conjunction with 'parent_trigger_values' to make this parameter + conditional (will only be rendered conditionally). Should be mappable to + a ParameterMetadata.name field. + parentTriggerValues: Optional. The value(s) of the 'parent_name' parameter + which will trigger this parameter to be shown. If left empty, ANY non- + empty value in parent_name will trigger this parameter to be shown. Only + considered when this parameter is conditional (when 'parent_name' has + been provided). regexes: Optional. Regexes that the parameter must match. """ class ParamTypeValueValuesEnum(_messages.Enum): @@ -3993,6 +4454,25 @@ class ParamTypeValueValuesEnum(_messages.Enum): write to. PUBSUB_TOPIC: The parameter specifies a Pub/Sub Topic. PUBSUB_SUBSCRIPTION: The parameter specifies a Pub/Sub Subscription. + BIGQUERY_TABLE: The parameter specifies a BigQuery table. + JAVASCRIPT_UDF_FILE: The parameter specifies a JavaScript UDF in Cloud + Storage. + SERVICE_ACCOUNT: The parameter specifies a Service Account email. + MACHINE_TYPE: The parameter specifies a Machine Type. + KMS_KEY_NAME: The parameter specifies a KMS Key name. + WORKER_REGION: The parameter specifies a Worker Region. + WORKER_ZONE: The parameter specifies a Worker Zone. + BOOLEAN: The parameter specifies a boolean input. + ENUM: The parameter specifies an enum input. + NUMBER: The parameter specifies a number input. + KAFKA_TOPIC: Deprecated. Please use KAFKA_READ_TOPIC instead. + KAFKA_READ_TOPIC: The parameter specifies the fully-qualified name of an + Apache Kafka topic. This can be either a Google Managed Kafka topic or + a non-managed Kafka topic. + KAFKA_WRITE_TOPIC: The parameter specifies the fully-qualified name of + an Apache Kafka topic. This can be an existing Google Managed Kafka + topic, the name for a new Google Managed Kafka topic, or an existing + non-managed Kafka topic. """ DEFAULT = 0 TEXT = 1 @@ -4004,6 +4484,19 @@ class ParamTypeValueValuesEnum(_messages.Enum): GCS_WRITE_FOLDER = 7 PUBSUB_TOPIC = 8 PUBSUB_SUBSCRIPTION = 9 + BIGQUERY_TABLE = 10 + JAVASCRIPT_UDF_FILE = 11 + SERVICE_ACCOUNT = 12 + MACHINE_TYPE = 13 + KMS_KEY_NAME = 14 + WORKER_REGION = 15 + WORKER_ZONE = 16 + BOOLEAN = 17 + ENUM = 18 + NUMBER = 19 + KAFKA_TOPIC = 20 + KAFKA_READ_TOPIC = 21 + KAFKA_WRITE_TOPIC = 22 @encoding.MapUnrecognizedFields('additionalProperties') class CustomMetadataValue(_messages.Message): @@ -4031,12 +4524,33 @@ class AdditionalProperty(_messages.Message): 'AdditionalProperty', 1, repeated=True) customMetadata = _messages.MessageField('CustomMetadataValue', 1) - helpText = _messages.StringField(2) - isOptional = _messages.BooleanField(3) - label = _messages.StringField(4) - name = _messages.StringField(5) - paramType = _messages.EnumField('ParamTypeValueValuesEnum', 6) - regexes = _messages.StringField(7, repeated=True) + defaultValue = _messages.StringField(2) + enumOptions = _messages.MessageField( + 'ParameterMetadataEnumOption', 3, repeated=True) + groupName = _messages.StringField(4) + helpText = _messages.StringField(5) + hiddenUi = _messages.BooleanField(6) + isOptional = _messages.BooleanField(7) + label = _messages.StringField(8) + name = _messages.StringField(9) + paramType = _messages.EnumField('ParamTypeValueValuesEnum', 10) + parentName = _messages.StringField(11) + parentTriggerValues = _messages.StringField(12, repeated=True) + regexes = _messages.StringField(13, repeated=True) + + +class ParameterMetadataEnumOption(_messages.Message): + r"""ParameterMetadataEnumOption specifies the option shown in the enum form. + + Fields: + description: Optional. The description to display for the enum option. + label: Optional. The label to display for the enum option. + value: Required. The value of the enum option. + """ + + description = _messages.StringField(1) + label = _messages.StringField(2) + value = _messages.StringField(3) class PartialGroupByKeyInstruction(_messages.Message): @@ -4119,6 +4633,36 @@ class AdditionalProperty(_messages.Message): valueCombiningFn = _messages.MessageField('ValueCombiningFnValue', 6) +class PerStepNamespaceMetrics(_messages.Message): + r"""Metrics for a particular unfused step and namespace. A metric is + uniquely identified by the `metrics_namespace`, `original_step`, `metric + name` and `metric_labels`. + + Fields: + metricValues: Optional. Metrics that are recorded for this namespace and + unfused step. + metricsNamespace: The namespace of these metrics on the worker. + originalStep: The original system name of the unfused step that these + metrics are reported from. + """ + + metricValues = _messages.MessageField('MetricValue', 1, repeated=True) + metricsNamespace = _messages.StringField(2) + originalStep = _messages.StringField(3) + + +class PerWorkerMetrics(_messages.Message): + r"""Per worker metrics. + + Fields: + perStepNamespaceMetrics: Optional. Metrics for a particular unfused step + and namespace. + """ + + perStepNamespaceMetrics = _messages.MessageField( + 'PerStepNamespaceMetrics', 1, repeated=True) + + class PipelineDescription(_messages.Message): r"""A descriptive representation of submitted pipeline as well as the executed form. This data is provided by the Dataflow service for ease of @@ -4130,6 +4674,8 @@ class PipelineDescription(_messages.Message): pipeline. originalPipelineTransform: Description of each transform in the pipeline and collections between them. + stepNamesHash: A hash value of the submitted pipeline portable graph step + names if exists. """ displayData = _messages.MessageField('DisplayData', 1, repeated=True) @@ -4137,6 +4683,7 @@ class PipelineDescription(_messages.Message): 'ExecutionStageSummary', 2, repeated=True) originalPipelineTransform = _messages.MessageField( 'TransformSummary', 3, repeated=True) + stepNamesHash = _messages.StringField(4) class Point(_messages.Message): @@ -4207,6 +4754,8 @@ class PubsubLocation(_messages.Message): Fields: dropLateData: Indicates whether the pipeline allows late-arriving data. + dynamicDestinations: If true, then this location represents dynamic + topics. idLabel: If set, contains a pubsub label from which to extract record ids. If left empty, record deduplication will be strictly best effort. subscription: A pubsub subscription, in the form of @@ -4222,12 +4771,13 @@ class PubsubLocation(_messages.Message): """ dropLateData = _messages.BooleanField(1) - idLabel = _messages.StringField(2) - subscription = _messages.StringField(3) - timestampLabel = _messages.StringField(4) - topic = _messages.StringField(5) - trackingSubscription = _messages.StringField(6) - withAttributes = _messages.BooleanField(7) + dynamicDestinations = _messages.BooleanField(2) + idLabel = _messages.StringField(3) + subscription = _messages.StringField(4) + timestampLabel = _messages.StringField(5) + topic = _messages.StringField(6) + trackingSubscription = _messages.StringField(7) + withAttributes = _messages.BooleanField(8) class PubsubSnapshotMetadata(_messages.Message): @@ -4244,31 +4794,6 @@ class PubsubSnapshotMetadata(_messages.Message): topicName = _messages.StringField(3) -class QueryInfo(_messages.Message): - r"""Information about a validated query. - - Enums: - QueryPropertyValueListEntryValuesEnum: - - Fields: - queryProperty: Includes an entry for each satisfied QueryProperty. - """ - class QueryPropertyValueListEntryValuesEnum(_messages.Enum): - r"""QueryPropertyValueListEntryValuesEnum enum type. - - Values: - QUERY_PROPERTY_UNSPECIFIED: The query property is unknown or - unspecified. - HAS_UNBOUNDED_SOURCE: Indicates this query reads from >= 1 unbounded - source. - """ - QUERY_PROPERTY_UNSPECIFIED = 0 - HAS_UNBOUNDED_SOURCE = 1 - - queryProperty = _messages.EnumField( - 'QueryPropertyValueListEntryValuesEnum', 1, repeated=True) - - class ReadInstruction(_messages.Message): r"""An instruction that reads records. Takes no inputs, produces one output. @@ -4448,69 +4973,88 @@ class ResourceUtilizationReportResponse(_messages.Message): class RuntimeEnvironment(_messages.Message): - r"""The environment values to set at runtime. + r"""The environment values to set at runtime. LINT.IfChange Enums: - IpConfigurationValueValuesEnum: Configuration for VM IPs. + IpConfigurationValueValuesEnum: Optional. Configuration for VM IPs. + StreamingModeValueValuesEnum: Optional. Specifies the Streaming Engine + message processing guarantees. Reduces cost and latency but might result + in duplicate messages committed to storage. Designed to run simple + mapping streaming ETL jobs at the lowest cost. For example, Change Data + Capture (CDC) to BigQuery is a canonical use case. For more information, + see [Set the pipeline streaming + mode](https://cloud.google.com/dataflow/docs/guides/streaming-modes). Messages: - AdditionalUserLabelsValue: Additional user labels to be specified for the - job. Keys and values should follow the restrictions specified in the - [labeling restrictions](https://cloud.google.com/compute/docs/labeling- + AdditionalUserLabelsValue: Optional. Additional user labels to be + specified for the job. Keys and values should follow the restrictions + specified in the [labeling + restrictions](https://cloud.google.com/compute/docs/labeling- resources#restrictions) page. An object containing a list of "key": value pairs. Example: { "name": "wrench", "mass": "1kg", "count": "3" }. Fields: - additionalExperiments: Additional experiment flags for the job, specified - with the `--experiments` option. - additionalUserLabels: Additional user labels to be specified for the job. - Keys and values should follow the restrictions specified in the + additionalExperiments: Optional. Additional experiment flags for the job, + specified with the `--experiments` option. + additionalUserLabels: Optional. Additional user labels to be specified for + the job. Keys and values should follow the restrictions specified in the [labeling restrictions](https://cloud.google.com/compute/docs/labeling- resources#restrictions) page. An object containing a list of "key": value pairs. Example: { "name": "wrench", "mass": "1kg", "count": "3" }. - bypassTempDirValidation: Whether to bypass the safety checks for the job's - temporary directory. Use with caution. - enableStreamingEngine: Whether to enable Streaming Engine for the job. - ipConfiguration: Configuration for VM IPs. - kmsKeyName: Name for the Cloud KMS key for the job. Key format is: - projects//locations//keyRings//cryptoKeys/ - machineType: The machine type to use for the job. Defaults to the value - from the template if not specified. - maxWorkers: The maximum number of Google Compute Engine instances to be - made available to your pipeline during execution, from 1 to 1000. - network: Network to which VMs will be assigned. If empty or unspecified, - the service will use the network "default". - numWorkers: The initial number of Google Compute Engine instnaces for the - job. - serviceAccountEmail: The email address of the service account to run the - job as. - subnetwork: Subnetwork to which VMs will be assigned, if desired. You can - specify a subnetwork using either a complete URL or an abbreviated path. - Expected to be of the form "https://www.googleapis.com/compute/v1/projec - ts/HOST_PROJECT_ID/regions/REGION/subnetworks/SUBNETWORK" or - "regions/REGION/subnetworks/SUBNETWORK". If the subnetwork is located in - a Shared VPC network, you must use the complete URL. - tempLocation: The Cloud Storage path to use for temporary files. Must be a - valid Cloud Storage URL, beginning with `gs://`. - workerRegion: The Compute Engine region + bypassTempDirValidation: Optional. Whether to bypass the safety checks for + the job's temporary directory. Use with caution. + diskSizeGb: Optional. The disk size, in gigabytes, to use on each remote + Compute Engine worker instance. + enableStreamingEngine: Optional. Whether to enable Streaming Engine for + the job. + ipConfiguration: Optional. Configuration for VM IPs. + kmsKeyName: Optional. Name for the Cloud KMS key for the job. Key format + is: projects//locations//keyRings//cryptoKeys/ + machineType: Optional. The machine type to use for the job. Defaults to + the value from the template if not specified. + maxWorkers: Optional. The maximum number of Google Compute Engine + instances to be made available to your pipeline during execution, from 1 + to 1000. The default value is 1. + network: Optional. Network to which VMs will be assigned. If empty or + unspecified, the service will use the network "default". + numWorkers: Optional. The initial number of Google Compute Engine + instances for the job. The default value is 11. + serviceAccountEmail: Optional. The email address of the service account to + run the job as. + streamingMode: Optional. Specifies the Streaming Engine message processing + guarantees. Reduces cost and latency but might result in duplicate + messages committed to storage. Designed to run simple mapping streaming + ETL jobs at the lowest cost. For example, Change Data Capture (CDC) to + BigQuery is a canonical use case. For more information, see [Set the + pipeline streaming + mode](https://cloud.google.com/dataflow/docs/guides/streaming-modes). + subnetwork: Optional. Subnetwork to which VMs will be assigned, if + desired. You can specify a subnetwork using either a complete URL or an + abbreviated path. Expected to be of the form "https://www.googleapis.com + /compute/v1/projects/HOST_PROJECT_ID/regions/REGION/subnetworks/SUBNETWO + RK" or "regions/REGION/subnetworks/SUBNETWORK". If the subnetwork is + located in a Shared VPC network, you must use the complete URL. + tempLocation: Required. The Cloud Storage path to use for temporary files. + Must be a valid Cloud Storage URL, beginning with `gs://`. + workerRegion: Required. The Compute Engine region (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in which worker processing should occur, e.g. "us-west1". Mutually exclusive with worker_zone. If neither worker_region nor worker_zone is specified, default to the control plane's region. - workerZone: The Compute Engine zone + workerZone: Optional. The Compute Engine zone (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in which worker processing should occur, e.g. "us-west1-a". Mutually exclusive with worker_region. If neither worker_region nor worker_zone is specified, a zone in the control plane's region is chosen based on available capacity. If both `worker_zone` and `zone` are set, `worker_zone` takes precedence. - zone: The Compute Engine [availability + zone: Optional. The Compute Engine [availability zone](https://cloud.google.com/compute/docs/regions-zones/regions-zones) for launching worker instances to run your pipeline. In the future, worker_zone will take precedence. """ class IpConfigurationValueValuesEnum(_messages.Enum): - r"""Configuration for VM IPs. + r"""Optional. Configuration for VM IPs. Values: WORKER_IP_UNSPECIFIED: The configuration is unknown, or unspecified. @@ -4521,10 +5065,33 @@ class IpConfigurationValueValuesEnum(_messages.Enum): WORKER_IP_PUBLIC = 1 WORKER_IP_PRIVATE = 2 + class StreamingModeValueValuesEnum(_messages.Enum): + r"""Optional. Specifies the Streaming Engine message processing + guarantees. Reduces cost and latency but might result in duplicate + messages committed to storage. Designed to run simple mapping streaming + ETL jobs at the lowest cost. For example, Change Data Capture (CDC) to + BigQuery is a canonical use case. For more information, see [Set the + pipeline streaming + mode](https://cloud.google.com/dataflow/docs/guides/streaming-modes). + + Values: + STREAMING_MODE_UNSPECIFIED: Run in the default mode. + STREAMING_MODE_EXACTLY_ONCE: In this mode, message deduplication is + performed against persistent state to make sure each message is + processed and committed to storage exactly once. + STREAMING_MODE_AT_LEAST_ONCE: Message deduplication is not performed. + Messages might be processed multiple times, and the results are + applied multiple times. Note: Setting this value also enables + Streaming Engine and Streaming Engine resource-based billing. + """ + STREAMING_MODE_UNSPECIFIED = 0 + STREAMING_MODE_EXACTLY_ONCE = 1 + STREAMING_MODE_AT_LEAST_ONCE = 2 + @encoding.MapUnrecognizedFields('additionalProperties') class AdditionalUserLabelsValue(_messages.Message): - r"""Additional user labels to be specified for the job. Keys and values - should follow the restrictions specified in the [labeling + r"""Optional. Additional user labels to be specified for the job. Keys and + values should follow the restrictions specified in the [labeling restrictions](https://cloud.google.com/compute/docs/labeling- resources#restrictions) page. An object containing a list of "key": value pairs. Example: { "name": "wrench", "mass": "1kg", "count": "3" }. @@ -4554,19 +5121,21 @@ class AdditionalProperty(_messages.Message): additionalExperiments = _messages.StringField(1, repeated=True) additionalUserLabels = _messages.MessageField('AdditionalUserLabelsValue', 2) bypassTempDirValidation = _messages.BooleanField(3) - enableStreamingEngine = _messages.BooleanField(4) - ipConfiguration = _messages.EnumField('IpConfigurationValueValuesEnum', 5) - kmsKeyName = _messages.StringField(6) - machineType = _messages.StringField(7) - maxWorkers = _messages.IntegerField(8, variant=_messages.Variant.INT32) - network = _messages.StringField(9) - numWorkers = _messages.IntegerField(10, variant=_messages.Variant.INT32) - serviceAccountEmail = _messages.StringField(11) - subnetwork = _messages.StringField(12) - tempLocation = _messages.StringField(13) - workerRegion = _messages.StringField(14) - workerZone = _messages.StringField(15) - zone = _messages.StringField(16) + diskSizeGb = _messages.IntegerField(4, variant=_messages.Variant.INT32) + enableStreamingEngine = _messages.BooleanField(5) + ipConfiguration = _messages.EnumField('IpConfigurationValueValuesEnum', 6) + kmsKeyName = _messages.StringField(7) + machineType = _messages.StringField(8) + maxWorkers = _messages.IntegerField(9, variant=_messages.Variant.INT32) + network = _messages.StringField(10) + numWorkers = _messages.IntegerField(11, variant=_messages.Variant.INT32) + serviceAccountEmail = _messages.StringField(12) + streamingMode = _messages.EnumField('StreamingModeValueValuesEnum', 13) + subnetwork = _messages.StringField(14) + tempLocation = _messages.StringField(15) + workerRegion = _messages.StringField(16) + workerZone = _messages.StringField(17) + zone = _messages.StringField(18) class RuntimeMetadata(_messages.Message): @@ -4581,6 +5150,29 @@ class RuntimeMetadata(_messages.Message): sdkInfo = _messages.MessageField('SDKInfo', 2) +class RuntimeUpdatableParams(_messages.Message): + r"""Additional job parameters that can only be updated during runtime using + the projects.jobs.update method. These fields have no effect when specified + during job creation. + + Fields: + maxNumWorkers: The maximum number of workers to cap autoscaling at. This + field is currently only supported for Streaming Engine jobs. + minNumWorkers: The minimum number of workers to scale down to. This field + is currently only supported for Streaming Engine jobs. + workerUtilizationHint: Target worker utilization, compared against the + aggregate utilization of the worker pool by autoscaler, to determine + upscaling and downscaling when absent other constraints such as backlog. + For more information, see [Update an existing + pipeline](https://cloud.google.com/dataflow/docs/guides/updating-a- + pipeline). + """ + + maxNumWorkers = _messages.IntegerField(1, variant=_messages.Variant.INT32) + minNumWorkers = _messages.IntegerField(2, variant=_messages.Variant.INT32) + workerUtilizationHint = _messages.FloatField(3) + + class SDKInfo(_messages.Message): r"""SDK Information. @@ -4598,22 +5190,75 @@ class LanguageValueValuesEnum(_messages.Enum): UNKNOWN: UNKNOWN Language. JAVA: Java. PYTHON: Python. + GO: Go. """ UNKNOWN = 0 JAVA = 1 PYTHON = 2 + GO = 3 language = _messages.EnumField('LanguageValueValuesEnum', 1) version = _messages.StringField(2) +class SdkBug(_messages.Message): + r"""A bug found in the Dataflow SDK. + + Enums: + SeverityValueValuesEnum: Output only. How severe the SDK bug is. + TypeValueValuesEnum: Output only. Describes the impact of this SDK bug. + + Fields: + severity: Output only. How severe the SDK bug is. + type: Output only. Describes the impact of this SDK bug. + uri: Output only. Link to more information on the bug. + """ + class SeverityValueValuesEnum(_messages.Enum): + r"""Output only. How severe the SDK bug is. + + Values: + SEVERITY_UNSPECIFIED: A bug of unknown severity. + NOTICE: A minor bug that that may reduce reliability or performance for + some jobs. Impact will be minimal or non-existent for most jobs. + WARNING: A bug that has some likelihood of causing performance + degradation, data loss, or job failures. + SEVERE: A bug with extremely significant impact. Jobs may fail + erroneously, performance may be severely degraded, and data loss may + be very likely. + """ + SEVERITY_UNSPECIFIED = 0 + NOTICE = 1 + WARNING = 2 + SEVERE = 3 + + class TypeValueValuesEnum(_messages.Enum): + r"""Output only. Describes the impact of this SDK bug. + + Values: + TYPE_UNSPECIFIED: Unknown issue with this SDK. + GENERAL: Catch-all for SDK bugs that don't fit in the below categories. + PERFORMANCE: Using this version of the SDK may result in degraded + performance. + DATALOSS: Using this version of the SDK may cause data loss. + """ + TYPE_UNSPECIFIED = 0 + GENERAL = 1 + PERFORMANCE = 2 + DATALOSS = 3 + + severity = _messages.EnumField('SeverityValueValuesEnum', 1) + type = _messages.EnumField('TypeValueValuesEnum', 2) + uri = _messages.StringField(3) + + class SdkHarnessContainerImage(_messages.Message): - r"""Defines a SDK harness container for executing Dataflow pipelines. + r"""Defines an SDK harness container for executing Dataflow pipelines. Fields: capabilities: The set of capabilities enumerated in the above Environment - proto. See also https://github.com/apache/beam/blob/master/model/pipelin - e/src/main/proto/beam_runner_api.proto + proto. See also [beam_runner_api.proto](https://github.com/apache/beam/b + lob/master/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/ + v1/beam_runner_api.proto) containerImage: A docker container image that resides in Google Container Registry. environmentId: Environment ID for the Beam runner API proto Environment @@ -4638,6 +5283,7 @@ class SdkVersion(_messages.Message): SdkSupportStatusValueValuesEnum: The support status for this SDK version. Fields: + bugs: Output only. Known bugs found in this SDK version. sdkSupportStatus: The support status for this SDK version. version: The version of the SDK used to run the job. versionDisplayName: A readable string describing the version of the SDK. @@ -4661,9 +5307,10 @@ class SdkSupportStatusValueValuesEnum(_messages.Enum): DEPRECATED = 3 UNSUPPORTED = 4 - sdkSupportStatus = _messages.EnumField('SdkSupportStatusValueValuesEnum', 1) - version = _messages.StringField(2) - versionDisplayName = _messages.StringField(3) + bugs = _messages.MessageField('SdkBug', 1, repeated=True) + sdkSupportStatus = _messages.EnumField('SdkSupportStatusValueValuesEnum', 2) + version = _messages.StringField(3) + versionDisplayName = _messages.StringField(4) class SendDebugCaptureRequest(_messages.Message): @@ -4796,6 +5443,17 @@ class SeqMapTaskOutputInfo(_messages.Message): tag = _messages.StringField(2) +class ServiceResources(_messages.Message): + r"""Resources used by the Dataflow Service to run the job. + + Fields: + zones: Output only. List of Cloud Zones being used by the Dataflow Service + for this job. Example: us-central1-c + """ + + zones = _messages.StringField(1, repeated=True) + + class ShellTask(_messages.Message): r"""A task which consists of a shell command for the worker to execute. @@ -5381,6 +6039,7 @@ class StageSummary(_messages.Message): stageId: ID of this stage startTime: Start time of this stage. state: State of this stage. + stragglerSummary: Straggler summary for this stage. """ class StateValueValuesEnum(_messages.Enum): r"""State of this stage. @@ -5406,6 +6065,7 @@ class StateValueValuesEnum(_messages.Enum): stageId = _messages.StringField(4) startTime = _messages.StringField(5) state = _messages.EnumField('StateValueValuesEnum', 6) + stragglerSummary = _messages.MessageField('StragglerSummary', 7) class StandardQueryParameters(_messages.Message): @@ -5536,15 +6196,16 @@ class Step(_messages.Message): r"""Defines a particular step within a Cloud Dataflow job. A job consists of multiple steps, each of which performs some specific operation as part of the overall job. Data is typically passed from one step to another as part - of the job. Here's an example of a sequence of steps which together - implement a Map-Reduce job: * Read a collection of data from some source, - parsing the collection's elements. * Validate the elements. * Apply a user- - defined function to map each element to some value and extract an element- - specific key value. * Group elements with the same key into a single element - with that key, transforming a multiply-keyed collection into a uniquely- - keyed collection. * Write the elements out to some data sink. Note that the - Cloud Dataflow service may be used to run many different types of jobs, not - just Map-Reduce. + of the job. **Note:** The properties of this object are not stable and might + change. Here's an example of a sequence of steps which together implement a + Map-Reduce job: * Read a collection of data from some source, parsing the + collection's elements. * Validate the elements. * Apply a user-defined + function to map each element to some value and extract an element-specific + key value. * Group elements with the same key into a single element with + that key, transforming a multiply-keyed collection into a uniquely-keyed + collection. * Write the elements out to some data sink. Note that the Cloud + Dataflow service may be used to run many different types of jobs, not just + Map-Reduce. Messages: PropertiesValue: Named properties associated with the step. Each kind of @@ -5590,6 +6251,120 @@ class AdditionalProperty(_messages.Message): properties = _messages.MessageField('PropertiesValue', 3) +class Straggler(_messages.Message): + r"""Information for a straggler. + + Fields: + batchStraggler: Batch straggler identification and debugging information. + streamingStraggler: Streaming straggler identification and debugging + information. + """ + + batchStraggler = _messages.MessageField('StragglerInfo', 1) + streamingStraggler = _messages.MessageField('StreamingStragglerInfo', 2) + + +class StragglerDebuggingInfo(_messages.Message): + r"""Information useful for debugging a straggler. Each type will provide + specialized debugging information relevant for a particular cause. The + StragglerDebuggingInfo will be 1:1 mapping to the StragglerCause enum. + + Fields: + hotKey: Hot key debugging details. + """ + + hotKey = _messages.MessageField('HotKeyDebuggingInfo', 1) + + +class StragglerInfo(_messages.Message): + r"""Information useful for straggler identification and debugging. + + Messages: + CausesValue: The straggler causes, keyed by the string representation of + the StragglerCause enum and contains specialized debugging information + for each straggler cause. + + Fields: + causes: The straggler causes, keyed by the string representation of the + StragglerCause enum and contains specialized debugging information for + each straggler cause. + startTime: The time when the work item attempt became a straggler. + """ + @encoding.MapUnrecognizedFields('additionalProperties') + class CausesValue(_messages.Message): + r"""The straggler causes, keyed by the string representation of the + StragglerCause enum and contains specialized debugging information for + each straggler cause. + + Messages: + AdditionalProperty: An additional property for a CausesValue object. + + Fields: + additionalProperties: Additional properties of type CausesValue + """ + class AdditionalProperty(_messages.Message): + r"""An additional property for a CausesValue object. + + Fields: + key: Name of the additional property. + value: A StragglerDebuggingInfo attribute. + """ + + key = _messages.StringField(1) + value = _messages.MessageField('StragglerDebuggingInfo', 2) + + additionalProperties = _messages.MessageField( + 'AdditionalProperty', 1, repeated=True) + + causes = _messages.MessageField('CausesValue', 1) + startTime = _messages.StringField(2) + + +class StragglerSummary(_messages.Message): + r"""Summarized straggler identification details. + + Messages: + StragglerCauseCountValue: Aggregated counts of straggler causes, keyed by + the string representation of the StragglerCause enum. + + Fields: + recentStragglers: The most recent stragglers. + stragglerCauseCount: Aggregated counts of straggler causes, keyed by the + string representation of the StragglerCause enum. + totalStragglerCount: The total count of stragglers. + """ + @encoding.MapUnrecognizedFields('additionalProperties') + class StragglerCauseCountValue(_messages.Message): + r"""Aggregated counts of straggler causes, keyed by the string + representation of the StragglerCause enum. + + Messages: + AdditionalProperty: An additional property for a + StragglerCauseCountValue object. + + Fields: + additionalProperties: Additional properties of type + StragglerCauseCountValue + """ + class AdditionalProperty(_messages.Message): + r"""An additional property for a StragglerCauseCountValue object. + + Fields: + key: Name of the additional property. + value: A string attribute. + """ + + key = _messages.StringField(1) + value = _messages.IntegerField(2) + + additionalProperties = _messages.MessageField( + 'AdditionalProperty', 1, repeated=True) + + recentStragglers = _messages.MessageField('Straggler', 1, repeated=True) + stragglerCauseCount = _messages.MessageField('StragglerCauseCountValue', 2) + totalStragglerCount = _messages.IntegerField(3) + + class StreamLocation(_messages.Message): r"""Describes a stream of data, either as input to be processed or as output of a streaming Dataflow job. @@ -5736,6 +6511,8 @@ class StreamingConfigTask(_messages.Message): harness to windmill. maxWorkItemCommitBytes: Maximum size for work item commit supported windmill storage layer. + operationalLimits: Operational limits for the streaming job. Can be used + by the worker to validate outputs sent to the backend. streamingComputationConfigs: Set of computation configuration information. userStepToStateFamilyNameMap: Map from user step names to state families. windmillServiceEndpoint: If present, the worker must use this endpoint to @@ -5775,12 +6552,80 @@ class AdditionalProperty(_messages.Message): commitStreamChunkSizeBytes = _messages.IntegerField(1) getDataStreamChunkSizeBytes = _messages.IntegerField(2) maxWorkItemCommitBytes = _messages.IntegerField(3) + operationalLimits = _messages.MessageField('StreamingOperationalLimits', 4) streamingComputationConfigs = _messages.MessageField( - 'StreamingComputationConfig', 4, repeated=True) + 'StreamingComputationConfig', 5, repeated=True) userStepToStateFamilyNameMap = _messages.MessageField( - 'UserStepToStateFamilyNameMapValue', 5) - windmillServiceEndpoint = _messages.StringField(6) - windmillServicePort = _messages.IntegerField(7) + 'UserStepToStateFamilyNameMapValue', 6) + windmillServiceEndpoint = _messages.StringField(7) + windmillServicePort = _messages.IntegerField(8) + + +class StreamingOperationalLimits(_messages.Message): + r"""Operational limits imposed on streaming jobs by the backend. + + Fields: + maxBagElementBytes: The maximum size for an element in bag state. + maxGlobalDataBytes: The maximum size for an element in global data. + maxKeyBytes: The maximum size allowed for a key. + maxProductionOutputBytes: The maximum size for a single output element. + maxSortedListElementBytes: The maximum size for an element in sorted list + state. + maxSourceStateBytes: The maximum size for a source state update. + maxTagBytes: The maximum size for a state tag. + maxValueBytes: The maximum size for a value state field. + """ + + maxBagElementBytes = _messages.IntegerField(1) + maxGlobalDataBytes = _messages.IntegerField(2) + maxKeyBytes = _messages.IntegerField(3) + maxProductionOutputBytes = _messages.IntegerField(4) + maxSortedListElementBytes = _messages.IntegerField(5) + maxSourceStateBytes = _messages.IntegerField(6) + maxTagBytes = _messages.IntegerField(7) + maxValueBytes = _messages.IntegerField(8) + + +class StreamingScalingReport(_messages.Message): + r"""Contains per-user worker telemetry used in streaming autoscaling. + + Fields: + activeBundleCount: A integer attribute. + activeThreadCount: Current acive thread count. + maximumBundleCount: Maximum bundle count. + maximumBytes: Maximum bytes. + maximumBytesCount: A integer attribute. + maximumThreadCount: Maximum thread count limit. + outstandingBundleCount: Current outstanding bundle count. + outstandingBytes: Current outstanding bytes. + outstandingBytesCount: A integer attribute. + """ + + activeBundleCount = _messages.IntegerField(1, variant=_messages.Variant.INT32) + activeThreadCount = _messages.IntegerField(2, variant=_messages.Variant.INT32) + maximumBundleCount = _messages.IntegerField( + 3, variant=_messages.Variant.INT32) + maximumBytes = _messages.IntegerField(4) + maximumBytesCount = _messages.IntegerField(5, variant=_messages.Variant.INT32) + maximumThreadCount = _messages.IntegerField( + 6, variant=_messages.Variant.INT32) + outstandingBundleCount = _messages.IntegerField( + 7, variant=_messages.Variant.INT32) + outstandingBytes = _messages.IntegerField(8) + outstandingBytesCount = _messages.IntegerField( + 9, variant=_messages.Variant.INT32) + + +class StreamingScalingReportResponse(_messages.Message): + r"""Contains per-user-worker streaming scaling recommendation from the + backend. + + Fields: + maximumThreadCount: Maximum thread count limit; + """ + + maximumThreadCount = _messages.IntegerField( + 1, variant=_messages.Variant.INT32) class StreamingSetupTask(_messages.Message): @@ -5829,6 +6674,26 @@ class StreamingStageLocation(_messages.Message): streamId = _messages.StringField(1) +class StreamingStragglerInfo(_messages.Message): + r"""Information useful for streaming straggler identification and debugging. + + Fields: + dataWatermarkLag: The event-time watermark lag at the time of the + straggler detection. + endTime: End time of this straggler. + startTime: Start time of this straggler. + systemWatermarkLag: The system watermark lag at the time of the straggler + detection. + workerName: Name of the worker where the straggler was detected. + """ + + dataWatermarkLag = _messages.StringField(1) + endTime = _messages.StringField(2) + startTime = _messages.StringField(3) + systemWatermarkLag = _messages.StringField(4) + workerName = _messages.StringField(5) + + class StringList(_messages.Message): r"""A metric value representing a list of strings. @@ -5923,14 +6788,27 @@ class TemplateMetadata(_messages.Message): r"""Metadata describing a template. Fields: + defaultStreamingMode: Optional. Indicates the default streaming mode for a + streaming template. Only valid if both supports_at_least_once and + supports_exactly_once are true. Possible values: UNSPECIFIED, + EXACTLY_ONCE and AT_LEAST_ONCE description: Optional. A description of the template. name: Required. The name of the template. parameters: The parameters for the template. + streaming: Optional. Indicates if the template is streaming or not. + supportsAtLeastOnce: Optional. Indicates if the streaming template + supports at least once mode. + supportsExactlyOnce: Optional. Indicates if the streaming template + supports exactly once mode. """ - description = _messages.StringField(1) - name = _messages.StringField(2) - parameters = _messages.MessageField('ParameterMetadata', 3, repeated=True) + defaultStreamingMode = _messages.StringField(1) + description = _messages.StringField(2) + name = _messages.StringField(3) + parameters = _messages.MessageField('ParameterMetadata', 4, repeated=True) + streaming = _messages.BooleanField(5) + supportsAtLeastOnce = _messages.BooleanField(6) + supportsExactlyOnce = _messages.BooleanField(7) class TopologyConfig(_messages.Message): @@ -6036,19 +6914,6 @@ class KindValueValuesEnum(_messages.Enum): outputCollectionName = _messages.StringField(6, repeated=True) -class ValidateResponse(_messages.Message): - r"""Response to the validation request. - - Fields: - errorMessage: Will be empty if validation succeeds. - queryInfo: Information about the validated query. Not defined if - validation fails. - """ - - errorMessage = _messages.StringField(1) - queryInfo = _messages.MessageField('QueryInfo', 2) - - class WorkItem(_messages.Message): r"""WorkItem represents basic information about a WorkItem to be executed in the cloud. @@ -6110,6 +6975,7 @@ class WorkItemDetails(_messages.Message): progress: Progress of this work item. startTime: Start time of this work item attempt. state: State of this work item. + stragglerInfo: Information about straggler detections for this work item. taskId: Name of this work item. """ class StateValueValuesEnum(_messages.Enum): @@ -6136,7 +7002,8 @@ class StateValueValuesEnum(_messages.Enum): progress = _messages.MessageField('ProgressTimeseries', 4) startTime = _messages.StringField(5) state = _messages.EnumField('StateValueValuesEnum', 6) - taskId = _messages.StringField(7) + stragglerInfo = _messages.MessageField('StragglerInfo', 7) + taskId = _messages.StringField(8) class WorkItemServiceState(_messages.Message): @@ -6459,6 +7326,8 @@ class WorkerMessage(_messages.Message): not be used here. Fields: + dataSamplingReport: Optional. Contains metrics related to go/dataflow- + data-sampling-telemetry. labels: Labels are used to group WorkerMessages. For example, a worker_message about a particular container might have the labels: { "JOB_ID": "2015-04-22", "WORKER_ID": "wordcount-vm-2015..." @@ -6466,12 +7335,16 @@ class WorkerMessage(_messages.Message): typically correspond to Label enum values. However, for ease of development other strings can be used as tags. LABEL_UNSPECIFIED should not be used here. + perWorkerMetrics: System defined metrics for this worker. + streamingScalingReport: Contains per-user worker telemetry used in + streaming autoscaling. time: The timestamp of the worker_message. workerHealthReport: The health of a worker. workerLifecycleEvent: Record of worker lifecycle events. workerMessageCode: A worker message code. workerMetrics: Resource metrics reported by workers. workerShutdownNotice: Shutdown notice by workers. + workerThreadScalingReport: Thread scaling information reported by workers. """ @encoding.MapUnrecognizedFields('additionalProperties') class LabelsValue(_messages.Message): @@ -6502,13 +7375,18 @@ class AdditionalProperty(_messages.Message): additionalProperties = _messages.MessageField( 'AdditionalProperty', 1, repeated=True) - labels = _messages.MessageField('LabelsValue', 1) - time = _messages.StringField(2) - workerHealthReport = _messages.MessageField('WorkerHealthReport', 3) - workerLifecycleEvent = _messages.MessageField('WorkerLifecycleEvent', 4) - workerMessageCode = _messages.MessageField('WorkerMessageCode', 5) - workerMetrics = _messages.MessageField('ResourceUtilizationReport', 6) - workerShutdownNotice = _messages.MessageField('WorkerShutdownNotice', 7) + dataSamplingReport = _messages.MessageField('DataSamplingReport', 1) + labels = _messages.MessageField('LabelsValue', 2) + perWorkerMetrics = _messages.MessageField('PerWorkerMetrics', 3) + streamingScalingReport = _messages.MessageField('StreamingScalingReport', 4) + time = _messages.StringField(5) + workerHealthReport = _messages.MessageField('WorkerHealthReport', 6) + workerLifecycleEvent = _messages.MessageField('WorkerLifecycleEvent', 7) + workerMessageCode = _messages.MessageField('WorkerMessageCode', 8) + workerMetrics = _messages.MessageField('ResourceUtilizationReport', 9) + workerShutdownNotice = _messages.MessageField('WorkerShutdownNotice', 10) + workerThreadScalingReport = _messages.MessageField( + 'WorkerThreadScalingReport', 11) class WorkerMessageCode(_messages.Message): @@ -6600,20 +7478,28 @@ class WorkerMessageResponse(_messages.Message): sender. Fields: + streamingScalingReportResponse: Service's streaming scaling response for + workers. workerHealthReportResponse: The service's response to a worker's health report. workerMetricsResponse: Service's response to reporting worker metrics (currently empty). workerShutdownNoticeResponse: Service's response to shutdown notice (currently empty). + workerThreadScalingReportResponse: Service's thread scaling recommendation + for workers. """ + streamingScalingReportResponse = _messages.MessageField( + 'StreamingScalingReportResponse', 1) workerHealthReportResponse = _messages.MessageField( - 'WorkerHealthReportResponse', 1) + 'WorkerHealthReportResponse', 2) workerMetricsResponse = _messages.MessageField( - 'ResourceUtilizationReportResponse', 2) + 'ResourceUtilizationReportResponse', 3) workerShutdownNoticeResponse = _messages.MessageField( - 'WorkerShutdownNoticeResponse', 3) + 'WorkerShutdownNoticeResponse', 4) + workerThreadScalingReportResponse = _messages.MessageField( + 'WorkerThreadScalingReportResponse', 5) class WorkerPool(_messages.Message): @@ -6884,6 +7770,29 @@ class WorkerShutdownNoticeResponse(_messages.Message): r"""Service-side response to WorkerMessage issuing shutdown notice.""" +class WorkerThreadScalingReport(_messages.Message): + r"""Contains information about the thread scaling information of a worker. + + Fields: + currentThreadCount: Current number of active threads in a worker. + """ + + currentThreadCount = _messages.IntegerField( + 1, variant=_messages.Variant.INT32) + + +class WorkerThreadScalingReportResponse(_messages.Message): + r"""Contains the thread scaling recommendation for a worker from the + backend. + + Fields: + recommendedThreadCount: Recommended number of threads for a worker. + """ + + recommendedThreadCount = _messages.IntegerField( + 1, variant=_messages.Variant.INT32) + + class WriteInstruction(_messages.Message): r"""An instruction that writes records. Takes one input, produces no outputs. diff --git a/sdks/python/apache_beam/runners/direct/direct_metrics.py b/sdks/python/apache_beam/runners/direct/direct_metrics.py index e4fd44053119..f715ce3bf521 100644 --- a/sdks/python/apache_beam/runners/direct/direct_metrics.py +++ b/sdks/python/apache_beam/runners/direct/direct_metrics.py @@ -28,6 +28,7 @@ from apache_beam.metrics.cells import CounterAggregator from apache_beam.metrics.cells import DistributionAggregator from apache_beam.metrics.cells import GaugeAggregator +from apache_beam.metrics.cells import StringSetAggregator from apache_beam.metrics.execution import MetricKey from apache_beam.metrics.execution import MetricResult from apache_beam.metrics.metric import MetricResults @@ -39,6 +40,7 @@ def __init__(self): self._distributions = defaultdict( lambda: DirectMetric(DistributionAggregator())) self._gauges = defaultdict(lambda: DirectMetric(GaugeAggregator())) + self._string_sets = defaultdict(lambda: DirectMetric(StringSetAggregator())) def _apply_operation(self, bundle, updates, op): for k, v in updates.counters.items(): @@ -50,6 +52,9 @@ def _apply_operation(self, bundle, updates, op): for k, v in updates.gauges.items(): op(self._gauges[k], bundle, v) + for k, v in updates.string_sets.items(): + op(self._string_sets[k], bundle, v) + def commit_logical(self, bundle, updates): op = lambda obj, bundle, update: obj.commit_logical(bundle, update) self._apply_operation(bundle, updates, op) @@ -84,11 +89,19 @@ def query(self, filter=None): v.extract_latest_attempted()) for k, v in self._gauges.items() if self.matches(filter, k) ] + string_sets = [ + MetricResult( + MetricKey(k.step, k.metric), + v.extract_committed(), + v.extract_latest_attempted()) for k, + v in self._string_sets.items() if self.matches(filter, k) + ] return { self.COUNTERS: counters, self.DISTRIBUTIONS: distributions, - self.GAUGES: gauges + self.GAUGES: gauges, + self.STRINGSETS: string_sets } diff --git a/sdks/python/apache_beam/runners/direct/direct_runner_test.py b/sdks/python/apache_beam/runners/direct/direct_runner_test.py index 58cec732d3fa..d8f1ea097b88 100644 --- a/sdks/python/apache_beam/runners/direct/direct_runner_test.py +++ b/sdks/python/apache_beam/runners/direct/direct_runner_test.py @@ -76,6 +76,8 @@ def process(self, element): count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) + str_set = Metrics.string_set(self.__class__, 'element_str_set') + str_set.add(str(element % 4)) return [element] p = Pipeline(DirectRunner()) @@ -115,6 +117,13 @@ def process(self, element): hc.assert_that(gauge_result.committed.value, hc.equal_to(5)) hc.assert_that(gauge_result.attempted.value, hc.equal_to(5)) + str_set_result = metrics['string_sets'][0] + hc.assert_that( + str_set_result.key, + hc.equal_to(MetricKey('Do', MetricName(namespace, 'element_str_set')))) + hc.assert_that(len(str_set_result.committed), hc.equal_to(4)) + hc.assert_that(len(str_set_result.attempted), hc.equal_to(4)) + def test_create_runner(self): self.assertTrue(isinstance(create_runner('DirectRunner'), DirectRunner)) self.assertTrue( diff --git a/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner.py b/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner.py index 8b313d624a52..1ed21942d28f 100644 --- a/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner.py +++ b/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner.py @@ -1535,15 +1535,17 @@ def __init__(self, step_monitoring_infos, user_metrics_only=True): self._counters = {} self._distributions = {} self._gauges = {} + self._string_sets = {} self._user_metrics_only = user_metrics_only self._monitoring_infos = step_monitoring_infos for smi in step_monitoring_infos.values(): - counters, distributions, gauges = \ + counters, distributions, gauges, string_sets = \ portable_metrics.from_monitoring_infos(smi, user_metrics_only) self._counters.update(counters) self._distributions.update(distributions) self._gauges.update(gauges) + self._string_sets.update(string_sets) def query(self, filter=None): counters = [ @@ -1558,11 +1560,16 @@ def query(self, filter=None): MetricResult(k, v, v) for k, v in self._gauges.items() if self.matches(filter, k) ] + string_sets = [ + MetricResult(k, v, v) for k, + v in self._string_sets.items() if self.matches(filter, k) + ] return { self.COUNTERS: counters, self.DISTRIBUTIONS: distributions, - self.GAUGES: gauges + self.GAUGES: gauges, + self.STRINGSETS: string_sets } def monitoring_infos(self) -> List[metrics_pb2.MonitoringInfo]: diff --git a/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner_test.py b/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner_test.py index 97b10b83e051..4a737feaf288 100644 --- a/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner_test.py +++ b/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner_test.py @@ -1212,13 +1212,16 @@ def test_metrics(self, check_gauge=True): counter = beam.metrics.Metrics.counter('ns', 'counter') distribution = beam.metrics.Metrics.distribution('ns', 'distribution') gauge = beam.metrics.Metrics.gauge('ns', 'gauge') + string_set = beam.metrics.Metrics.string_set('ns', 'string_set') - pcoll = p | beam.Create(['a', 'zzz']) + elements = ['a', 'zzz'] + pcoll = p | beam.Create(elements) # pylint: disable=expression-not-assigned pcoll | 'count1' >> beam.FlatMap(lambda x: counter.inc()) pcoll | 'count2' >> beam.FlatMap(lambda x: counter.inc(len(x))) pcoll | 'dist' >> beam.FlatMap(lambda x: distribution.update(len(x))) pcoll | 'gauge' >> beam.FlatMap(lambda x: gauge.set(3)) + pcoll | 'string_set' >> beam.FlatMap(lambda x: string_set.add(x)) res = p.run() res.wait_until_finish() @@ -1238,6 +1241,10 @@ def test_metrics(self, check_gauge=True): .with_name('gauge'))['gauges'] self.assertEqual(gaug.committed.value, 3) + str_set, = res.metrics().query(beam.metrics.MetricsFilter() + .with_name('string_set'))['string_sets'] + self.assertEqual(str_set.committed, set(elements)) + def test_callbacks_with_exception(self): elements_list = ['1', '2'] diff --git a/sdks/python/apache_beam/runners/portability/portable_metrics.py b/sdks/python/apache_beam/runners/portability/portable_metrics.py index d7d330dd7e77..5bc3e0539181 100644 --- a/sdks/python/apache_beam/runners/portability/portable_metrics.py +++ b/sdks/python/apache_beam/runners/portability/portable_metrics.py @@ -27,18 +27,21 @@ def from_monitoring_infos(monitoring_info_list, user_metrics_only=False): - """Groups MonitoringInfo objects into counters, distributions and gauges. + """Groups MonitoringInfo objects into counters, distributions, gauges and + string sets Args: monitoring_info_list: An iterable of MonitoringInfo objects. user_metrics_only: If true, includes user metrics only. Returns: - A tuple containing three dictionaries: counters, distributions and gauges, - respectively. Each dictionary contains (MetricKey, metric result) pairs. + A tuple containing three dictionaries: counters, distributions, gauges and + string set, respectively. Each dictionary contains (MetricKey, metric + result) pairs. """ counters = {} distributions = {} gauges = {} + string_sets = {} for mi in monitoring_info_list: if (user_metrics_only and not monitoring_infos.is_user_monitoring_info(mi)): @@ -57,8 +60,10 @@ def from_monitoring_infos(monitoring_info_list, user_metrics_only=False): distributions[key] = metric_result elif monitoring_infos.is_gauge(mi): gauges[key] = metric_result + elif monitoring_infos.is_string_set(mi): + string_sets[key] = metric_result - return counters, distributions, gauges + return counters, distributions, gauges, string_sets def _create_metric_key(monitoring_info): diff --git a/sdks/python/apache_beam/runners/portability/portable_runner.py b/sdks/python/apache_beam/runners/portability/portable_runner.py index 92f123697a9d..ba48bbec6d3a 100644 --- a/sdks/python/apache_beam/runners/portability/portable_runner.py +++ b/sdks/python/apache_beam/runners/portability/portable_runner.py @@ -437,7 +437,7 @@ def _combine(committed, attempted, filter): ] def query(self, filter=None): - counters, distributions, gauges = [ + counters, distributions, gauges, stringsets = [ self._combine(x, y, filter) for x, y in zip(self.committed, self.attempted) ] @@ -445,7 +445,8 @@ def query(self, filter=None): return { self.COUNTERS: counters, self.DISTRIBUTIONS: distributions, - self.GAUGES: gauges + self.GAUGES: gauges, + self.STRINGSETS: stringsets } diff --git a/sdks/python/apache_beam/runners/portability/prism_runner_test.py b/sdks/python/apache_beam/runners/portability/prism_runner_test.py index f1ccf66a2289..324fe5a17b54 100644 --- a/sdks/python/apache_beam/runners/portability/prism_runner_test.py +++ b/sdks/python/apache_beam/runners/portability/prism_runner_test.py @@ -40,7 +40,9 @@ from apache_beam.runners.portability import portable_runner_test from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to +from apache_beam.transforms import window from apache_beam.transforms.sql import SqlTransform +from apache_beam.utils import timestamp # Run as # @@ -178,6 +180,26 @@ def create_options(self): return options + # Slightly more robust session window test: + # Validates that an inner grouping doesn't duplicate data either. + # Copied also because the timestamp in fn_runner_test.py isn't being + # inferred correctly as seconds for some reason, but as micros. + # The belabored specification is validating the timestamp type works at least. + # See https://github.com/apache/beam/issues/32085 + def test_windowing(self): + with self.create_pipeline() as p: + res = ( + p + | beam.Create([1, 2, 100, 101, 102, 123]) + | beam.Map( + lambda t: window.TimestampedValue( + ('k', t), timestamp.Timestamp.of(t).micros)) + | beam.WindowInto(beam.transforms.window.Sessions(10)) + | beam.GroupByKey() + | beam.Map(lambda k_vs1: (k_vs1[0], sorted(k_vs1[1])))) + assert_that( + res, equal_to([('k', [1, 2]), ('k', [100, 101, 102]), ('k', [123])])) + # Can't read host files from within docker, read a "local" file there. def test_read(self): print('name:', __name__) diff --git a/sdks/python/apache_beam/runners/portability/stager.py b/sdks/python/apache_beam/runners/portability/stager.py index 48dabe18aa36..98c0e3176f75 100644 --- a/sdks/python/apache_beam/runners/portability/stager.py +++ b/sdks/python/apache_beam/runners/portability/stager.py @@ -214,7 +214,8 @@ def create_job_resources(options, # type: PipelineOptions os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache') if (setup_options.requirements_cache is None) else setup_options.requirements_cache) - if not os.path.exists(requirements_cache_path): + if (setup_options.requirements_cache != SKIP_REQUIREMENTS_CACHE and + not os.path.exists(requirements_cache_path)): os.makedirs(requirements_cache_path) # Stage a requirements file if present. diff --git a/sdks/python/apache_beam/yaml/pipeline.schema.yaml b/sdks/python/apache_beam/yaml/pipeline.schema.yaml index f68a7306d941..c3937e611317 100644 --- a/sdks/python/apache_beam/yaml/pipeline.schema.yaml +++ b/sdks/python/apache_beam/yaml/pipeline.schema.yaml @@ -168,8 +168,10 @@ $defs: providerOrProviderInclude: if: - properties: - include {} + allOf: [ + { properties: { include: { type: string }}}, + { required: [ "include" ] } + ] then: $ref: '#/$defs/providerInclude' else: diff --git a/sdks/python/apache_beam/yaml/yaml_transform.py b/sdks/python/apache_beam/yaml/yaml_transform.py index c1c509ebde2c..ffef9bbcd8f0 100644 --- a/sdks/python/apache_beam/yaml/yaml_transform.py +++ b/sdks/python/apache_beam/yaml/yaml_transform.py @@ -1064,5 +1064,5 @@ def expand_pipeline( return YamlTransform( pipeline_as_composite(pipeline_spec['pipeline']), yaml_provider.merge_providers( - pipeline_spec.get('providers', []), providers or - {})).expand(beam.pvalue.PBegin(pipeline)) + yaml_provider.parse_providers(pipeline_spec.get('providers', [])), + providers or {})).expand(beam.pvalue.PBegin(pipeline)) diff --git a/sdks/python/build.gradle b/sdks/python/build.gradle index 72c696d30bf5..832f567c66fd 100644 --- a/sdks/python/build.gradle +++ b/sdks/python/build.gradle @@ -95,7 +95,7 @@ tasks.register("generateYamlDocs") { dependsOn ":sdks:java:extensions:sql:expansion-service:shadowJar" dependsOn ":sdks:java:io:expansion-service:build" dependsOn ":sdks:java:io:google-cloud-platform:expansion-service:build" - def extraPackages = "pyyaml markdown docstring_parser pandas pygments" + def extraPackages = "pyyaml markdown docstring_parser pandas pygments Jinja2" doLast { exec { diff --git a/sdks/python/container/boot.go b/sdks/python/container/boot.go index 710041e0f041..696604c64886 100644 --- a/sdks/python/container/boot.go +++ b/sdks/python/container/boot.go @@ -41,8 +41,8 @@ import ( pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/execx" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" - "github.com/golang/protobuf/jsonpb" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/proto" ) var ( @@ -217,12 +217,12 @@ func launchSDKProcess() error { os.Setenv("PIPELINE_OPTIONS", options) os.Setenv("SEMI_PERSISTENT_DIRECTORY", *semiPersistDir) - os.Setenv("LOGGING_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *loggingEndpoint})) - os.Setenv("CONTROL_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *controlEndpoint})) + os.Setenv("LOGGING_API_SERVICE_DESCRIPTOR", (&pipepb.ApiServiceDescriptor{Url: *loggingEndpoint}).String()) + os.Setenv("CONTROL_API_SERVICE_DESCRIPTOR", (&pipepb.ApiServiceDescriptor{Url: *controlEndpoint}).String()) os.Setenv("RUNNER_CAPABILITIES", strings.Join(info.GetRunnerCapabilities(), " ")) if info.GetStatusEndpoint() != nil { - os.Setenv("STATUS_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(info.GetStatusEndpoint())) + os.Setenv("STATUS_API_SERVICE_DESCRIPTOR", info.GetStatusEndpoint().String()) } if metadata := info.GetMetadata(); metadata != nil { @@ -441,7 +441,7 @@ func processArtifactsInSetupOnlyMode() { files := make([]string, len(infoJsons)) for i, info := range infoJsons { var artifactInformation pipepb.ArtifactInformation - if err := jsonpb.UnmarshalString(info, &artifactInformation); err != nil { + if err := protojson.Unmarshal([]byte(info), &artifactInformation); err != nil { log.Fatalf("Unable to unmarshal artifact information from json string %v", info) } diff --git a/sdks/python/container/license_scripts/dep_urls_py.yaml b/sdks/python/container/license_scripts/dep_urls_py.yaml index 6fc5129e35c2..0fe830b7ab6e 100644 --- a/sdks/python/container/license_scripts/dep_urls_py.yaml +++ b/sdks/python/container/license_scripts/dep_urls_py.yaml @@ -141,6 +141,8 @@ pip_dependencies: license: "https://raw.githubusercontent.com/jamescasbon/PyVCF/master/LICENSE" singledispatch: license: "file:///tmp/license_scripts/manual_licenses/singledispatch/LICENSE" + scikit-learn: + license: "https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/COPYING" scipy: license: "https://raw.githubusercontent.com/scipy/scipy/master/LICENSE.txt" soupsieve: diff --git a/sdks/python/container/piputil.go b/sdks/python/container/piputil.go index 113bf4054167..d6250ad2fdcd 100644 --- a/sdks/python/container/piputil.go +++ b/sdks/python/container/piputil.go @@ -32,6 +32,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/util/execx" ) +const pipLogFlushInterval time.Duration = 15 * time.Second const unrecoverableURL string = "https://beam.apache.org/documentation/sdks/python-unrecoverable-errors/index.html#pip-dependency-resolution-failures" // pipInstallRequirements installs the given requirement, if present. @@ -40,7 +41,7 @@ func pipInstallRequirements(ctx context.Context, logger *tools.Logger, files []s if err != nil { return err } - bufLogger := tools.NewBufferedLogger(logger) + bufLogger := tools.NewBufferedLoggerWithFlushInterval(ctx, logger, pipLogFlushInterval) for _, file := range files { if file == name { // We run the install process in two rounds in order to avoid as much @@ -48,7 +49,7 @@ func pipInstallRequirements(ctx context.Context, logger *tools.Logger, files []s // option will make sure that only things staged in the worker will be // used without following their dependencies. args := []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--no-cache-dir", "--disable-pip-version-check", "--no-index", "--no-deps", "--find-links", dir} - if err := execx.Execute(pythonVersion, args...); err != nil { + if err := execx.ExecuteEnvWithIO(nil, os.Stdin, bufLogger, bufLogger, pythonVersion, args...); err != nil { bufLogger.Printf(ctx, "Some packages could not be installed solely from the requirements cache. Installing packages from PyPI.") } // The second install round opens up the search for packages on PyPI and @@ -79,8 +80,6 @@ func isPackageInstalled(pkgName string) bool { return true } -const pipLogFlushInterval time.Duration = 15 * time.Second - // pipInstallPackage installs the given package, if present. func pipInstallPackage(ctx context.Context, logger *tools.Logger, files []string, dir, name string, force, optional bool, extras []string) error { pythonVersion, err := expansionx.GetPythonVersion() @@ -150,7 +149,7 @@ func pipInstallPackage(ctx context.Context, logger *tools.Logger, files []string // installExtraPackages installs all the packages declared in the extra // packages manifest file. func installExtraPackages(ctx context.Context, logger *tools.Logger, files []string, extraPackagesFile, dir string) error { - bufLogger := tools.NewBufferedLogger(logger) + bufLogger := tools.NewBufferedLoggerWithFlushInterval(ctx, logger, pipLogFlushInterval) // First check that extra packages manifest file is present. for _, file := range files { if file != extraPackagesFile { @@ -179,7 +178,7 @@ func installExtraPackages(ctx context.Context, logger *tools.Logger, files []str } func findBeamSdkWhl(ctx context.Context, logger *tools.Logger, files []string, acceptableWhlSpecs []string) string { - bufLogger := tools.NewBufferedLogger(logger) + bufLogger := tools.NewBufferedLoggerWithFlushInterval(ctx, logger, pipLogFlushInterval) for _, file := range files { if strings.HasPrefix(file, "apache_beam") { for _, s := range acceptableWhlSpecs { @@ -200,7 +199,7 @@ func findBeamSdkWhl(ctx context.Context, logger *tools.Logger, files []string, a // SDK from source tarball provided in sdkSrcFile. func installSdk(ctx context.Context, logger *tools.Logger, files []string, workDir string, sdkSrcFile string, acceptableWhlSpecs []string, required bool) error { sdkWhlFile := findBeamSdkWhl(ctx, logger, files, acceptableWhlSpecs) - bufLogger := tools.NewBufferedLogger(logger) + bufLogger := tools.NewBufferedLoggerWithFlushInterval(ctx, logger, pipLogFlushInterval) if sdkWhlFile != "" { // by default, pip rejects to install wheel if same version already installed isDev := strings.Contains(sdkWhlFile, ".dev") diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index 35eea227888b..31cf194fbbc4 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -21,76 +21,77 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -annotated-types==0.6.0 +annotated-types==0.7.0 async-timeout==4.0.3 -attrs==23.2.0 +attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 build==1.2.1 -cachetools==5.3.3 -certifi==2024.2.2 -cffi==1.16.0 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 -cryptography==42.0.5 +cryptography==43.0.0 Cython==3.0.10 -Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 dnspython==2.6.1 -docker==7.0.0 +docker==7.1.0 docopt==0.6.2 docstring_parser==0.16 -exceptiongroup==1.2.0 +exceptiongroup==1.2.2 execnet==2.1.1 -fastavro==1.9.4 +fastavro==1.9.5 fasteners==0.19 -freezegun==1.4.0 +freezegun==1.5.1 future==1.0.0 -google-api-core==2.18.0 -google-api-python-client==2.126.0 +google-api-core==2.19.1 +google-api-python-client==2.140.0 google-apitools==0.5.31 -google-auth==2.29.0 +google-auth==2.33.0 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.47.0 -google-cloud-bigquery==3.20.1 -google-cloud-bigquery-storage==2.24.0 -google-cloud-bigtable==2.23.1 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-bigquery-storage==2.25.0 +google-cloud-bigtable==2.25.0 google-cloud-core==2.4.1 -google-cloud-datastore==2.19.0 -google-cloud-dlp==3.16.0 -google-cloud-language==2.13.3 +google-cloud-datastore==2.20.0 +google-cloud-dlp==3.21.0 +google-cloud-language==2.14.0 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.21.1 -google-cloud-pubsublite==1.10.0 -google-cloud-recommendations-ai==0.10.10 -google-cloud-resource-manager==1.12.3 -google-cloud-spanner==3.44.0 -google-cloud-storage==2.16.0 -google-cloud-videointelligence==2.13.3 -google-cloud-vision==3.7.2 +google-cloud-pubsub==2.23.0 +google-cloud-pubsublite==1.11.1 +google-cloud-recommendations-ai==0.10.12 +google-cloud-resource-manager==1.12.5 +google-cloud-spanner==3.48.0 +google-cloud-storage==2.18.2 +google-cloud-videointelligence==2.13.5 +google-cloud-vision==3.7.4 google-crc32c==1.5.0 -google-resumable-media==2.7.0 -googleapis-common-protos==1.63.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 greenlet==3.0.3 -grpc-google-iam-v1==0.13.0 +grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.62.2 -grpcio-status==1.62.2 +grpcio==1.65.4 +grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.100.1 +hypothesis==6.110.1 idna==3.7 iniconfig==2.0.0 -joblib==1.4.0 +Jinja2==3.0.3 +joblib==1.4.2 Js2Py==0.74 -jsonpickle==3.0.4 -jsonschema==4.21.1 +jsonpickle==3.2.2 +jsonschema==4.23.0 jsonschema-specifications==2023.12.1 +MarkupSafe==2.1.5 mmh3==4.1.0 mock==5.1.0 nltk==3.8.1 @@ -98,60 +99,60 @@ nose==1.3.7 numpy==1.26.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.10.1 +orjson==3.10.7 overrides==7.7.0 -packaging==24.0 +packaging==24.1 pandas==2.1.4 parameterized==0.9.0 -pluggy==1.4.0 -proto-plus==1.23.0 -protobuf==4.25.3 +pluggy==1.5.0 +proto-plus==1.24.0 +protobuf==4.25.4 psycopg2-binary==2.9.9 -pyarrow==14.0.2 +pyarrow==16.1.0 pyarrow-hotfix==0.6 pyasn1==0.6.0 pyasn1_modules==0.4.0 pycparser==2.22 -pydantic==2.7.0 -pydantic_core==2.18.1 +pydantic==2.8.2 +pydantic_core==2.20.1 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.3 -PyMySQL==1.1.0 +pymongo==4.8.0 +PyMySQL==1.1.1 pyparsing==3.1.2 -pyproject_hooks==1.0.0 +pyproject_hooks==1.1.0 pytest==7.4.4 pytest-timeout==2.3.1 -pytest-xdist==3.5.0 +pytest-xdist==3.6.1 python-dateutil==2.9.0.post0 -python-snappy==0.7.1 +python-snappy==0.7.2 pytz==2024.1 -PyYAML==6.0.1 -redis==5.0.3 -referencing==0.34.0 -regex==2024.4.16 +PyYAML==6.0.2 +redis==5.0.8 +referencing==0.35.1 +regex==2024.7.24 requests==2.31.0 requests-mock==1.12.1 -rpds-py==0.18.0 +rpds-py==0.20.0 rsa==4.9 -scikit-learn==1.4.2 -scipy==1.13.0 -shapely==2.0.4 +scikit-learn==1.5.1 +scipy==1.14.0 +shapely==2.0.5 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==2.0.29 -sqlparse==0.5.0 -tenacity==8.2.3 +SQLAlchemy==2.0.32 +sqlparse==0.5.1 +tenacity==8.5.0 testcontainers==3.7.1 -threadpoolctl==3.4.0 +threadpoolctl==3.5.0 tomli==2.0.1 -tqdm==4.66.2 -typing_extensions==4.11.0 +tqdm==4.66.5 +typing_extensions==4.12.2 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 -urllib3==2.2.1 +urllib3==2.2.2 wrapt==1.16.0 -zstandard==0.22.0 +zstandard==0.23.0 diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt index d6d523689fa7..44b8a1edc8d7 100644 --- a/sdks/python/container/py311/base_image_requirements.txt +++ b/sdks/python/container/py311/base_image_requirements.txt @@ -21,74 +21,75 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -annotated-types==0.6.0 -attrs==23.2.0 +annotated-types==0.7.0 +attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 build==1.2.1 -cachetools==5.3.3 -certifi==2024.2.2 -cffi==1.16.0 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 -cryptography==42.0.5 +cryptography==43.0.0 Cython==3.0.10 -Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 dnspython==2.6.1 -docker==7.0.0 +docker==7.1.0 docopt==0.6.2 docstring_parser==0.16 execnet==2.1.1 -fastavro==1.9.4 +fastavro==1.9.5 fasteners==0.19 -freezegun==1.4.0 +freezegun==1.5.1 future==1.0.0 -google-api-core==2.18.0 -google-api-python-client==2.126.0 +google-api-core==2.19.1 +google-api-python-client==2.140.0 google-apitools==0.5.31 -google-auth==2.29.0 +google-auth==2.33.0 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.47.0 -google-cloud-bigquery==3.20.1 -google-cloud-bigquery-storage==2.24.0 -google-cloud-bigtable==2.23.1 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-bigquery-storage==2.25.0 +google-cloud-bigtable==2.25.0 google-cloud-core==2.4.1 -google-cloud-datastore==2.19.0 -google-cloud-dlp==3.16.0 -google-cloud-language==2.13.3 +google-cloud-datastore==2.20.0 +google-cloud-dlp==3.21.0 +google-cloud-language==2.14.0 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.21.1 -google-cloud-pubsublite==1.10.0 -google-cloud-recommendations-ai==0.10.10 -google-cloud-resource-manager==1.12.3 -google-cloud-spanner==3.44.0 -google-cloud-storage==2.16.0 -google-cloud-videointelligence==2.13.3 -google-cloud-vision==3.7.2 +google-cloud-pubsub==2.23.0 +google-cloud-pubsublite==1.11.1 +google-cloud-recommendations-ai==0.10.12 +google-cloud-resource-manager==1.12.5 +google-cloud-spanner==3.48.0 +google-cloud-storage==2.18.2 +google-cloud-videointelligence==2.13.5 +google-cloud-vision==3.7.4 google-crc32c==1.5.0 -google-resumable-media==2.7.0 -googleapis-common-protos==1.63.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 greenlet==3.0.3 -grpc-google-iam-v1==0.13.0 +grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.62.2 -grpcio-status==1.62.2 +grpcio==1.65.4 +grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.100.1 +hypothesis==6.110.1 idna==3.7 iniconfig==2.0.0 -joblib==1.4.0 +Jinja2==3.0.3 +joblib==1.4.2 Js2Py==0.74 -jsonpickle==3.0.4 -jsonschema==4.21.1 +jsonpickle==3.2.2 +jsonschema==4.23.0 jsonschema-specifications==2023.12.1 +MarkupSafe==2.1.5 mmh3==4.1.0 mock==5.1.0 nltk==3.8.1 @@ -96,59 +97,59 @@ nose==1.3.7 numpy==1.26.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.10.1 +orjson==3.10.7 overrides==7.7.0 -packaging==24.0 +packaging==24.1 pandas==2.1.4 parameterized==0.9.0 -pluggy==1.4.0 -proto-plus==1.23.0 -protobuf==4.25.3 +pluggy==1.5.0 +proto-plus==1.24.0 +protobuf==4.25.4 psycopg2-binary==2.9.9 -pyarrow==14.0.2 +pyarrow==16.1.0 pyarrow-hotfix==0.6 pyasn1==0.6.0 pyasn1_modules==0.4.0 pycparser==2.22 -pydantic==2.7.0 -pydantic_core==2.18.1 +pydantic==2.8.2 +pydantic_core==2.20.1 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.3 -PyMySQL==1.1.0 +pymongo==4.8.0 +PyMySQL==1.1.1 pyparsing==3.1.2 -pyproject_hooks==1.0.0 +pyproject_hooks==1.1.0 pytest==7.4.4 pytest-timeout==2.3.1 -pytest-xdist==3.5.0 +pytest-xdist==3.6.1 python-dateutil==2.9.0.post0 -python-snappy==0.7.1 +python-snappy==0.7.2 pytz==2024.1 -PyYAML==6.0.1 -redis==5.0.3 -referencing==0.34.0 -regex==2024.4.16 +PyYAML==6.0.2 +redis==5.0.8 +referencing==0.35.1 +regex==2024.7.24 requests==2.31.0 requests-mock==1.12.1 -rpds-py==0.18.0 +rpds-py==0.20.0 rsa==4.9 -scikit-learn==1.4.2 -scipy==1.13.0 -shapely==2.0.4 +scikit-learn==1.5.1 +scipy==1.14.0 +shapely==2.0.5 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==2.0.29 -sqlparse==0.5.0 -tenacity==8.2.3 +SQLAlchemy==2.0.32 +sqlparse==0.5.1 +tenacity==8.5.0 testcontainers==3.7.1 -threadpoolctl==3.4.0 -tqdm==4.66.2 -typing_extensions==4.11.0 +threadpoolctl==3.5.0 +tqdm==4.66.5 +typing_extensions==4.12.2 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 -urllib3==2.2.1 +urllib3==2.2.2 wrapt==1.16.0 -zstandard==0.22.0 +zstandard==0.23.0 diff --git a/sdks/python/container/py312/base_image_requirements.txt b/sdks/python/container/py312/base_image_requirements.txt index b7a5f3687166..76553ad6ab64 100644 --- a/sdks/python/container/py312/base_image_requirements.txt +++ b/sdks/python/container/py312/base_image_requirements.txt @@ -21,72 +21,74 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -annotated-types==0.6.0 -attrs==23.2.0 +annotated-types==0.7.0 +attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 build==1.2.1 -cachetools==5.3.3 -certifi==2024.2.2 -cffi==1.16.0 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 -cryptography==42.0.7 +cryptography==43.0.0 Cython==3.0.10 deprecation==2.1.0 dill==0.3.1.1 dnspython==2.6.1 -docker==7.0.0 +docker==7.1.0 docopt==0.6.2 docstring_parser==0.16 execnet==2.1.1 -fastavro==1.9.4 +fastavro==1.9.5 fasteners==0.19 -freezegun==1.5.0 +freezegun==1.5.1 future==1.0.0 -google-api-core==2.19.0 -google-api-python-client==2.128.0 +google-api-core==2.19.1 +google-api-python-client==2.140.0 google-apitools==0.5.31 -google-auth==2.29.0 +google-auth==2.33.0 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.50.0 -google-cloud-bigquery==3.22.0 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 google-cloud-bigquery-storage==2.25.0 -google-cloud-bigtable==2.23.1 +google-cloud-bigtable==2.25.0 google-cloud-core==2.4.1 -google-cloud-datastore==2.19.0 -google-cloud-dlp==3.16.0 -google-cloud-language==2.13.3 +google-cloud-datastore==2.20.0 +google-cloud-dlp==3.21.0 +google-cloud-language==2.14.0 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.21.1 -google-cloud-pubsublite==1.10.0 -google-cloud-recommendations-ai==0.10.10 -google-cloud-resource-manager==1.12.3 -google-cloud-spanner==3.46.0 -google-cloud-storage==2.16.0 -google-cloud-videointelligence==2.13.3 -google-cloud-vision==3.7.2 +google-cloud-pubsub==2.23.0 +google-cloud-pubsublite==1.11.1 +google-cloud-recommendations-ai==0.10.12 +google-cloud-resource-manager==1.12.5 +google-cloud-spanner==3.48.0 +google-cloud-storage==2.18.2 +google-cloud-videointelligence==2.13.5 +google-cloud-vision==3.7.4 google-crc32c==1.5.0 -google-resumable-media==2.7.0 -googleapis-common-protos==1.63.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 greenlet==3.0.3 -grpc-google-iam-v1==0.13.0 +grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.63.0 -grpcio-status==1.62.2 +grpcio==1.65.4 +grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.100.5 +hypothesis==6.110.1 idna==3.7 iniconfig==2.0.0 +Jinja2==3.0.3 joblib==1.4.2 -jsonpickle==3.0.4 -jsonschema==4.22.0 +jsonpickle==3.2.2 +jsonschema==4.23.0 jsonschema-specifications==2023.12.1 +MarkupSafe==2.1.5 mmh3==4.1.0 mock==5.1.0 nltk==3.8.1 @@ -94,59 +96,59 @@ nose==1.3.7 numpy==1.26.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.10.3 +orjson==3.10.7 overrides==7.7.0 -packaging==24.0 +packaging==24.1 pandas==2.1.4 parameterized==0.9.0 pluggy==1.5.0 -proto-plus==1.23.0 -protobuf==4.25.3 +proto-plus==1.24.0 +protobuf==4.25.4 psycopg2-binary==2.9.9 -pyarrow==14.0.2 +pyarrow==16.1.0 pyarrow-hotfix==0.6 pyasn1==0.6.0 pyasn1_modules==0.4.0 pycparser==2.22 -pydantic==2.7.1 -pydantic_core==2.18.2 +pydantic==2.8.2 +pydantic_core==2.20.1 pydot==1.4.2 PyHamcrest==2.1.0 -pymongo==4.7.1 -PyMySQL==1.1.0 +pymongo==4.8.0 +PyMySQL==1.1.1 pyparsing==3.1.2 pyproject_hooks==1.1.0 pytest==7.4.4 pytest-timeout==2.3.1 pytest-xdist==3.6.1 python-dateutil==2.9.0.post0 -python-snappy==0.7.1 +python-snappy==0.7.2 pytz==2024.1 -PyYAML==6.0.1 -redis==5.0.4 +PyYAML==6.0.2 +redis==5.0.8 referencing==0.35.1 -regex==2024.4.28 +regex==2024.7.24 requests==2.31.0 requests-mock==1.12.1 -rpds-py==0.18.1 +rpds-py==0.20.0 rsa==4.9 -scikit-learn==1.4.2 -scipy==1.13.0 -setuptools==69.5.1 -shapely==2.0.4 +scikit-learn==1.5.1 +scipy==1.14.0 +setuptools==72.1.0 +shapely==2.0.5 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==2.0.30 -sqlparse==0.5.0 -tenacity==8.3.0 +SQLAlchemy==2.0.32 +sqlparse==0.5.1 +tenacity==8.5.0 testcontainers==3.7.1 threadpoolctl==3.5.0 -tqdm==4.66.4 -typing_extensions==4.11.0 +tqdm==4.66.5 +typing_extensions==4.12.2 tzdata==2024.1 uritemplate==4.1.1 -urllib3==2.2.1 -wheel==0.43.0 +urllib3==2.2.2 +wheel==0.44.0 wrapt==1.16.0 -zstandard==0.22.0 +zstandard==0.23.0 diff --git a/sdks/python/container/py38/base_image_requirements.txt b/sdks/python/container/py38/base_image_requirements.txt index c92761473362..8e6d6f8d059a 100644 --- a/sdks/python/container/py38/base_image_requirements.txt +++ b/sdks/python/container/py38/base_image_requirements.txt @@ -21,79 +21,80 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -annotated-types==0.6.0 +annotated-types==0.7.0 async-timeout==4.0.3 -attrs==23.2.0 +attrs==24.2.0 backports.zoneinfo==0.2.1 beautifulsoup4==4.12.3 bs4==0.0.2 build==1.2.1 -cachetools==5.3.3 +cachetools==5.4.0 certifi==2024.7.4 -cffi==1.16.0 +cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 -cryptography==42.0.5 +cryptography==43.0.0 Cython==3.0.10 -Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 dnspython==2.6.1 -docker==7.0.0 +docker==7.1.0 docopt==0.6.2 docstring_parser==0.16 -exceptiongroup==1.2.0 +exceptiongroup==1.2.2 execnet==2.1.1 -fastavro==1.9.4 +fastavro==1.9.5 fasteners==0.19 -freezegun==1.4.0 +freezegun==1.5.1 future==1.0.0 -google-api-core==2.18.0 -google-api-python-client==2.126.0 +google-api-core==2.19.1 +google-api-python-client==2.140.0 google-apitools==0.5.31 -google-auth==2.29.0 +google-auth==2.33.0 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.47.0 -google-cloud-bigquery==3.20.1 -google-cloud-bigquery-storage==2.24.0 -google-cloud-bigtable==2.23.1 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-bigquery-storage==2.25.0 +google-cloud-bigtable==2.25.0 google-cloud-core==2.4.1 -google-cloud-datastore==2.19.0 -google-cloud-dlp==3.16.0 -google-cloud-language==2.13.3 +google-cloud-datastore==2.20.0 +google-cloud-dlp==3.21.0 +google-cloud-language==2.14.0 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.21.1 -google-cloud-pubsublite==1.10.0 -google-cloud-recommendations-ai==0.10.10 -google-cloud-resource-manager==1.12.3 -google-cloud-spanner==3.44.0 -google-cloud-storage==2.16.0 -google-cloud-videointelligence==2.13.3 -google-cloud-vision==3.7.2 +google-cloud-pubsub==2.23.0 +google-cloud-pubsublite==1.11.1 +google-cloud-recommendations-ai==0.10.12 +google-cloud-resource-manager==1.12.5 +google-cloud-spanner==3.48.0 +google-cloud-storage==2.18.2 +google-cloud-videointelligence==2.13.5 +google-cloud-vision==3.7.4 google-crc32c==1.5.0 -google-resumable-media==2.7.0 -googleapis-common-protos==1.63.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 greenlet==3.0.3 -grpc-google-iam-v1==0.13.0 +grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.62.2 -grpcio-status==1.62.2 +grpcio==1.65.4 +grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.100.1 +hypothesis==6.110.1 idna==3.7 -importlib_metadata==7.1.0 +importlib_metadata==8.2.0 importlib_resources==6.4.0 iniconfig==2.0.0 -joblib==1.4.0 +Jinja2==3.0.3 +joblib==1.4.2 Js2Py==0.74 -jsonpickle==3.0.4 -jsonschema==4.21.1 +jsonpickle==3.2.2 +jsonschema==4.23.0 jsonschema-specifications==2023.12.1 +MarkupSafe==2.1.5 mmh3==4.1.0 mock==5.1.0 nltk==3.8.1 @@ -101,62 +102,62 @@ nose==1.3.7 numpy==1.24.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.10.1 +orjson==3.10.7 overrides==7.7.0 -packaging==24.0 +packaging==24.1 pandas==2.0.3 parameterized==0.9.0 pkgutil_resolve_name==1.3.10 -pluggy==1.4.0 -proto-plus==1.23.0 -protobuf==4.25.3 +pluggy==1.5.0 +proto-plus==1.24.0 +protobuf==4.25.4 psycopg2-binary==2.9.9 -pyarrow==14.0.2 +pyarrow==16.1.0 pyarrow-hotfix==0.6 pyasn1==0.6.0 pyasn1_modules==0.4.0 pycparser==2.22 -pydantic==2.7.0 -pydantic_core==2.18.1 +pydantic==2.8.2 +pydantic_core==2.20.1 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.3 -PyMySQL==1.1.0 +pymongo==4.8.0 +PyMySQL==1.1.1 pyparsing==3.1.2 -pyproject_hooks==1.0.0 +pyproject_hooks==1.1.0 pytest==7.4.4 pytest-timeout==2.3.1 -pytest-xdist==3.5.0 +pytest-xdist==3.6.1 python-dateutil==2.9.0.post0 -python-snappy==0.7.1 +python-snappy==0.7.2 pytz==2024.1 -PyYAML==6.0.1 -redis==5.0.3 -referencing==0.34.0 -regex==2024.4.16 +PyYAML==6.0.2 +redis==5.0.8 +referencing==0.35.1 +regex==2024.7.24 requests==2.31.0 requests-mock==1.12.1 -rpds-py==0.18.0 +rpds-py==0.20.0 rsa==4.9 scikit-learn==1.3.2 scipy==1.10.1 -shapely==2.0.4 +shapely==2.0.5 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==2.0.29 -sqlparse==0.5.0 -tenacity==8.2.3 +SQLAlchemy==2.0.32 +sqlparse==0.5.1 +tenacity==8.5.0 testcontainers==3.7.1 -threadpoolctl==3.4.0 +threadpoolctl==3.5.0 tomli==2.0.1 -tqdm==4.66.3 -typing_extensions==4.11.0 +tqdm==4.66.5 +typing_extensions==4.12.2 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 -urllib3==2.2.1 +urllib3==2.2.2 wrapt==1.16.0 -zipp==3.18.1 -zstandard==0.22.0 +zipp==3.19.2 +zstandard==0.23.0 diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt index bab94181499c..8e07b087a6c3 100644 --- a/sdks/python/container/py39/base_image_requirements.txt +++ b/sdks/python/container/py39/base_image_requirements.txt @@ -21,77 +21,78 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -annotated-types==0.6.0 +annotated-types==0.7.0 async-timeout==4.0.3 -attrs==23.2.0 +attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 build==1.2.1 -cachetools==5.3.3 -certifi==2024.2.2 -cffi==1.16.0 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 -cryptography==42.0.5 +cryptography==43.0.0 Cython==3.0.10 -Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 dnspython==2.6.1 -docker==7.0.0 +docker==7.1.0 docopt==0.6.2 docstring_parser==0.16 -exceptiongroup==1.2.0 +exceptiongroup==1.2.2 execnet==2.1.1 -fastavro==1.9.4 +fastavro==1.9.5 fasteners==0.19 -freezegun==1.4.0 +freezegun==1.5.1 future==1.0.0 -google-api-core==2.18.0 -google-api-python-client==2.126.0 +google-api-core==2.19.1 +google-api-python-client==2.140.0 google-apitools==0.5.31 -google-auth==2.29.0 +google-auth==2.33.0 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.47.0 -google-cloud-bigquery==3.20.1 -google-cloud-bigquery-storage==2.24.0 -google-cloud-bigtable==2.23.1 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-bigquery-storage==2.25.0 +google-cloud-bigtable==2.25.0 google-cloud-core==2.4.1 -google-cloud-datastore==2.19.0 -google-cloud-dlp==3.16.0 -google-cloud-language==2.13.3 +google-cloud-datastore==2.20.0 +google-cloud-dlp==3.21.0 +google-cloud-language==2.14.0 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.21.1 -google-cloud-pubsublite==1.10.0 -google-cloud-recommendations-ai==0.10.10 -google-cloud-resource-manager==1.12.3 -google-cloud-spanner==3.44.0 -google-cloud-storage==2.16.0 -google-cloud-videointelligence==2.13.3 -google-cloud-vision==3.7.2 +google-cloud-pubsub==2.23.0 +google-cloud-pubsublite==1.11.1 +google-cloud-recommendations-ai==0.10.12 +google-cloud-resource-manager==1.12.5 +google-cloud-spanner==3.48.0 +google-cloud-storage==2.18.2 +google-cloud-videointelligence==2.13.5 +google-cloud-vision==3.7.4 google-crc32c==1.5.0 -google-resumable-media==2.7.0 -googleapis-common-protos==1.63.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 greenlet==3.0.3 -grpc-google-iam-v1==0.13.0 +grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.62.2 -grpcio-status==1.62.2 +grpcio==1.65.4 +grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.100.1 +hypothesis==6.110.1 idna==3.7 -importlib_metadata==7.1.0 +importlib_metadata==8.2.0 iniconfig==2.0.0 -joblib==1.4.0 +Jinja2==3.0.3 +joblib==1.4.2 Js2Py==0.74 -jsonpickle==3.0.4 -jsonschema==4.21.1 +jsonpickle==3.2.2 +jsonschema==4.23.0 jsonschema-specifications==2023.12.1 +MarkupSafe==2.1.5 mmh3==4.1.0 mock==5.1.0 nltk==3.8.1 @@ -99,61 +100,61 @@ nose==1.3.7 numpy==1.26.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.10.1 +orjson==3.10.7 overrides==7.7.0 -packaging==24.0 +packaging==24.1 pandas==2.1.4 parameterized==0.9.0 -pluggy==1.4.0 -proto-plus==1.23.0 -protobuf==4.25.3 +pluggy==1.5.0 +proto-plus==1.24.0 +protobuf==4.25.4 psycopg2-binary==2.9.9 -pyarrow==14.0.2 +pyarrow==16.1.0 pyarrow-hotfix==0.6 pyasn1==0.6.0 pyasn1_modules==0.4.0 pycparser==2.22 -pydantic==2.7.0 -pydantic_core==2.18.1 +pydantic==2.8.2 +pydantic_core==2.20.1 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.3 -PyMySQL==1.1.0 +pymongo==4.8.0 +PyMySQL==1.1.1 pyparsing==3.1.2 -pyproject_hooks==1.0.0 +pyproject_hooks==1.1.0 pytest==7.4.4 pytest-timeout==2.3.1 -pytest-xdist==3.5.0 +pytest-xdist==3.6.1 python-dateutil==2.9.0.post0 -python-snappy==0.7.1 +python-snappy==0.7.2 pytz==2024.1 -PyYAML==6.0.1 -redis==5.0.3 -referencing==0.34.0 -regex==2024.4.16 +PyYAML==6.0.2 +redis==5.0.8 +referencing==0.35.1 +regex==2024.7.24 requests==2.31.0 requests-mock==1.12.1 -rpds-py==0.18.0 +rpds-py==0.20.0 rsa==4.9 -scikit-learn==1.4.2 -scipy==1.13.0 -shapely==2.0.4 +scikit-learn==1.5.1 +scipy==1.13.1 +shapely==2.0.5 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==2.0.29 -sqlparse==0.5.0 -tenacity==8.2.3 +SQLAlchemy==2.0.32 +sqlparse==0.5.1 +tenacity==8.5.0 testcontainers==3.7.1 -threadpoolctl==3.4.0 +threadpoolctl==3.5.0 tomli==2.0.1 -tqdm==4.66.2 -typing_extensions==4.11.0 +tqdm==4.66.5 +typing_extensions==4.12.2 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 -urllib3==2.2.1 +urllib3==2.2.2 wrapt==1.16.0 -zipp==3.18.1 -zstandard==0.22.0 +zipp==3.19.2 +zstandard==0.23.0 diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 5f631e3dfdab..7802a7133d08 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -155,7 +155,7 @@ def cythonize(*args, **kwargs): # Exclude 1.5.0 and 1.5.1 because of # https://github.com/pandas-dev/pandas/issues/45725 dataframe_dependency = [ - 'pandas>=1.4.3,!=1.5.0,!=1.5.1,<2.2;python_version>="3.8"', + 'pandas>=1.4.3,!=1.5.0,!=1.5.1,<2.3;python_version>="3.8"', ] @@ -440,7 +440,7 @@ def get_portability_package_data(): 'google-cloud-datastore>=2.0.0,<3', 'google-cloud-pubsub>=2.1.0,<3', 'google-cloud-pubsublite>=1.2.0,<2', - 'google-cloud-storage>=2.16.0,<3', + 'google-cloud-storage>=2.18.2,<3', # GCP packages required by tests 'google-cloud-bigquery>=2.0.0,<4', 'google-cloud-bigquery-storage>=2.6.3,<3', diff --git a/sdks/python/test-suites/dataflow/common.gradle b/sdks/python/test-suites/dataflow/common.gradle index 8f3402035eb2..e5d301ecbe14 100644 --- a/sdks/python/test-suites/dataflow/common.gradle +++ b/sdks/python/test-suites/dataflow/common.gradle @@ -523,8 +523,8 @@ project.tasks.register("inferencePostCommitIT") { // Create cross-language tasks for running tests against Java expansion service(s) -def dataflowProject = project.findProperty('dataflowProject') ?: 'apache-beam-testing' -def dataflowRegion = project.findProperty('dataflowRegion') ?: 'us-central1' +def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing' +def gcpRegion = project.findProperty('gcpRegion') ?: 'us-central1' project(":sdks:python:test-suites:xlang").ext.xlangTasks.each { taskMetadata -> createCrossLanguageUsingJavaExpansionTask( @@ -533,8 +533,8 @@ project(":sdks:python:test-suites:xlang").ext.xlangTasks.each { taskMetadata -> collectMarker: taskMetadata.collectMarker, pythonPipelineOptions: [ "--runner=TestDataflowRunner", - "--project=${dataflowProject}", - "--region=${dataflowRegion}", + "--project=${gcpProject}", + "--region=${gcpRegion}", "--sdk_container_image=gcr.io/apache-beam-testing/beam-sdk/beam_python${project.ext.pythonVersion}_sdk:latest", "--sdk_harness_container_image_overrides=.*java.*,gcr.io/apache-beam-testing/beam-sdk/beam_java8_sdk:latest" ], diff --git a/sdks/python/test-suites/direct/common.gradle b/sdks/python/test-suites/direct/common.gradle index c79c5f66abbc..e290e8003b13 100644 --- a/sdks/python/test-suites/direct/common.gradle +++ b/sdks/python/test-suites/direct/common.gradle @@ -436,7 +436,7 @@ project.tasks.register("inferencePostCommitIT") { } // Create cross-language tasks for running tests against Java expansion service(s) -def gcpProject = project.findProperty('dataflowProject') ?: 'apache-beam-testing' +def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing' project(":sdks:python:test-suites:xlang").ext.xlangTasks.each { taskMetadata -> createCrossLanguageUsingJavaExpansionTask( diff --git a/sdks/python/test-suites/direct/xlang/build.gradle b/sdks/python/test-suites/direct/xlang/build.gradle index 289f5c8a0e07..3003329aef59 100644 --- a/sdks/python/test-suites/direct/xlang/build.gradle +++ b/sdks/python/test-suites/direct/xlang/build.gradle @@ -44,7 +44,7 @@ def cleanupTask = project.tasks.register("fnApiJobServerCleanup", Exec) { args '-c', ". ${envDir}/bin/activate && python -m apache_beam.runners.portability.local_job_service_main --pid_file ${pidFile} --stop" } -def gcpProject = project.findProperty('dataflowProject') ?: 'apache-beam-testing' +def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing' createCrossLanguageValidatesRunnerTask( startJobServer: setupTask, diff --git a/settings.gradle.kts b/settings.gradle.kts index f29f8a70ce0c..4d4b93908a02 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -24,7 +24,7 @@ pluginManagement { } plugins { - id("com.gradle.develocity") version "3.17.5" + id("com.gradle.develocity") version "3.17.6" id("com.gradle.common-custom-user-data-gradle-plugin") version "2.0.1" } diff --git a/website/www/site/config.toml b/website/www/site/config.toml index 7fe6df7a2c7a..6675cf418bdd 100644 --- a/website/www/site/config.toml +++ b/website/www/site/config.toml @@ -104,7 +104,7 @@ github_project_repo = "https://github.com/apache/beam" [params] description = "Apache Beam is an open source, unified model and set of language-specific SDKs for defining and executing data processing workflows, and also data ingestion and integration flows, supporting Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the mechanics of large-scale batch and streaming data processing and can run on a number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in different languages, allowing users to easily implement their data integration processes." -release_latest = "2.57.0" +release_latest = "2.58.0" # The repository and branch where the files live in Github or Colab. This is used # to serve and stage from your local branch, but publish to the master branch. # e.g. https://github.com/{{< param branch_repo >}}/path/to/notebook.ipynb diff --git a/website/www/site/content/en/blog/beam-2.49.0.md b/website/www/site/content/en/blog/beam-2.49.0.md index a2e7af0e18f8..4dbc08693f5b 100644 --- a/website/www/site/content/en/blog/beam-2.49.0.md +++ b/website/www/site/content/en/blog/beam-2.49.0.md @@ -52,6 +52,7 @@ For more information on changes in 2.49.0, check out the [detailed release notes * Long-running Python pipelines might experience a memory leak: [#28246](https://github.com/apache/beam/issues/28246). * Python SDK's cross-language Bigtable sink mishandles records that don't have an explicit timestamp set: [#28632](https://github.com/apache/beam/issues/28632). To avoid this issue, set explicit timestamps for all records before writing to Bigtable. +* Python pipelines using the `--impersonate_service_account` option with BigQuery IOs might fail on Dataflow ([#32030](https://github.com/apache/beam/issues/32030)). This is fixed in 2.59.0 release. ## List of Contributors diff --git a/website/www/site/content/en/blog/beam-2.58.0.md b/website/www/site/content/en/blog/beam-2.58.0.md new file mode 100644 index 000000000000..603403cd7fdb --- /dev/null +++ b/website/www/site/content/en/blog/beam-2.58.0.md @@ -0,0 +1,130 @@ +--- +title: "Apache Beam 2.58.0" +date: 2024-08-06 13:00:00 -0800 +categories: + - blog + - release +authors: + - jrmccluskey +--- + + +We are happy to present the new 2.58.0 release of Beam. +This release includes both improvements and new functionality. +See the [download page](/get-started/downloads/#2580-2024-08-06) for this release. + + + +For more information about changes in 2.58.0, check out the [detailed release notes](https://github.com/apache/beam/milestone/22). + +## I/Os + +* Support for [Solace](https://solace.com/) source (`SolaceIO.Read`) added (Java) ([#31440](https://github.com/apache/beam/issues/31440)). + +## New Features / Improvements + +* Multiple RunInference instances can now share the same model instance by setting the model_identifier parameter (Python) ([#31665](https://github.com/apache/beam/issues/31665)). +* Added options to control the number of Storage API multiplexing connections ([#31721](https://github.com/apache/beam/pull/31721)) +* [BigQueryIO] Better handling for batch Storage Write API when it hits AppendRows throughput quota ([#31837](https://github.com/apache/beam/pull/31837)) +* [IcebergIO] All specified catalog properties are passed through to the connector ([#31726](https://github.com/apache/beam/pull/31726)) +* Removed a third-party LGPL dependency from the Go SDK ([#31765](https://github.com/apache/beam/issues/31765)). +* Support for `MapState` and `SetState` when using Dataflow Runner v1 with Streaming Engine (Java) ([[#18200](https://github.com/apache/beam/issues/18200)]) + +## Breaking Changes + +* [IcebergIO] `IcebergCatalogConfig` was changed to support specifying catalog properties in a key-store fashion ([#31726](https://github.com/apache/beam/pull/31726)) +* [SpannerIO] Added validation that query and table cannot be specified at the same time for `SpannerIO.read()`. Previously `withQuery` overrides `withTable`, if set ([#24956](https://github.com/apache/beam/issues/24956)). + +## Bug fixes + +* [BigQueryIO] Fixed a bug in batch Storage Write API that frequently exhausted concurrent connections quota ([#31710](https://github.com/apache/beam/pull/31710)) + +## List of Contributors + +According to git shortlog, the following people contributed to the 2.58.0 release. Thank you to all contributors! + +Ahmed Abualsaud + +Ahmet Altay + +Alexandre Moueddene + +Alexey Romanenko + +Andrew Crites + +Bartosz Zablocki + +Celeste Zeng + +Chamikara Jayalath + +Clay Johnson + +Damon Douglass + +Danny McCormick + +Dilnaz Amanzholova + +Florian Bernard + +Francis O'Hara + +George Ma + +Israel Herraiz + +Jack McCluskey + +Jaehyeon Kim + +James Roseman + +Kenneth Knowles + +Maciej Szwaja + +Michel Davit + +Minh Son Nguyen + +Naireen + +Niel Markwick + +Oliver Cardoza + +Robert Bradshaw + +Robert Burke + +Rohit Sinha + +S. Veyrié + +Sam Whittle + +Shunping Huang + +Svetak Sundhar + +TongruiLi + +Tony Tang + +Valentyn Tymofieiev + +Vitaly Terentyev + +Yi Hu \ No newline at end of file diff --git a/website/www/site/content/en/documentation/dsls/sql/extensions/user-defined-functions.md b/website/www/site/content/en/documentation/dsls/sql/extensions/user-defined-functions.md index 9bd05b536bde..9515145dc07d 100644 --- a/website/www/site/content/en/documentation/dsls/sql/extensions/user-defined-functions.md +++ b/website/www/site/content/en/documentation/dsls/sql/extensions/user-defined-functions.md @@ -111,7 +111,7 @@ public static class SquareSum extends CombineFn { String sql = "SELECT f_int1, squaresum(f_int2) " + "FROM PCOLLECTION " - + "GROUP BY f_int2"; + + "GROUP BY f_int1"; // Create and apply the PTransform representing the query. // Register the UDAFs used in the query by calling '.registerUdaf()' by diff --git a/website/www/site/content/en/documentation/io/connectors.md b/website/www/site/content/en/documentation/io/connectors.md index d390a9248cd7..313f72ce622a 100644 --- a/website/www/site/content/en/documentation/io/connectors.md +++ b/website/www/site/content/en/documentation/io/connectors.md @@ -1196,5 +1196,21 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ ✘ + + + Beam PyIO (Collection of Python IO connectors) + + ✔ + ✔ + Not available + + ✔ + native + + Not available + Not available + ✔ + ✔ + diff --git a/website/www/site/content/en/documentation/ml/large-language-modeling.md b/website/www/site/content/en/documentation/ml/large-language-modeling.md index 79ef58e6de31..90bbd43383c0 100644 --- a/website/www/site/content/en/documentation/ml/large-language-modeling.md +++ b/website/www/site/content/en/documentation/ml/large-language-modeling.md @@ -27,7 +27,7 @@ RunInference has several mechanisms for reducing memory utilization. For example Many Beam runners, however, run multiple Beam processes per machine at once. This can cause problems since the memory footprint of loading large models like LLMs multiple times can be too large to fit into a single machine. For memory-intensive models, RunInference provides a mechanism for more intelligently sharing memory across multiple processes to reduce the overall memory footprint. To enable this mode, users just have -to set the parameter `large_model` to True in their model configuration (see below for an example), and Beam will take care of the memory management. +to set the parameter `large_model` to True in their model configuration (see below for an example), and Beam will take care of the memory management. When using a custom model handler, you can override the `share_model_across_processes` function or the `model_copies` function for a similar effect. ### Running an Example Pipeline with T5 @@ -122,3 +122,51 @@ A `ModelHandler` requires parameters like: * `device` – The device on which you wish to run the model. If device = GPU then a GPU device will be used if it is available. Otherwise, it will be CPU. * `inference_fn` - The inference function to use during RunInference. * `large_model` - (see `Memory Management` above). Whether to use memory minimization techniques to lower the memory footprint of your model. + +### Troubleshooting Large Models + +#### Pickling errors + +When sharing a model across processes with `large_model=True` or using a custom model handler, Beam sends the input and output data across a process boundary. +To do this, it uses a serialization method known as [pickling](https://docs.python.org/3/library/pickle.html). +For example, if you call `output=model.my_inference_fn(input_1, input_2)`, `input_1`, `input_2`, and `output` will all need to be pickled. +The model itself does not need to be pickled since it is not passed across process boundaries. + +While most objects can be pickled without issue, if one of these objects is unpickleable you may run into errors like `error: can't pickle fasttext_pybind.fasttext objects`. +To work around this, there are a few options: + +First of all, if possible you can choose not to share your model across processes. This will incur additional memory pressure, but it may be tolerable in some cases. + +Second, using a custom model handler you can wrap your model to take in and return serializable types. For example, if your model handler looks like: + +``` +class MyModelHandler(): + def load_model(self): + return model_loading_logic() + + def run_inference(self, batch: Sequence[str], model, inference_args): + unpickleable_object = Unpickleable(batch) + unpickleable_returned = model.predict(unpickleable_object) + my_output = int(unpickleable_returned[0]) + return my_output +``` + +you could instead wrap the unpickleable pieces in a model wrapper. Since the model wrapper will sit in the inference process, this will work as long as it only takes in/returns pickleable objects. + +``` +class MyWrapper(): + def __init__(self, model): + self._model = model + + def predict(self, batch: Sequence[str]): + unpickleable_object = Unpickleable(batch) + unpickleable_returned = model.predict(unpickleable_object) + return int(prediction[0]) + +class MyModelHandler(): + def load_model(self): + return MyWrapper(model_loading_logic()) + + def run_inference(self, batch: Sequence[str], model: MyWrapper, inference_args): + return model.predict(unpickleable_object) +``` diff --git a/website/www/site/content/en/get-started/downloads.md b/website/www/site/content/en/get-started/downloads.md index 8f3b92ef9f2a..b7db1ddd65b6 100644 --- a/website/www/site/content/en/get-started/downloads.md +++ b/website/www/site/content/en/get-started/downloads.md @@ -96,10 +96,17 @@ versions denoted `0.x.y`. ## Releases +### 2.58.0 (2024-08-06) +Official [source code download](https://downloads.apache.org/beam/2.58.0/apache-beam-2.58.0-source-release.zip). +[SHA-512](https://downloads.apache.org/beam/2.58.0/apache-beam-2.58.0-source-release.zip.sha512). +[signature](https://downloads.apache.org/beam/2.58.0/apache-beam-2.58.0-source-release.zip.asc). + +[Release notes](https://github.com/apache/beam/releases/tag/v2.58.0) + ### 2.57.0 (2024-06-26) -Official [source code download](https://downloads.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip). -[SHA-512](https://downloads.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip.sha512). -[signature](https://downloads.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip.asc). +Official [source code download](https://archive.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip). +[SHA-512](https://archive.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip.sha512). +[signature](https://archive.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip.asc). [Release notes](https://github.com/apache/beam/releases/tag/v2.57.0) diff --git a/website/www/site/content/en/get-started/quickstart-java.md b/website/www/site/content/en/get-started/quickstart-java.md index 26d758f994db..b3ff77174e0b 100644 --- a/website/www/site/content/en/get-started/quickstart-java.md +++ b/website/www/site/content/en/get-started/quickstart-java.md @@ -139,7 +139,7 @@ if (project.hasProperty("dataflow-runner")) { {{< /highlight >}} 4. At the end of the build script, add the following task: {{< highlight >}} -task("execute", JavaExec::class) { +task execute (type: JavaExec) { classpath = sourceSets["main"].runtimeClasspath mainClass.set(System.getProperty("mainClass")) }