From cd6bb9569e5f8a0a9c6b55473c13a0b453ee6c8f Mon Sep 17 00:00:00 2001 From: Moritz Mack Date: Wed, 29 Jun 2022 14:51:50 +0200 Subject: [PATCH] Deprecate runner support for Spark 2.4 (closes #22094) --- CHANGES.md | 1 + .../spark/translation/SparkContextFactory.java | 8 +++++++- .../content/en/documentation/runners/spark.md | 15 ++++++++------- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 9a5873ae940b..53af5dd28b0b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -70,6 +70,7 @@ ## Deprecations +* Support for Spark 2.4.x is deprecated and will be dropped with the release of Beam 2.44.0 or soon after (Spark runner) ([#22094](https://github.com/apache/beam/issues/22094)). * X behavior is deprecated and will be removed in X versions ([#X](https://github.com/apache/beam/issues/X)). ## Bugfixes diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkContextFactory.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkContextFactory.java index 9f9465ccde8f..4b714b655818 100644 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkContextFactory.java +++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkContextFactory.java @@ -143,6 +143,12 @@ private static JavaSparkContext createSparkContext(SparkPipelineOptions options) conf.setAppName(options.getAppName()); // register immutable collections serializers because the SDK uses them. conf.set("spark.kryo.registrator", SparkRunnerKryoRegistrator.class.getName()); - return new JavaSparkContext(conf); + JavaSparkContext jsc = new JavaSparkContext(conf); + if (jsc.sc().version().startsWith("2")) { + LOG.warn( + "Support for Spark 2 is deprecated, this runner will be removed in a few releases.\n" + + "Spark 2 is reaching its EOL, consider migrating to Spark 3."); + } + return jsc; } } diff --git a/website/www/site/content/en/documentation/runners/spark.md b/website/www/site/content/en/documentation/runners/spark.md index 91b72d542a75..abc1031840ba 100644 --- a/website/www/site/content/en/documentation/runners/spark.md +++ b/website/www/site/content/en/documentation/runners/spark.md @@ -67,7 +67,8 @@ the portable Runner. For more information on portability, please visit the ## Spark Runner prerequisites and setup -The Spark runner currently supports Spark's 2.x branch, and more specifically any version greater than 2.4.0. +The Spark runner currently supports Spark's 3.1.x branch. +> **Note:** Support for Spark 2.4.x is deprecated and will be dropped with the release of Beam 2.44.0 (or soon after). {{< paragraph class="language-java" >}} You can add a dependency on the latest version of the Spark runner by adding to your pom.xml the following: @@ -76,7 +77,7 @@ You can add a dependency on the latest version of the Spark runner by adding to {{< highlight java >}} org.apache.beam - beam-runners-spark + beam-runners-spark-3 {{< param release_latest >}} {{< /highlight >}} @@ -90,13 +91,13 @@ In some cases, such as running in local mode/Standalone, your (self-contained) a {{< highlight java >}} org.apache.spark - spark-core_2.11 + spark-core_2.12 ${spark.version} org.apache.spark - spark-streaming_2.11 + spark-streaming_2.12 ${spark.version} {{< /highlight >}} @@ -193,7 +194,7 @@ download it on the [Downloads page](/get-started/downloads/). {{< paragraph class="language-py" >}} 1. Start the JobService endpoint: * with Docker (preferred): `docker run --net=host apache/beam_spark_job_server:latest` - * or from Beam source code: `./gradlew :runners:spark:2:job-server:runShadow` + * or from Beam source code: `./gradlew :runners:spark:3:job-server:runShadow` {{< /paragraph >}} {{< paragraph class="language-py" >}} @@ -228,7 +229,7 @@ For more details on the different deployment modes see: [Standalone](https://spa {{< paragraph class="language-py" >}} 2. Start JobService that will connect with the Spark master: * with Docker (preferred): `docker run --net=host apache/beam_spark_job_server:latest --spark-master-url=spark://localhost:7077` - * or from Beam source code: `./gradlew :runners:spark:2:job-server:runShadow -PsparkMasterUrl=spark://localhost:7077` + * or from Beam source code: `./gradlew :runners:spark:3:job-server:runShadow -PsparkMasterUrl=spark://localhost:7077` {{< /paragraph >}} {{< paragraph class="language-py" >}}3. Submit the pipeline as above. @@ -246,7 +247,7 @@ To run Beam jobs written in Python, Go, and other supported languages, you can u The following example runs a portable Beam job in Python from the Dataproc cluster's master node with Yarn backed. -> Note: This example executes successfully with Dataproc 2.0, Spark 2.4.8 and 3.1.2 and Beam 2.37.0. +> Note: This example executes successfully with Dataproc 2.0, Spark 3.1.2 and Beam 2.37.0. 1. Create a Dataproc cluster with [Docker](https://cloud.google.com/dataproc/docs/concepts/components/docker) component enabled.