From 0ff58147a5fee327e782c840b160737091b5f66a Mon Sep 17 00:00:00 2001 From: anumicrosoftlab <163318412+anumicrosoftlab@users.noreply.github.com> Date: Wed, 26 Jun 2024 12:07:12 -0400 Subject: [PATCH] Create SparklensJAR_for_Spark3.x.md --- SparklensJAR_for_Spark3.x.md | 116 +++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 SparklensJAR_for_Spark3.x.md diff --git a/SparklensJAR_for_Spark3.x.md b/SparklensJAR_for_Spark3.x.md new file mode 100644 index 0000000..a5cd88b --- /dev/null +++ b/SparklensJAR_for_Spark3.x.md @@ -0,0 +1,116 @@ +#### Problem: +Latest JARs in Maven Central repo support Spark 2.X and doesn't work with Spark 3.X. Here are modifications you need to make to run on Spark 3.X. + +#### Steps to run Sparklens on Spark 3.X: +1. Install sbt: 0.13.18 +2. Clone from the repo: qubole/sparklens: Qubole Sparklens tool for performance tuning Apache Spark (github.com) +3. Change directory: cd sparklens +4. Changing plugins.sbt: +Comment out addSbtPlugin (addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.4")) + ``` + addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") + + resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven" + + // addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.4") + ``` +6. In build.sbt file, comment out spName, sparkVersion, spAppendScalaVersion as they were using the ':=' operator which is used for setting keys in earlier sbt version. Declare these three as variables in this context. Comment out the line where it is using 'sparkVersion.version' to get the version of Spark. However, 'sparkVersion' is a String and does not have a 'version' property. Hence, it has been replaced with 'sparkVersion'. + ```` + name := "sparklens" + organization := "com.qubole" + + scalaVersion := "2.12.0" + + crossScalaVersions := Seq("2.10.6", "2.12.0") + + // spName := "qubole/sparklens" + + // sparkVersion := "2.0.0" + + // spAppendScalaVersion := true + + val spName = "qubole/sparklens" + + val sparkVersion = "3.0.0" + + val spAppendScalaVersion = true + + + // libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion.version % "provided" + + libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion % "provided" + + libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.0.0" + + libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.6.5" % "provided" + + libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.5.6" % "provided" + + libraryDependencies += "org.apache.httpcomponents" % "httpmime" % "4.5.6" % "provided" + + test in assembly := {} + + testOptions in Test += Tests.Argument("-oF") + + scalacOptions ++= Seq("-target:jvm-1.7") + + javacOptions ++= Seq("-source", "1.7", "-target", "1.7") + + publishMavenStyle := true + + + licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0")) + + credentials += Credentials(Path.userHome / ".ivy2" / ".sbtcredentials") + + + pomExtra := + https://github.com/qubole/sparklens + + git@github.com:qubole/sparklens.git + scm:git:git@github.com:qubole/sparklens.git + + + + iamrohit + Rohit Karlupia + https://github.com/iamrohit + + + beriaanirudh + Anirudh Beria + https://github.com/beriaanirudh + + + mayurdb + Mayur Bhosale + https://github.com/mayurdb + + + + ``` +7. Change the scala version to 2.12.0 and spark version to make Sparklens work on support 3.X. Add spark-sql 3.0.0 library dependency (libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.0.0") +8. In QuboleJobListener.scala (src\main\scala\com\qubole\sparklens\QuboleJobListener.scala), change attemptId to attemptNumber(). +9. In the HDFSConfigHelper.scala (src\main\scala\com\qubole\sparklens\helper\HDFSConfigHelper.scala), SparkHadoopUtil class has been changed to a private class in Spark 3. Modify this as shown below: + + ``` + import org.apache.hadoop.conf.Configuration + import org.apache.spark.SparkConf + import org.apache.spark.deploy.SparkHadoopUtil + import org.apache.spark.sql.SparkSession + + object HDFSConfigHelper { + def getHadoopConf(sparkConfOptional: Option[SparkConf]): Configuration = { + if (sparkConfOptional.isDefined) { + val spark = SparkSession.builder.config(sparkConfOptional.get).getOrCreate() + spark.sparkContext.hadoopConfiguration + } else { + val spark = SparkSession.builder.getOrCreate() + spark.sparkContext.hadoopConfiguration + } + } + } + ``` + +10. Run "sbt compile" to compile the revised code: +10.Run "sbt package" to package the compiled code to sparklens JAR (target\scala-2.12\sparklens_2.12-0.3.2.jar).