From 11fcd2aceda261392482dab72dd666a8e6bcc732 Mon Sep 17 00:00:00 2001 From: skestle Date: Thu, 13 May 2021 16:54:20 +1200 Subject: [PATCH] Updated to scala 2.13 and spark 3.2.0-SNAPSHOT Issues with: - StructType string changes - Fragile schema tests because scala's .groupBy function is not deterministic in ordering, and deep schemas don't resolve --- build.sbt | 10 ++++++---- .../github/mrpowers/spark/daria/sql/functions.scala | 5 +++-- .../mrpowers/spark/daria/sql/DataFrameExtTest.scala | 12 ++++++++++-- .../spark/daria/sql/DataFrameSchemaCheckerTest.scala | 2 +- .../spark/daria/sql/SparkSessionTestWrapper.scala | 4 ++++ 5 files changed, 24 insertions(+), 9 deletions(-) diff --git a/build.sbt b/build.sbt index fba934fe..336458b6 100644 --- a/build.sbt +++ b/build.sbt @@ -4,14 +4,16 @@ organization := "com.github.mrpowers" name := "spark-daria" version := "1.0.0" -crossScalaVersions := Seq("2.12.12") -scalaVersion := "2.12.12" -val sparkVersion = "3.0.1" +crossScalaVersions := Seq("2.13.5") +scalaVersion := crossScalaVersions.value.head +val sparkVersion = "3.2.0-SNAPSHOT" + +resolvers += "Apache Snapshots" at "https://repository.apache.org/snapshots" libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion % "provided" libraryDependencies += "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided" libraryDependencies += "com.github.mrpowers" %% "spark-fast-tests" % "1.0.0" % "test" -libraryDependencies += "com.lihaoyi" %% "utest" % "0.6.3" % "test" +libraryDependencies += "com.lihaoyi" %% "utest" % "0.7.9" % "test" libraryDependencies += "com.lihaoyi" %% "os-lib" % "0.7.1" % "test" testFrameworks += new TestFramework("com.github.mrpowers.spark.daria.CustomFramework") diff --git a/src/main/scala/com/github/mrpowers/spark/daria/sql/functions.scala b/src/main/scala/com/github/mrpowers/spark/daria/sql/functions.scala index e357146b..54d9b483 100644 --- a/src/main/scala/com/github/mrpowers/spark/daria/sql/functions.scala +++ b/src/main/scala/com/github/mrpowers/spark/daria/sql/functions.scala @@ -121,9 +121,9 @@ object functions { val d = Option(delimiters).getOrElse(return None) val c = Option(colName).getOrElse(return None) // initialize the previousLetter to be the null character - the closest representation of the empty character: https://stackoverflow.com/questions/8306060/how-do-i-represent-an-empty-char-in-scala - var previousLetter: Char = '\0' + var previousLetter: Char = '\u0000' Some(c.map { letter: Char => - if (d.contains(previousLetter) || previousLetter.equals('\0')) { + if (d.contains(previousLetter) || previousLetter.equals('\u0000')) { previousLetter = letter letter.toUpper } else { @@ -423,6 +423,7 @@ object functions { arr.view .map(f(_)) .filter(_ != null) + .toSeq ) } } diff --git a/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameExtTest.scala b/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameExtTest.scala index 7ad238ce..304ebd38 100644 --- a/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameExtTest.scala +++ b/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameExtTest.scala @@ -1014,18 +1014,26 @@ object DataFrameExtTest extends TestSuite with DataFrameComparer with SparkSessi StructType(schema) ) + println("DF Schema:") df.printSchema() val delimiter = "_" + println("FlattenedDF Schema:") df.flattenSchema(delimiter).printSchema() val expectedDF = df .flattenSchema(delimiter) .structureSchema(delimiter) .setNullableForAllColumns(true) //for some reason spark changes nullability of struct columns + println("ExpectedDF Schema:") expectedDF.printSchema() + // Work around test fragility that breaks when scala 2.13 applies groupBy in a different order DariaValidator.validateSchema( - expectedDF, - schema + expectedDF.select("z"), + StructType(Seq(schema.apply("z"))) + ) + DariaValidator.validateSchema( + expectedDF.select("foo.baz", "foo.bar"), + schema.apply("foo").dataType.asInstanceOf[StructType] ) } } diff --git a/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaCheckerTest.scala b/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaCheckerTest.scala index b7c6823c..caca3cfe 100644 --- a/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaCheckerTest.scala +++ b/src/test/scala/com/github/mrpowers/spark/daria/sql/DataFrameSchemaCheckerTest.scala @@ -220,7 +220,7 @@ object DataFrameSchemaCheckerTest extends TestSuite with SparkSessionTestWrapper ) val expected = - "The [StructField(name,StringType,true)] StructFields are not included in the DataFrame with the following StructFields [StructType(StructField(num1,IntegerType,true), StructField(num2,IntegerType,true))]" + "The [StructField(name,StringType,true)] StructFields are not included in the DataFrame with the following StructFields [Seq(StructField(num1,IntegerType,true), StructField(num2,IntegerType,true))]" assert(c.missingStructFieldsMessage() == expected) diff --git a/src/test/scala/com/github/mrpowers/spark/daria/sql/SparkSessionTestWrapper.scala b/src/test/scala/com/github/mrpowers/spark/daria/sql/SparkSessionTestWrapper.scala index 7e1aff81..3ba23ad9 100644 --- a/src/test/scala/com/github/mrpowers/spark/daria/sql/SparkSessionTestWrapper.scala +++ b/src/test/scala/com/github/mrpowers/spark/daria/sql/SparkSessionTestWrapper.scala @@ -13,6 +13,10 @@ trait SparkSessionTestWrapper { "spark.sql.shuffle.partitions", "1" ) + // Time zone tests + .config("spark.sql.session.timeZone", "GMT") + .config("spark.executor.extraJavaOptions", "-Duser.timezone=GMT") + .config("spark.driver.extraJavaOptions", "-Duser.timezone=GMT") .getOrCreate() }