Skip to content

Commit

Permalink
[SPARK-38094][SQL][FOLLOWUP] Fix exception message and add a test case
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Minor follow ups on #35385:
1. Add a nested schema test
2. Fixed an error message.

### Why are the changes needed?
Better observability.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Existing test

Closes #35700 from jackierwzhang/SPARK-38094-minor.

Authored-by: jackierwzhang <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
jackierwzhang authored and dongjoon-hyun committed Mar 2, 2022
1 parent 80f25ad commit 5664403
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ object ParquetReadSupport extends Logging {
"Spark read schema expects field Ids, " +
"but Parquet file schema doesn't contain any field Ids.\n" +
"Please remove the field ids from Spark schema or ignore missing ids by " +
"setting `spark.sql.parquet.fieldId.ignoreMissing = true`\n" +
s"setting `${SQLConf.IGNORE_MISSING_PARQUET_FIELD_ID.key} = true`\n" +
s"""
|Spark read schema:
|${catalystRequestedSchema.prettyJson}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import org.apache.spark.SparkException
import org.apache.spark.sql.{QueryTest, Row}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SharedSparkSession
import org.apache.spark.sql.types.{IntegerType, Metadata, MetadataBuilder, StringType, StructType}
import org.apache.spark.sql.types.{ArrayType, IntegerType, MapType, Metadata, MetadataBuilder, StringType, StructType}

class ParquetFieldIdIOSuite extends QueryTest with ParquetTest with SharedSparkSession {

Expand Down Expand Up @@ -107,6 +107,35 @@ class ParquetFieldIdIOSuite extends QueryTest with ParquetTest with SharedSparkS
}
}

test("SPARK-38094: absence of field ids: reading nested schema") {
withTempDir { dir =>
// now with nested schema/complex type
val readSchema =
new StructType()
.add("a", IntegerType, true, withId(1))
.add("b", ArrayType(StringType), true, withId(2))
.add("c", new StructType().add("c1", IntegerType, true, withId(6)), true, withId(3))
.add("d", MapType(StringType, StringType), true, withId(4))
.add("e", IntegerType, true, withId(5))

val writeSchema =
new StructType()
.add("a", IntegerType, true, withId(5))
.add("randomName", StringType, true)

val writeData = Seq(Row(100, "text"), Row(200, "more"))

spark.createDataFrame(writeData.asJava, writeSchema)
.write.mode("overwrite").parquet(dir.getCanonicalPath)

withAllParquetReaders {
checkAnswer(spark.read.schema(readSchema).parquet(dir.getCanonicalPath),
// a, b, c, d all couldn't be found
Row(null, null, null, null, 100) :: Row(null, null, null, null, 200) :: Nil)
}
}
}

test("multiple id matches") {
withTempDir { dir =>
val readSchema =
Expand Down

0 comments on commit 5664403

Please sign in to comment.