[SPARK-48506][CORE] Compression codec short names are case insensitiv…

…e except for event logging ### What changes were proposed in this pull request? Compression codec short names, e.g. map statuses, broadcasts, shuffle, parquet/orc/avro outputs, are case insensitive except for event logging. Calling `org.apache.spark.io.CompressionCodec.getShortName` causes this issue. In this PR, we make `CompressionCodec.getShortName` handle case sensitivity correctly. ### Why are the changes needed? Feature parity ### Does this PR introduce _any_ user-facing change? Yes, spark.eventLog.compression.codec now accepts not only the lowercased form of lz4, lzf, snappy, and zstd, but also forms with any of the characters to be upcased。 ### How was this patch tested? new tests ### Was this patch authored or co-authored using generative AI tooling? no Closes apache#46847 from yaooqinn/SPARK-48506. Authored-by: Kent Yao <[email protected]> Signed-off-by: yangjie01 <[email protected]>
riyaverm-db · Jun 4, 2024 · f4afa22 · f4afa22
1 parent d273fdf
commit f4afa22
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 2 deletions.
diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -101,8 +101,9 @@ private[spark] object CompressionCodec {
    * If it is already a short name, just return it.
    */
   def getShortName(codecName: String): String = {
-    if (shortCompressionCodecNames.contains(codecName)) {
-      codecName
+    val lowercasedCodec = codecName.toLowerCase(Locale.ROOT)
+    if (shortCompressionCodecNames.contains(lowercasedCodec)) {
+      lowercasedCodec
     } else {
       shortCompressionCodecNames
         .collectFirst { case (k, v) if v == codecName => k }

diff --git a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.io
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+import java.util.Locale
 
 import com.google.common.io.ByteStreams
 
@@ -160,4 +161,18 @@ class CompressionCodecSuite extends SparkFunSuite {
     ByteStreams.readFully(concatenatedBytes, decompressed)
     assert(decompressed.toSeq === (0 to 127))
   }
+
+  test("SPARK-48506: CompressionCodec getShortName is case insensitive for short names") {
+    CompressionCodec.shortCompressionCodecNames.foreach { case (shortName, codecClass) =>
+      assert(CompressionCodec.getShortName(shortName) === shortName)
+      assert(CompressionCodec.getShortName(shortName.toUpperCase(Locale.ROOT)) === shortName)
+      assert(CompressionCodec.getShortName(codecClass) === shortName)
+      checkError(
+        exception = intercept[SparkIllegalArgumentException] {
+          CompressionCodec.getShortName(codecClass.toUpperCase(Locale.ROOT))
+        },
+        errorClass = "CODEC_SHORT_NAME_NOT_FOUND",
+        parameters = Map("codecName" -> codecClass.toUpperCase(Locale.ROOT)))
+    }
+  }
 }