From f4afa2215a1a390d9f099a26155fbefc5beefbe9 Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Tue, 4 Jun 2024 20:33:51 +0800
Subject: [PATCH] [SPARK-48506][CORE] Compression codec short names are case
 insensitive except for event logging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

Compression codec short names, e.g. map statuses, broadcasts, shuffle, parquet/orc/avro outputs, are case insensitive except for event logging. Calling `org.apache.spark.io.CompressionCodec.getShortName` causes this issue.

In this PR, we make `CompressionCodec.getShortName` handle case sensitivity correctly.

### Why are the changes needed?

Feature parity

### Does this PR introduce _any_ user-facing change?

Yes, spark.eventLog.compression.codec now accepts not only the lowercased form of lz4, lzf, snappy, and zstd, but also forms with any of the characters to be upcased。

### How was this patch tested?

new tests
### Was this patch authored or co-authored using generative AI tooling?
no

Closes #46847 from yaooqinn/SPARK-48506.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: yangjie01 <yangjie01@baidu.com>
---
 .../org/apache/spark/io/CompressionCodec.scala    |  5 +++--
 .../apache/spark/io/CompressionCodecSuite.scala   | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index 7d5a86d1a81da..233228a9c6d4c 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -101,8 +101,9 @@ private[spark] object CompressionCodec {
    * If it is already a short name, just return it.
    */
   def getShortName(codecName: String): String = {
-    if (shortCompressionCodecNames.contains(codecName)) {
-      codecName
+    val lowercasedCodec = codecName.toLowerCase(Locale.ROOT)
+    if (shortCompressionCodecNames.contains(lowercasedCodec)) {
+      lowercasedCodec
     } else {
       shortCompressionCodecNames
         .collectFirst { case (k, v) if v == codecName => k }
diff --git a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
index 729fcecff1207..5c09a1f965b9e 100644
--- a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
+++ b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.io
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+import java.util.Locale
 
 import com.google.common.io.ByteStreams
 
@@ -160,4 +161,18 @@ class CompressionCodecSuite extends SparkFunSuite {
     ByteStreams.readFully(concatenatedBytes, decompressed)
     assert(decompressed.toSeq === (0 to 127))
   }
+
+  test("SPARK-48506: CompressionCodec getShortName is case insensitive for short names") {
+    CompressionCodec.shortCompressionCodecNames.foreach { case (shortName, codecClass) =>
+      assert(CompressionCodec.getShortName(shortName) === shortName)
+      assert(CompressionCodec.getShortName(shortName.toUpperCase(Locale.ROOT)) === shortName)
+      assert(CompressionCodec.getShortName(codecClass) === shortName)
+      checkError(
+        exception = intercept[SparkIllegalArgumentException] {
+          CompressionCodec.getShortName(codecClass.toUpperCase(Locale.ROOT))
+        },
+        errorClass = "CODEC_SHORT_NAME_NOT_FOUND",
+        parameters = Map("codecName" -> codecClass.toUpperCase(Locale.ROOT)))
+    }
+  }
 }