From f4afa2215a1a390d9f099a26155fbefc5beefbe9 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Tue, 4 Jun 2024 20:33:51 +0800 Subject: [PATCH] [SPARK-48506][CORE] Compression codec short names are case insensitive except for event logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? Compression codec short names, e.g. map statuses, broadcasts, shuffle, parquet/orc/avro outputs, are case insensitive except for event logging. Calling `org.apache.spark.io.CompressionCodec.getShortName` causes this issue. In this PR, we make `CompressionCodec.getShortName` handle case sensitivity correctly. ### Why are the changes needed? Feature parity ### Does this PR introduce _any_ user-facing change? Yes, spark.eventLog.compression.codec now accepts not only the lowercased form of lz4, lzf, snappy, and zstd, but also forms with any of the characters to be upcased。 ### How was this patch tested? new tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #46847 from yaooqinn/SPARK-48506. Authored-by: Kent Yao Signed-off-by: yangjie01 --- .../org/apache/spark/io/CompressionCodec.scala | 5 +++-- .../apache/spark/io/CompressionCodecSuite.scala | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala index 7d5a86d1a81da..233228a9c6d4c 100644 --- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala +++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala @@ -101,8 +101,9 @@ private[spark] object CompressionCodec { * If it is already a short name, just return it. */ def getShortName(codecName: String): String = { - if (shortCompressionCodecNames.contains(codecName)) { - codecName + val lowercasedCodec = codecName.toLowerCase(Locale.ROOT) + if (shortCompressionCodecNames.contains(lowercasedCodec)) { + lowercasedCodec } else { shortCompressionCodecNames .collectFirst { case (k, v) if v == codecName => k } diff --git a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala index 729fcecff1207..5c09a1f965b9e 100644 --- a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala +++ b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.io import java.io.{ByteArrayInputStream, ByteArrayOutputStream} +import java.util.Locale import com.google.common.io.ByteStreams @@ -160,4 +161,18 @@ class CompressionCodecSuite extends SparkFunSuite { ByteStreams.readFully(concatenatedBytes, decompressed) assert(decompressed.toSeq === (0 to 127)) } + + test("SPARK-48506: CompressionCodec getShortName is case insensitive for short names") { + CompressionCodec.shortCompressionCodecNames.foreach { case (shortName, codecClass) => + assert(CompressionCodec.getShortName(shortName) === shortName) + assert(CompressionCodec.getShortName(shortName.toUpperCase(Locale.ROOT)) === shortName) + assert(CompressionCodec.getShortName(codecClass) === shortName) + checkError( + exception = intercept[SparkIllegalArgumentException] { + CompressionCodec.getShortName(codecClass.toUpperCase(Locale.ROOT)) + }, + errorClass = "CODEC_SHORT_NAME_NOT_FOUND", + parameters = Map("codecName" -> codecClass.toUpperCase(Locale.ROOT))) + } + } }