NVIDIA · kuhushukla · Nov 22, 2024 · Nov 25, 2024 · Nov 25, 2024 · revans2
diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py
@@ -26,10 +26,17 @@
 pytestmark = pytest.mark.nightly_resource_consuming_test
 
 orc_write_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-        string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+        string_gen, BooleanGen(nullable=False), DateGen(start=date(1590, 1, 1)),
+        TimestampGen(start=datetime(1970, 1, 1, tzinfo=timezone.utc)) ] + \
+        decimal_gens
+# Use every type except boolean , see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, we can replace this list with
+# orc_write_basic_gens
+orc_write_basic_gens_for_structs = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
+        string_gen, DateGen(start=date(1590, 1, 1)),
         TimestampGen(start=datetime(1970, 1, 1, tzinfo=timezone.utc)) ] + \
         decimal_gens
-
 all_nulls_string_gen = SetValuesGen(StringType(), [None])
 empty_or_null_string_gen = SetValuesGen(StringType(), [None, ""])
 all_empty_string_gen = SetValuesGen(StringType(), [""])
@@ -52,7 +59,8 @@
         all_nulls_map_gen,
         all_empty_map_gen]
 
-orc_write_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_write_basic_gens)])
+orc_write_basic_struct_gen = StructGen(
+    [['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_write_basic_gens_for_structs)])
 
 orc_write_struct_gens_sample = [orc_write_basic_struct_gen,
     StructGen([['child0', byte_gen], ['child1', orc_write_basic_struct_gen]]),
@@ -64,13 +72,14 @@
     ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))]
 
 orc_write_basic_map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [
-    BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
+    ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
     # Using timestamps from 1970 to work around a cudf ORC bug
     # https://github.com/NVIDIA/spark-rapids/issues/140.
     lambda nullable=True: TimestampGen(start=datetime(1970, 1, 1, tzinfo=timezone.utc), nullable=nullable),
     lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable),
     lambda nullable=True: DecimalGen(precision=15, scale=1, nullable=nullable),
-    lambda nullable=True: DecimalGen(precision=36, scale=5, nullable=nullable)]]
+    lambda nullable=True: DecimalGen(precision=36, scale=5, nullable=nullable)]] + [MapGen(
+    f(nullable=False), f(nullable=False)) for f in [BooleanGen]]
 
 orc_write_gens_list = [orc_write_basic_gens,
         orc_write_struct_gens_sample,
@@ -79,6 +88,8 @@
         pytest.param([date_gen], marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/139')),
         pytest.param([timestamp_gen], marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/140'))]
 
+nullable_bools_gen = [pytest.param([BooleanGen(nullable=True)],
+                                 marks=pytest.mark.allow_non_gpu('ExecutedCommandExec','DataWritingCommandExec'))]
 @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn)
 @pytest.mark.parametrize('orc_impl', ["native", "hive"])
 @allow_non_gpu(*non_utc_allow)
@@ -91,6 +102,18 @@ def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl):
             data_path,
             conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
 
+@pytest.mark.parametrize('orc_gens', nullable_bools_gen, ids=idfn)
+@pytest.mark.parametrize('orc_impl', ["native", "hive"])
+@allow_non_gpu('ExecutedCommandExec', 'DataWritingCommandExec', 'WriteFilesExec')
+def test_write_round_trip_null_bool(spark_tmp_path, orc_gens, orc_impl):
+    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
+    data_path = spark_tmp_path + '/ORC_DATA'
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.orc(path),
+        lambda spark, path: spark.read.orc(path),
+        data_path,
+        conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
+
 @pytest.mark.parametrize('orc_gen', orc_write_odd_empty_strings_gens_sample, ids=idfn)
 @pytest.mark.parametrize('orc_impl', ["native", "hive"])
 def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl):
@@ -103,7 +126,7 @@ def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl):
             conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
 
 orc_part_write_gens = [
-        byte_gen, short_gen, int_gen, long_gen, boolean_gen,
+        byte_gen, short_gen, int_gen, long_gen, BooleanGen(nullable=False),
         # Some file systems have issues with UTF8 strings so to help the test pass even there
         StringGen('(\\w| ){0,50}'),
         # Once https://github.com/NVIDIA/spark-rapids/issues/139 is fixed replace this with
@@ -345,7 +368,7 @@ def test_orc_write_column_name_with_dots(spark_tmp_path):
                 ("f.g", int_gen),
                 ("h", string_gen)])),
             ("i.j", long_gen)])),
-        ("k", boolean_gen)]
+        ("k", BooleanGen(nullable=False))]
     assert_gpu_and_cpu_writes_are_equal_collect(
         lambda spark, path:  gen_df(spark, gens).coalesce(1).write.orc(path),
         lambda spark, path: spark.read.orc(path),

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -1243,6 +1243,14 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern")
     .booleanConf
     .createWithDefault(true)
 
+  val ENABLE_ORC_NULLABLE_BOOL = conf("spark.rapids.sql.format.orc.write.boolType.enabled")
+    .doc("When set to false disables nullable boolean columns for ORC writes." +
+      "Set to true if your data does not have null booleans and want tp experiment" +
+      "See https://github.com/NVIDIA/spark-rapids/issues/11736.")
+    .internal()
+    .booleanConf
+    .createWithDefault(false)
+
   val ENABLE_EXPAND_PREPROJECT = conf("spark.rapids.sql.expandPreproject.enabled")
     .doc("When set to false disables the pre-projection for GPU Expand. " +
       "Pre-projection leverages the tiered projection to evaluate expressions that " +
@@ -2964,6 +2972,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val maxNumOrcFilesParallel: Int = get(ORC_MULTITHREAD_READ_MAX_NUM_FILES_PARALLEL)
 
+  lazy val isOrcBoolNullTypeEnabled: Boolean = get(ENABLE_ORC_NULLABLE_BOOL)
+
   lazy val isCsvEnabled: Boolean = get(ENABLE_CSV)
 
   lazy val isCsvReadEnabled: Boolean = get(ENABLE_CSV_READ)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala
@@ -44,6 +44,27 @@ object GpuOrcFileFormat extends Logging {
     cls == classOf[OrcFileFormat] || cls.getCanonicalName.equals(HIVE_IMPL_CLASS)
   }
 
+  private def checkForBoolNulls(dataType: DataType): Boolean = {
+    dataType match {
+      case ArrayType(elementType, t) => elementType == BooleanType && t
+      case StructType(fields) =>
+        fields.exists { f =>
+          hasBoolNulls(f.dataType, f.nullable)
+        }
+      case MapType(_, valueType, t) => hasBoolNulls(valueType, t)
+    }
+  }
+
+  private def hasBoolNulls(d: DataType, nulls: Boolean) = {
+    if (nulls && d == BooleanType) {
+      true
+    } else if (DataTypeUtils.isNestedType(d)) {
+      checkForBoolNulls(d)
+    } else {
+      false
+    }
+  }
+
   def tagGpuSupport(meta: RapidsMeta[_, _, _],
                     spark: SparkSession,
                     options: Map[String, String],
@@ -83,6 +104,12 @@ object GpuOrcFileFormat extends Logging {
     // [[org.apache.spark.sql.execution.datasources.DaysWritable]] object
     // which is a subclass of [[org.apache.hadoop.hive.serde2.io.DateWritable]].
     val types = schema.map(_.dataType).toSet
+    val res = schema.exists {
+      case field if field.dataType == BooleanType && field.nullable => true
+      case field if DataTypeUtils.isNestedType(field.dataType) => checkForBoolNulls(field.dataType)
+      case _ => false
+    }
+
     if (types.exists(GpuOverrides.isOrContainsDateOrTimestamp(_))) {
       if (!GpuOverrides.isUTCTimezone()) {
         meta.willNotWorkOnGpu("Only UTC timezone is supported for ORC. " +
@@ -91,6 +118,10 @@ object GpuOrcFileFormat extends Logging {
       }
     }
 
+    if (res && !meta.conf.isOrcBoolNullTypeEnabled) {
+      meta.willNotWorkOnGpu("Nullable Booleans can not work in certain cases with ORC writer." +
+        "See https://github.com/rapidsai/cudf/issues/6763")
+    }
     FileFormatChecks.tag(meta, schema, OrcFormatType, WriteFileOp)
 
     val sqlConf = spark.sessionState.conf