From 122370cb3fa497434244929d052ce73a61fe5108 Mon Sep 17 00:00:00 2001 From: MithunR Date: Mon, 21 Aug 2023 08:56:44 -0700 Subject: [PATCH] Retry with smaller split on CudfColumnSizeOverflowException Depends on https://github.com/rapidsai/cudf/pull/13911. When a CUDF operation causes a column's size to exceed the valid range for CUDF columns (i.e. cudf::size_type), CUDF will throw an exception. Prior to this commit, the `RmmRapidsRetryIterator` does not attempt retries with smaller splits, in this case. Instead, the overflow is treated as a generic exception. This commit allows the RmmRapidsRetryIterator to recognize the exception specific to the overflow case (i.e. `CudfColumnSizeOverflowException`), and attempt a split-retry. Note: This error condition is difficult to reproduce. The catch/retry is a "best effort" attempt not to fail the entire task. Signed-off-by: MithunR --- .../spark/rapids/RmmRapidsRetryIterator.scala | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala index f25628cb66b3..fc307bd0a755 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala @@ -18,6 +18,8 @@ package com.nvidia.spark.rapids import scala.collection.mutable +import ai.rapids.cudf.CudfColumnSizeOverflowException + import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource} import com.nvidia.spark.rapids.RapidsPluginImplicits._ import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion @@ -580,9 +582,14 @@ object RmmRapidsRetryIterator extends Logging { lastException = ex if (!topLevelIsRetry && !causedByRetry) { - // we want to throw early here, since we got an exception - // we were not prepared to handle - throw lastException + // If the exception is the result of a CUDF column size overflow, attempt split-retry. + ex match { + case _: CudfColumnSizeOverflowException => doSplit = true + case _ => + // we want to throw early here, since we got an exception + // we were not prepared to handle + throw lastException + } } // else another exception wrapped a retry. So we are going to try again }