From 207fbfced356e0fecb0181b647e782086842fb31 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Fri, 1 Apr 2022 12:50:42 -0500 Subject: [PATCH] On task failure catch some CUDA exceptions and kill executor [databricks] (#5118) * On task failure catch some cuda exceptions and kill executor Signed-off-by: Thomas Graves * include other exceptions * cleanup logs * fix message * update other message to debug * dont' call super * comment out checking all error string * fxi extra space * Check the entire stack trace * remove extra comment --- .../com/nvidia/spark/rapids/Plugin.scala | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala index 1c10dad637f..36a7ef86540 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala @@ -28,7 +28,7 @@ import scala.util.matching.Regex import com.nvidia.spark.rapids.python.PythonWorkerSemaphore import com.nvidia.spark.rapids.shims.SparkShimImpl -import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.{ExceptionFailure, SparkConf, SparkContext, TaskFailedReason} import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext} import org.apache.spark.internal.Logging import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} @@ -275,6 +275,25 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging { GpuDeviceManager.shutdown() Option(rapidsShuffleHeartbeatEndpoint).foreach(_.close()) } + + override def onTaskFailed(failureReason: TaskFailedReason): Unit = { + failureReason match { + case ef: ExceptionFailure => + val unrecoverableErrors = Seq("cudaErrorIllegalAddress", "cudaErrorLaunchTimeout", + "cudaErrorHardwareStackError", "cudaErrorIllegalInstruction", + "cudaErrorMisalignedAddress", "cudaErrorInvalidAddressSpace", "cudaErrorInvalidPc", + "cudaErrorLaunchFailure", "cudaErrorExternalDevice", "cudaErrorUnknown", + "cudaErrorECCUncorrectable") + if (unrecoverableErrors.exists(ef.description.contains(_)) || + unrecoverableErrors.exists(ef.toErrorString.contains(_))) { + logError("Stopping the Executor based on exception being a fatal CUDA error: " + + s"${ef.toErrorString}") + System.exit(20) + } + case other => + logDebug(s"Executor onTaskFailed not a CUDA fatal error: ${other.toString}") + } + } } object RapidsExecutorPlugin {