Skip to content

Commit

Permalink
Halt Spark executor when encountering unrecoverable CUDA errors (#5350)
Browse files Browse the repository at this point in the history
Closes #5029

Detects unrecoverable (fatal) CUDA errors through the cuDF utility, which applys a more comprehensive way to determine whether a CUDA error is fatal or not.

Signed-off-by: sperlingxx <[email protected]>

Co-authored-by: Jason Lowe <[email protected]>
  • Loading branch information
sperlingxx and jlowe authored Jun 13, 2022
1 parent a5165ef commit 4f95734
Showing 1 changed file with 14 additions and 11 deletions.
25 changes: 14 additions & 11 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import scala.collection.mutable.{Map => MutableMap}
import scala.util.Try
import scala.util.matching.Regex

import ai.rapids.cudf.{CudaException, CudaFatalException, CudfException}
import com.nvidia.spark.rapids.python.PythonWorkerSemaphore

import org.apache.spark.{ExceptionFailure, SparkConf, SparkContext, TaskFailedReason}
Expand Down Expand Up @@ -288,19 +289,21 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
override def onTaskFailed(failureReason: TaskFailedReason): Unit = {
failureReason match {
case ef: ExceptionFailure =>
val unrecoverableErrors = Seq("cudaErrorIllegalAddress", "cudaErrorLaunchTimeout",
"cudaErrorHardwareStackError", "cudaErrorIllegalInstruction",
"cudaErrorMisalignedAddress", "cudaErrorInvalidAddressSpace", "cudaErrorInvalidPc",
"cudaErrorLaunchFailure", "cudaErrorExternalDevice", "cudaErrorUnknown",
"cudaErrorECCUncorrectable")
if (unrecoverableErrors.exists(ef.description.contains(_)) ||
unrecoverableErrors.exists(ef.toErrorString.contains(_))) {
logError("Stopping the Executor based on exception being a fatal CUDA error: " +
s"${ef.toErrorString}")
System.exit(20)
ef.exception match {
case Some(_: CudaFatalException) =>
logError("Stopping the Executor based on exception being a fatal CUDA error: " +
s"${ef.toErrorString}")
System.exit(20)
case Some(_: CudaException) =>
logDebug(s"Executor onTaskFailed because of a non-fatal CUDA error: " +
s"${ef.toErrorString}")
case Some(_: CudfException) =>
logDebug(s"Executor onTaskFailed because of a CUDF error: ${ef.toErrorString}")
case _ =>
logDebug(s"Executor onTaskFailed: ${ef.toErrorString}")
}
case other =>
logDebug(s"Executor onTaskFailed not a CUDA fatal error: ${other.toString}")
logDebug(s"Executor onTaskFailed: ${other.toString}")
}
}
}
Expand Down

0 comments on commit 4f95734

Please sign in to comment.