Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Halt Spark executor when encountering unrecoverable CUDA errors #5350

Merged
merged 11 commits into from
Jun 13, 2022
25 changes: 14 additions & 11 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import scala.collection.mutable.{Map => MutableMap}
import scala.util.Try
import scala.util.matching.Regex

import ai.rapids.cudf.{CudaException, CudaFatalException, CudfException}
import com.nvidia.spark.rapids.python.PythonWorkerSemaphore

import org.apache.spark.{ExceptionFailure, SparkConf, SparkContext, TaskFailedReason}
Expand Down Expand Up @@ -288,19 +289,21 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
override def onTaskFailed(failureReason: TaskFailedReason): Unit = {
failureReason match {
case ef: ExceptionFailure =>
val unrecoverableErrors = Seq("cudaErrorIllegalAddress", "cudaErrorLaunchTimeout",
"cudaErrorHardwareStackError", "cudaErrorIllegalInstruction",
"cudaErrorMisalignedAddress", "cudaErrorInvalidAddressSpace", "cudaErrorInvalidPc",
"cudaErrorLaunchFailure", "cudaErrorExternalDevice", "cudaErrorUnknown",
"cudaErrorECCUncorrectable")
if (unrecoverableErrors.exists(ef.description.contains(_)) ||
unrecoverableErrors.exists(ef.toErrorString.contains(_))) {
logError("Stopping the Executor based on exception being a fatal CUDA error: " +
s"${ef.toErrorString}")
System.exit(20)
ef.exception match {
case _: CudaFatalException =>
logError("Stopping the Executor based on exception being a fatal CUDA error: " +
s"${ef.toErrorString}")
System.exit(20)
case _: CudaException =>
logDebug(s"Executor onTaskFailed because of a non-fatal CUDA error: " +
s"${ef.toErrorString}")
case _: CudfException =>
logDebug(s"Executor onTaskFailed because of a CUDF error: ${ef.toErrorString}")
sperlingxx marked this conversation as resolved.
Show resolved Hide resolved
case _ =>
logDebug(s"Executor onTaskFailed: ${ef.toErrorString}")
}
case other =>
logDebug(s"Executor onTaskFailed not a CUDA fatal error: ${other.toString}")
logDebug(s"Executor onTaskFailed: ${other.toString}")
}
}
}
Expand Down