Skip to content

Commit

Permalink
On task failure catch some CUDA exceptions and kill executor [databri…
Browse files Browse the repository at this point in the history
…cks] (#5118)

* On task failure catch some cuda exceptions and kill executor

Signed-off-by: Thomas Graves <[email protected]>

* include other exceptions

* cleanup logs

* fix message

* update other message to debug

* dont' call super

* comment out checking all error string

* fxi extra space

* Check the entire stack trace

* remove extra comment
  • Loading branch information
tgravescs authored Apr 1, 2022
1 parent 679721e commit 207fbfc
Showing 1 changed file with 20 additions and 1 deletion.
21 changes: 20 additions & 1 deletion sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import scala.util.matching.Regex
import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
import com.nvidia.spark.rapids.shims.SparkShimImpl

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.{ExceptionFailure, SparkConf, SparkContext, TaskFailedReason}
import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext}
import org.apache.spark.internal.Logging
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
Expand Down Expand Up @@ -275,6 +275,25 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
GpuDeviceManager.shutdown()
Option(rapidsShuffleHeartbeatEndpoint).foreach(_.close())
}

override def onTaskFailed(failureReason: TaskFailedReason): Unit = {
failureReason match {
case ef: ExceptionFailure =>
val unrecoverableErrors = Seq("cudaErrorIllegalAddress", "cudaErrorLaunchTimeout",
"cudaErrorHardwareStackError", "cudaErrorIllegalInstruction",
"cudaErrorMisalignedAddress", "cudaErrorInvalidAddressSpace", "cudaErrorInvalidPc",
"cudaErrorLaunchFailure", "cudaErrorExternalDevice", "cudaErrorUnknown",
"cudaErrorECCUncorrectable")
if (unrecoverableErrors.exists(ef.description.contains(_)) ||
unrecoverableErrors.exists(ef.toErrorString.contains(_))) {
logError("Stopping the Executor based on exception being a fatal CUDA error: " +
s"${ef.toErrorString}")
System.exit(20)
}
case other =>
logDebug(s"Executor onTaskFailed not a CUDA fatal error: ${other.toString}")
}
}
}

object RapidsExecutorPlugin {
Expand Down

0 comments on commit 207fbfc

Please sign in to comment.