Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

On task failure catch some CUDA exceptions and kill executor [databricks] #5118

Merged
merged 10 commits into from
Apr 1, 2022
22 changes: 21 additions & 1 deletion sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import scala.util.matching.Regex
import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
import com.nvidia.spark.rapids.shims.SparkShimImpl

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.{ExceptionFailure, SparkConf, SparkContext, TaskFailedReason}
import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext}
import org.apache.spark.internal.Logging
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
Expand Down Expand Up @@ -275,6 +275,26 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
GpuDeviceManager.shutdown()
Option(rapidsShuffleHeartbeatEndpoint).foreach(_.close())
}

override def onTaskFailed(failureReason: TaskFailedReason): Unit = {
failureReason match {
case ef: ExceptionFailure =>
val unrecoverableErrors = Seq("cudaErrorIllegalAddress", "cudaErrorLaunchTimeout",
"cudaErrorHardwareStackError", "cudaErrorIllegalInstruction",
"cudaErrorMisalignedAddress", "cudaErrorInvalidAddressSpace", "cudaErrorInvalidPc",
"cudaErrorLaunchFailure", "cudaErrorExternalDevice", "cudaErrorUnknown",
"cudaErrorECCUncorrectable")
// Could check the entire error string, but don't think its necessary
// unrecoverableErrors.exists(ef.toErrorString.contains(_))
if (unrecoverableErrors.exists(ef.description.contains(_))) {
jlowe marked this conversation as resolved.
Show resolved Hide resolved
logError("Stopping the Executor based on exception being a fatal CUDA error: " +
s"${ef.toErrorString}")
System.exit(20)
}
case other =>
logDebug(s"Executor onTaskFailed not a CUDA fatal error: ${other.toString}")
}
}
}

object RapidsExecutorPlugin {
Expand Down