From 4f4172fa612c02a5f69ccad2ebbf9d64f989ed34 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 13 Apr 2022 09:48:06 -0500 Subject: [PATCH 1/2] Throw again after logging that RMM could not intialize Signed-off-by: Alessandro Bellina --- .../main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala index 260127e6971..26fda9aaea6 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala @@ -301,7 +301,9 @@ object GpuDeviceManager extends Logging { Rmm.initialize(init, logConf, poolAllocation) RapidsBufferCatalog.init(conf) } catch { - case e: Exception => logError("Could not initialize RMM", e) + case e: CudfException => + logError("Could not initialize RMM, exiting!", e) + throw e } } } From f6545d334ecf1dd7e496e12b5212bbc7afe1fc5f Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 13 Apr 2022 12:34:15 -0500 Subject: [PATCH 2/2] Let exception as is from RMM, and change slightly the wording from the ExecutorPlugin --- .../com/nvidia/spark/rapids/GpuDeviceManager.scala | 12 +++--------- .../main/scala/com/nvidia/spark/rapids/Plugin.scala | 2 +- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala index 26fda9aaea6..f5e5a3a940a 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala @@ -296,15 +296,9 @@ object GpuDeviceManager extends Logging { logInfo("Using legacy default stream") } - try { - Cuda.setDevice(gpuId) - Rmm.initialize(init, logConf, poolAllocation) - RapidsBufferCatalog.init(conf) - } catch { - case e: CudfException => - logError("Could not initialize RMM, exiting!", e) - throw e - } + Cuda.setDevice(gpuId) + Rmm.initialize(init, logConf, poolAllocation) + RapidsBufferCatalog.init(conf) } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala index 36a7ef86540..f91b310eda4 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala @@ -238,7 +238,7 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging { // Exceptions in executor plugin can cause a single thread to die but the executor process // sticks around without any useful info until it hearbeat times out. Print what happened // and exit immediately. - logError("Exception in the executor plugin", e) + logError("Exception in the executor plugin, shutting down!", e) System.exit(1) } }