Skip to content

Commit

Permalink
fix: Fix GPU power utilization logging and error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
SiddhantSadangi committed Aug 20, 2024
1 parent 1902e67 commit 49ecb4c
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
5 changes: 4 additions & 1 deletion src/neptune/common/hardware/gauges/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ def name(self):
return str(self.card_index)

def value(self):
return self.__gpu_monitor.get_card_power_usage(self.card_index) // MILLIWATTS_IN_ONE_WATT
power_usage = self.__gpu_monitor.get_card_power_usage(self.card_index)
if power_usage is None:
return None
return self.__gpu_monitor.get_card_power_usage(self.card_index) / MILLIWATTS_IN_ONE_WATT

def __eq__(self, other):
return self.__class__ == other.__class__ and self.card_index == other.card_index
Expand Down
4 changes: 2 additions & 2 deletions src/neptune/common/hardware/gpu/gpu_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ def __nvml_get_or_else(self, getter, default=None):
try:
nvmlInit()
return getter()
except NVMLError:
except NVMLError as e:
if not GPUMonitor.nvml_error_printed:
warning = (
"Info (NVML): %s. GPU usage metrics may not be reported. For more information, "
f"Info (NVML): {e}. GPU usage metrics may not be reported. For more information, "
"see https://docs.neptune.ai/help/nvml_error/"
)
warn_once(message=warning, exception=NeptuneWarning)
Expand Down

0 comments on commit 49ecb4c

Please sign in to comment.