Skip to content

Commit

Permalink
GPU power utilization added to monitoring namespace. (#1854)
Browse files Browse the repository at this point in the history
Co-authored-by: Siddhant Sadangi <[email protected]>
  • Loading branch information
harishankar-gopalan and SiddhantSadangi authored Aug 19, 2024
1 parent 7ddbb03 commit f668ad0
Show file tree
Hide file tree
Showing 18 changed files with 153 additions and 103 deletions.
6 changes: 4 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
## [UNRELEASED] neptune 1.10.5
## neptune 1.11.0

### Features
- Added GPU power consumption monitoring ([#1854](https://github.com/neptune-ai/neptune-client/pull/1854))

### Changes
- Added docstring for the `pop()` function ([#1781](https://github.com/neptune-ai/neptune-client/pull/1781))


## neptune 1.10.4

### Fixes
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,6 @@ module = [
"neptune.internal.background_job",
"neptune.internal.container_structure",
"neptune.internal.credentials",
"neptune.internal.hardware.gpu.gpu_monitor",
"neptune.internal.hardware.hardware_metric_reporting_job",
"neptune.internal.id_formats",
"neptune.internal.init.model",
Expand Down
3 changes: 2 additions & 1 deletion src/neptune/common/hardware/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
__all__ = ["BYTES_IN_ONE_MB", "BYTES_IN_ONE_GB"]
__all__ = ["BYTES_IN_ONE_MB", "BYTES_IN_ONE_GB", "MILLIWATTS_IN_ONE_WATT"]

BYTES_IN_ONE_MB = 2**20
BYTES_IN_ONE_GB = 2**30
MILLIWATTS_IN_ONE_WATT = 10**3
7 changes: 6 additions & 1 deletion src/neptune/common/hardware/gauges/gauge_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from neptune.common.hardware.gauges.gauge_mode import GaugeMode
from neptune.common.hardware.gauges.gpu import (
GpuMemoryGauge,
GpuPowerGauge,
GpuUsageGauge,
)
from neptune.common.hardware.gauges.memory import (
Expand Down Expand Up @@ -56,5 +57,9 @@ def create_gpu_usage_gauge(card_index):
def create_gpu_memory_gauge(card_index):
return GpuMemoryGauge(card_index=card_index)

@staticmethod
def create_gpu_power_gauge(card_index):
return GpuPowerGauge(card_index=card_index)

def __invalid_gauge_mode_exception(self):
return ValueError(str("Invalid gauge mode: {}".format(self.__gauge_mode)))
return ValueError(f"Invalid gauge mode: {self.__gauge_mode}")
27 changes: 24 additions & 3 deletions src/neptune/common/hardware/gauges/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from neptune.common.hardware.constants import BYTES_IN_ONE_GB
from neptune.common.hardware.constants import (
BYTES_IN_ONE_GB,
MILLIWATTS_IN_ONE_WATT,
)
from neptune.common.hardware.gauges.gauge import Gauge
from neptune.common.hardware.gpu.gpu_monitor import GPUMonitor

Expand All @@ -33,7 +36,7 @@ def __eq__(self, other):
return self.__class__ == other.__class__ and self.card_index == other.card_index

def __repr__(self):
return str("GpuUsageGauge")
return "GpuUsageGauge"


class GpuMemoryGauge(Gauge):
Expand All @@ -51,4 +54,22 @@ def __eq__(self, other):
return self.__class__ == other.__class__ and self.card_index == other.card_index

def __repr__(self):
return str("GpuMemoryGauge")
return "GpuMemoryGauge"


class GpuPowerGauge(Gauge):
def __init__(self, card_index):
self.card_index = card_index
self.__gpu_monitor = GPUMonitor()

def name(self):
return str(self.card_index)

def value(self):
return self.__gpu_monitor.get_card_power_usage(self.card_index) // MILLIWATTS_IN_ONE_WATT

def __eq__(self, other):
return self.__class__ == other.__class__ and self.card_index == other.card_index

def __repr__(self):
return "GpuPowerGauge"
24 changes: 20 additions & 4 deletions src/neptune/common/hardware/gpu/gpu_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,24 @@
# limitations under the License.
#

from neptune.common.hardware.constants import MILLIWATTS_IN_ONE_WATT
from neptune.common.warnings import (
NeptuneWarning,
warn_once,
)
from neptune.vendor.pynvml import (
NVMLError,
nvmlDeviceGetCount,
nvmlDeviceGetEnforcedPowerLimit,
nvmlDeviceGetHandleByIndex,
nvmlDeviceGetMemoryInfo,
nvmlDeviceGetPowerUsage,
nvmlDeviceGetUtilizationRates,
nvmlInit,
)


class GPUMonitor(object):

nvml_error_printed = False

def get_card_count(self):
Expand All @@ -43,6 +45,22 @@ def get_card_usage_percent(self, card_index):
def get_card_used_memory_in_bytes(self, card_index):
return self.__nvml_get_or_else(lambda: nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(card_index)).used)

def get_card_power_usage(self, card_index):
return self.__nvml_get_or_else(lambda: nvmlDeviceGetPowerUsage(nvmlDeviceGetHandleByIndex(card_index)))

def get_card_max_power_rating(self):
def read_max_power_rating():
return self.__nvml_get_or_else(
lambda: [
nvmlDeviceGetEnforcedPowerLimit(nvmlDeviceGetHandleByIndex(card_index)) // MILLIWATTS_IN_ONE_WATT
for card_index in range(nvmlDeviceGetCount())
],
default=0,
)

power_rating_per_card = read_max_power_rating()
return max(power_rating_per_card) if power_rating_per_card else 0

def get_top_card_memory_in_bytes(self):
def read_top_card_memory_in_bytes():
return self.__nvml_get_or_else(
Expand All @@ -54,9 +72,7 @@ def read_top_card_memory_in_bytes():
)

memory_per_card = read_top_card_memory_in_bytes()
if not memory_per_card:
return 0
return max(memory_per_card)
return max(memory_per_card) if memory_per_card else 0

def __nvml_get_or_else(self, getter, default=None):
try:
Expand Down
1 change: 1 addition & 0 deletions src/neptune/common/hardware/metrics/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,5 @@ class MetricResourceType(object):
RAM = "MEMORY"
GPU = "GPU"
GPU_RAM = "GPU_MEMORY"
GPU_POWER = "GPU_POWER"
OTHER = "OTHER"
4 changes: 3 additions & 1 deletion src/neptune/common/hardware/metrics/metrics_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@


class MetricsContainer(object):
def __init__(self, cpu_usage_metric, memory_metric, gpu_usage_metric, gpu_memory_metric):
def __init__(self, cpu_usage_metric, memory_metric, gpu_usage_metric, gpu_memory_metric, gpu_power_usage_metric):
self.cpu_usage_metric = cpu_usage_metric
self.memory_metric = memory_metric
self.gpu_usage_metric = gpu_usage_metric
self.gpu_memory_metric = gpu_memory_metric
self.gpu_power_usage_metric = gpu_power_usage_metric

def metrics(self):
return [
Expand All @@ -30,6 +31,7 @@ def metrics(self):
self.memory_metric,
self.gpu_usage_metric,
self.gpu_memory_metric,
self.gpu_power_usage_metric,
]
if metric is not None
]
16 changes: 16 additions & 0 deletions src/neptune/common/hardware/metrics/metrics_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,14 @@ def create_metrics_container(self):
has_gpu = self.__system_resource_info.has_gpu()
gpu_usage_metric = self.__create_gpu_usage_metric() if has_gpu else None
gpu_memory_metric = self.__create_gpu_memory_metric() if has_gpu else None
gpu_power_usage_metric = self.__create_gpu_power_usage_metric() if has_gpu else None

return MetricsContainer(
cpu_usage_metric=cpu_usage_metric,
memory_metric=memory_metric,
gpu_usage_metric=gpu_usage_metric,
gpu_memory_metric=gpu_memory_metric,
gpu_power_usage_metric=gpu_power_usage_metric,
)

def __create_cpu_usage_metric(self):
Expand Down Expand Up @@ -90,3 +92,17 @@ def __create_gpu_memory_metric(self):
for card_index in self.__system_resource_info.gpu_card_indices
],
)

def __create_gpu_power_usage_metric(self):
return Metric(
name="GPU - power usage",
description="{} cards".format(self.__system_resource_info.gpu_card_count),
resource_type=MetricResourceType.GPU_POWER,
unit="W",
min_value=0.0,
max_value=self.__system_resource_info.gpu_max_power_watts,
gauges=[
self.__gauge_factory.create_gpu_power_gauge(card_index=card_index)
for card_index in self.__system_resource_info.gpu_card_indices
],
)
6 changes: 6 additions & 0 deletions src/neptune/common/hardware/resources/system_resource_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ def __init__(
memory_amount_bytes,
gpu_card_indices,
gpu_memory_amount_bytes,
gpu_max_power_watts,
):
self.__cpu_core_count = cpu_core_count
self.__memory_amount_bytes = memory_amount_bytes
self.__gpu_card_indices = gpu_card_indices
self.__gpu_memory_amount_bytes = gpu_memory_amount_bytes
self.__gpu_max_power_watts = gpu_max_power_watts

@property
def cpu_core_count(self):
Expand All @@ -48,6 +50,10 @@ def gpu_card_indices(self):
def gpu_memory_amount_bytes(self):
return self.__gpu_memory_amount_bytes

@property
def gpu_max_power_watts(self):
return self.__gpu_max_power_watts

def has_gpu(self):
return self.gpu_card_count > 0

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,15 @@ def create(self, gauge_mode):
elif gauge_mode == GaugeMode.CGROUP:
return self.__create_cgroup_resource_info()
else:
raise ValueError(str("Unknown gauge mode: {}".format(gauge_mode)))
raise ValueError(f"Unknown gauge mode: {gauge_mode}")

def __create_whole_system_resource_info(self):
return SystemResourceInfo(
cpu_core_count=float(self.__system_monitor.cpu_count()),
memory_amount_bytes=self.__system_monitor.virtual_memory().total,
gpu_card_indices=self.__gpu_card_indices_provider.get(),
gpu_memory_amount_bytes=self.__gpu_monitor.get_top_card_memory_in_bytes(),
gpu_max_power_watts=self.__gpu_monitor.get_card_max_power_rating(),
)

def __create_cgroup_resource_info(self):
Expand All @@ -53,4 +54,5 @@ def __create_cgroup_resource_info(self):
memory_amount_bytes=cgroup_monitor.get_memory_limit_in_bytes(),
gpu_card_indices=self.__gpu_card_indices_provider.get(),
gpu_memory_amount_bytes=self.__gpu_monitor.get_top_card_memory_in_bytes(),
gpu_max_power_watts=self.__gpu_monitor.get_card_max_power_rating(),
)
15 changes: 0 additions & 15 deletions src/neptune/internal/hardware/gpu/__init__.py

This file was deleted.

73 changes: 0 additions & 73 deletions src/neptune/internal/hardware/gpu/gpu_monitor.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@

from neptune.common.hardware.gauges.gauge_factory import GaugeFactory
from neptune.common.hardware.gauges.gauge_mode import GaugeMode
from neptune.common.hardware.gpu.gpu_monitor import GPUMonitor
from neptune.common.hardware.metrics.metrics_factory import MetricsFactory
from neptune.common.hardware.metrics.reports.metric_reporter import MetricReporter
from neptune.common.hardware.metrics.reports.metric_reporter_factory import MetricReporterFactory
from neptune.common.hardware.resources.system_resource_info_factory import SystemResourceInfoFactory
from neptune.common.hardware.system.system_monitor import SystemMonitor
from neptune.common.utils import in_docker
from neptune.internal.background_job import BackgroundJob
from neptune.internal.hardware.gpu.gpu_monitor import GPUMonitor
from neptune.internal.threading.daemon import Daemon
from neptune.internal.utils.logger import get_logger
from neptune.types.series import FloatSeries
Expand Down
Loading

0 comments on commit f668ad0

Please sign in to comment.