diff --git a/pyproject.toml b/pyproject.toml index 8fd575445..9833f55fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -209,25 +209,6 @@ module = [ "neptune.attributes.utils", "neptune.internal.exceptions", "neptune.internal.utils.git_info", - "neptune.internal.hardware.cgroup.cgroup_filesystem_reader", - "neptune.internal.hardware.cgroup.cgroup_monitor", - "neptune.internal.hardware.gauges.cpu", - "neptune.internal.hardware.gauges.gauge", - "neptune.internal.hardware.gauges.gauge_factory", - "neptune.internal.hardware.gauges.gpu", - "neptune.internal.hardware.gauges.memory", - "neptune.internal.hardware.gpu.gpu_monitor", - "neptune.internal.hardware.metrics.metric", - "neptune.internal.hardware.metrics.metrics_container", - "neptune.internal.hardware.metrics.metrics_factory", - "neptune.internal.hardware.metrics.reports.metric_reporter", - "neptune.internal.hardware.metrics.reports.metric_reporter_factory", - "neptune.internal.hardware.metrics.service.metric_service", - "neptune.internal.hardware.metrics.service.metric_service_factory", - "neptune.internal.hardware.resources.gpu_card_indices_provider", - "neptune.internal.hardware.resources.system_resource_info", - "neptune.internal.hardware.resources.system_resource_info_factory", - "neptune.internal.hardware.system.system_monitor", "neptune.internal.oauth", "neptune.internal.patches.bravado", "neptune.internal.storage.datastream", diff --git a/src/neptune/internal/backends/hosted_file_operations.py b/src/neptune/internal/backends/hosted_file_operations.py index 0fee39f0f..b6ca38325 100644 --- a/src/neptune/internal/backends/hosted_file_operations.py +++ b/src/neptune/internal/backends/hosted_file_operations.py @@ -69,7 +69,6 @@ NeptuneException, UploadedFileChanged, ) -from neptune.internal.hardware.constants import BYTES_IN_ONE_MB from neptune.internal.storage import ( AttributeUploadConfiguration, FileChunk, @@ -88,6 +87,7 @@ from neptune.typing import ProgressBarType logger = get_logger() +BYTES_IN_ONE_MB = 2**20 DEFAULT_CHUNK_SIZE = 5 * BYTES_IN_ONE_MB DEFAULT_UPLOAD_CONFIG = AttributeUploadConfiguration(chunk_size=DEFAULT_CHUNK_SIZE) diff --git a/src/neptune/internal/hardware/__init__.py b/src/neptune/internal/hardware/__init__.py deleted file mode 100644 index b5e585d90..000000000 --- a/src/neptune/internal/hardware/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2022, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/src/neptune/internal/hardware/cgroup/__init__.py b/src/neptune/internal/hardware/cgroup/__init__.py deleted file mode 100644 index 62a86a5be..000000000 --- a/src/neptune/internal/hardware/cgroup/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/src/neptune/internal/hardware/cgroup/cgroup_filesystem_reader.py b/src/neptune/internal/hardware/cgroup/cgroup_filesystem_reader.py deleted file mode 100644 index c8f4020d4..000000000 --- a/src/neptune/internal/hardware/cgroup/cgroup_filesystem_reader.py +++ /dev/null @@ -1,69 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import re - - -class CGroupFilesystemReader(object): - def __init__(self): - cgroup_memory_dir = self.__cgroup_mount_dir(subsystem="memory") - self.__memory_usage_file = os.path.join(cgroup_memory_dir, "memory.usage_in_bytes") - self.__memory_limit_file = os.path.join(cgroup_memory_dir, "memory.limit_in_bytes") - - cgroup_cpu_dir = self.__cgroup_mount_dir(subsystem="cpu") - self.__cpu_period_file = os.path.join(cgroup_cpu_dir, "cpu.cfs_period_us") - self.__cpu_quota_file = os.path.join(cgroup_cpu_dir, "cpu.cfs_quota_us") - - cgroup_cpuacct_dir = self.__cgroup_mount_dir(subsystem="cpuacct") - self.__cpuacct_usage_file = os.path.join(cgroup_cpuacct_dir, "cpuacct.usage") - - def get_memory_usage_in_bytes(self): - return self.__read_int_file(self.__memory_usage_file) - - def get_memory_limit_in_bytes(self): - return self.__read_int_file(self.__memory_limit_file) - - def get_cpu_quota_micros(self): - return self.__read_int_file(self.__cpu_quota_file) - - def get_cpu_period_micros(self): - return self.__read_int_file(self.__cpu_period_file) - - def get_cpuacct_usage_nanos(self): - return self.__read_int_file(self.__cpuacct_usage_file) - - def __read_int_file(self, filename): - with open(filename) as f: - return int(f.read()) - - def __cgroup_mount_dir(self, subsystem): - """ - :param subsystem: cgroup subsystem like memory, cpu - :return: directory where given subsystem is mounted - """ - with open("/proc/mounts", "r") as f: - for line in f.readlines(): - split_line = re.split(r"\s+", line) - mount_dir = split_line[1] - - if "cgroup" in mount_dir: - dirname = mount_dir.split("/")[-1] - subsystems = dirname.split(",") - if subsystem in subsystems: - return mount_dir - - assert False, 'Mount directory for "{}" subsystem not found'.format(subsystem) diff --git a/src/neptune/internal/hardware/cgroup/cgroup_monitor.py b/src/neptune/internal/hardware/cgroup/cgroup_monitor.py deleted file mode 100644 index 07a282ffc..000000000 --- a/src/neptune/internal/hardware/cgroup/cgroup_monitor.py +++ /dev/null @@ -1,76 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import time - -from neptune.internal.hardware.cgroup.cgroup_filesystem_reader import CGroupFilesystemReader -from neptune.internal.hardware.system.system_monitor import SystemMonitor - - -class CGroupMonitor(object): - def __init__(self, cgroup_filesystem_reader, system_monitor): - self.__cgroup_filesystem_reader = cgroup_filesystem_reader - self.__system_monitor = system_monitor - - self.__last_cpu_usage_measurement_timestamp_nanos = None - self.__last_cpu_cumulative_usage_nanos = None - - @staticmethod - def create(): - return CGroupMonitor(CGroupFilesystemReader(), SystemMonitor()) - - def get_memory_usage_in_bytes(self): - return self.__cgroup_filesystem_reader.get_memory_usage_in_bytes() - - def get_memory_limit_in_bytes(self): - cgroup_mem_limit = self.__cgroup_filesystem_reader.get_memory_limit_in_bytes() - total_virtual_memory = self.__system_monitor.virtual_memory().total - return min(cgroup_mem_limit, total_virtual_memory) - - def get_cpu_usage_limit_in_cores(self): - cpu_quota_micros = self.__cgroup_filesystem_reader.get_cpu_quota_micros() - - if cpu_quota_micros == -1: - return float(self.__system_monitor.cpu_count()) - else: - cpu_period_micros = self.__cgroup_filesystem_reader.get_cpu_period_micros() - return float(cpu_quota_micros) / float(cpu_period_micros) - - def get_cpu_usage_percentage(self): - current_timestamp_nanos = time.time() * 10**9 - cpu_cumulative_usage_nanos = self.__cgroup_filesystem_reader.get_cpuacct_usage_nanos() - - if self.__first_measurement(): - current_usage = 0.0 - else: - usage_diff = cpu_cumulative_usage_nanos - self.__last_cpu_cumulative_usage_nanos - time_diff = current_timestamp_nanos - self.__last_cpu_usage_measurement_timestamp_nanos - current_usage = float(usage_diff) / float(time_diff) / self.get_cpu_usage_limit_in_cores() * 100.0 - - self.__last_cpu_usage_measurement_timestamp_nanos = current_timestamp_nanos - self.__last_cpu_cumulative_usage_nanos = cpu_cumulative_usage_nanos - - # cgroup cpu usage may slightly exceed the given limit, but we don't want to show it - return self.__clamp(current_usage, lower_limit=0.0, upper_limit=100.0) - - def __first_measurement(self): - return ( - self.__last_cpu_usage_measurement_timestamp_nanos is None or self.__last_cpu_cumulative_usage_nanos is None - ) - - @staticmethod - def __clamp(value, lower_limit, upper_limit): - return max(lower_limit, min(value, upper_limit)) diff --git a/src/neptune/internal/hardware/constants.py b/src/neptune/internal/hardware/constants.py deleted file mode 100644 index 1530fe3a7..000000000 --- a/src/neptune/internal/hardware/constants.py +++ /dev/null @@ -1,19 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -__all__ = ["BYTES_IN_ONE_MB", "BYTES_IN_ONE_GB"] - -BYTES_IN_ONE_MB = 2**20 -BYTES_IN_ONE_GB = 2**30 diff --git a/src/neptune/internal/hardware/gauges/__init__.py b/src/neptune/internal/hardware/gauges/__init__.py deleted file mode 100644 index 62a86a5be..000000000 --- a/src/neptune/internal/hardware/gauges/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/src/neptune/internal/hardware/gauges/cpu.py b/src/neptune/internal/hardware/gauges/cpu.py deleted file mode 100644 index 50897409a..000000000 --- a/src/neptune/internal/hardware/gauges/cpu.py +++ /dev/null @@ -1,53 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from neptune.internal.hardware.cgroup.cgroup_monitor import CGroupMonitor -from neptune.internal.hardware.gauges.gauge import Gauge -from neptune.internal.hardware.system.system_monitor import SystemMonitor - - -class SystemCpuUsageGauge(Gauge): - def __init__(self): - self.__system_monitor = SystemMonitor() - - def name(self): - return "cpu" - - def value(self): - return self.__system_monitor.cpu_percent() - - def __eq__(self, other): - return self.__class__ == other.__class__ - - def __repr__(self): - return str("SystemCpuUsageGauge") - - -class CGroupCpuUsageGauge(Gauge): - def __init__(self): - self.__cgroup_monitor = CGroupMonitor.create() - - def name(self): - return "cpu" - - def value(self): - return self.__cgroup_monitor.get_cpu_usage_percentage() - - def __eq__(self, other): - return self.__class__ == other.__class__ - - def __repr__(self): - return str("CGroupCpuUsageGauge") diff --git a/src/neptune/internal/hardware/gauges/gauge.py b/src/neptune/internal/hardware/gauges/gauge.py deleted file mode 100644 index 32b070b89..000000000 --- a/src/neptune/internal/hardware/gauges/gauge.py +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from abc import ( - ABCMeta, - abstractmethod, -) - - -class Gauge(object): - __metaclass__ = ABCMeta - - @abstractmethod - def name(self): - """ - :return: Gauge name (str). - """ - raise NotImplementedError() - - @abstractmethod - def value(self): - """ - :return: Current value (float). - """ - raise NotImplementedError() diff --git a/src/neptune/internal/hardware/gauges/gauge_factory.py b/src/neptune/internal/hardware/gauges/gauge_factory.py deleted file mode 100644 index fd9b4e53a..000000000 --- a/src/neptune/internal/hardware/gauges/gauge_factory.py +++ /dev/null @@ -1,60 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from neptune.internal.hardware.gauges.cpu import ( - CGroupCpuUsageGauge, - SystemCpuUsageGauge, -) -from neptune.internal.hardware.gauges.gauge_mode import GaugeMode -from neptune.internal.hardware.gauges.gpu import ( - GpuMemoryGauge, - GpuUsageGauge, -) -from neptune.internal.hardware.gauges.memory import ( - CGroupMemoryUsageGauge, - SystemMemoryUsageGauge, -) - - -class GaugeFactory(object): - def __init__(self, gauge_mode): - self.__gauge_mode = gauge_mode - - def create_cpu_usage_gauge(self): - if self.__gauge_mode == GaugeMode.SYSTEM: - return SystemCpuUsageGauge() - elif self.__gauge_mode == GaugeMode.CGROUP: - return CGroupCpuUsageGauge() - else: - raise self.__invalid_gauge_mode_exception() - - def create_memory_usage_gauge(self): - if self.__gauge_mode == GaugeMode.SYSTEM: - return SystemMemoryUsageGauge() - elif self.__gauge_mode == GaugeMode.CGROUP: - return CGroupMemoryUsageGauge() - else: - raise self.__invalid_gauge_mode_exception() - - @staticmethod - def create_gpu_usage_gauge(card_index): - return GpuUsageGauge(card_index=card_index) - - @staticmethod - def create_gpu_memory_gauge(card_index): - return GpuMemoryGauge(card_index=card_index) - - def __invalid_gauge_mode_exception(self): - return ValueError(str("Invalid gauge mode: {}".format(self.__gauge_mode))) diff --git a/src/neptune/internal/hardware/gauges/gauge_mode.py b/src/neptune/internal/hardware/gauges/gauge_mode.py deleted file mode 100644 index ce0a02b16..000000000 --- a/src/neptune/internal/hardware/gauges/gauge_mode.py +++ /dev/null @@ -1,20 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -class GaugeMode(object): - SYSTEM = "system" - CGROUP = "cgroup" diff --git a/src/neptune/internal/hardware/gauges/gpu.py b/src/neptune/internal/hardware/gauges/gpu.py deleted file mode 100644 index 057c959f4..000000000 --- a/src/neptune/internal/hardware/gauges/gpu.py +++ /dev/null @@ -1,54 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from neptune.internal.hardware.constants import BYTES_IN_ONE_GB -from neptune.internal.hardware.gauges.gauge import Gauge -from neptune.internal.hardware.gpu.gpu_monitor import GPUMonitor - - -class GpuUsageGauge(Gauge): - def __init__(self, card_index): - self.card_index = card_index - self.__gpu_monitor = GPUMonitor() - - def name(self): - return str(self.card_index) - - def value(self): - return self.__gpu_monitor.get_card_usage_percent(self.card_index) - - def __eq__(self, other): - return self.__class__ == other.__class__ and self.card_index == other.card_index - - def __repr__(self): - return str("GpuUsageGauge") - - -class GpuMemoryGauge(Gauge): - def __init__(self, card_index): - self.card_index = card_index - self.__gpu_monitor = GPUMonitor() - - def name(self): - return str(self.card_index) - - def value(self): - return self.__gpu_monitor.get_card_used_memory_in_bytes(self.card_index) / float(BYTES_IN_ONE_GB) - - def __eq__(self, other): - return self.__class__ == other.__class__ and self.card_index == other.card_index - - def __repr__(self): - return str("GpuMemoryGauge") diff --git a/src/neptune/internal/hardware/gauges/memory.py b/src/neptune/internal/hardware/gauges/memory.py deleted file mode 100644 index 195936e62..000000000 --- a/src/neptune/internal/hardware/gauges/memory.py +++ /dev/null @@ -1,55 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from neptune.internal.hardware.cgroup.cgroup_monitor import CGroupMonitor -from neptune.internal.hardware.constants import BYTES_IN_ONE_GB -from neptune.internal.hardware.gauges.gauge import Gauge -from neptune.internal.hardware.system.system_monitor import SystemMonitor - - -class SystemMemoryUsageGauge(Gauge): - def __init__(self): - self.__system_monitor = SystemMonitor() - - def name(self): - return "ram" - - def value(self): - virtual_mem = self.__system_monitor.virtual_memory() - return (virtual_mem.total - virtual_mem.available) / float(BYTES_IN_ONE_GB) - - def __eq__(self, other): - return self.__class__ == other.__class__ - - def __repr__(self): - return str("SystemMemoryUsageGauge") - - -class CGroupMemoryUsageGauge(Gauge): - def __init__(self): - self.__cgroup_monitor = CGroupMonitor.create() - - def name(self): - return "ram" - - def value(self): - return self.__cgroup_monitor.get_memory_usage_in_bytes() / float(BYTES_IN_ONE_GB) - - def __eq__(self, other): - return self.__class__ == other.__class__ - - def __repr__(self): - return str("CGroupMemoryUsageGauge") diff --git a/src/neptune/internal/hardware/gpu/__init__.py b/src/neptune/internal/hardware/gpu/__init__.py deleted file mode 100644 index b5e585d90..000000000 --- a/src/neptune/internal/hardware/gpu/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2022, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/src/neptune/internal/hardware/gpu/gpu_monitor.py b/src/neptune/internal/hardware/gpu/gpu_monitor.py deleted file mode 100644 index ab683b19e..000000000 --- a/src/neptune/internal/hardware/gpu/gpu_monitor.py +++ /dev/null @@ -1,73 +0,0 @@ -# -# Copyright (c) 2022, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -__all__ = ["GPUMonitor"] - -from neptune.internal.utils.logger import get_logger -from neptune.vendor.pynvml import ( - NVMLError, - nvmlDeviceGetCount, - nvmlDeviceGetHandleByIndex, - nvmlDeviceGetMemoryInfo, - nvmlDeviceGetUtilizationRates, - nvmlInit, -) - -_logger = get_logger() - - -class GPUMonitor(object): - - nvml_error_printed = False - - def get_card_count(self): - return self.__nvml_get_or_else(nvmlDeviceGetCount, default=0) - - def get_card_usage_percent(self, card_index): - return self.__nvml_get_or_else( - lambda: float(nvmlDeviceGetUtilizationRates(nvmlDeviceGetHandleByIndex(card_index)).gpu) - ) - - def get_card_used_memory_in_bytes(self, card_index): - return self.__nvml_get_or_else(lambda: nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(card_index)).used) - - def get_top_card_memory_in_bytes(self): - def read_top_card_memory_in_bytes(): - return self.__nvml_get_or_else( - lambda: [ - nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(card_index)).total - for card_index in range(nvmlDeviceGetCount()) - ], - default=0, - ) - - memory_per_card = read_top_card_memory_in_bytes() - if not memory_per_card: - return 0 - return max(memory_per_card) - - def __nvml_get_or_else(self, getter, default=None): - try: - nvmlInit() - return getter() - except NVMLError as e: - if not GPUMonitor.nvml_error_printed: - warning = ( - "Info (NVML): %s. GPU usage metrics may not be reported. For more information, " - "see https://docs.neptune.ai/help/nvml_error/" - ) - _logger.warning(warning, e) - GPUMonitor.nvml_error_printed = True - return default diff --git a/src/neptune/internal/hardware/hardware_metric_reporting_job.py b/src/neptune/internal/hardware/hardware_metric_reporting_job.py deleted file mode 100644 index bb6773e03..000000000 --- a/src/neptune/internal/hardware/hardware_metric_reporting_job.py +++ /dev/null @@ -1,122 +0,0 @@ -# -# Copyright (c) 2022, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -__all__ = ["HardwareMetricReportingJob"] - -import os -import time -from itertools import groupby -from typing import ( - TYPE_CHECKING, - Dict, - Optional, -) - -from neptune.internal.background_job import BackgroundJob -from neptune.internal.hardware.gauges.gauge_factory import GaugeFactory -from neptune.internal.hardware.gauges.gauge_mode import GaugeMode -from neptune.internal.hardware.gpu.gpu_monitor import GPUMonitor -from neptune.internal.hardware.metrics.metrics_factory import MetricsFactory -from neptune.internal.hardware.metrics.reports.metric_reporter import MetricReporter -from neptune.internal.hardware.metrics.reports.metric_reporter_factory import MetricReporterFactory -from neptune.internal.hardware.resources.system_resource_info_factory import SystemResourceInfoFactory -from neptune.internal.hardware.system.system_monitor import SystemMonitor -from neptune.internal.threading.daemon import Daemon -from neptune.internal.utils.logger import get_logger -from neptune.internal.utils.utils import in_docker -from neptune.types.series import FloatSeries - -if TYPE_CHECKING: - from neptune.objects import NeptuneObject - -_logger = get_logger() - - -class HardwareMetricReportingJob(BackgroundJob): - def __init__(self, period: float = 10, attribute_namespace: str = "monitoring"): - self._period = period - self._thread = None - self._started = False - self._gauges_in_resource: Dict[str, int] = dict() - self._attribute_namespace = attribute_namespace - - def start(self, container: "NeptuneObject"): - gauge_mode = GaugeMode.CGROUP if in_docker() else GaugeMode.SYSTEM - system_resource_info = SystemResourceInfoFactory( - system_monitor=SystemMonitor(), - gpu_monitor=GPUMonitor(), - os_environ=os.environ, - ).create(gauge_mode=gauge_mode) - gauge_factory = GaugeFactory(gauge_mode=gauge_mode) - metrics_factory = MetricsFactory(gauge_factory=gauge_factory, system_resource_info=system_resource_info) - metrics_container = metrics_factory.create_metrics_container() - metric_reporter = MetricReporterFactory(time.time()).create(metrics=metrics_container.metrics()) - - for metric in metrics_container.metrics(): - self._gauges_in_resource[metric.resource_type] = len(metric.gauges) - - for metric in metrics_container.metrics(): - for gauge in metric.gauges: - path = self.get_attribute_name(metric.resource_type, gauge.name()) - if not container.get_attribute(path): - container[path] = FloatSeries([], min=metric.min_value, max=metric.max_value, unit=metric.unit) - - self._thread = self.ReportingThread(self, self._period, container, metric_reporter) - self._thread.start() - self._started = True - - def stop(self): - if not self._started: - return - self._thread.interrupt() - - def pause(self): - self._thread.pause() - - def resume(self): - self._thread.resume() - - def join(self, seconds: Optional[float] = None): - if not self._started: - return - self._thread.join(seconds) - - def get_attribute_name(self, resource_type, gauge_name) -> str: - gauges_count = self._gauges_in_resource.get(resource_type, None) - if gauges_count is None or gauges_count != 1: - return "{}/{}_{}".format(self._attribute_namespace, resource_type, gauge_name).lower() - return "{}/{}".format(self._attribute_namespace, resource_type).lower() - - class ReportingThread(Daemon): - def __init__( - self, - outer: "HardwareMetricReportingJob", - period: float, - container: "NeptuneObject", - metric_reporter: MetricReporter, - ): - super().__init__(sleep_time=period, name="NeptuneReporting") - self._outer = outer - self._container = container - self._metric_reporter = metric_reporter - - def work(self) -> None: - metric_reports = self._metric_reporter.report(time.time()) - for report in metric_reports: - for gauge_name, metric_values in groupby(report.values, lambda value: value.gauge_name): - attr = self._container[self._outer.get_attribute_name(report.metric.resource_type, gauge_name)] - # TODO: Avoid loop - for metric_value in metric_values: - attr.log(value=metric_value.value, timestamp=metric_value.timestamp) diff --git a/src/neptune/internal/hardware/metrics/__init__.py b/src/neptune/internal/hardware/metrics/__init__.py deleted file mode 100644 index 62a86a5be..000000000 --- a/src/neptune/internal/hardware/metrics/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/src/neptune/internal/hardware/metrics/metric.py b/src/neptune/internal/hardware/metrics/metric.py deleted file mode 100644 index b54ed808a..000000000 --- a/src/neptune/internal/hardware/metrics/metric.py +++ /dev/null @@ -1,99 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -class Metric(object): - def __init__( - self, - name, - description, - resource_type, - unit, - min_value, - max_value, - gauges, - internal_id=None, - ): - self.__internal_id = internal_id - self.__name = name - self.__description = description - self.__resource_type = resource_type - self.__unit = unit - self.__min_value = min_value - self.__max_value = max_value - self.__gauges = gauges - - @property - def internal_id(self): - return self.__internal_id - - @internal_id.setter - def internal_id(self, value): - self.__internal_id = value - - @property - def name(self): - return self.__name - - @property - def description(self): - return self.__description - - @property - def resource_type(self): - return self.__resource_type - - @property - def unit(self): - return self.__unit - - @property - def min_value(self): - return self.__min_value - - @property - def max_value(self): - return self.__max_value - - @property - def gauges(self): - return self.__gauges - - def __repr__(self): - return ( - "Metric(internal_id={}, name={}, description={}, resource_type={}, unit={}, min_value={}, " - "max_value={}, gauges={})" - ).format( - self.internal_id, - self.name, - self.description, - self.resource_type, - self.unit, - self.min_value, - self.max_value, - self.gauges, - ) - - def __eq__(self, other): - return self.__class__ == other.__class__ and repr(self) == repr(other) - - -class MetricResourceType(object): - CPU = "CPU" - RAM = "MEMORY" - GPU = "GPU" - GPU_RAM = "GPU_MEMORY" - OTHER = "OTHER" diff --git a/src/neptune/internal/hardware/metrics/metrics_container.py b/src/neptune/internal/hardware/metrics/metrics_container.py deleted file mode 100644 index f7d90a9b2..000000000 --- a/src/neptune/internal/hardware/metrics/metrics_container.py +++ /dev/null @@ -1,35 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -class MetricsContainer(object): - def __init__(self, cpu_usage_metric, memory_metric, gpu_usage_metric, gpu_memory_metric): - self.cpu_usage_metric = cpu_usage_metric - self.memory_metric = memory_metric - self.gpu_usage_metric = gpu_usage_metric - self.gpu_memory_metric = gpu_memory_metric - - def metrics(self): - return [ - metric - for metric in [ - self.cpu_usage_metric, - self.memory_metric, - self.gpu_usage_metric, - self.gpu_memory_metric, - ] - if metric is not None - ] diff --git a/src/neptune/internal/hardware/metrics/metrics_factory.py b/src/neptune/internal/hardware/metrics/metrics_factory.py deleted file mode 100644 index 666d6b23e..000000000 --- a/src/neptune/internal/hardware/metrics/metrics_factory.py +++ /dev/null @@ -1,92 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from neptune.internal.hardware.constants import BYTES_IN_ONE_GB -from neptune.internal.hardware.metrics.metric import ( - Metric, - MetricResourceType, -) -from neptune.internal.hardware.metrics.metrics_container import MetricsContainer - - -class MetricsFactory(object): - def __init__(self, gauge_factory, system_resource_info): - self.__gauge_factory = gauge_factory - self.__system_resource_info = system_resource_info - - def create_metrics_container(self): - cpu_usage_metric = self.__create_cpu_usage_metric() - memory_metric = self.__create_memory_metric() - - has_gpu = self.__system_resource_info.has_gpu() - gpu_usage_metric = self.__create_gpu_usage_metric() if has_gpu else None - gpu_memory_metric = self.__create_gpu_memory_metric() if has_gpu else None - - return MetricsContainer( - cpu_usage_metric=cpu_usage_metric, - memory_metric=memory_metric, - gpu_usage_metric=gpu_usage_metric, - gpu_memory_metric=gpu_memory_metric, - ) - - def __create_cpu_usage_metric(self): - return Metric( - name="CPU - usage", - description="average of all cores", - resource_type=MetricResourceType.CPU, - unit="%", - min_value=0.0, - max_value=100.0, - gauges=[self.__gauge_factory.create_cpu_usage_gauge()], - ) - - def __create_memory_metric(self): - return Metric( - name="RAM", - description="", - resource_type=MetricResourceType.RAM, - unit="GB", - min_value=0.0, - max_value=self.__system_resource_info.memory_amount_bytes / float(BYTES_IN_ONE_GB), - gauges=[self.__gauge_factory.create_memory_usage_gauge()], - ) - - def __create_gpu_usage_metric(self): - return Metric( - name="GPU - usage", - description="{} cards".format(self.__system_resource_info.gpu_card_count), - resource_type=MetricResourceType.GPU, - unit="%", - min_value=0.0, - max_value=100.0, - gauges=[ - self.__gauge_factory.create_gpu_usage_gauge(card_index=card_index) - for card_index in self.__system_resource_info.gpu_card_indices - ], - ) - - def __create_gpu_memory_metric(self): - return Metric( - name="GPU - memory", - description="{} cards".format(self.__system_resource_info.gpu_card_count), - resource_type=MetricResourceType.GPU_RAM, - unit="GB", - min_value=0.0, - max_value=self.__system_resource_info.gpu_memory_amount_bytes / float(BYTES_IN_ONE_GB), - gauges=[ - self.__gauge_factory.create_gpu_memory_gauge(card_index=card_index) - for card_index in self.__system_resource_info.gpu_card_indices - ], - ) diff --git a/src/neptune/internal/hardware/metrics/reports/__init__.py b/src/neptune/internal/hardware/metrics/reports/__init__.py deleted file mode 100644 index 62a86a5be..000000000 --- a/src/neptune/internal/hardware/metrics/reports/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/src/neptune/internal/hardware/metrics/reports/metric_report.py b/src/neptune/internal/hardware/metrics/reports/metric_report.py deleted file mode 100644 index c028e0e67..000000000 --- a/src/neptune/internal/hardware/metrics/reports/metric_report.py +++ /dev/null @@ -1,21 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from collections import namedtuple - -MetricReport = namedtuple("MetricReport", ["metric", "values"]) - -MetricValue = namedtuple("MetricValue", ["timestamp", "running_time", "gauge_name", "value"]) diff --git a/src/neptune/internal/hardware/metrics/reports/metric_reporter.py b/src/neptune/internal/hardware/metrics/reports/metric_reporter.py deleted file mode 100644 index 8dee14409..000000000 --- a/src/neptune/internal/hardware/metrics/reports/metric_reporter.py +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from neptune.internal.hardware.metrics.reports.metric_report import ( - MetricReport, - MetricValue, -) - - -class MetricReporter(object): - def __init__(self, metrics, reference_timestamp): - self.__metrics = metrics - self.__reference_timestamp = reference_timestamp - - def report(self, timestamp): - """ - :param timestamp: Time of measurement (float, seconds since Epoch). - :return: list[MetricReport] - """ - return [ - MetricReport( - metric=metric, - values=[x for x in [self.__metric_value_for_gauge(gauge, timestamp) for gauge in metric.gauges] if x], - ) - for metric in self.__metrics - ] - - def __metric_value_for_gauge(self, gauge, timestamp): - value = gauge.value() - return ( - MetricValue( - timestamp=timestamp, - running_time=timestamp - self.__reference_timestamp, - gauge_name=gauge.name(), - value=value, - ) - if value - else None - ) diff --git a/src/neptune/internal/hardware/metrics/reports/metric_reporter_factory.py b/src/neptune/internal/hardware/metrics/reports/metric_reporter_factory.py deleted file mode 100644 index ab1d88576..000000000 --- a/src/neptune/internal/hardware/metrics/reports/metric_reporter_factory.py +++ /dev/null @@ -1,24 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from neptune.internal.hardware.metrics.reports.metric_reporter import MetricReporter - - -class MetricReporterFactory(object): - def __init__(self, reference_timestamp): - self.__reference_timestamp = reference_timestamp - - def create(self, metrics): - return MetricReporter(metrics=metrics, reference_timestamp=self.__reference_timestamp) diff --git a/src/neptune/internal/hardware/metrics/service/__init__.py b/src/neptune/internal/hardware/metrics/service/__init__.py deleted file mode 100644 index 62a86a5be..000000000 --- a/src/neptune/internal/hardware/metrics/service/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/src/neptune/internal/hardware/metrics/service/metric_service.py b/src/neptune/internal/hardware/metrics/service/metric_service.py deleted file mode 100644 index dda6cdd08..000000000 --- a/src/neptune/internal/hardware/metrics/service/metric_service.py +++ /dev/null @@ -1,27 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -class MetricService(object): - def __init__(self, backend, metric_reporter, experiment, metrics_container): - self.__backend = backend - self.__metric_reporter = metric_reporter - self.experiment = experiment - self.metrics_container = metrics_container - - def report_and_send(self, timestamp): - metric_reports = self.__metric_reporter.report(timestamp) - self.__backend.send_hardware_metric_reports(self.experiment, self.metrics_container.metrics(), metric_reports) diff --git a/src/neptune/internal/hardware/metrics/service/metric_service_factory.py b/src/neptune/internal/hardware/metrics/service/metric_service_factory.py deleted file mode 100644 index 11ade5a18..000000000 --- a/src/neptune/internal/hardware/metrics/service/metric_service_factory.py +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from neptune.internal.hardware.gauges.gauge_factory import GaugeFactory -from neptune.internal.hardware.gpu.gpu_monitor import GPUMonitor -from neptune.internal.hardware.metrics.metrics_factory import MetricsFactory -from neptune.internal.hardware.metrics.reports.metric_reporter_factory import MetricReporterFactory -from neptune.internal.hardware.metrics.service.metric_service import MetricService -from neptune.internal.hardware.resources.system_resource_info_factory import SystemResourceInfoFactory -from neptune.internal.hardware.system.system_monitor import SystemMonitor - - -class MetricServiceFactory(object): - def __init__(self, backend, os_environ): - self.__backend = backend - self.__os_environ = os_environ - - def create(self, gauge_mode, experiment, reference_timestamp): - system_resource_info = SystemResourceInfoFactory( - system_monitor=SystemMonitor(), - gpu_monitor=GPUMonitor(), - os_environ=self.__os_environ, - ).create(gauge_mode=gauge_mode) - - gauge_factory = GaugeFactory(gauge_mode=gauge_mode) - metrics_factory = MetricsFactory(gauge_factory=gauge_factory, system_resource_info=system_resource_info) - metrics_container = metrics_factory.create_metrics_container() - - for metric in metrics_container.metrics(): - metric.internal_id = self.__backend.create_hardware_metric(experiment, metric) - - metric_reporter = MetricReporterFactory(reference_timestamp).create(metrics=metrics_container.metrics()) - - return MetricService( - backend=self.__backend, - metric_reporter=metric_reporter, - experiment=experiment, - metrics_container=metrics_container, - ) diff --git a/src/neptune/internal/hardware/resources/__init__.py b/src/neptune/internal/hardware/resources/__init__.py deleted file mode 100644 index 62a86a5be..000000000 --- a/src/neptune/internal/hardware/resources/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/src/neptune/internal/hardware/resources/gpu_card_indices_provider.py b/src/neptune/internal/hardware/resources/gpu_card_indices_provider.py deleted file mode 100644 index 4c1d73ea8..000000000 --- a/src/neptune/internal/hardware/resources/gpu_card_indices_provider.py +++ /dev/null @@ -1,49 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import re - - -class GPUCardIndicesProvider(object): - def __init__(self, cuda_visible_devices, gpu_card_count): - self.__cuda_visible_devices = cuda_visible_devices - self.__gpu_card_count = gpu_card_count - self.__cuda_visible_devices_regex = r"^-?\d+(,-?\d+)*$" - - def get(self): - if self.__is_cuda_visible_devices_correct(): - return self.__gpu_card_indices_from_cuda_visible_devices() - else: - return list(range(self.__gpu_card_count)) - - def __is_cuda_visible_devices_correct(self): - return self.__cuda_visible_devices is not None and re.match( - self.__cuda_visible_devices_regex, self.__cuda_visible_devices - ) - - def __gpu_card_indices_from_cuda_visible_devices(self): - correct_indices = [] - - # According to CUDA Toolkit specification. - # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars - for gpu_index_str in self.__cuda_visible_devices.split(","): - gpu_index = int(gpu_index_str) - if 0 <= gpu_index < self.__gpu_card_count: - correct_indices.append(gpu_index) - else: - break - - return list(set(correct_indices)) diff --git a/src/neptune/internal/hardware/resources/system_resource_info.py b/src/neptune/internal/hardware/resources/system_resource_info.py deleted file mode 100644 index 9c5cd0ba6..000000000 --- a/src/neptune/internal/hardware/resources/system_resource_info.py +++ /dev/null @@ -1,55 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -class SystemResourceInfo(object): - def __init__( - self, - cpu_core_count, - memory_amount_bytes, - gpu_card_indices, - gpu_memory_amount_bytes, - ): - self.__cpu_core_count = cpu_core_count - self.__memory_amount_bytes = memory_amount_bytes - self.__gpu_card_indices = gpu_card_indices - self.__gpu_memory_amount_bytes = gpu_memory_amount_bytes - - @property - def cpu_core_count(self): - return self.__cpu_core_count - - @property - def memory_amount_bytes(self): - return self.__memory_amount_bytes - - @property - def gpu_card_count(self): - return len(self.__gpu_card_indices) - - @property - def gpu_card_indices(self): - return self.__gpu_card_indices - - @property - def gpu_memory_amount_bytes(self): - return self.__gpu_memory_amount_bytes - - def has_gpu(self): - return self.gpu_card_count > 0 - - def __repr__(self): - return str(self.__dict__) diff --git a/src/neptune/internal/hardware/resources/system_resource_info_factory.py b/src/neptune/internal/hardware/resources/system_resource_info_factory.py deleted file mode 100644 index b05b8bf56..000000000 --- a/src/neptune/internal/hardware/resources/system_resource_info_factory.py +++ /dev/null @@ -1,56 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from neptune.internal.hardware.cgroup.cgroup_monitor import CGroupMonitor -from neptune.internal.hardware.gauges.gauge_mode import GaugeMode -from neptune.internal.hardware.resources.gpu_card_indices_provider import GPUCardIndicesProvider -from neptune.internal.hardware.resources.system_resource_info import SystemResourceInfo - - -class SystemResourceInfoFactory(object): - def __init__(self, system_monitor, gpu_monitor, os_environ): - self.__system_monitor = system_monitor - self.__gpu_monitor = gpu_monitor - self.__gpu_card_indices_provider = GPUCardIndicesProvider( - cuda_visible_devices=os_environ.get("CUDA_VISIBLE_DEVICES"), - gpu_card_count=self.__gpu_monitor.get_card_count(), - ) - - def create(self, gauge_mode): - if gauge_mode == GaugeMode.SYSTEM: - return self.__create_whole_system_resource_info() - elif gauge_mode == GaugeMode.CGROUP: - return self.__create_cgroup_resource_info() - else: - raise ValueError(str("Unknown gauge mode: {}".format(gauge_mode))) - - def __create_whole_system_resource_info(self): - return SystemResourceInfo( - cpu_core_count=float(self.__system_monitor.cpu_count()), - memory_amount_bytes=self.__system_monitor.virtual_memory().total, - gpu_card_indices=self.__gpu_card_indices_provider.get(), - gpu_memory_amount_bytes=self.__gpu_monitor.get_top_card_memory_in_bytes(), - ) - - def __create_cgroup_resource_info(self): - cgroup_monitor = CGroupMonitor.create() - - return SystemResourceInfo( - cpu_core_count=cgroup_monitor.get_cpu_usage_limit_in_cores(), - memory_amount_bytes=cgroup_monitor.get_memory_limit_in_bytes(), - gpu_card_indices=self.__gpu_card_indices_provider.get(), - gpu_memory_amount_bytes=self.__gpu_monitor.get_top_card_memory_in_bytes(), - ) diff --git a/src/neptune/internal/hardware/system/__init__.py b/src/neptune/internal/hardware/system/__init__.py deleted file mode 100644 index 62a86a5be..000000000 --- a/src/neptune/internal/hardware/system/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/src/neptune/internal/hardware/system/system_monitor.py b/src/neptune/internal/hardware/system/system_monitor.py deleted file mode 100644 index ec6a2dea3..000000000 --- a/src/neptune/internal/hardware/system/system_monitor.py +++ /dev/null @@ -1,36 +0,0 @@ -# -# Copyright (c) 2019, Neptune Labs Sp. z o.o. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -try: - import psutil - - PSUTIL_INSTALLED = True -except ImportError: - PSUTIL_INSTALLED = False - - -class SystemMonitor(object): - @staticmethod - def cpu_count(): - return psutil.cpu_count() - - @staticmethod - def cpu_percent(): - return psutil.cpu_percent() - - @staticmethod - def virtual_memory(): - return psutil.virtual_memory() diff --git a/src/neptune/objects/run.py b/src/neptune/objects/run.py index 9fa2c5fee..81f6f7547 100644 --- a/src/neptune/objects/run.py +++ b/src/neptune/objects/run.py @@ -50,7 +50,6 @@ ) from neptune.internal.backends.api_model import ApiExperiment from neptune.internal.container_type import ContainerType -from neptune.internal.hardware.hardware_metric_reporting_job import HardwareMetricReportingJob from neptune.internal.id_formats import QualifiedName from neptune.internal.init.parameters import ( ASYNC_LAG_THRESHOLD, @@ -456,9 +455,6 @@ def _get_background_jobs(self) -> List["BackgroundJob"]: if self._capture_stderr: background_jobs.append(StderrCaptureBackgroundJob(attribute_name=self._stderr_path)) - if self._capture_hardware_metrics: - background_jobs.append(HardwareMetricReportingJob(attribute_namespace=self._monitoring_namespace)) - if self._capture_traceback: background_jobs.append( TracebackJob(path=f"{self._monitoring_namespace}/traceback", fail_on_exception=self._fail_on_exception)