From 36f96209b8f1675d00f7f518426d266c84cf8c59 Mon Sep 17 00:00:00 2001 From: Jack Kelly Date: Wed, 18 Oct 2023 13:02:33 +0100 Subject: [PATCH] Only collect DiskIO data for the disk used for benchmarking --- src/perfcapture/metrics.py | 1 + src/perfcapture/performance_counters.py | 67 ++++++++++++++++++++----- src/perfcapture/workload.py | 2 +- 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/perfcapture/metrics.py b/src/perfcapture/metrics.py index c3a538e..da78ef7 100644 --- a/src/perfcapture/metrics.py +++ b/src/perfcapture/metrics.py @@ -1,3 +1,4 @@ +import pathlib from dataclasses import dataclass diff --git a/src/perfcapture/performance_counters.py b/src/perfcapture/performance_counters.py index 3b5fb56..2dac2da 100644 --- a/src/perfcapture/performance_counters.py +++ b/src/perfcapture/performance_counters.py @@ -1,6 +1,7 @@ import abc +import pathlib from collections import namedtuple -from dataclasses import dataclass, field +from dataclasses import InitVar, dataclass, field from datetime import datetime import numpy as np @@ -18,13 +19,15 @@ class _PerfCounterABC(abc.ABC): Usage: 1. Init PerfCounter subclass when we start benchmarking a specific combination of and . - 2. Call `start_timing_iteration()` at the start of each iteration. - 3. Call `stop_timing_iteration()` at the end of each iteration. - 4. Call `get_results()` at the end of the `run`, to get a `pd.DataFrame` of results. + 2. Set `PerfCounter.dataset_path` if this perf counter needs to know the dataset path. + 3. Call `start_timing_iteration()` at the start of each iteration. + 4. Call `stop_timing_iteration()` at the end of each iteration. + 5. Call `get_results()` at the end of the `run`, to get a `pd.DataFrame` of results. """ def __init__(self) -> None: self._data_per_run = pd.DataFrame(columns=[self.name]) self._data_per_run.index.name = "run_ID" + self._dataset_path: pathlib.Path | None = None def start_timing_run(self) -> None: pass @@ -41,16 +44,26 @@ def get_results(self) -> pd.DataFrame: def name(self) -> str: return self.__class__.__name__ + @property + def dataset_path(self) -> pathlib.Path: + return self._dataset_path + + @dataset_path.setter + def dataset_path(self, dataset_path: pathlib.Path) -> None: + self._dataset_path = dataset_path @dataclass class PerfCounterManager: """Simple manager for multiple performance counters.""" + dataset_path: InitVar[pathlib.Path] counters: list[_PerfCounterABC] = field( default_factory=lambda: [Runtime(), BandwidthToNumpy(), DiskIO()] ) - def __post_init__(self) -> None: + def __post_init__(self, dataset_path: pathlib.Path) -> None: self._run_id: int = 0 + for counter in self.counters: + counter.dataset_path = dataset_path def start_timing_run(self) -> None: self._run_id += 1 @@ -100,9 +113,9 @@ def name(self) -> str: class DiskIO(_PerfCounterABC): """Record performance of disks. - Note that this records performance of all disks on this machine, - used by all processes. So, if other processes are using any disk - (even a different disk to the disk that you're benchmarking) then + Note that this records performance of the specific disk used + to store the benchmark datasets, but we record the activity of + all processes. So, if other processes are using this disk then you'll get misleading results! For more information on the fields recorded, please see: @@ -125,7 +138,17 @@ def __init__(self) -> None: ("read_bytes", "write_bytes", "read_time", "write_time", "busy_time")) self._data_per_run = pd.DataFrame(dtype=np.int64, columns=columns) self._data_per_run.index.name = "run_ID" - + + @property + def dataset_path(self) -> pathlib.Path: + return super().dataset_path() + + @dataset_path.setter + def dataset_path(self, dataset_path: pathlib.Path) -> None: + self._dataset_path = dataset_path + self._dataset_partition_name = _get_partition_name_from_path(dataset_path) + print("dataset_partition_name =", self._dataset_partition_name) + def start_timing_run(self) -> None: self._disk_counters_at_start_of_run = self._get_disk_io_counters_as_series() @@ -162,9 +185,9 @@ def stop_timing_run(self, metrics_for_run: MetricsForRun) -> None: self._data_per_run.loc[metrics_for_run.run_id] = count_diff - @classmethod - def _get_disk_io_counters_as_series(cls) -> pd.Series: - counters: namedtuple = psutil.disk_io_counters() + def _get_disk_io_counters_as_series(self) -> pd.Series: + counters: dict[str, namedtuple] = psutil.disk_io_counters(perdisk=True) + counters: namedtuple = counters[self._dataset_partition_name] return pd.Series(counters._asdict()) @property @@ -181,4 +204,22 @@ def __init__(self) -> None: def total_secs_elapsed(self) -> float: duration = datetime.now() - self._time_at_start - return duration.total_seconds() \ No newline at end of file + return duration.total_seconds() + + +def _get_partition_name_from_path(dataset_path: pathlib.Path) -> str: + dataset_mount_point = _get_mount_point_from_path(dataset_path) + partitions: list[psutil._common.sdiskpart] = psutil.disk_partitions() + for partition in partitions: + if pathlib.Path(partition.mountpoint) == dataset_mount_point: + return pathlib.Path(partition.device).parts[-1] + raise RuntimeError(f"Could not find partition for {dataset_path}") + + +def _get_mount_point_from_path(p: pathlib.Path) -> pathlib.Path: + if len(p.parts) == 0: + raise RuntimeError(f"Path '{p}' should not be empty!") + elif p.is_mount(): + return p + else: + return _get_mount_point_from_path(p.parent) \ No newline at end of file diff --git a/src/perfcapture/workload.py b/src/perfcapture/workload.py index ce0b0d2..c76f275 100644 --- a/src/perfcapture/workload.py +++ b/src/perfcapture/workload.py @@ -81,7 +81,7 @@ def run_workloads( for workload in workloads: for dataset in workload.datasets: print(f"Running {workload.name} {workload.n_runs} times on {dataset.name}!") - perf_counter = PerfCounterManager() + perf_counter = PerfCounterManager(dataset.path) for i in range(workload.n_runs): print(f"Run {i+1} of {workload.n_runs}...") if not keep_cache: