Making a start on separating the dataset from the benchmark workload

zarr-developers · Sep 29, 2023 · d126413 · d126413
1 parent da234bb
commit d126413
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 54 deletions.
diff --git a/examples/read_numpy_file.py b/examples/read_numpy_file.py
@@ -1,10 +1,11 @@
 import numpy as np
+from perfcapture.dataset import Dataset
 
 from perfcapture.workload import Workload
 
 
-class ReadNumpyFile(Workload):
-    def prepare_dataset(self):
+class NumpyDataset(Dataset):
+    def prepare(self) -> None:
         """Create simple numpy file."""
         # Generate an array of random numbers
         rng = np.random.default_rng()
@@ -20,9 +21,11 @@ def prepare_dataset(self):
 
         # Save array to temporary file
         with open(self.path_to_dataset, mode="wb") as fh:
-            np.save(fh, array)
+            np.save(fh, array)    
 
-    def run_workload(self):
+
+class ReadNumpyFile(Workload):
+    def run(self):
         """Load numpy file into RAM."""
         for _ in range(100):
-            np.load(self.path_to_dataset)
+            np.load(self.dataset.path_to_dataset)
diff --git a/scripts/perfcapture.py b/scripts/perfcapture.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+import pathlib
 import typer
 from typing_extensions import Annotated
 from typing import Optional
@@ -8,16 +9,35 @@
 
 @app.command()
 def bench(
-    workloads: Annotated[Optional[str], typer.Argument()] = None,
-    do_not_clear_cache: Annotated[
+    data_path: Annotated[
+        pathlib.Path,
+        typer.Argument(help="The directory for storing the data which the benchmarks read from.")
+    ],
+    recipe_dir: Annotated[
+        pathlib.Path,
+        typer.Argument(help=(
+            "The directory containing the code which defines the Workloads and Datasets."))
+    ] = pathlib.Path("."),
+    workloads: Annotated[
+        Optional[str], 
+        typer.Argument(help=(
+            "Space-separated list of workload classes to run. If not set, all workloads found in"
+            " recipe_dir will be run."))
+        ] = None,
+    keep_cache: Annotated[
         bool, 
         typer.Option(
-            "--do-not-clear-cache",
+            "--keep-cache",
             help="Set this flag to prevent `vmtouch -e` being called before each benchmark.",
             )
         ] = False,
-    ):
-    pass
+    ) -> None:
+
+    all_workloads = descover_workloads(recipe_dir)
+    all_datasets = set([workload.dataset for workload in all_workloads])
+    for dataset in all_datasets:
+        if not dataset.already_exists():
+            dataset.prepare(base_data_path=data_path)
 
 if __name__ == "__main__":
     app()
diff --git a/src/perfcapture/dataset.py b/src/perfcapture/dataset.py
@@ -0,0 +1,45 @@
+import abc
+import pathlib
+
+from perfcapture.utils import path_not_empty
+
+
+class Dataset(abc.ABC):
+    """Inherit from `Dataset` to implement a new benchmark dataset.
+    
+    Datasets are read by `Workload`s.
+    """
+    def __init__(self, base_data_path: pathlib.Path):
+        self.path_to_dataset = base_data_path / self.name
+
+    @property
+    @abc.abstractmethod
+    def name(self) -> str:
+        """The name of this dataset. Must be unique amongst all the datasets used in the benchmark suite."""
+        pass
+
+    @abc.abstractmethod
+    def prepare(self) -> None:
+        """Override this method if your workload needs to prepare a local dataset.
+        
+        Store your dataset at `self.path_to_dataset`.
+        
+        Every time the workload runner executes, it runs this pseudocode:
+
+            if not dataset.already_exists():
+                dataset.prepare()
+        """
+        pass
+
+    def already_exists(self) -> bool:
+        """Returns True if the dataset is already on disk."""
+        path_is_dir_which_is_not_empty = (
+            self.path_to_dataset.exists() and
+            self.path_to_dataset.is_dir() and
+            path_not_empty(self.path_to_dataset)
+        )
+        path_is_single_file = (
+            self.path_to_dataset.exists() and
+            not self.path_to_dataset.is_dir()
+        )
+        return path_is_dir_which_is_not_empty or path_is_single_file
diff --git a/src/perfcapture/workload.py b/src/perfcapture/workload.py
@@ -1,54 +1,16 @@
 import abc
 import pathlib
+from perfcapture.dataset import Dataset
 
 from perfcapture.utils import path_not_empty
 
 
-"""
-TODO: Workload and Dataset should be separate classes.
-This is so single Dataset can be used by multiple Workloads.
-"""
-
 class Workload(abc.ABC):
-    """To implement a new benchmark workload, inherit from `Workload`.
-    
-    Most folks will want to override just two methods:
-    
-    - prepare_dataset
-    - run_workload
-    """
-
-    def __init__(self, path_to_dataset: pathlib.Path):
-        self.path_to_dataset = path_to_dataset
+    """Inherit from `Workload` to implement a new benchmark workload."""
 
-    def prepare_dataset(self) -> None:
-        """Override this method if your workload needs to prepare a local dataset.
-        
-        Every time the workload runner executes, it runs this pseudocode
-        before calling `run_workload`:
+    def __init__(self, dataset: Dataset):
+        self.dataset = dataset
 
-            if not workload.dataset_already_exists():
-                workload.prepare_dataset()
-
-        Store your dataset at `self.path_to_dataset`.
-        """
-        pass
-
     @abc.abstractmethod
-    def run_workload(self) -> dict[str, object]:
-        """Must be overridden. This method implements the workload.
-        """
-
-    def dataset_already_exists(self) -> bool:
-        """Returns True if the dataset is already on disk.
-        """
-        path_is_dir_which_is_not_empty = (
-            self.path_to_dataset.exists() and
-            self.path_to_dataset.is_dir() and
-            path_not_empty(self.path_to_dataset)
-        )
-        path_is_single_file = (
-            self.path_to_dataset.exists() and
-            not self.path_to_dataset.is_dir()
-        )
-        return path_is_dir_which_is_not_empty or path_is_single_file
+    def run(self) -> dict[str, object]:
+        """Must be overridden to implement the workload."""