Yay! cli.py now runs the example workload, and creates the example da…

…taset!
zarr-developers · Sep 29, 2023 · 28aedb0 · 28aedb0
1 parent ef482a6
commit 28aedb0
Show file tree

Hide file tree

Showing 6 changed files with 147 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -7,3 +7,12 @@ Capture the performance of a computer system whilst running a set of benchmark w
 2. Optionally create a virtual Python environment (e.g. with `python -m venv </path/to/venv/>`) and 
    activate that venv (`source </path/to/venv/>bin/activate`).
 3. `pip install -e .`
+
+# Usage
+
+To run the examples:
+
+```
+~/dev/perfcapture$ mkdir -p ~/temp/perfcapture_data_path
+~/dev/perfcapture$ python scripts/cli.py --data-path ~/temp/perfcapture_data_path --recipe-path examples
+```
diff --git a/examples/read_numpy_file.py b/examples/read_numpy_file.py
@@ -20,12 +20,18 @@ def prepare(self) -> None:
         print("Created array", flush=True)
 
         # Save array to temporary file
-        with open(self.path_to_dataset, mode="wb") as fh:
+        with open(self.path, mode="wb") as fh:
             np.save(fh, array)    
 
 
 class ReadNumpyFile(Workload):
+    def init_dataset(self) -> Dataset:
+        return NumpyDataset()
+
     def run(self):
         """Load numpy file into RAM."""
-        for _ in range(100):
-            np.load(self.dataset.path_to_dataset)
+        np.load(self.dataset.path)
+
+    @property
+    def n_repeats(self) -> int:
+        return 10
diff --git a/scripts/cli.py b/scripts/cli.py
@@ -1,43 +1,88 @@
 #!/usr/bin/env python
 import pathlib
+import shutil
+import subprocess
+import sys
 import typer
 from typing_extensions import Annotated
 from typing import Optional
 
+from perfcapture.workload import discover_workloads
+
 app = typer.Typer()
 
 
 @app.command()
 def bench(
     data_path: Annotated[
         pathlib.Path,
-        typer.Argument(help="The directory for storing the data which the benchmarks read from.")
+        typer.Option(help="The path for storing the data which the benchmarks read from.")
     ],
-    recipe_dir: Annotated[
+    recipe_path: Annotated[
         pathlib.Path,
-        typer.Argument(help=(
-            "The directory containing the code which defines the Workloads and Datasets."))
+        typer.Option(help=(
+            "The path containing the code which defines the Workloads and Datasets."))
     ] = pathlib.Path("."),
-    workloads: Annotated[
+    selected_workloads: Annotated[
         Optional[str], 
-        typer.Argument(help=(
-            "Space-separated list of workload classes to run. If not set, all workloads found in"
-            " recipe_dir will be run."))
+        typer.Option(help=(
+            "Space-separated list of workloads to run. If not set, all workloads found in"
+            " recipe_path will be run. Use the `name` of each workload."))
         ] = None,
     keep_cache: Annotated[
         bool, 
         typer.Option(
-            "--keep-cache",
             help="Set this flag to prevent `vmtouch -e` being called before each benchmark.",
             )
         ] = False,
     ) -> None:
+    """Run workload(s) and measure performance.
+    
+    If any of the workloads require datasets to be pre-prepared then this script will first generate
+    all datasets required by the workload(s). Those datasets will be stored at the `data_path`.
+    The time spent creating the datasets will not be recorded. The contents of `data_path` will not
+    be removed after running this script. So if you run this script multiple times then subsequent
+    runs can make use of the already existing datasets.
+    
+    If you update the recipe which specifies the dataset creation then it is up to you to manually
+    delete the old dataset on disk.
+    
+    vmtouch must be installed if you wish to clear the page cache after each iteration.
+    """
+    # Sanity checks
+    if not data_path.exists():
+        sys.exit(f"ERROR! {data_path} does not exist! Please create the directory!")
+    if not recipe_path.exists():
+        sys.exit(f"ERROR! {recipe_path} does not exist!")
+    if shutil.which("vmtouch") is None:  # Check if vmtouch has been installed.
+        sys.exit(
+            "If you want to flush the page cache before each iteration, then please install"
+            " vmtouch. Or run with the --keep-cache option, which does not call vmtouch.")
 
-    all_workloads = descover_workloads(recipe_dir)
-    all_datasets = set([workload.dataset for workload in all_workloads])
+    workloads = discover_workloads(recipe_path)
+    print(f"Found {len(workloads)} Workload(s) in {recipe_path}")
+
+    # Filter workloads (if necessary).
+    if selected_workloads:
+        selected_workloads = selected_workloads.split(" ")
+        workloads = filter(lambda workload: workload.name in selected_workloads, workloads)
+
+    # Prepare datasets (if necessary).
+    all_datasets = set([workload.dataset for workload in workloads])
     for dataset in all_datasets:
+        dataset.set_path(data_path)
         if not dataset.already_exists():
-            dataset.prepare(base_data_path=data_path)
+            dataset.prepare()
+
+    # Run the workloads!
+    for workload in workloads:
+        print(f"Running {workload.name} {workload.n_repeats} times!", flush=True)
+        for _ in range(workload.n_repeats):
+            if not keep_cache:
+                p = subprocess.run(
+                    ["vmtouch", "-e", workload.dataset.path], capture_output=True, check=True)
+            workload.run()
+        print(f"    Finished running {workload.name} {workload.n_repeats} times!", flush=True)
 
 if __name__ == "__main__":
     app()
diff --git a/src/perfcapture/dataset.py b/src/perfcapture/dataset.py
@@ -9,20 +9,22 @@ class Dataset(abc.ABC):
     
     Datasets are read by `Workload`s.
     """
-    def __init__(self, base_data_path: pathlib.Path):
-        self.path_to_dataset = base_data_path / self.name
+    def set_path(self, base_data_path: pathlib.Path):
+        self.path = base_data_path / self.name
 
     @property
-    @abc.abstractmethod
     def name(self) -> str:
-        """The name of this dataset. Must be unique amongst all the datasets used in the benchmark suite."""
-        pass
+        """The name of this dataset.
+        
+        Must be unique amongst all the datasets used in the benchmark suite.
+        """
+        return self.__class__.__name__
 
     @abc.abstractmethod
     def prepare(self) -> None:
         """Override this method if your workload needs to prepare a local dataset.
         
-        Store your dataset at `self.path_to_dataset`.
+        Store your dataset at `self.path`.
         
         Every time the workload runner executes, it runs this pseudocode:
 
@@ -34,12 +36,12 @@ def prepare(self) -> None:
     def already_exists(self) -> bool:
         """Returns True if the dataset is already on disk."""
         path_is_dir_which_is_not_empty = (
-            self.path_to_dataset.exists() and
-            self.path_to_dataset.is_dir() and
-            path_not_empty(self.path_to_dataset)
+            self.path.exists() and
+            self.path.is_dir() and
+            path_not_empty(self.path)
         )
         path_is_single_file = (
-            self.path_to_dataset.exists() and
-            not self.path_to_dataset.is_dir()
+            self.path.exists() and
+            not self.path.is_dir()
         )
         return path_is_dir_which_is_not_empty or path_is_single_file
diff --git a/src/perfcapture/utils.py b/src/perfcapture/utils.py
@@ -1,12 +1,22 @@
 """Simple utility functions."""
 
+import importlib.util
 import pathlib
-
+import sys
 
 def path_not_empty(path: pathlib.Path) -> bool:
     """Returns True if `path` is not empty."""
     # If `path` contains just a single entry then return True.
     # To save time, don't bother iterating past the first entry.
     for _ in path.iterdir():
         return True
-    return False
+    return False
+
+
+def load_module_from_filename(py_filename: pathlib.Path):
+    module_name = py_filename.stem
+    spec = importlib.util.spec_from_file_location(module_name, py_filename)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
diff --git a/src/perfcapture/workload.py b/src/perfcapture/workload.py
@@ -1,16 +1,60 @@
 import abc
+import inspect
 import pathlib
 from perfcapture.dataset import Dataset
 
-from perfcapture.utils import path_not_empty
+from perfcapture.utils import load_module_from_filename, path_not_empty
 
 
 class Workload(abc.ABC):
     """Inherit from `Workload` to implement a new benchmark workload."""
 
-    def __init__(self, dataset: Dataset):
-        self.dataset = dataset
+    def __init__(self):
+        self.dataset = self.init_dataset()
+
+    @abc.abstractmethod
+    def init_dataset(self) -> Dataset:
+        """Initialises and returns a concrete Dataset objects."""
 
     @abc.abstractmethod
     def run(self) -> dict[str, object]:
         """Must be overridden to implement the workload."""
+
+    @property
+    def name(self) -> str:
+        """The name of this workload.
+        
+        Must be unique amongst all the workloads used in this benchmark suite.
+        """
+        return self.__class__.__name__
+
+    @property
+    def n_repeats(self) -> int:
+        """The number of times to repeat this workload."""
+        return 1
+
+
+def load_workloads_from_filename(py_filename: pathlib.Path) -> list[Workload]:
+    workloads = []
+    module = load_module_from_filename(py_filename)
+    for member_name in dir(module):
+        module_attr = getattr(module, member_name)
+        if (module_attr 
+            and inspect.isclass(module_attr) 
+            and issubclass(module_attr, Workload) 
+            and module_attr is not Workload
+            ):
+            print(f"Instantiating {member_name}")
+            workload_obj = module_attr()
+            workloads.append(workload_obj)
+    return workloads
+
+
+def discover_workloads(recipe_path: pathlib.Path) -> list[Workload]:
+    workloads = []
+    for py_filename in recipe_path.glob("*.py"):
+        workloads_from_py_file = load_workloads_from_filename(py_filename)
+        workloads.extend(workloads_from_py_file)
+    return workloads
+
+