diff --git a/examples/read_numpy_file.py b/examples/read_numpy_file.py index f66e75c..ae21536 100644 --- a/examples/read_numpy_file.py +++ b/examples/read_numpy_file.py @@ -1,10 +1,11 @@ import numpy as np +from perfcapture.dataset import Dataset from perfcapture.workload import Workload -class ReadNumpyFile(Workload): - def prepare_dataset(self): +class NumpyDataset(Dataset): + def prepare(self) -> None: """Create simple numpy file.""" # Generate an array of random numbers rng = np.random.default_rng() @@ -20,9 +21,11 @@ def prepare_dataset(self): # Save array to temporary file with open(self.path_to_dataset, mode="wb") as fh: - np.save(fh, array) + np.save(fh, array) - def run_workload(self): + +class ReadNumpyFile(Workload): + def run(self): """Load numpy file into RAM.""" for _ in range(100): - np.load(self.path_to_dataset) + np.load(self.dataset.path_to_dataset) diff --git a/scripts/perfcapture.py b/scripts/perfcapture.py index 11c7743..ba8d6a6 100755 --- a/scripts/perfcapture.py +++ b/scripts/perfcapture.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import pathlib import typer from typing_extensions import Annotated from typing import Optional @@ -8,16 +9,35 @@ @app.command() def bench( - workloads: Annotated[Optional[str], typer.Argument()] = None, - do_not_clear_cache: Annotated[ + data_path: Annotated[ + pathlib.Path, + typer.Argument(help="The directory for storing the data which the benchmarks read from.") + ], + recipe_dir: Annotated[ + pathlib.Path, + typer.Argument(help=( + "The directory containing the code which defines the Workloads and Datasets.")) + ] = pathlib.Path("."), + workloads: Annotated[ + Optional[str], + typer.Argument(help=( + "Space-separated list of workload classes to run. If not set, all workloads found in" + " recipe_dir will be run.")) + ] = None, + keep_cache: Annotated[ bool, typer.Option( - "--do-not-clear-cache", + "--keep-cache", help="Set this flag to prevent `vmtouch -e` being called before each benchmark.", ) ] = False, - ): - pass + ) -> None: + + all_workloads = descover_workloads(recipe_dir) + all_datasets = set([workload.dataset for workload in all_workloads]) + for dataset in all_datasets: + if not dataset.already_exists(): + dataset.prepare(base_data_path=data_path) if __name__ == "__main__": app() diff --git a/src/perfcapture/dataset.py b/src/perfcapture/dataset.py new file mode 100644 index 0000000..71b4466 --- /dev/null +++ b/src/perfcapture/dataset.py @@ -0,0 +1,45 @@ +import abc +import pathlib + +from perfcapture.utils import path_not_empty + + +class Dataset(abc.ABC): + """Inherit from `Dataset` to implement a new benchmark dataset. + + Datasets are read by `Workload`s. + """ + def __init__(self, base_data_path: pathlib.Path): + self.path_to_dataset = base_data_path / self.name + + @property + @abc.abstractmethod + def name(self) -> str: + """The name of this dataset. Must be unique amongst all the datasets used in the benchmark suite.""" + pass + + @abc.abstractmethod + def prepare(self) -> None: + """Override this method if your workload needs to prepare a local dataset. + + Store your dataset at `self.path_to_dataset`. + + Every time the workload runner executes, it runs this pseudocode: + + if not dataset.already_exists(): + dataset.prepare() + """ + pass + + def already_exists(self) -> bool: + """Returns True if the dataset is already on disk.""" + path_is_dir_which_is_not_empty = ( + self.path_to_dataset.exists() and + self.path_to_dataset.is_dir() and + path_not_empty(self.path_to_dataset) + ) + path_is_single_file = ( + self.path_to_dataset.exists() and + not self.path_to_dataset.is_dir() + ) + return path_is_dir_which_is_not_empty or path_is_single_file \ No newline at end of file diff --git a/src/perfcapture/workload.py b/src/perfcapture/workload.py index 3c09e3b..dd2c19e 100644 --- a/src/perfcapture/workload.py +++ b/src/perfcapture/workload.py @@ -1,54 +1,16 @@ import abc import pathlib +from perfcapture.dataset import Dataset from perfcapture.utils import path_not_empty -""" -TODO: Workload and Dataset should be separate classes. -This is so single Dataset can be used by multiple Workloads. -""" - class Workload(abc.ABC): - """To implement a new benchmark workload, inherit from `Workload`. - - Most folks will want to override just two methods: - - - prepare_dataset - - run_workload - """ - - def __init__(self, path_to_dataset: pathlib.Path): - self.path_to_dataset = path_to_dataset + """Inherit from `Workload` to implement a new benchmark workload.""" - def prepare_dataset(self) -> None: - """Override this method if your workload needs to prepare a local dataset. - - Every time the workload runner executes, it runs this pseudocode - before calling `run_workload`: + def __init__(self, dataset: Dataset): + self.dataset = dataset - if not workload.dataset_already_exists(): - workload.prepare_dataset() - - Store your dataset at `self.path_to_dataset`. - """ - pass - @abc.abstractmethod - def run_workload(self) -> dict[str, object]: - """Must be overridden. This method implements the workload. - """ - - def dataset_already_exists(self) -> bool: - """Returns True if the dataset is already on disk. - """ - path_is_dir_which_is_not_empty = ( - self.path_to_dataset.exists() and - self.path_to_dataset.is_dir() and - path_not_empty(self.path_to_dataset) - ) - path_is_single_file = ( - self.path_to_dataset.exists() and - not self.path_to_dataset.is_dir() - ) - return path_is_dir_which_is_not_empty or path_is_single_file + def run(self) -> dict[str, object]: + """Must be overridden to implement the workload."""