diff --git a/README.md b/README.md index 50ed45f..027e967 100644 --- a/README.md +++ b/README.md @@ -7,3 +7,12 @@ Capture the performance of a computer system whilst running a set of benchmark w 2. Optionally create a virtual Python environment (e.g. with `python -m venv `) and activate that venv (`source bin/activate`). 3. `pip install -e .` + +# Usage + +To run the examples: + +``` +~/dev/perfcapture$ mkdir -p ~/temp/perfcapture_data_path +~/dev/perfcapture$ python scripts/cli.py --data-path ~/temp/perfcapture_data_path --recipe-path examples +``` \ No newline at end of file diff --git a/examples/read_numpy_file.py b/examples/read_numpy_file.py index ae21536..e71c95c 100644 --- a/examples/read_numpy_file.py +++ b/examples/read_numpy_file.py @@ -20,12 +20,18 @@ def prepare(self) -> None: print("Created array", flush=True) # Save array to temporary file - with open(self.path_to_dataset, mode="wb") as fh: + with open(self.path, mode="wb") as fh: np.save(fh, array) class ReadNumpyFile(Workload): + def init_dataset(self) -> Dataset: + return NumpyDataset() + def run(self): """Load numpy file into RAM.""" - for _ in range(100): - np.load(self.dataset.path_to_dataset) + np.load(self.dataset.path) + + @property + def n_repeats(self) -> int: + return 10 diff --git a/scripts/cli.py b/scripts/cli.py index ba8d6a6..17c20f7 100755 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -1,9 +1,14 @@ #!/usr/bin/env python import pathlib +import shutil +import subprocess +import sys import typer from typing_extensions import Annotated from typing import Optional +from perfcapture.workload import discover_workloads + app = typer.Typer() @@ -11,33 +16,73 @@ def bench( data_path: Annotated[ pathlib.Path, - typer.Argument(help="The directory for storing the data which the benchmarks read from.") + typer.Option(help="The path for storing the data which the benchmarks read from.") ], - recipe_dir: Annotated[ + recipe_path: Annotated[ pathlib.Path, - typer.Argument(help=( - "The directory containing the code which defines the Workloads and Datasets.")) + typer.Option(help=( + "The path containing the code which defines the Workloads and Datasets.")) ] = pathlib.Path("."), - workloads: Annotated[ + selected_workloads: Annotated[ Optional[str], - typer.Argument(help=( - "Space-separated list of workload classes to run. If not set, all workloads found in" - " recipe_dir will be run.")) + typer.Option(help=( + "Space-separated list of workloads to run. If not set, all workloads found in" + " recipe_path will be run. Use the `name` of each workload.")) ] = None, keep_cache: Annotated[ bool, typer.Option( - "--keep-cache", help="Set this flag to prevent `vmtouch -e` being called before each benchmark.", ) ] = False, ) -> None: + """Run workload(s) and measure performance. + + If any of the workloads require datasets to be pre-prepared then this script will first generate + all datasets required by the workload(s). Those datasets will be stored at the `data_path`. + The time spent creating the datasets will not be recorded. The contents of `data_path` will not + be removed after running this script. So if you run this script multiple times then subsequent + runs can make use of the already existing datasets. + + If you update the recipe which specifies the dataset creation then it is up to you to manually + delete the old dataset on disk. + + vmtouch must be installed if you wish to clear the page cache after each iteration. + """ + # Sanity checks + if not data_path.exists(): + sys.exit(f"ERROR! {data_path} does not exist! Please create the directory!") + if not recipe_path.exists(): + sys.exit(f"ERROR! {recipe_path} does not exist!") + if shutil.which("vmtouch") is None: # Check if vmtouch has been installed. + sys.exit( + "If you want to flush the page cache before each iteration, then please install" + " vmtouch. Or run with the --keep-cache option, which does not call vmtouch.") - all_workloads = descover_workloads(recipe_dir) - all_datasets = set([workload.dataset for workload in all_workloads]) + workloads = discover_workloads(recipe_path) + print(f"Found {len(workloads)} Workload(s) in {recipe_path}") + + # Filter workloads (if necessary). + if selected_workloads: + selected_workloads = selected_workloads.split(" ") + workloads = filter(lambda workload: workload.name in selected_workloads, workloads) + + # Prepare datasets (if necessary). + all_datasets = set([workload.dataset for workload in workloads]) for dataset in all_datasets: + dataset.set_path(data_path) if not dataset.already_exists(): - dataset.prepare(base_data_path=data_path) + dataset.prepare() + + # Run the workloads! + for workload in workloads: + print(f"Running {workload.name} {workload.n_repeats} times!", flush=True) + for _ in range(workload.n_repeats): + if not keep_cache: + p = subprocess.run( + ["vmtouch", "-e", workload.dataset.path], capture_output=True, check=True) + workload.run() + print(f" Finished running {workload.name} {workload.n_repeats} times!", flush=True) if __name__ == "__main__": app() diff --git a/src/perfcapture/dataset.py b/src/perfcapture/dataset.py index 71b4466..e7197d7 100644 --- a/src/perfcapture/dataset.py +++ b/src/perfcapture/dataset.py @@ -9,20 +9,22 @@ class Dataset(abc.ABC): Datasets are read by `Workload`s. """ - def __init__(self, base_data_path: pathlib.Path): - self.path_to_dataset = base_data_path / self.name + def set_path(self, base_data_path: pathlib.Path): + self.path = base_data_path / self.name @property - @abc.abstractmethod def name(self) -> str: - """The name of this dataset. Must be unique amongst all the datasets used in the benchmark suite.""" - pass + """The name of this dataset. + + Must be unique amongst all the datasets used in the benchmark suite. + """ + return self.__class__.__name__ @abc.abstractmethod def prepare(self) -> None: """Override this method if your workload needs to prepare a local dataset. - Store your dataset at `self.path_to_dataset`. + Store your dataset at `self.path`. Every time the workload runner executes, it runs this pseudocode: @@ -34,12 +36,12 @@ def prepare(self) -> None: def already_exists(self) -> bool: """Returns True if the dataset is already on disk.""" path_is_dir_which_is_not_empty = ( - self.path_to_dataset.exists() and - self.path_to_dataset.is_dir() and - path_not_empty(self.path_to_dataset) + self.path.exists() and + self.path.is_dir() and + path_not_empty(self.path) ) path_is_single_file = ( - self.path_to_dataset.exists() and - not self.path_to_dataset.is_dir() + self.path.exists() and + not self.path.is_dir() ) return path_is_dir_which_is_not_empty or path_is_single_file \ No newline at end of file diff --git a/src/perfcapture/utils.py b/src/perfcapture/utils.py index c58a845..13fee01 100644 --- a/src/perfcapture/utils.py +++ b/src/perfcapture/utils.py @@ -1,7 +1,8 @@ """Simple utility functions.""" +import importlib.util import pathlib - +import sys def path_not_empty(path: pathlib.Path) -> bool: """Returns True if `path` is not empty.""" @@ -9,4 +10,13 @@ def path_not_empty(path: pathlib.Path) -> bool: # To save time, don't bother iterating past the first entry. for _ in path.iterdir(): return True - return False \ No newline at end of file + return False + + +def load_module_from_filename(py_filename: pathlib.Path): + module_name = py_filename.stem + spec = importlib.util.spec_from_file_location(module_name, py_filename) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module diff --git a/src/perfcapture/workload.py b/src/perfcapture/workload.py index dd2c19e..e72c6c9 100644 --- a/src/perfcapture/workload.py +++ b/src/perfcapture/workload.py @@ -1,16 +1,60 @@ import abc +import inspect import pathlib from perfcapture.dataset import Dataset -from perfcapture.utils import path_not_empty +from perfcapture.utils import load_module_from_filename, path_not_empty class Workload(abc.ABC): """Inherit from `Workload` to implement a new benchmark workload.""" - def __init__(self, dataset: Dataset): - self.dataset = dataset + def __init__(self): + self.dataset = self.init_dataset() + + @abc.abstractmethod + def init_dataset(self) -> Dataset: + """Initialises and returns a concrete Dataset objects.""" @abc.abstractmethod def run(self) -> dict[str, object]: """Must be overridden to implement the workload.""" + + @property + def name(self) -> str: + """The name of this workload. + + Must be unique amongst all the workloads used in this benchmark suite. + """ + return self.__class__.__name__ + + @property + def n_repeats(self) -> int: + """The number of times to repeat this workload.""" + return 1 + + +def load_workloads_from_filename(py_filename: pathlib.Path) -> list[Workload]: + workloads = [] + module = load_module_from_filename(py_filename) + for member_name in dir(module): + module_attr = getattr(module, member_name) + if (module_attr + and inspect.isclass(module_attr) + and issubclass(module_attr, Workload) + and module_attr is not Workload + ): + print(f"Instantiating {member_name}") + workload_obj = module_attr() + workloads.append(workload_obj) + return workloads + + +def discover_workloads(recipe_path: pathlib.Path) -> list[Workload]: + workloads = [] + for py_filename in recipe_path.glob("*.py"): + workloads_from_py_file = load_workloads_from_filename(py_filename) + workloads.extend(workloads_from_py_file) + return workloads + + \ No newline at end of file