Skip to content

Commit

Permalink
Making a start on separating the dataset from the benchmark workload
Browse files Browse the repository at this point in the history
  • Loading branch information
JackKelly committed Sep 29, 2023
1 parent da234bb commit d126413
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 54 deletions.
13 changes: 8 additions & 5 deletions examples/read_numpy_file.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import numpy as np
from perfcapture.dataset import Dataset

from perfcapture.workload import Workload


class ReadNumpyFile(Workload):
def prepare_dataset(self):
class NumpyDataset(Dataset):
def prepare(self) -> None:
"""Create simple numpy file."""
# Generate an array of random numbers
rng = np.random.default_rng()
Expand All @@ -20,9 +21,11 @@ def prepare_dataset(self):

# Save array to temporary file
with open(self.path_to_dataset, mode="wb") as fh:
np.save(fh, array)
np.save(fh, array)

def run_workload(self):

class ReadNumpyFile(Workload):
def run(self):
"""Load numpy file into RAM."""
for _ in range(100):
np.load(self.path_to_dataset)
np.load(self.dataset.path_to_dataset)
30 changes: 25 additions & 5 deletions scripts/perfcapture.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python
import pathlib
import typer
from typing_extensions import Annotated
from typing import Optional
Expand All @@ -8,16 +9,35 @@

@app.command()
def bench(
workloads: Annotated[Optional[str], typer.Argument()] = None,
do_not_clear_cache: Annotated[
data_path: Annotated[
pathlib.Path,
typer.Argument(help="The directory for storing the data which the benchmarks read from.")
],
recipe_dir: Annotated[
pathlib.Path,
typer.Argument(help=(
"The directory containing the code which defines the Workloads and Datasets."))
] = pathlib.Path("."),
workloads: Annotated[
Optional[str],
typer.Argument(help=(
"Space-separated list of workload classes to run. If not set, all workloads found in"
" recipe_dir will be run."))
] = None,
keep_cache: Annotated[
bool,
typer.Option(
"--do-not-clear-cache",
"--keep-cache",
help="Set this flag to prevent `vmtouch -e` being called before each benchmark.",
)
] = False,
):
pass
) -> None:

all_workloads = descover_workloads(recipe_dir)
all_datasets = set([workload.dataset for workload in all_workloads])
for dataset in all_datasets:
if not dataset.already_exists():
dataset.prepare(base_data_path=data_path)

if __name__ == "__main__":
app()
45 changes: 45 additions & 0 deletions src/perfcapture/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import abc
import pathlib

from perfcapture.utils import path_not_empty


class Dataset(abc.ABC):
"""Inherit from `Dataset` to implement a new benchmark dataset.
Datasets are read by `Workload`s.
"""
def __init__(self, base_data_path: pathlib.Path):
self.path_to_dataset = base_data_path / self.name

@property
@abc.abstractmethod
def name(self) -> str:
"""The name of this dataset. Must be unique amongst all the datasets used in the benchmark suite."""
pass

@abc.abstractmethod
def prepare(self) -> None:
"""Override this method if your workload needs to prepare a local dataset.
Store your dataset at `self.path_to_dataset`.
Every time the workload runner executes, it runs this pseudocode:
if not dataset.already_exists():
dataset.prepare()
"""
pass

def already_exists(self) -> bool:
"""Returns True if the dataset is already on disk."""
path_is_dir_which_is_not_empty = (
self.path_to_dataset.exists() and
self.path_to_dataset.is_dir() and
path_not_empty(self.path_to_dataset)
)
path_is_single_file = (
self.path_to_dataset.exists() and
not self.path_to_dataset.is_dir()
)
return path_is_dir_which_is_not_empty or path_is_single_file
50 changes: 6 additions & 44 deletions src/perfcapture/workload.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,16 @@
import abc
import pathlib
from perfcapture.dataset import Dataset

from perfcapture.utils import path_not_empty


"""
TODO: Workload and Dataset should be separate classes.
This is so single Dataset can be used by multiple Workloads.
"""

class Workload(abc.ABC):
"""To implement a new benchmark workload, inherit from `Workload`.
Most folks will want to override just two methods:
- prepare_dataset
- run_workload
"""

def __init__(self, path_to_dataset: pathlib.Path):
self.path_to_dataset = path_to_dataset
"""Inherit from `Workload` to implement a new benchmark workload."""

def prepare_dataset(self) -> None:
"""Override this method if your workload needs to prepare a local dataset.
Every time the workload runner executes, it runs this pseudocode
before calling `run_workload`:
def __init__(self, dataset: Dataset):
self.dataset = dataset

if not workload.dataset_already_exists():
workload.prepare_dataset()
Store your dataset at `self.path_to_dataset`.
"""
pass

@abc.abstractmethod
def run_workload(self) -> dict[str, object]:
"""Must be overridden. This method implements the workload.
"""

def dataset_already_exists(self) -> bool:
"""Returns True if the dataset is already on disk.
"""
path_is_dir_which_is_not_empty = (
self.path_to_dataset.exists() and
self.path_to_dataset.is_dir() and
path_not_empty(self.path_to_dataset)
)
path_is_single_file = (
self.path_to_dataset.exists() and
not self.path_to_dataset.is_dir()
)
return path_is_dir_which_is_not_empty or path_is_single_file
def run(self) -> dict[str, object]:
"""Must be overridden to implement the workload."""

0 comments on commit d126413

Please sign in to comment.