Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add cli access #25

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion .github/workflows/python-package-pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,16 @@ jobs:
python -m pip install .
python -m pip install pytest

- name: Run tests
- name: Run tests (non-distributed)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
python -m pytest tests/
- name: Install distributed dependencies
run: |
python -m pip install .[dask-distributed]
- name: Run tests (distributed)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- add access to CLI via `mllam-data-prep` and add tests for CLI with/without `dask.distributed` ![\25](https://github.com/mllam/mllam-data-prep/pull/25).
- add optional output path argument to parser. ![\#26](https://github.com/mllam/mllam-data-prep/pull/26)

### Changed
Expand All @@ -18,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- fix typo in install dependency `distributed` ![\#20](https://github.com/mllam/mllam-data-prep/pull/20)
- add missing `psutil` requirement. [\#21](https://github.com/mllam/mllam-data-prep/pull/21).


## [v0.3.0](https://github.com/mllam/mllam-data-prep/releases/tag/v0.3.0)

[All changes](https://github.com/mllam/mllam-data-prep/compare/v0.3.0...v0.2.0)
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ The package can also be used as a python module to create datasets in a more pro
### Command-line usage

```bash
python -m mllam_data_prep example.danra.yaml
mllam_data_prep example.danra.yaml
```

Example output:
Expand All @@ -78,7 +78,7 @@ fraction given. For example, to use 50% of the cores on the machine you would
run:

```bash
python -m mllam_data_prep example.danra.yaml --dask-distributed-local-core-fraction 0.5
mllam_data_prep example.danra.yaml --dask-distributed-local-core-fraction 0.5
```

Unfortunately, the number of cores to use can only be worked out by trial and
Expand Down
80 changes: 2 additions & 78 deletions mllam_data_prep/__main__.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,4 @@
import os
from pathlib import Path

from loguru import logger

from .create_dataset import create_dataset_zarr

# Attempt to import psutil and dask.distributed modules
DASK_DISTRIBUTED_AVAILABLE = True
try:
import psutil
from dask.diagnostics import ProgressBar
from dask.distributed import LocalCluster
except ImportError or ModuleNotFoundError:
DASK_DISTRIBUTED_AVAILABLE = False
from .cli import call

if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("config", help="Path to the config file", type=Path)
parser.add_argument(
"-o", "--output", help="Path to the output zarr file", type=Path, default=None
)
parser.add_argument(
"--show-progress", help="Show progress bar", action="store_true"
)
parser.add_argument(
"--dask-distributed-local-core-fraction",
help="Fraction of cores to use on the local machine to do multiprocessing with dask.distributed",
type=float,
default=0.0,
)
parser.add_argument(
"--dask-distributed-local-memory-fraction",
help="Fraction of memory to use on the local machine (when doing multiprocessing with dask.distributed)",
type=float,
default=0.9,
)
args = parser.parse_args()

if args.show_progress:
ProgressBar().register()

if args.dask_distributed_local_core_fraction > 0.0:
# Only run this block if dask.distributed is available
if not DASK_DISTRIBUTED_AVAILABLE:
raise ModuleNotFoundError(
"Currently dask.distributed isn't installed and therefore can't "
"be used in mllam-data-prep. Please install the optional dependency "
'with `python -m pip install "mllam-data-prep[dask-distributed]"`'
)
# get the number of system cores
n_system_cores = os.cpu_count()
# compute the number of cores to use
n_local_cores = int(args.dask_distributed_local_core_fraction * n_system_cores)
# get the total system memory
total_memory = psutil.virtual_memory().total
# compute the memory per worker
memory_per_worker = (
total_memory / n_local_cores * args.dask_distributed_local_memory_fraction
)

logger.info(
f"Setting up dask.distributed.LocalCluster with {n_local_cores} cores and {memory_per_worker/1024/1024:0.0f} MB of memory per worker"
)

cluster = LocalCluster(
n_workers=n_local_cores,
threads_per_worker=1,
memory_limit=memory_per_worker,
)

client = cluster.get_client()
# print the dashboard link
logger.info(f"Dashboard link: {cluster.dashboard_link}")

create_dataset_zarr(fp_config=args.config, fp_zarr=args.output)
args = call(args=None)
75 changes: 75 additions & 0 deletions mllam_data_prep/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""
Command line interface for mllam_data_prep
"""
import argparse
import os
from pathlib import Path

from loguru import logger

from .create_dataset import create_dataset_zarr

# Attempt to import psutil and dask.distributed modules
DASK_DISTRIBUTED_AVAILABLE = True
try:
import psutil
from dask.diagnostics import ProgressBar
from dask.distributed import LocalCluster
except ImportError or ModuleNotFoundError:
DASK_DISTRIBUTED_AVAILABLE = False


def call(args=None):
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("config", help="Path to the config file", type=Path)
parser.add_argument(
"-o", "--output", help="Path to the output zarr file", type=Path, default=None
)
parser.add_argument(
"--show-progress", help="Show progress bar", action="store_true"
)
parser.add_argument(
"--dask-distributed-local-core-fraction",
help="Fraction of cores to use on the local machine to do multiprocessing with dask.distributed",
type=float,
default=0.0,
)
parser.add_argument(
"--dask-distributed-local-memory-fraction",
help="Fraction of memory to use on the local machine (when doing multiprocessing with dask.distributed)",
type=float,
default=0.9,
)
args = parser.parse_args(args)

if args.show_progress:
ProgressBar().register()

if args.dask_distributed_local_core_fraction > 0.0:
# get the number of system cores
n_system_cores = os.cpu_count()
# compute the number of cores to use
n_local_cores = int(args.dask_distributed_local_core_fraction * n_system_cores)
# get the total system memory
total_memory = psutil.virtual_memory().total
# compute the memory per worker
memory_per_worker = (
total_memory / n_local_cores * args.dask_distributed_local_memory_fraction
)

logger.info(
f"Setting up dask.distributed.LocalCluster with {n_local_cores} cores and {memory_per_worker/1024/1024:0.0f} MB of memory per worker"
)

cluster = LocalCluster(
n_workers=n_local_cores,
threads_per_worker=1,
memory_limit=memory_per_worker,
)

# print the dashboard link
logger.info(f"Dashboard link: {cluster.dashboard_link}")

create_dataset_zarr(fp_config=args.config, fp_zarr=args.output)
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,5 @@ dev = [
"ipdb>=0.13.13",
"pre-commit>=3.7.1",
]
[project.scripts]
mllam_data_prep = "mllam_data_prep:cli.call"
leifdenby marked this conversation as resolved.
Show resolved Hide resolved
14 changes: 14 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import tempfile

import pytest
import xarray as xr

from mllam_data_prep.cli import call


@pytest.mark.parametrize("args", [["example.danra.yaml"]])
def test_call(args):
with tempfile.TemporaryDirectory(suffix=".zarr") as tmpdir:
args.extend(["--output", tmpdir])
call(args)
_ = xr.open_zarr(tmpdir)
53 changes: 53 additions & 0 deletions tests/test_distributed.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this test is pure elegance, nice!

Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import importlib
import tempfile

import pytest
import xarray as xr

from mllam_data_prep.cli import call


def call_wrapper(args):
with tempfile.TemporaryDirectory(suffix=".zarr") as tmpdir:
args.extend(["--output", tmpdir])
call(args)
_ = xr.open_zarr(tmpdir)


def distributed():
"""Check if dask.distributed is installed"""
try:
importlib.import_module("dask.distributed")

return True
except (ModuleNotFoundError, ImportError):
return False


@pytest.mark.parametrize(
"args",
[
["example.danra.yaml", "--dask-distributed-local-core-fraction", "1.0"],
["example.danra.yaml"],
],
)
def test_run_distributed(args):
if distributed():
call_wrapper(args)
elif not distributed() and "--dask-distributed-local-core-fraction" in args:
index = args.index("--dask-distributed-local-core-fraction")
core_fraction = float(args[index + 1])
if core_fraction > 0:
pytest.raises(
NameError,
call_wrapper,
args=args,
)
else:
pytest.raises(
ModuleNotFoundError,
call_wrapper,
args=args,
)
else:
call_wrapper(args)
Loading