Skip to content

Commit

Permalink
Merge 65e47f7 into 1526017
Browse files Browse the repository at this point in the history
  • Loading branch information
charlesbluca authored Apr 12, 2021
2 parents 1526017 + 65e47f7 commit ba92e58
Show file tree
Hide file tree
Showing 17 changed files with 549 additions and 485 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dask_cuda.egg-info/
python/build
python/cudf/bindings/*.cpp
dask-worker-space/
docs/_build/

## Patching
*.diff
Expand Down
244 changes: 139 additions & 105 deletions dask_cuda/cli/dask_cuda_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,188 +19,222 @@

@click.command(context_settings=dict(ignore_unknown_options=True))
@click.argument("scheduler", type=str, required=False)
@click.option(
"--tls-ca-file",
type=pem_file_option_type,
default=None,
help="CA cert(s) file for TLS (in PEM format)",
)
@click.option(
"--tls-cert",
type=pem_file_option_type,
default=None,
help="certificate file for TLS (in PEM format)",
)
@click.option(
"--tls-key",
type=pem_file_option_type,
default=None,
help="private key file for TLS (in PEM format)",
)
@click.option("--dashboard-address", type=str, default=":0", help="dashboard address")
@click.option(
"--dashboard/--no-dashboard",
"dashboard",
default=True,
show_default=True,
required=False,
help="Launch dashboard",
@click.argument(
"preload_argv", nargs=-1, type=click.UNPROCESSED, callback=validate_preload_argv
)
@click.option(
"--host",
type=str,
default=None,
help="Serving host. Should be an ip address that is"
" visible to the scheduler and other workers. "
"See --listen-address and --contact-address if you "
"need different listen and contact addresses. "
"See --interface.",
help="""IP address of serving host; should be visible to the scheduler and other
workers. Can be a string (like ``"127.0.0.1"``) or ``None`` to fall back on the
address of the interface specified by ``--interface`` or the default interface.""",
)
@click.option(
"--interface",
type=str,
default=None,
help="The external interface used to connect to the scheduler, usually "
"an ethernet interface is used for connection, and not an InfiniBand "
"interface (if one is available).",
"--nthreads",
type=int,
default=1,
show_default=True,
help="Number of threads to be used for each Dask worker process.",
)
@click.option("--nthreads", type=int, default=1, help="Number of threads per process.")
@click.option(
"--name",
type=str,
default=None,
help="A unique name for this worker like 'worker-1'. "
"If used with --nprocs then the process number "
"will be appended like name-0, name-1, name-2, ...",
help="""A unique name for the worker. Can be a string (like ``"worker-1"``) or
``None`` for a nameless worker. If used with ``--nprocs``, then the process number
will be appended to the worker name, e.g. ``"worker-1-0"``, ``"worker-1-1"``,
``"worker-1-2"``.""",
)
@click.option(
"--memory-limit",
default="auto",
help="Bytes of memory per process that the worker can use. "
"This can be an integer (bytes), "
"float (fraction of total system memory), "
"string (like 5GB or 5000M), "
"'auto', or zero for no memory management",
show_default=True,
help="""Bytes of memory per process that the worker can use. Can be an integer
(bytes), float (fraction of total system memory), string (like ``"5GB"`` or
``"5000M"``), or ``"auto"`` or 0 for no memory management.""",
)
@click.option(
"--device-memory-limit",
default="0.8",
help="Specifies the size of the CUDA device LRU cache, which "
"is used to determine when the worker starts spilling to host "
"memory. This can be a float (fraction of total device "
"memory), an integer (bytes), a string (like 5GB or 5000M), "
"and 'auto' or 0 to disable spilling to host (i.e., allow "
"full device memory usage). Default is 0.8, 80% of the "
"worker's total device memory.",
show_default=True,
help="""Size of the CUDA device LRU cache, which is used to determine when the
worker starts spilling to host memory. Can be an integer (bytes), float (fraction of
total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"`` or 0 to
disable spilling to host (i.e. allow full device memory usage).""",
)
@click.option(
"--rmm-pool-size",
default=None,
help="If specified, initialize each worker with an RMM pool of "
"the given size, otherwise no RMM pool is created. This can be "
"an integer (bytes) or string (like 5GB or 5000M)."
"NOTE: This size is a per worker (i.e., per GPU) configuration, "
"and not cluster-wide!",
help="""RMM pool size to initialize each worker with. Can be an integer (bytes),
string (like ``"5GB"`` or ``"5000M"``), or ``None`` to disable RMM pools.
.. note::
This size is a per-worker configuration, and not cluster-wide.""",
)
@click.option(
"--rmm-managed-memory/--no-rmm-managed-memory",
default=False,
help="If enabled, initialize each worker with RMM and set it to "
"use managed memory. If disabled, RMM may still be used if "
"--rmm-pool-size is specified, but in that case with default "
"(non-managed) memory type."
"WARNING: managed memory is currently incompatible with NVLink, "
"trying to enable both will result in an exception.",
show_default=True,
help="""Initialize each worker with RMM and set it to use managed memory. If
disabled, RMM may still be used by specifying ``--rmm-pool-size``.
.. warning::
Managed memory is currently incompatible with NVLink. Trying to enable both will
result in an exception.""",
)
@click.option(
"--rmm-log-directory",
default=None,
help="Directory to write per-worker RMM log files to; the client "
"and scheduler are not logged here."
"NOTE: Logging will only be enabled if --rmm-pool-size or "
"--rmm-managed-memory are specified.",
help="""Directory to write per-worker RMM log files to. The client and scheduler are
not logged here. Can be a string (like ``"/path/to/logs/"``) or ``None`` to disable
logging.
.. note::
Logging will only be enabled if ``--rmm-pool-size`` or ``--rmm-managed-memory``
are specified.""",
)
@click.option(
"--pid-file", type=str, default="", help="File to write the process PID.",
)
@click.option(
"--resources",
type=str,
default="",
help="""Resources for task constraints like ``"GPU=2 MEM=10e9"``. Resources are
applied separately to each worker process (only relevant when starting multiple
worker processes with ``--nprocs``).""",
)
@click.option(
"--reconnect/--no-reconnect",
"--dashboard/--no-dashboard",
"dashboard",
default=True,
help="Reconnect to scheduler if disconnected",
show_default=True,
required=False,
help="Launch the dashboard.",
)
@click.option("--pid-file", type=str, default="", help="File to write the process PID")
@click.option(
"--local-directory", default=None, type=str, help="Directory to place worker files"
"--dashboard-address",
type=str,
default=":0",
show_default=True,
help="Relative address to serve the dashboard (if enabled).",
)
@click.option(
"--resources",
"--local-directory",
default=None,
type=str,
default="",
help='Resources for task constraints like "GPU=2 MEM=10e9". '
"Resources are applied separately to each worker process "
"(only relevant when starting multiple worker processes with '--nprocs').",
help="""Path on local machine to store temporary files. Can be a string (like
``"path/to/files"``) or ``None`` to fall back on the value of
``dask.temporary-directory`` in the local Dask configuration, using the current
working directory if this is not set.""",
)
@click.option(
"--scheduler-file",
type=str,
default="",
help="Filename to JSON encoded scheduler information. "
"Use with dask-scheduler --scheduler-file",
help="""Filename to JSON encoded scheduler information. To be used in conjunction
with the equivalent ``dask-scheduler`` option.""",
)
@click.option(
"--dashboard-prefix", type=str, default=None, help="Prefix for the Dashboard"
"--interface",
type=str,
default=None,
help="""External interface used to connect to the scheduler. Usually an ethernet
interface is used for connection, and not an InfiniBand interface (if one is
available). Can be a string (like ``"eth0"`` for NVLink or ``"ib0"`` for
InfiniBand) or ``None`` to fall back on the default interface.""",
)
@click.option(
"--preload",
type=str,
multiple=True,
is_eager=True,
help="Module that should be loaded by each worker process "
'like "foo.bar" or "/path/to/foo.py"',
help="""Module that should be loaded by each worker process like ``"foo.bar"`` or
``"/path/to/foo.py"``.""",
)
@click.argument(
"preload_argv", nargs=-1, type=click.UNPROCESSED, callback=validate_preload_argv
@click.option(
"--dashboard-prefix",
type=str,
default=None,
help="""Prefix for the dashboard. Can be a string (like ...) or ``None`` for no
prefix.""",
)
@click.option(
"--tls-ca-file",
type=pem_file_option_type,
default=None,
help="""CA certificate(s) file for TLS (in PEM format). Can be a string (like
``"path/to/certs"``), or ``None`` for no certificate(s).""",
)
@click.option(
"--tls-cert",
type=pem_file_option_type,
default=None,
help="""Certificate file for TLS (in PEM format). Can be a string (like
``"path/to/certs"``), or ``None`` for no certificate(s).""",
)
@click.option(
"--tls-key",
type=pem_file_option_type,
default=None,
help="""Private key file for TLS (in PEM format). Can be a string (like
``"path/to/certs"``), or ``None`` for no private key.""",
)
@click.option(
"--enable-tcp-over-ucx/--disable-tcp-over-ucx",
default=False,
help="Enable TCP communication over UCX",
show_default=True,
help="""Set environment variables to enable TCP over UCX, even if InfiniBand and
NVLink are not supported or disabled.""",
)
@click.option(
"--enable-infiniband/--disable-infiniband",
default=False,
help="Enable InfiniBand communication",
show_default=True,
help="""Set environment variables to enable UCX over InfiniBand, implies
``--enable-tcp-over-ucx``.""",
)
@click.option(
"--enable-rdmacm/--disable-rdmacm",
"--enable-nvlink/--disable-nvlink",
default=False,
help="Enable RDMA connection manager, currently requires InfiniBand enabled.",
show_default=True,
help="""Set environment variables to enable UCX over NVLink, implies
``--enable-tcp-over-ucx``.""",
)
@click.option(
"--enable-nvlink/--disable-nvlink",
"--enable-rdmacm/--disable-rdmacm",
default=False,
help="Enable NVLink communication",
show_default=True,
help="""Set environment variables to enable UCX RDMA connection manager support,
requires ``--enable-infiniband``.""",
)
@click.option(
"--net-devices",
type=str,
default=None,
help="When None (default), 'UCX_NET_DEVICES' will be left to its default. "
"Otherwise, it must be a non-empty string with the interface name, such as "
"such as 'eth0' or 'auto' to allow for automatically choosing the closest "
"interface based on the system's topology. Normally used only with "
"--enable-infiniband to specify the interface to be used by the worker, "
"such as 'mlx5_0:1' or 'ib0'. "
"WARNING: 'auto' requires UCX-Py to be installed and compiled with hwloc "
"support. Additionally that will always use the closest interface, and "
"that may cause unexpected errors if that interface is not properly "
"configured or is disconnected, for that reason it's limited to "
"InfiniBand only and will still cause unpredictable errors if not _ALL_ "
"interfaces are connected and properly configured.",
help="""Interface(s) used by workers for UCX communication. Can be a string (like
``"eth0"`` for NVLink or ``"mlx5_0:1"``/``"ib0"`` for InfiniBand), ``"auto"``
(requires ``--enable-infiniband``) to pick the optimal interface per-worker based on
the system's topology, or ``None`` to stay with the default value of ``"all"`` (use
all available interfaces).
.. warning::
``"auto"`` requires UCX-Py to be installed and compiled with hwloc support.
Unexpected errors can occur when using ``"auto"`` if any interfaces are
disconnected or improperly configured.""",
)
@click.option(
"--enable-jit-unspill/--disable-jit-unspill",
default=None,
help="Enable just-in-time unspilling. This is experimental and doesn't "
"support memory spilling to disk Please see proxy_object.ProxyObject "
"and proxify_host_file.ProxifyHostFile.",
help="""Enable just-in-time unspilling. Can be a boolean or ``None`` to fall back on
the value of ``dask.jit-unspill`` in the local Dask configuration, disabling
unspilling if this is not set.
.. note::
This is experimental and doesn't support memory spilling to disk. See
``proxy_object.ProxyObject`` and ``proxify_host_file.ProxifyHostFile`` for more
info.""",
)
def main(
scheduler,
Expand Down
Loading

0 comments on commit ba92e58

Please sign in to comment.