diff --git a/.gitignore b/.gitignore index bd3ca853c..2bdc52a19 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ dask_cuda.egg-info/ python/build python/cudf/bindings/*.cpp dask-worker-space/ +docs/_build/ ## Patching *.diff diff --git a/dask_cuda/cli/dask_cuda_worker.py b/dask_cuda/cli/dask_cuda_worker.py index f67c384d4..5b62aae73 100755 --- a/dask_cuda/cli/dask_cuda_worker.py +++ b/dask_cuda/cli/dask_cuda_worker.py @@ -19,98 +19,69 @@ @click.command(context_settings=dict(ignore_unknown_options=True)) @click.argument("scheduler", type=str, required=False) -@click.option( - "--tls-ca-file", - type=pem_file_option_type, - default=None, - help="CA cert(s) file for TLS (in PEM format)", -) -@click.option( - "--tls-cert", - type=pem_file_option_type, - default=None, - help="certificate file for TLS (in PEM format)", -) -@click.option( - "--tls-key", - type=pem_file_option_type, - default=None, - help="private key file for TLS (in PEM format)", -) -@click.option("--dashboard-address", type=str, default=":0", help="dashboard address") -@click.option( - "--dashboard/--no-dashboard", - "dashboard", - default=True, - show_default=True, - required=False, - help="Launch dashboard", +@click.argument( + "preload_argv", nargs=-1, type=click.UNPROCESSED, callback=validate_preload_argv ) @click.option( "--host", type=str, default=None, - help="Serving host. Should be an ip address that is" - " visible to the scheduler and other workers. " - "See --listen-address and --contact-address if you " - "need different listen and contact addresses. " - "See --interface.", + help="""IP address of serving host; should be visible to the scheduler and other + workers. Can be a string (like ``"127.0.0.1"``) or ``None`` to fall back on the + address of the interface specified by ``--interface`` or the default interface.""", ) @click.option( - "--interface", - type=str, - default=None, - help="The external interface used to connect to the scheduler, usually " - "an ethernet interface is used for connection, and not an InfiniBand " - "interface (if one is available).", + "--nthreads", + type=int, + default=1, + show_default=True, + help="Number of threads to be used for each Dask worker process.", ) -@click.option("--nthreads", type=int, default=1, help="Number of threads per process.") @click.option( "--name", type=str, default=None, - help="A unique name for this worker like 'worker-1'. " - "If used with --nprocs then the process number " - "will be appended like name-0, name-1, name-2, ...", + help="""A unique name for the worker. Can be a string (like ``"worker-1"``) or + ``None`` for a nameless worker. If used with ``--nprocs``, then the process number + will be appended to the worker name, e.g. ``"worker-1-0"``, ``"worker-1-1"``, + ``"worker-1-2"``.""", ) @click.option( "--memory-limit", default="auto", - help="Bytes of memory per process that the worker can use. " - "This can be an integer (bytes), " - "float (fraction of total system memory), " - "string (like 5GB or 5000M), " - "'auto', or zero for no memory management", + show_default=True, + help="""Bytes of memory per process that the worker can use. Can be an integer + (bytes), float (fraction of total system memory), string (like ``"5GB"`` or + ``"5000M"``), or ``"auto"`` or 0 for no memory management.""", ) @click.option( "--device-memory-limit", default="0.8", - help="Specifies the size of the CUDA device LRU cache, which " - "is used to determine when the worker starts spilling to host " - "memory. This can be a float (fraction of total device " - "memory), an integer (bytes), a string (like 5GB or 5000M), " - "and 'auto' or 0 to disable spilling to host (i.e., allow " - "full device memory usage). Default is 0.8, 80% of the " - "worker's total device memory.", + show_default=True, + help="""Size of the CUDA device LRU cache, which is used to determine when the + worker starts spilling to host memory. Can be an integer (bytes), float (fraction of + total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"`` or 0 to + disable spilling to host (i.e. allow full device memory usage).""", ) @click.option( "--rmm-pool-size", default=None, - help="If specified, initialize each worker with an RMM pool of " - "the given size, otherwise no RMM pool is created. This can be " - "an integer (bytes) or string (like 5GB or 5000M)." - "NOTE: This size is a per worker (i.e., per GPU) configuration, " - "and not cluster-wide!", + help="""RMM pool size to initialize each worker with. Can be an integer (bytes), + string (like ``"5GB"`` or ``"5000M"``), or ``None`` to disable RMM pools. + + .. note:: + This size is a per-worker configuration, and not cluster-wide.""", ) @click.option( "--rmm-managed-memory/--no-rmm-managed-memory", default=False, - help="If enabled, initialize each worker with RMM and set it to " - "use managed memory. If disabled, RMM may still be used if " - "--rmm-pool-size is specified, but in that case with default " - "(non-managed) memory type." - "WARNING: managed memory is currently incompatible with NVLink, " - "trying to enable both will result in an exception.", + show_default=True, + help="""Initialize each worker with RMM and set it to use managed memory. If + disabled, RMM may still be used by specifying ``--rmm-pool-size``. + + .. warning:: + Managed memory is currently incompatible with NVLink. Trying to enable both will + result in failure.""", ) @click.option( "--rmm-async/--no-rmm-async", @@ -119,7 +90,7 @@ help="""Initialize each worker withh RMM and set it to use RMM's asynchronous allocator. See ``rmm.mr.CudaAsyncMemoryResource`` for more info. - .. note:: + .. warning:: The asynchronous allocator requires CUDA Toolkit 11.2 or newer. It is also incompatible with RMM pools and managed memory, trying to enable both will result in failure.""", @@ -127,92 +98,155 @@ @click.option( "--rmm-log-directory", default=None, - help="Directory to write per-worker RMM log files to; the client " - "and scheduler are not logged here." - "NOTE: Logging will only be enabled if --rmm-pool-size, " - "--rmm-managed-memory, or --rmm-async are specified.", + help="""Directory to write per-worker RMM log files to. The client and scheduler are + not logged here. Can be a string (like ``"/path/to/logs/"``) or ``None`` to disable + logging. + + .. note:: + Logging will only be enabled if ``--rmm-pool-size`` or ``--rmm-managed-memory`` + are specified.""", +) +@click.option( + "--pid-file", type=str, default="", help="File to write the process PID.", +) +@click.option( + "--resources", + type=str, + default="", + help="""Resources for task constraints like ``"GPU=2 MEM=10e9"``. Resources are + applied separately to each worker process (only relevant when starting multiple + worker processes with ``--nprocs``).""", ) @click.option( - "--reconnect/--no-reconnect", + "--dashboard/--no-dashboard", + "dashboard", default=True, - help="Reconnect to scheduler if disconnected", + show_default=True, + required=False, + help="Launch the dashboard.", ) -@click.option("--pid-file", type=str, default="", help="File to write the process PID") @click.option( - "--local-directory", default=None, type=str, help="Directory to place worker files" + "--dashboard-address", + type=str, + default=":0", + show_default=True, + help="Relative address to serve the dashboard (if enabled).", ) @click.option( - "--resources", + "--local-directory", + default=None, type=str, - default="", - help='Resources for task constraints like "GPU=2 MEM=10e9". ' - "Resources are applied separately to each worker process " - "(only relevant when starting multiple worker processes with '--nprocs').", + help="""Path on local machine to store temporary files. Can be a string (like + ``"path/to/files"``) or ``None`` to fall back on the value of + ``dask.temporary-directory`` in the local Dask configuration, using the current + working directory if this is not set.""", ) @click.option( "--scheduler-file", type=str, default="", - help="Filename to JSON encoded scheduler information. " - "Use with dask-scheduler --scheduler-file", + help="""Filename to JSON encoded scheduler information. To be used in conjunction + with the equivalent ``dask-scheduler`` option.""", ) @click.option( - "--dashboard-prefix", type=str, default=None, help="Prefix for the Dashboard" + "--interface", + type=str, + default=None, + help="""External interface used to connect to the scheduler. Usually an ethernet + interface is used for connection, and not an InfiniBand interface (if one is + available). Can be a string (like ``"eth0"`` for NVLink or ``"ib0"`` for + InfiniBand) or ``None`` to fall back on the default interface.""", ) @click.option( "--preload", type=str, multiple=True, is_eager=True, - help="Module that should be loaded by each worker process " - 'like "foo.bar" or "/path/to/foo.py"', + help="""Module that should be loaded by each worker process like ``"foo.bar"`` or + ``"/path/to/foo.py"``.""", ) -@click.argument( - "preload_argv", nargs=-1, type=click.UNPROCESSED, callback=validate_preload_argv +@click.option( + "--dashboard-prefix", + type=str, + default=None, + help="""Prefix for the dashboard. Can be a string (like ...) or ``None`` for no + prefix.""", +) +@click.option( + "--tls-ca-file", + type=pem_file_option_type, + default=None, + help="""CA certificate(s) file for TLS (in PEM format). Can be a string (like + ``"path/to/certs"``), or ``None`` for no certificate(s).""", +) +@click.option( + "--tls-cert", + type=pem_file_option_type, + default=None, + help="""Certificate file for TLS (in PEM format). Can be a string (like + ``"path/to/certs"``), or ``None`` for no certificate(s).""", +) +@click.option( + "--tls-key", + type=pem_file_option_type, + default=None, + help="""Private key file for TLS (in PEM format). Can be a string (like + ``"path/to/certs"``), or ``None`` for no private key.""", ) @click.option( "--enable-tcp-over-ucx/--disable-tcp-over-ucx", default=False, - help="Enable TCP communication over UCX", + show_default=True, + help="""Set environment variables to enable TCP over UCX, even if InfiniBand and + NVLink are not supported or disabled.""", ) @click.option( "--enable-infiniband/--disable-infiniband", default=False, - help="Enable InfiniBand communication", + show_default=True, + help="""Set environment variables to enable UCX over InfiniBand, implies + ``--enable-tcp-over-ucx``.""", ) @click.option( - "--enable-rdmacm/--disable-rdmacm", + "--enable-nvlink/--disable-nvlink", default=False, - help="Enable RDMA connection manager, currently requires InfiniBand enabled.", + show_default=True, + help="""Set environment variables to enable UCX over NVLink, implies + ``--enable-tcp-over-ucx``.""", ) @click.option( - "--enable-nvlink/--disable-nvlink", + "--enable-rdmacm/--disable-rdmacm", default=False, - help="Enable NVLink communication", + show_default=True, + help="""Set environment variables to enable UCX RDMA connection manager support, + requires ``--enable-infiniband``.""", ) @click.option( "--net-devices", type=str, default=None, - help="When None (default), 'UCX_NET_DEVICES' will be left to its default. " - "Otherwise, it must be a non-empty string with the interface name, such as " - "such as 'eth0' or 'auto' to allow for automatically choosing the closest " - "interface based on the system's topology. Normally used only with " - "--enable-infiniband to specify the interface to be used by the worker, " - "such as 'mlx5_0:1' or 'ib0'. " - "WARNING: 'auto' requires UCX-Py to be installed and compiled with hwloc " - "support. Additionally that will always use the closest interface, and " - "that may cause unexpected errors if that interface is not properly " - "configured or is disconnected, for that reason it's limited to " - "InfiniBand only and will still cause unpredictable errors if not _ALL_ " - "interfaces are connected and properly configured.", + help="""Interface(s) used by workers for UCX communication. Can be a string (like + ``"eth0"`` for NVLink or ``"mlx5_0:1"``/``"ib0"`` for InfiniBand), ``"auto"`` + (requires ``--enable-infiniband``) to pick the optimal interface per-worker based on + the system's topology, or ``None`` to stay with the default value of ``"all"`` (use + all available interfaces). + + .. warning:: + ``"auto"`` requires UCX-Py to be installed and compiled with hwloc support. + Unexpected errors can occur when using ``"auto"`` if any interfaces are + disconnected or improperly configured.""", ) @click.option( "--enable-jit-unspill/--disable-jit-unspill", default=None, - help="Enable just-in-time unspilling. This is experimental and doesn't " - "support memory spilling to disk Please see proxy_object.ProxyObject " - "and proxify_host_file.ProxifyHostFile.", + help="""Enable just-in-time unspilling. Can be a boolean or ``None`` to fall back on + the value of ``dask.jit-unspill`` in the local Dask configuration, disabling + unspilling if this is not set. + + .. note:: + This is experimental and doesn't support memory spilling to disk. See + ``proxy_object.ProxyObject`` and ``proxify_host_file.ProxifyHostFile`` for more + info.""", ) def main( scheduler, diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py index d47dda268..416a7d6e1 100644 --- a/dask_cuda/initialize.py +++ b/dask_cuda/initialize.py @@ -22,16 +22,16 @@ def initialize( """Create CUDA context and initialize UCX-Py, depending on user parameters. Sometimes it is convenient to initialize the CUDA context, particularly before - starting up Dask workers which create a variety of threads. + starting up Dask worker processes which create a variety of threads. - To ensure UCX works correctly, it is important to ensure it is initialized with - the correct options. This is especially important for the client, which cannot be + To ensure UCX works correctly, it is important to ensure it is initialized with the + correct options. This is especially important for the client, which cannot be configured to use UCX with arguments like ``LocalCUDACluster`` and ``dask-cuda-worker``. This function will ensure that they are provided a UCX configuration based on the flags and options passed by the user. This function can also be used within a worker preload script for UCX configuration - of mainline Dask/Distributed. + of mainline Dask.distributed. https://docs.dask.org/en/latest/setup/custom-startup.html You can add it to your global config with the following YAML: @@ -43,42 +43,39 @@ def initialize( preload: - dask_cuda.initialize - See https://docs.dask.org/en/latest/configuration.html for more information - about Dask configuration. + See https://docs.dask.org/en/latest/configuration.html for more information about + Dask configuration. Parameters ---------- - create_cuda_context: bool + create_cuda_context : bool, default True Create CUDA context on initialization. - Default is ``True``. - enable_tcp_over_ucx: bool - Set environment variables to enable TCP over UCX, even if InfiniBand - and NVLink are not supported or disabled. - Default is ``False``. - enable_infiniband: bool - Set environment variables to enable UCX InfiniBand support, implies + enable_tcp_over_ucx : bool, default False + Set environment variables to enable TCP over UCX, even if InfiniBand and NVLink + are not supported or disabled. + enable_infiniband : bool, default False + Set environment variables to enable UCX over InfiniBand, implies ``enable_tcp_over_ucx=True``. - Default is ``False``. - enable_nvlink: bool - Set environment variables to enable UCX NVLink support, implies + enable_nvlink : bool, default False + Set environment variables to enable UCX over NVLink, implies ``enable_tcp_over_ucx=True``. - Default is ``False``. - enable_rdmacm: bool + enable_rdmacm : bool, default False Set environment variables to enable UCX RDMA connection manager support, - implies ``enable_infiniband=True``. - Default is ``False``. - net_devices: callable or str - If callable, the function must take exactly one argument (the index of - current GPU) that will be used to get the interface name, such as - ``lambda dev: "mlx5_%d:1" % (dev // 2)``, which would return - ``"mlx5_1:1"`` for GPU 3. - If a string, must be an explicit interface name, such as ``"ib0"`` - for InfiniBand or ``"eth0"`` if InfiniBand is disabled. - Default is ``""``, which will result in all available devices being used. - cuda_device_index: None or int - Index of the current GPU, which will be supplied to ``net_devices`` if - it is callable. - Default is ``None``. + requires ``enable_infiniband=True``. + net_devices : str or callable, default "" + Interface(s) used by workers for UCX communication. Can be a string (like + ``"eth0"`` for NVLink, ``"mlx5_0:1"``/``"ib0"`` for InfiniBand, or ``""`` to use + all available devices), or a callable function that takes the index of the + current GPU to return an interface name (like + ``lambda dev: "mlx5_%d:1" % (dev // 2)``). + + .. note:: + If ``net_devices`` is callable, a GPU index must be supplied through + ``cuda_device_index``. + cuda_device_index : int or None, default None + Index of the current GPU, which must be specified for ``net_devices`` if + it is callable. Can be an integer or ``None`` if ``net_devices`` is not + callable. """ if create_cuda_context: diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py index 9e2e744ea..dc3c70d13 100644 --- a/dask_cuda/local_cuda_cluster.py +++ b/dask_cuda/local_cuda_cluster.py @@ -39,104 +39,111 @@ def __init__(self, *args, **kwargs): class LocalCUDACluster(LocalCluster): """A variant of ``dask.distributed.LocalCluster`` that uses one GPU per process. - This assigns a different ``CUDA_VISIBLE_DEVICES`` environment variable to each + This assigns a different ``CUDA_VISIBLE_DEVICES`` environment variable to each Dask worker process. - For machines with a complex architecture mapping CPUs, GPUs, and network - hardware, such as NVIDIA DGX-1 and DGX-2, this class creates a local - cluster that tries to respect this hardware as much as possible. + For machines with a complex architecture mapping CPUs, GPUs, and network hardware, + such as NVIDIA DGX-1 and DGX-2, this class creates a local cluster that tries to + respect this hardware as much as possible. - It creates one Dask worker process per GPU, and assigns each worker process - the correct CPU cores and Network interface cards to maximize performance. - If UCX and UCX-Py are also available, it's possible to use InfiniBand and - NVLink connections for optimal data transfer performance. + Each worker process is automatically assigned the correct CPU cores and network + interface cards to maximize performance. If UCX and UCX-Py are available, InfiniBand + and NVLink connections can be used to optimize data transfer performance. Parameters ---------- - CUDA_VISIBLE_DEVICES: str or list - String or list ``"0,1,2,3"`` or ``[0, 1, 2, 3]`` to restrict activity to - different GPUs. - device_memory_limit: int, float or str - Specifies the size of the CUDA device LRU cache, which is used to - determine when the worker starts spilling to host memory. This can be - a float (fraction of total device memory), an integer (bytes), a string - (like ``"5GB"`` or ``"5000M"``), or ``"auto"``, ``0``, or ``None`` to disable - spilling to host (i.e., allow full device memory usage). Default is ``0.8``, - 80% of the worker's total device memory. - interface: str - The external interface used to connect to the scheduler, usually - an ethernet interface is used for connection, and not an InfiniBand - interface (if one is available). - threads_per_worker: int - Number of threads to be used for each CUDA worker process. - protocol: str - Protocol to use for communication, e.g., ``"tcp"`` or ``"ucx"``. - enable_tcp_over_ucx: bool - Set environment variables to enable TCP over UCX, even if InfiniBand - and NVLink are not supported or disabled. - enable_infiniband: bool - Set environment variables to enable UCX InfiniBand support, requires + CUDA_VISIBLE_DEVICES : str, list of int, or None, default None + GPUs to restrict activity to. Can be a string (like ``"0,1,2,3"``), list (like + ``[0, 1, 2, 3]``), or ``None`` to use all available GPUs. + n_workers : int or None, default None + Number of workers. Can be an integer or ``None`` to fall back on the GPUs + specified by ``CUDA_VISIBLE_DEVICES``. Will override the value of + ``CUDA_VISIBLE_DEVICES`` if specified. + threads_per_worker : int, default 1 + Number of threads to be used for each Dask worker process. + memory_limit : int, float, str, or None, default "auto" + Bytes of memory per process that the worker can use. Can be an integer (bytes), + float (fraction of total system memory), string (like ``"5GB"`` or ``"5000M"``), + or ``"auto"``, 0, or ``None`` for no memory management. + device_memory_limit : int, float, str, or None, default 0.8 + Size of the CUDA device LRU cache, which is used to determine when the worker + starts spilling to host memory. Can be an integer (bytes), float (fraction of + total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"``, 0, + or ``None`` to disable spilling to host (i.e. allow full device memory usage). + local_directory : str or None, default None + Path on local machine to store temporary files. Can be a string (like + ``"path/to/files"``) or ``None`` to fall back on the value of + ``dask.temporary-directory`` in the local Dask configuration, using the current + working directory if this is not set. + protocol : str or None, default None + Protocol to use for communication. Can be a string (like ``"tcp"`` or + ``"ucx"``), or ``None`` to automatically choose the correct protocol. + enable_tcp_over_ucx : bool, default False + Set environment variables to enable TCP over UCX, even if InfiniBand and NVLink + are not supported or disabled. + enable_infiniband : bool, default False + Set environment variables to enable UCX over InfiniBand, requires ``protocol="ucx"`` and implies ``enable_tcp_over_ucx=True``. - enable_rdmacm: bool + enable_nvlink : bool, default False + Set environment variables to enable UCX over NVLink, requires ``protocol="ucx"`` + and implies ``enable_tcp_over_ucx=True``. + enable_rdmacm : bool, default False Set environment variables to enable UCX RDMA connection manager support, requires ``protocol="ucx"`` and ``enable_infiniband=True``. - enable_nvlink: bool - Set environment variables to enable UCX NVLink support, requires - ``protocol="ucx"`` and implies ``enable_tcp_over_ucx=True``. - ucx_net_devices: None, callable or str - When ``None`` (default), ``"UCX_NET_DEVICES"`` will be left to its default. - If callable, the function must take exactly one argument (the index of - current GPU) that will be used to get the interface name, such as - ``lambda dev: "mlx5_%d:1" % (dev // 2)``, returning ``"mlx5_1:1"`` for - GPU 3, for example. If it's a string, it must be a non-empty string - with the interface name, such as ``"eth0"`` or ``"auto"`` to allow for - automatically choosing the closest interface based on the system's - topology. + ucx_net_devices : str, callable, or None, default None + Interface(s) used by workers for UCX communication. Can be a string (like + ``"eth0"`` for NVLink or ``"mlx5_0:1"``/``"ib0"`` for InfiniBand), a callable + function that takes the index of the current GPU to return an interface name + (like ``lambda dev: "mlx5_%d:1" % (dev // 2)``), ``"auto"`` (requires + ``enable_infiniband=True``) to pick the optimal interface per-worker + based on the system's topology, or ``None`` to stay with the default value of + ``"all"`` (use all available interfaces). .. warning:: - ``"auto"`` requires UCX-Py to be installed and compiled with hwloc - support. Additionally that will always use the closest interface, and - that may cause unexpected errors if that interface is not properly - configured or is disconnected, for that reason it's limited to - InfiniBand only and will still cause unpredictable errors if **all** - interfaces are not connected and properly configured. - rmm_pool_size: None, int or str - When ``None`` (default), no RMM pool is initialized. If a different value - is given, it can be an integer (bytes) or string (like ``"5GB"`` or - ``"5000M"``). + ``"auto"`` requires UCX-Py to be installed and compiled with hwloc support. + Unexpected errors can occur when using ``"auto"`` if any interfaces are + disconnected or improperly configured. + rmm_pool_size : int, str or None, default None + RMM pool size to initialize each worker with. Can be an integer (bytes), string + (like ``"5GB"`` or ``"5000M"``), or ``None`` to disable RMM pools. .. note:: - The size is a per worker (i.e., per GPU) configuration, and not - cluster-wide! - rmm_managed_memory: bool - If ``True``, initialize each worker with RMM and set it to use managed - memory. If ``False``, RMM may still be used if ``rmm_pool_size`` is specified, - but in that case with default (non-managed) memory type. + This size is a per-worker configuration, and not cluster-wide. + rmm_managed_memory : bool, default False + Initialize each worker with RMM and set it to use managed memory. If disabled, + RMM may still be used by specifying ``rmm_pool_size``. .. warning:: - Managed memory is currently incompatible with NVLink, trying to enable - both will result in an exception. + Managed memory is currently incompatible with NVLink. Trying to enable both + will result in an exception. rmm_async: bool, default False Initialize each worker withh RMM and set it to use RMM's asynchronous allocator. See ``rmm.mr.CudaAsyncMemoryResource`` for more info. - .. note:: + .. warning:: The asynchronous allocator requires CUDA Toolkit 11.2 or newer. It is also - incompatible with RMM pools and managed memory, trying to enable both will + incompatible with RMM pools and managed memory. Trying to enable both will result in an exception. - rmm_log_directory: str - Directory to write per-worker RMM log files to; the client and scheduler - are not logged here. Logging will only be enabled if ``rmm_pool_size``, - ``rmm_managed_memory``, or ``rmm_async`` are specified. - jit_unspill: bool - If ``True``, enable just-in-time unspilling. This is experimental and doesn't - support memory spilling to disk. Please see ``proxy_object.ProxyObject`` and - ``proxify_host_file.ProxifyHostFile``. - log_spilling: bool - If True, all spilling operations will be logged directly to - distributed.worker with an INFO loglevel. This will eventually be - replaced by a Dask configuration flag. + rmm_log_directory : str or None, default None + Directory to write per-worker RMM log files to. The client and scheduler are not + logged here. Can be a string (like ``"/path/to/logs/"``) or ``None`` to + disable logging. + + .. note:: + Logging will only be enabled if ``rmm_pool_size`` is specified or + ``rmm_managed_memory=True``. + jit_unspill : bool or None, default None + Enable just-in-time unspilling. Can be a boolean or ``None`` to fall back on + the value of ``dask.jit-unspill`` in the local Dask configuration, disabling + unspilling if this is not set. + .. note:: + This is experimental and doesn't support memory spilling to disk. See + ``proxy_object.ProxyObject`` and ``proxify_host_file.ProxifyHostFile`` for + more info. + log_spilling : bool, default True + Enable logging of spilling operations directly to ``distributed.Worker`` with an + ``INFO`` log level. Examples -------- @@ -148,8 +155,7 @@ class LocalCUDACluster(LocalCluster): Raises ------ TypeError - If ``enable_infiniband`` or ``enable_nvlink`` is ``True`` and protocol is not - ``"ucx"``. + If InfiniBand or NVLink are enabled and ``protocol!="ucx"``. ValueError If ``ucx_net_devices=""``, if NVLink and RMM managed memory are both enabled, if RMM pools / managed memory and asynchronous allocator are both @@ -165,11 +171,12 @@ class LocalCUDACluster(LocalCluster): def __init__( self, + CUDA_VISIBLE_DEVICES=None, n_workers=None, threads_per_worker=1, memory_limit="auto", device_memory_limit=0.8, - CUDA_VISIBLE_DEVICES=None, + data=None, local_directory=None, protocol=None, enable_tcp_over_ucx=False, diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py index d15704078..0b3ec54ea 100644 --- a/dask_cuda/tests/test_spill.py +++ b/dask_cuda/tests/test_spill.py @@ -193,7 +193,7 @@ async def test_cupy_cluster_device_spill(params): cupy = pytest.importorskip("cupy") with dask.config.set({"distributed.worker.memory.terminate": False}): async with LocalCUDACluster( - 1, + n_workers=1, scheduler_port=0, silence_logs=False, dashboard_address=None, diff --git a/docs/requirements.txt b/docs/requirements.txt index cef8580f6..3287f78c8 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,4 @@ dask_cuda numpydoc==1.1.0 +sphinx_click==2.7.1 sphinx_rtd_theme==0.5.1 \ No newline at end of file diff --git a/docs/source/api.rst b/docs/source/api.rst index 1fd4e1b4a..881e9fc85 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -1,13 +1,19 @@ API === -Setup ------- -.. currentmodule:: dask_cuda.initialize -.. autofunction:: initialize - Cluster ------- .. currentmodule:: dask_cuda .. autoclass:: LocalCUDACluster :members: + +Worker +------ +.. click:: dask_cuda.cli.dask_cuda_worker:main + :prog: dask-cuda-worker + :nested: none + +Client initialization +--------------------- +.. currentmodule:: dask_cuda.initialize +.. autofunction:: initialize diff --git a/docs/source/conf.py b/docs/source/conf.py index 26a174cfc..be7a5ec2b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -48,6 +48,8 @@ "sphinx.ext.intersphinx", "sphinx.ext.extlinks", "numpydoc", + "sphinx_click", + "sphinx_rtd_theme", ] numpydoc_show_class_members = False diff --git a/docs/source/examples/spilling.rst b/docs/source/examples/spilling.rst new file mode 100644 index 000000000..29a9add44 --- /dev/null +++ b/docs/source/examples/spilling.rst @@ -0,0 +1,32 @@ +Spilling from device +==================== + +By default, Dask-CUDA enables spilling from GPU to host memory when a GPU reaches a memory utilization of 80%. +This can be changed to suit the needs of a workload, or disabled altogether, by explicitly setting ``device_memory_limit``. +This parameter accepts an integer or string memory size, or a float representing a percentage of the GPU's total memory: + +.. code-block:: python + + from dask_cuda import LocalCUDACluster + + cluster = LocalCUDACluster(device_memory_limit=50000) # spilling after 50000 bytes + cluster = LocalCUDACluster(device_memory_limit="5GB") # spilling after 5 GB + cluster = LocalCUDACluster(device_memory_limit=0.3) # spilling after 30% memory utilization + +Memory spilling can be disabled by setting ``device_memory_limit`` to 0: + +.. code-block:: python + + cluster = LocalCUDACluster(device_memory_limit=0) # spilling disabled + +The same applies for ``dask-cuda-worker``, and spilling can be controlled by setting ``--device-memory-limit``: + +.. code-block:: + + $ dask-scheduler + distributed.scheduler - INFO - Scheduler at: tcp://127.0.0.1:8786 + + $ dask-cuda-worker --device-memory-limit 50000 + $ dask-cuda-worker --device-memory-limit 5GB + $ dask-cuda-worker --device-memory-limit 0.3 + $ dask-cuda-worker --device-memory-limit 0 \ No newline at end of file diff --git a/docs/source/examples/ucx.rst b/docs/source/examples/ucx.rst new file mode 100644 index 000000000..77b12ce65 --- /dev/null +++ b/docs/source/examples/ucx.rst @@ -0,0 +1,92 @@ +Enabling UCX communication +========================== + +A CUDA cluster using UCX communication can be started automatically with LocalCUDACluster or manually with the ``dask-cuda-worker`` CLI tool. +In either case, a ``dask.distributed.Client`` must be made for the worker cluster using the same Dask UCX configuration; see `UCX Integration -- Configuration <../ucx.html#configuration>`_ for details on all available options. + +LocalCUDACluster +---------------- + +When using LocalCUDACluster with UCX communication, all required UCX configuration is handled through arguments supplied at construction; see `API -- Cluster <../api.html#cluster>`_ for a complete list of these arguments. +To connect a client to a cluster with all supported transports and an RMM pool: + +.. code-block:: python + + from dask.distributed import Client + from dask_cuda import LocalCUDACluster + + cluster = LocalCUDACluster( + protocol="ucx", + interface="ib0", + enable_tcp_over_ucx=True, + enable_nvlink=True, + enable_infiniband=True, + enable_rdmacm=True, + ucx_net_devices="auto", + rmm_pool_size="1GB" + ) + client = Client(cluster) + +dask-cuda-worker +---------------- + +When using ``dask-cuda-worker`` with communication, the scheduler, workers, and client must all be started manually, each using the same UCX configuration. + +Scheduler +^^^^^^^^^ + +UCX configuration options will need to be specified for ``dask-scheduler`` as environment variables; see `Dask Configuration -- Environment Variables `_ for more details on the mapping between environment variables and options. + +To start a Dask scheduler using UCX with all supported transports and an gigabyte RMM pool: + +.. code-block:: bash + + $ DASK_UCX__CUDA_COPY=True \ + > DASK_UCX__TCP=True \ + > DASK_UCX__NVLINK=True \ + > DASK_UCX__INFINIBAND=True \ + > DASK_UCX__RDMACM=True \ + > DASK_UCX__NET_DEVICES=mlx5_0:1 \ + > DASK_RMM__POOL_SIZE=1GB \ + > dask-scheduler --protocol ucx --interface ib0 + +Note the specification of ``"mlx5_0:1"`` as our UCX net device; because the scheduler does not rely upon Dask-CUDA, it cannot automatically detect InfiniBand interfaces, so we must specify one explicitly. +We communicate to the scheduler that we will be using UCX with the ``--protocol`` option, and that we will be using InfiniBand with the ``--interface`` option. + +Workers +^^^^^^^ + +All UCX configuration options have analogous options in ``dask-cuda-worker``; see `API -- Worker <../api.html#worker>`_ for a complete list of these options. +To start a cluster with all supported transports and an RMM pool: + +.. code-block:: bash + + $ dask-cuda-worker ucx://:8786 \ + > --enable-tcp-over-ucx \ + > --enable-nvlink \ + > --enable-infiniband \ + > --enable-rdmacm \ + > --net-devices="auto" \ + > --rmm-pool-size="1GB" + +Client +^^^^^^ + +A client can be configured to use UCX by using ``dask_cuda.initialize``, a utility which takes the same UCX configuring arguments as LocalCUDACluster and adds them to the current Dask configuration used when creating it; see `API -- Client initialization <../api.html#client-initialization>`_ for a complete list of arguments. +To connect a client to the cluster we have made: + +.. code-block:: python + + from dask.distributed import Client + from dask_cuda.initialize import initialize + + initialize( + enable_tcp_over_ucx=True, + enable_nvlink=True, + enable_infiniband=True, + enable_rdmacm=True, + net_devices="mlx5_0:1", + ) + client = Client("ucx://:8786") + +Note again the specification of ``"mlx5_0:1"`` as our UCX net device, due to the fact that the client does not support automatic detection of InfiniBand interfaces. diff --git a/docs/source/examples/worker_count.rst b/docs/source/examples/worker_count.rst new file mode 100644 index 000000000..29c6502c0 --- /dev/null +++ b/docs/source/examples/worker_count.rst @@ -0,0 +1,47 @@ +Controlling number of workers +============================= + +Users can restrict activity to specific GPUs by explicitly setting ``CUDA_VISIBLE_DEVICES``; for a LocalCUDACluster, this can provided as a keyword argument. +For example, to restrict activity to the first two indexed GPUs: + +.. code-block:: python + + from dask_cuda import LocalCUDACluster + + cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="0,1") + +LocalCUDACluster can also take an ``n_workers`` argument, which will restrict activity to the first N GPUs listed in ``CUDA_VISIBLE_DEVICES``. +This argument can be used on its own or in conjunction with ``CUDA_VISIBLE_DEVICES``: + +.. code-block:: python + + cluster = LocalCUDACluster(n_workers=2) # will use GPUs 0,1 + cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="3,4,5", n_workers=2) # will use GPUs 3,4 + +When using ``dask-cuda-worker``, ``CUDA_VISIBLE_DEVICES`` must be provided as an environment variable: + +.. code-block:: bash + + $ dask-scheduler + distributed.scheduler - INFO - Scheduler at: tcp://127.0.0.1:8786 + + $ CUDA_VISIBLE_DEVICES=0,1 dask-cuda-worker 127.0.0.1:8786 + +GPUs can also be selected by their UUIDs, which can be acquired using `NVIDIA System Management Interface `_: + +.. code-block:: bash + + $ nvidia-smi -L + GPU 0: Tesla V100-SXM2-32GB (UUID: GPU-dae76d0e-3414-958a-8f3e-fc6682b36f31) + GPU 1: Tesla V100-SXM2-32GB (UUID: GPU-60f2c95a-c564-a078-2a14-b4ff488806ca) + +These UUIDs can then be passed to ``CUDA_VISIBLE_DEVICES`` in place of a GPU index: + +.. code-block:: python + + cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="GPU-dae76d0e-3414-958a-8f3e-fc6682b36f31") + +.. code-block:: bash + + $ CUDA_VISIBLE_DEVICES="GPU-dae76d0e-3414-958a-8f3e-fc6682b36f31" \ + > dask-cuda-worker 127.0.0.1:8786 diff --git a/docs/source/index.rst b/docs/source/index.rst index 507e30665..6795267b2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,16 +1,41 @@ Dask-CUDA ========= -Dask-CUDA is tool for using `Dask `_ on GPUs. It extends Dask's `Single-Machine Cluster `_ and `Workers `_ for optimized distributed GPU workloads. +Dask-CUDA is a library extending `Dask.distributed `_'s single-machine `LocalCluster `_ and `Worker `_ for use in distributed GPU workloads. +It is a part of the `RAPIDS `_ suite of open-source software libraries for GPU-accelerated data science. +Motivation +---------- +While Distributed can be used to leverage GPU workloads through libraries such as `cuDF `_, `CuPy `_, and `Numba `_, Dask-CUDA offers several unique features unavailable to Distributed: + +- **Automatic instantiation of per-GPU workers** -- Using Dask-CUDA's LocalCUDACluster or ``dask-cuda-worker`` CLI will automatically launch one worker for each GPU available on the executing node, avoiding the need to explicitly select GPUs. +- **Automatic setting of CPU affinity** -- The setting of CPU affinity for each GPU is done automatically, preventing memory transfers from taking suboptimal paths. +- **Automatic selection of InfiniBand devices** -- When UCX communication is enabled over InfiniBand, Dask-CUDA automatically selects the optimal InfiniBand device for each GPU (see `UCX Integration `_ for instructions on configuring UCX communication). +- **Memory spilling from GPU** -- For memory-intensive workloads, Dask-CUDA supports spilling from GPU to host memory when a GPU reaches the default or user-specified memory utilization limit. +- **Allocation of GPU memory** -- when using UCX communication, per-GPU memory pools can be allocated using `RAPIDS Memory Manager `_ to circumvent the costly memory buffer mappings that would be required otherwise. + +Contents +-------- .. toctree:: :maxdepth: 1 - :hidden: + :caption: Getting Started + install quickstart - specializations - worker - ucx api + +.. toctree:: + :maxdepth: 1 + :caption: Additional Features + + ucx + +.. toctree:: + :maxdepth: 1 + :caption: Examples + + examples/worker_count + examples/spilling + examples/ucx diff --git a/docs/source/install.rst b/docs/source/install.rst new file mode 100644 index 000000000..eb303346c --- /dev/null +++ b/docs/source/install.rst @@ -0,0 +1,50 @@ +Installation +============ + +Dask-CUDA can be installed using ``conda``, ``pip``, or from source. + +Conda +----- + +To use Dask-CUDA on your system, you will need: + +- NVIDIA drivers for your GPU; see `NVIDIA Driver Installation Quickstart Guide `_ for installation instructions +- A version of NVIDIA CUDA Toolkit compatible with the installed driver version; see Table 1 of `CUDA Compatibility -- Binary Compatibility `_ for an overview of CUDA Toolkit driver requirements + +Once the proper CUDA Toolkit version has been determined, it can be installed using along with Dask-CUDA using ``conda``. +To install the latest version of Dask-CUDA along with CUDA Toolkit 11.0: + +.. code-block:: bash + + conda install -c rapidsai -c nvidia -c conda-forge dask-cuda cudatoolkit=11.0 + +Pip +--- + +When working outside of a Conda environment, CUDA Toolkit can be downloaded and installed from `NVIDIA's website `_; this package also contains the required NVIDIA drivers. +To install the latest version of Dask-CUDA: + +.. code-block:: bash + + python -m pip install dask-cuda + +Source +------ + +To install Dask-CUDA from source, the source code repository must be cloned from GitHub: + +.. code-block:: bash + + git clone https://github.com/rapidsai/dask-cuda.git + cd dask-cuda + python -m pip install . + +Other RAPIDS libraries +---------------------- + +Dask-CUDA is a part of the `RAPIDS `_ suite of open-source software libraries for GPU-accelerated data science, and works well in conjunction with them. +See `RAPIDS -- Getting Started `_ for instructions on how to install these libraries. +Keep in mind that these libraries will require: + +- At least one CUDA-compliant GPU +- A system installation of `CUDA `_ diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 431065f7b..ce9ea2f21 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -1,27 +1,37 @@ Quickstart ========== +A Dask-CUDA cluster can be created using either LocalCUDACluster or ``dask-cuda-worker`` from the command line. -Setup ------ +LocalCUDACluster +---------------- +To create a Dask-CUDA cluster using all available GPUs and connect a Dask.distributed `Client `_ to it: -:: +.. code-block:: python + + from dask_cuda import LocalCUDACluster + from dask.distributed import Client + + cluster = LocalCUDACluster() + client = Client(cluster) - conda create -n dask-cuda -c rapidsai -c nvidia -c conda-forge \ - cudatoolkit= cudf dask-cuda distributed python=3.7 +dask-cuda-worker +---------------- -Creating a Dask-CUDA Cluster ----------------------------- +To create an equivalent cluster from the command line, Dask-CUDA workers must be connected to a scheduler started with ``dask-scheduler``: -Notebook -~~~~~~~~ +.. code-block:: bash + + $ dask-scheduler + distributed.scheduler - INFO - Scheduler at: tcp://127.0.0.1:8786 + + $ dask-cuda-worker 127.0.0.1:8786 + +To connect a client to this cluster: .. code-block:: python - from dask_cuda import LocalCUDACluster from dask.distributed import Client - # Create a Dask Cluster with one worker per GPU - cluster = LocalCUDACluster() - client = Client(cluster) \ No newline at end of file + client = Client("127.0.0.1:8786") diff --git a/docs/source/specializations.rst b/docs/source/specializations.rst deleted file mode 100644 index d97b6fc56..000000000 --- a/docs/source/specializations.rst +++ /dev/null @@ -1,40 +0,0 @@ -Specializations for GPU Usage -============================= - -It is known that main line Dask and Distributed packages can be used to leverage GPU computing, utilizing libraries such as cuDF, CuPy and Numba. So why use Dask-CUDA instead? This section aims to answer this question. - -Automatic Instantiation of One-Worker-Per-GPU ---------------------------------------------- - -Using the ``dask-cuda-worker`` or ``LocalCUDACluster`` will automatically launch one worker for each GPU available on the node from where it was executed, avoiding the need for users to select GPUs in their application and thus reducing code complexity. - -Controlling Number of Workers -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Users can control the number of workers by explicitly defining the `environment variable `_ ``CUDA_VISIBLE_DEVICES`` when using the dask-cuda-worker CLI. When using ``LocalCUDACluster`` the environment variable ``CUDA_VISIBLE_DEVICES`` or keyword argument with same name can be used. If both keyword and environment variables are set, the environment variable will be ignored in favor of the argument. For example, the following will launch 3 workers on devices, 1, 2, and 3: - -.. code-block:: bash - - LocalCUDACluster(CUDA_VISIBLE_DEVICES='0,1,2') - or - CUDA_VISIBLE_DEVICES=0,1,2 dask-cuda-worker - -Users can also use UUID of the device as an inputs to ``CUDA_VISIBLE_DEVICES``. UUIDs should begin with the `prefix 'GPU-' or 'MIG-GPU' `_ : `GPU-9baca7f5-0f2f-01ac-6b05-8da14d6e9005`, for example. - - -Spilling From Device --------------------- - -For applications that do not fit in GPU memory, Dask-CUDA supports spilling from device memory to host memory when the GPU can't fit more data. The spilling mechanism is automatically triggered once the user-defined limit is reached, such limit can be set via the ``--device-memory-limit`` and ``device_memory_limit`` arguments for ``dask-cuda-worker`` and ``LocalCUDACluster``, respectively. - -Previously, spilling was disabled by default, but since Dask-CUDA 0.17 the default has been changed to ``0.8`` -- spilling will begin when Dask-CUDA device memory utilization reaches 80% of the device's total memory. Behavior can configured with ``--device-memory-limit`` flag. Users can disable spilling by setting ``--device-memory-limit=0`` or ``device_memory_limit=0``. - -CPU Affinity ------------- - -To improve performance, setting CPU affinity for each GPU is automatically done, preventing memory transfers from taking sub-optimal paths. - -Automatic Selection of InfiniBand Device ----------------------------------------- - -When InfiniBand is activated, Dask-CUDA is also capable of selecting the topologically closest InfiniBand device to each GPU, thus ensuring optimal path and improving performance even further by using GPU Remote Direct Memory Access (RDMA) when available. See the :doc:`UCX ` page for more details. diff --git a/docs/source/ucx.rst b/docs/source/ucx.rst index f79e9f2dd..95358dd52 100644 --- a/docs/source/ucx.rst +++ b/docs/source/ucx.rst @@ -2,21 +2,17 @@ UCX Integration =============== Communication can be a major bottleneck in distributed systems. -Dask-CUDA addresses this by supporting integration with `UCX `_, an optimized communication framework that provides high-performance networking and supports a variety of transport methods, including `NVLink `_ and `Infiniband `_ for systems with specialized hardware, and TCP for systems without it. +Dask-CUDA addresses this by supporting integration with `UCX `_, an optimized communication framework that provides high-performance networking and supports a variety of transport methods, including `NVLink `_ and `InfiniBand `_ for systems with specialized hardware, and TCP for systems without it. This integration is enabled through `UCX-Py `_, an interface that provides Python bindings for UCX. - -Requirements ------------- - -Hardware -^^^^^^^^ +Hardware requirements +--------------------- To use UCX with NVLink or InfiniBand, relevant GPUs must be connected with NVLink bridges or NVIDIA Mellanox InfiniBand Adapters, respectively. NVIDIA provides comparison charts for both `NVLink bridges `_ and `InfiniBand adapters `_. -Software -^^^^^^^^ +Software requirements +--------------------- UCX integration requires an environment with both UCX and UCX-Py installed; see `UCX-Py Installation `_ for detailed instructions on this process. @@ -25,7 +21,7 @@ For this reason, it is strongly recommended to use `RAPIDS Memory Manager (RMM) A memory pool also prevents the Dask scheduler from deserializing CUDA data, which will cause a crash. Configuration -^^^^^^^^^^^^^ +------------- In addition to installations of UCX and UCX-Py on your system, several options must be specified within your Dask configuration to enable the integration. Typically, these will affect ``UCX_TLS`` and ``UCX_SOCKADDR_TLS_PRIORITY``, environment variables used by UCX to decide what transport methods to use and which to prioritize, respectively. @@ -69,118 +65,11 @@ However, some will affect related libraries, such as RMM: It is recommended to set the pool size to at least the minimum amount of memory used by the process; if possible, one can map all GPU memory to a single pool, to be utilized for the lifetime of the process. .. note:: - These options can be used with mainline Dask/Distributed. + These options can be used with mainline Dask.distributed. However, some features are exclusive to Dask-CUDA, such as the automatic detection of InfiniBand interfaces. - See :doc:`Specializations for GPU Usage ` for more details on the benefits of using Dask-CUDA. - + See `Dask-CUDA -- Motivation `_ for more details on the benefits of using Dask-CUDA. Usage ----- -Dask-CUDA workers using UCX communication can be started manually with the ``dask-cuda-worker`` CLI tool or automatically with ``LocalCUDACluster``. -In either case, a ``dask.distributed.Client`` must be made for the worker cluster using the same UCX configuration. - -dask-cuda-worker -^^^^^^^^^^^^^^^^ - -A Dask cluster with UCX support can be started using the ``dask-cuda-worker`` CLI tool with a Dask scheduler which has been configured for UCX. -This must be used for cases where a multi-node cluster is needed, as ``LocalCUDACluster`` will only start single-node clusters. - -Scheduler -""""""""" - -UCX configuration options will need to be specified for ``dask-scheduler`` as environment variables; see `Dask Configuration `_ for more details on the mapping between environment variables and options. - -To start a Dask scheduler using UCX with all supported transports and a 1 gigabyte RMM pool: - -.. code-block:: bash - - DASK_UCX__CUDA_COPY=True \ - DASK_UCX__TCP=True \ - DASK_UCX__NVLINK=True \ - DASK_UCX__INFINIBAND=True \ - DASK_UCX__RDMACM=True \ - DASK_UCX__NET_DEVICES=mlx5_0:1 \ - DASK_RMM__POOL_SIZE=1GB \ - dask-scheduler --protocol ucx --interface ib0 - -Note the specification of ``mlx5_0:1`` as our UCX net device; because the scheduler does not rely upon Dask-CUDA, it cannot automatically detect InfiniBand interfaces, so we must specify one explicitly. -We communicate to the scheduler that we will be using UCX with the ``--protocol`` option, and that we will be using InfiniBand with the ``--interface`` option. - -To start the same Dask scheduler as above but only using NVLink: - -.. code-block:: bash - - DASK_UCX__CUDA_COPY=True \ - DASK_UCX__TCP=True \ - DASK_UCX__NVLINK=True \ - DASK_RMM__POOL_SIZE=1GB \ - dask-scheduler --protocol ucx --interface eth0 - -Note that we no longer specify a net device, as this generally can be skipped when using a non-InfiniBand interface. - -Workers -""""""" - -All the relevant Dask configuration options for UCX have analogous parameters in ``dask-cuda-worker``; see :doc:`Worker ` for a complete list of these options. - -To start workers with all supported transports and a 1 gigabyte RMM pool: - -.. code-block:: bash - - dask-cuda-worker ucx://:8786 \ - --enable-tcp-over-ucx \ - --enable-nvlink \ - --enable-infiniband \ - --enable-rdmacm \ - --net-devices="auto" \ - --rmm-pool-size="1GB" - -Client -"""""" - -The UCX configurations used by the scheduler and client must be the same. -This can be ensured by using ``dask_cuda.initialize``, a utility which takes the same UCX configuring arguments as ``LocalCUDACluster`` and adds them to the current Dask configuration used when creating the client; see the :doc:`API reference ` for a complete list of arguments. - -To connect a client to a cluster with all supported transports: - -.. code-block:: python - - from dask.distributed import Client - from dask_cuda.initialize import initialize - - initialize( - enable_tcp_over_ucx=True, - enable_nvlink=True, - enable_infiniband=True, - enable_rdmacm=True, - net_devices="mlx5_0:1", - ) - client = Client("ucx://:8786") - -Note the specification of ``"mlx5_0:1"`` as our net device; because the scheduler and client do not rely upon Dask-CUDA, they cannot automatically detect InfiniBand interfaces, so we must specify one explicitly. - -LocalCUDACluster -^^^^^^^^^^^^^^^^ - -All options available to ``dask-cuda-worker`` are also available as arguments for ``LocalCUDACluster``; see the :doc:`API reference ` for a complete list of arguments. -When creating a ``LocalCUDACluster``, ``dask_cuda.initialize`` is run automatically to ensure the Dask configuration is consistent with the cluster, so that a client can be connected to the cluster with no additional setup. - -To start a cluster and client with all supported transports and a 1 gigabyte RMM pool: - -.. code-block:: python - - from dask.distributed import Client - from dask_cuda import LocalCUDACluster - - cluster = LocalCUDACluster( - protocol="ucx", - interface="ib0", # passed to the scheduler - enable_tcp_over_ucx=True, - enable_nvlink=True, - enable_infiniband=True, - enable_rdmacm=True, - ucx_net_devices="auto", - rmm_pool_size="1GB" - ) - client = Client(cluster) \ No newline at end of file +See `Enabling UCX communication `_ for examples of UCX usage with different supported transports. \ No newline at end of file diff --git a/docs/source/worker.rst b/docs/source/worker.rst deleted file mode 100644 index 875ec07c0..000000000 --- a/docs/source/worker.rst +++ /dev/null @@ -1,82 +0,0 @@ -Worker -====== - -Dask-CUDA workers extend the standard Dask worker in two ways: - -1) Advanced networking configuration -2) GPU Memory Pool configuration - -These configurations can be defined in the single cluster use case with ``LocalCUDACluster`` or passed to workers on the cli with ``dask-cuda-worker`` - -Single Cluster configuration ----------------------------- -Dask-CUDA can be configured for single machine clusters with multiple GPUs such as as DGX1 or DGX2. Below is an example of configuring a single machine Dask cluster on a DGX2 with an RMM pool and NVLink enabled - -.. code-block:: python - - from dask.distributed import Client - from dask_cuda import LocalCUDACluster - from dask_cuda.initialize import initialize - - # Configurations - protocol = "ucx" - interface = "enp6s0" # DGX-2 - enable_tcp_over_ucx = True - enable_nvlink = True - enable_infiniband = False - - initialize( - create_cuda_context=True, - enable_tcp_over_ucx=enable_tcp_over_ucx, - enable_infiniband=enable_infiniband, - enable_nvlink=enable_nvlink, - ) - - cluster = LocalCUDACluster(local_directory="/tmp/USERNAME", - protocol=protocol, - interface=interface, - enable_tcp_over_ucx=enable_tcp_over_ucx, - enable_infiniband=enable_infiniband, - enable_nvlink=enable_nvlink, - rmm_pool_size="25GB", - ) - client = Client(cluster) - - -Command Line Tool ------------------ - -New configuration options:: - - --interface TEXT The external interface used to connect to - the scheduler, usually an ethernet interface - is used for connection, and not an - InfiniBand interface (if one is available). - - --device-memory-limit TEXT Bytes of memory per CUDA device that the - worker can use. This can be an integer - (bytes), float (fraction of total device - memory), string (like 5GB or 5000M), 'auto', - or zero for no memory management (i.e., - allow full device memory usage). - - --rmm-pool-size TEXT If specified, initialize each worker with an - RMM pool of the given size, otherwise no RMM - pool is created. This can be an integer - (bytes) or string (like 5GB or 5000M). - NOTE: This size is a per worker (i.e., per - GPU) configuration, and not cluster-wide! - - --enable-tcp-over-ucx / --disable-tcp-over-ucx - Enable TCP communication over UCX - --enable-infiniband / --disable-infiniband - Enable InfiniBand communication - --enable-nvlink / --disable-nvlink - Enable NVLink communication - --net-devices TEXT When None (default), 'UCX_NET_DEVICES' will - be left to its default. Otherwise, it must - be a non-empty string with the interface - name. Normally used only with --enable- - infiniband to specify the interface to be - used by the worker, such as 'mlx5_0:1' or - 'ib0'.