From 72029bcc23c10b35a4f34da4651b5dcec15d2eb8 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 10 Mar 2021 03:26:20 -0500
Subject: [PATCH] Improve UCX documentation and examples (#545)

Addressing #544, this PR aims to clarify the requirements, configuration, and usage of UCX with Dask-CUDA.

Still a lot to be done:

- [x] Flesh out hardware/software requirements
- [x] Rework CLI/Python usage examples
- [x] Clarify some uncertainties in the Configuration section
- [ ] Add standalone examples of UCX usage

Authors:
  - Charles Blackmon-Luca (@charlesbluca)

Approvers:
  - Peter Andreas Entschev (@pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/545
---
 dask_cuda/initialize.py         |  87 ++++++++++----
 dask_cuda/local_cuda_cluster.py |  72 ++++++-----
 docs/requirements.txt           |   3 +
 docs/source/api.rst             |  15 +--
 docs/source/ucx.rst             | 206 ++++++++++++++++++++------------
 5 files changed, 239 insertions(+), 144 deletions(-)
 create mode 100644 docs/requirements.txt

diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index e53c10de0..d47dda268 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -1,28 +1,3 @@
-"""
-This initialization scripts will create CUDA context and initialize UCX-Py,
-depending on user parameters.
-
-It is sometimes convenient to initialize the CUDA context, particularly before
-starting up Dask workers which create a variety of threads.
-
-To ensure UCX works correctly, it is important to ensure it is initialized with
-the correct options. This is important for scheduler, workers and client. This
-initialization script will ensure that based on the flags and options passed by
-the user.
-
-This module is intended to be used within a Worker preload script.
-https://docs.dask.org/en/latest/setup/custom-startup.html
-
-You can add it to your global config with the following yaml
-
-    distributed:
-      worker:
-        preload:
-          - dask_cuda.initialize_ucx
-
-See https://docs.dask.org/en/latest/configuration.html for more information
-about Dask configuration.
-"""
 import logging
 
 import click
@@ -44,6 +19,68 @@ def initialize(
     net_devices="",
     cuda_device_index=None,
 ):
+    """Create CUDA context and initialize UCX-Py, depending on user parameters.
+
+    Sometimes it is convenient to initialize the CUDA context, particularly before
+    starting up Dask workers which create a variety of threads.
+
+    To ensure UCX works correctly, it is important to ensure it is initialized with
+    the correct options. This is especially important for the client, which cannot be
+    configured to use UCX with arguments like ``LocalCUDACluster`` and
+    ``dask-cuda-worker``. This function will ensure that they are provided a UCX
+    configuration based on the flags and options passed by the user.
+
+    This function can also be used within a worker preload script for UCX configuration
+    of mainline Dask/Distributed.
+    https://docs.dask.org/en/latest/setup/custom-startup.html
+
+    You can add it to your global config with the following YAML:
+
+    .. code-block:: yaml
+
+        distributed:
+          worker:
+            preload:
+              - dask_cuda.initialize
+
+    See https://docs.dask.org/en/latest/configuration.html for more information
+    about Dask configuration.
+
+    Parameters
+    ----------
+    create_cuda_context: bool
+        Create CUDA context on initialization.
+        Default is ``True``.
+    enable_tcp_over_ucx: bool
+        Set environment variables to enable TCP over UCX, even if InfiniBand
+        and NVLink are not supported or disabled.
+        Default is ``False``.
+    enable_infiniband: bool
+        Set environment variables to enable UCX InfiniBand support, implies
+        ``enable_tcp_over_ucx=True``.
+        Default is ``False``.
+    enable_nvlink: bool
+        Set environment variables to enable UCX NVLink support, implies
+        ``enable_tcp_over_ucx=True``.
+        Default is ``False``.
+    enable_rdmacm: bool
+        Set environment variables to enable UCX RDMA connection manager support,
+        implies ``enable_infiniband=True``.
+        Default is ``False``.
+    net_devices: callable or str
+        If callable, the function must take exactly one argument (the index of
+        current GPU) that will be used to get the interface name, such as
+        ``lambda dev: "mlx5_%d:1" % (dev // 2)``, which would return
+        ``"mlx5_1:1"`` for GPU 3.
+        If a string, must be an explicit interface name, such as ``"ib0"``
+        for InfiniBand or ``"eth0"`` if InfiniBand is disabled.
+        Default is ``""``, which will result in all available devices being used.
+    cuda_device_index: None or int
+        Index of the current GPU, which will be supplied to ``net_devices`` if
+        it is callable.
+        Default is ``None``.
+    """
+
     if create_cuda_context:
         try:
             numba.cuda.current_context()
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index aa289ced5..34415fc4e 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -23,9 +23,9 @@
 
 
 class LocalCUDACluster(LocalCluster):
-    """A variant of LocalCluster that uses one GPU per process
+    """A variant of ``dask.distributed.LocalCluster`` that uses one GPU per process.
 
-    This assigns a different CUDA_VISIBLE_DEVICES environment variable to each
+    This assigns a different ``CUDA_VISIBLE_DEVICES`` environment variable to each
     worker process.
 
     For machines with a complex architecture mapping CPUs, GPUs, and network
@@ -46,9 +46,9 @@ class LocalCUDACluster(LocalCluster):
         Specifies the size of the CUDA device LRU cache, which is used to
         determine when the worker starts spilling to host memory.  This can be
         a float (fraction of total device memory), an integer (bytes), a string
-        (like 5GB or 5000M), and "auto", 0 or None to disable spilling to
-        host (i.e., allow full device memory usage). Default is 0.8, 80% of the
-        worker's total device memory.
+        (like ``"5GB"`` or ``"5000M"``), or ``"auto"``, ``0``, or ``None`` to disable
+        spilling to host (i.e., allow full device memory usage). Default is ``0.8``,
+        80% of the worker's total device memory.
     interface: str
         The external interface used to connect to the scheduler, usually
         an ethernet interface is used for connection, and not an InfiniBand
@@ -56,53 +56,60 @@ class LocalCUDACluster(LocalCluster):
     threads_per_worker: int
         Number of threads to be used for each CUDA worker process.
     protocol: str
-        Protocol to use for communication, e.g., "tcp" or "ucx".
+        Protocol to use for communication, e.g., ``"tcp"`` or ``"ucx"``.
     enable_tcp_over_ucx: bool
         Set environment variables to enable TCP over UCX, even if InfiniBand
         and NVLink are not supported or disabled.
     enable_infiniband: bool
         Set environment variables to enable UCX InfiniBand support, requires
-        protocol='ucx' and implies enable_tcp_over_ucx=True.
+        ``protocol="ucx"`` and implies ``enable_tcp_over_ucx=True``.
     enable_rdmacm: bool
         Set environment variables to enable UCX RDMA connection manager support,
-        requires protocol='ucx' and enable_infiniband=True.
+        requires ``protocol="ucx"`` and ``enable_infiniband=True``.
     enable_nvlink: bool
         Set environment variables to enable UCX NVLink support, requires
-        protocol='ucx' and implies enable_tcp_over_ucx=True.
+        ``protocol="ucx"`` and implies ``enable_tcp_over_ucx=True``.
     ucx_net_devices: None, callable or str
-        When None (default), 'UCX_NET_DEVICES' will be left to its default.
+        When ``None`` (default), ``"UCX_NET_DEVICES"`` will be left to its default.
         If callable, the function must take exactly one argument (the index of
         current GPU) that will be used to get the interface name, such as
         ``lambda dev: "mlx5_%d:1" % (dev // 2)``, returning ``"mlx5_1:1"`` for
         GPU 3, for example. If it's a string, it must be a non-empty string
-        with the interface name, such as "eth0" or "auto" to allow for
+        with the interface name, such as ``"eth0"`` or ``"auto"`` to allow for
         automatically choosing the closest interface based on the system's
         topology.
-        WARNING: 'auto' requires UCX-Py to be installed and compiled with hwloc
-        support. Additionally that will always use the closest interface, and
-        that may cause unexpected errors if that interface is not properly
-        configured or is disconnected, for that reason it's limited to
-        InfiniBand only and will still cause unpredictable errors if not _ALL_
-        interfaces are connected and properly configured.
+
+        .. warning::
+            ``"auto"`` requires UCX-Py to be installed and compiled with hwloc
+            support. Additionally that will always use the closest interface, and
+            that may cause unexpected errors if that interface is not properly
+            configured or is disconnected, for that reason it's limited to
+            InfiniBand only and will still cause unpredictable errors if **all**
+            interfaces are not connected and properly configured.
     rmm_pool_size: None, int or str
-        When None (default), no RMM pool is initialized. If a different value
-        is given, it can be an integer (bytes) or string (like 5GB or 5000M).
-        NOTE: The size is a per worker (i.e., per GPU) configuration, and
-        not cluster-wide!
+        When ``None`` (default), no RMM pool is initialized. If a different value
+        is given, it can be an integer (bytes) or string (like ``"5GB"`` or
+        ``"5000M"``).
+
+        .. note::
+            The size is a per worker (i.e., per GPU) configuration, and not
+            cluster-wide!
     rmm_managed_memory: bool
         If ``True``, initialize each worker with RMM and set it to use managed
-        memory. If ``False``, RMM may still be used if `rmm_pool_size` is specified,
+        memory. If ``False``, RMM may still be used if ``rmm_pool_size`` is specified,
         but in that case with default (non-managed) memory type.
-        WARNING: managed memory is currently incompatible with NVLink, trying
-        to enable both will result in an exception.
+
+        .. warning::
+            Managed memory is currently incompatible with NVLink, trying to enable
+            both will result in an exception.
     rmm_log_directory: str
         Directory to write per-worker RMM log files to; the client and scheduler
-        are not logged here. Logging will only be enabled if `rmm_pool_size` or
-        `rmm_managed_memory` are specified.
+        are not logged here. Logging will only be enabled if ``rmm_pool_size`` or
+        ``rmm_managed_memory`` are specified.
     jit_unspill: bool
-        If True, enable just-in-time unspilling. This is experimental and doesn't
-        support memory spilling to disk. Please see proxy_object.ProxyObject and
-        proxify_host_file.ProxifyHostFile.
+        If ``True``, enable just-in-time unspilling. This is experimental and doesn't
+        support memory spilling to disk. Please see ``proxy_object.ProxyObject`` and
+        ``proxify_host_file.ProxifyHostFile``.
 
 
     Examples
@@ -115,10 +122,11 @@ class LocalCUDACluster(LocalCluster):
     Raises
     ------
     TypeError
-        If enable_infiniband or enable_nvlink is True and protocol is not 'ucx'
+        If ``enable_infiniband`` or ``enable_nvlink`` is ``True`` and protocol is not
+        ``"ucx"``.
     ValueError
-        If ucx_net_devices is an empty string, or if it is "auto" and UCX-Py is
-        not installed, or if it is "auto" and enable_infiniband=False, or UCX-Py
+        If ``ucx_net_devices`` is an empty string, or if it is ``"auto"`` and UCX-Py is
+        not installed, or if it is ``"auto"`` and ``enable_infiniband=False``, or UCX-Py
         wasn't compiled with hwloc support, or both RMM managed memory and
         NVLink are enabled.
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 000000000..cef8580f6
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,3 @@
+dask_cuda
+numpydoc==1.1.0
+sphinx_rtd_theme==0.5.1
\ No newline at end of file
diff --git a/docs/source/api.rst b/docs/source/api.rst
index ea8bfc903..1fd4e1b4a 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -1,24 +1,13 @@
 API
 ===
 
-.. currentmodule:: dask_cuda
-
-**dask_cuda**
-
 Setup
 ------
-
-.. autosummary::
-   initialize
-
+.. currentmodule:: dask_cuda.initialize
 .. autofunction:: initialize
 
-
 Cluster
 -------
-
-.. autosummary::
-   LocalCUDACluster
-
+.. currentmodule:: dask_cuda
 .. autoclass:: LocalCUDACluster
    :members:
diff --git a/docs/source/ucx.rst b/docs/source/ucx.rst
index b149cf947..2487e7e7a 100644
--- a/docs/source/ucx.rst
+++ b/docs/source/ucx.rst
@@ -1,135 +1,193 @@
 UCX Integration
 ===============
 
-Communication can be a major bottleneck in distributed systems, and that's no different in Dask and Dask-CUDA. To address that, Dask-CUDA provides integration with `UCX-Py <https://ucx-py.readthedocs.io/>`_, a Python interface for the `UCX <https://www.openucx.org/>`_ communication framework.  UCX is a low-level library that provides high-performance networking and supports several transports, including NVLink and InfiniBand for systems that have such specialized hardware, as well as TCP for those that do not.
+Communication can be a major bottleneck in distributed systems.
+Dask-CUDA addresses this by supporting integration with `UCX <https://www.openucx.org/>`_, an optimized communication framework that provides high-performance networking and supports a variety of transport methods, including `NVLink <https://www.nvidia.com/en-us/data-center/nvlink/>`_ and `Infiniband <https://www.mellanox.com/pdf/whitepapers/IB_Intro_WP_190.pdf>`_ for systems with specialized hardware, and TCP for systems without it.
+This integration is enabled through `UCX-Py <https://ucx-py.readthedocs.io/>`_, an interface that provides Python bindings for UCX.
 
 
+Requirements
+------------
 
-Relevant Settings
------------------
+Hardware
+^^^^^^^^
 
-In this list below we shall see all the Dask configurations relevant for using with Dask-CUDA currently. Note that these options can also be used with mainline Dask/Distributed if you can't use Dask-CUDA for some reason. Note that many improvments for running GPU code is readily available in Dask-CUDA -- but not in mainline Dask/Distributed -- thus, we recommend using Dask-CUDA when possible.
-In the list below we describe the relevant Dask configurations for using UCX with Dask-CUDA.
+To use UCX with NVLink or InfiniBand, relevant GPUs must be connected with NVLink bridges or NVIDIA Mellanox InfiniBand Adapters, respectively.
+NVIDIA provides comparison charts for both `NVLink bridges <https://www.nvidia.com/en-us/design-visualization/nvlink-bridges/>`_ and `InfiniBand adapters <https://www.nvidia.com/en-us/networking/infiniband-adapters/>`_.
 
-.. note::
-    These options can also be used with mainline Dask/Distributed outside of Dask-CUDA, however, we recommend using Dask-CUDA when possible. See the :doc:`Specializations for GPU Usage <specializations>` page for details on the benefits of using Dask-CUDA.
+Software
+^^^^^^^^
 
+UCX integration requires an environment with both UCX and UCX-Py installed; see `UCX-Py Installation <https://ucx-py.readthedocs.io/en/latest/install.html>`_ for detailed instructions on this process.
 
-- ``DASK_UCX__CUDA_COPY=True`` (default: ``False``): *Always required for UCX*, adds ``cuda_copy`` to ``UCX_TLS`` -- required for all CUDA transfers in UCX, both intra- and inter-node;
-- ``DASK_UCX__TCP=True`` (default: ``False``): *Always required for UCX*, adds ``tcp`` to ``UCX_TLS`` -- required for all TCP transfers (e.g., where NVLink or IB is not available/disabled) in UCX, both intra- and inter-node;
-- ``DASK_UCX__NVLINK=True`` (default: ``False``): Adds ``cuda_ipc`` to ``UCX_TLS`` -- required for all NVLink transfers in UCX, only affects intra-node.
-- ``DASK_UCX__INFINIBAND=True`` (defalt: ``False``): Adds ``rc`` to ``UCX_TLS`` -- required for all InfiniBand transfers in UCX, only affects inter-node.
-- ``DASK_UCX__RDMACM=True`` (default: ``False``): Replaces ``sockcm`` by ``rdmacm`` in ``UCX_TLS`` and ``UCX_SOCKADDR_TLS_PRIORITY``. ``rdmacm`` is the recommended method by UCX to use with IB and currently won't work if ``DASK_UCX__INFINIBAND=False``.
-- ``DASK_UCX__NET_DEVICES=mlx5_0:1`` (default: ``None``, causes UCX to decide what device to use, possibly being suboptimal, implies ``UCX_NET_DEVICES=all``): this is very important when ``DASK_UCX__INFINIBAND=True`` to ensure the scheduler is connected over the InfiniBand interface. When ``DASK_UCX__INFINIBAND=False`` it's recommended to use the ethernet device instead, e.g., ``DASK_UCX__NET_DEVICES=enp1s0f0`` on a DGX-1.
-- ``DASK_RMM__POOL_SIZE=1GB``: allocates an RMM pool for the process. In some circumstances, the Dask scheduler will deserialize CUDA data and cause a crash if there's no pool.
+When using UCX, each NVLink and InfiniBand memory buffer must create a mapping between each unique pair of processes they are transferred across; this can be quite costly, potentially in the range of hundreds of milliseconds per mapping.
+For this reason, it is strongly recommended to use `RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`_ to allocate a memory pool that is only prone to a single mapping operation, which all subsequent transfers may rely upon.
+A memory pool also prevents the Dask scheduler from deserializing CUDA data, which will cause a crash.
 
+Configuration
+^^^^^^^^^^^^^
 
-Important notes
----------------
+In addition to installations of UCX and UCX-Py on your system, several options must be specified within your Dask configuration to enable the integration.
+Typically, these will affect ``UCX_TLS`` and ``UCX_SOCKADDR_TLS_PRIORITY``, environment variables used by UCX to decide what transport methods to use and which to prioritize, respectively.
+However, some will affect related libraries, such as RMM:
 
-* CUDA Memory Pool: With UCX, all NVLink and InfiniBand memory buffers have to be mapped from one process to another upon the first request for transfer of that buffer between a single pair of processes. This can be quite costly, consuming up to 100ms only for mapping, plus the transfer time itself. For this reason it is strongly recommended to use a `RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`_ memory pool in such cases, incurring in a single mapping of the pool and all subsequent transfers will not be required to repeat that process. It is recommened to also keep the memory pool size to at least the minimum amount of memory used by the application, if possible one can map all GPU memory to a single pool and utilize that pool for the entire lifetime of the application.
+- ``ucx.cuda_copy: true`` -- **required.**
 
-* Automatic detection of InfiniBand interfaces: it's especially important to note the usage of ``--net-devices="auto"`` in ``dask-cuda-worker``, which will automatically determine the InfiniBand interface that's closest to each GPU. For safety, this option can only be used if ``--enable-infiniband`` is specified. Be warned that this mode assumes all InfiniBand interfaces on the system are connected and properly configured, undefined behavior may occur otherwise.
+  Adds ``cuda_copy`` to ``UCX_TLS``, enabling CUDA transfers over UCX.
 
+- ``ucx.tcp: true`` -- **required.**
 
-Launching Scheduler, Workers and Clients Separately
----------------------------------------------------
+  Adds ``tcp`` to ``UCX_TLS``, enabling TCP transfers over UCX; this is required for very small transfers which are inefficient for NVLink and InfiniBand.
 
-The first way for starting a Dask cluster with UCX support is to start each process separately. The processes are ``dask-scheduler``, ``dask-cuda-worker`` and the client process utilizing ``distributed.Client`` that will connect to the cluster. Details follow for each of the processes.
+- ``ucx.nvlink: true`` -- **required for NVLink.**
 
-dask-scheduler
-^^^^^^^^^^^^^^
+  Adds ``cuda_ipc`` to ``UCX_TLS``, enabling NVLink transfers over UCX; affects intra-node communication only.
 
-The ``dask-scheduler`` has no parameters for UCX configuration -- different from what we will see for ``dask-cuda-worker`` on the next section -- for that reason we rely on Dask environment variables. Here's how to start the scheduler with all transports that are currently supported by Dask-CUDA:
+- ``ucx.infiniband: true`` -- **required for InfiniBand.**
 
-.. code-block:: bash
+  Adds ``rc`` to ``UCX_TLS``, enabling InfiniBand transfers over UCX.
 
-    DASK_RMM__POOL_SIZE=1GB DASK_UCX__CUDA_COPY=True DASK_UCX__TCP=True DASK_UCX__NVLINK=True DASK_UCX__INFINIBAND=True DASK_UCX__RDMACM=True DASK_UCX__NET_DEVICES=mlx5_0:1 dask-scheduler --protocol ucx --interface ib0
+- ``ucx.rdmacm: true`` -- **recommended for InfiniBand.**
 
-Note above how we use ``DASK_UCX__NET_DEVICES=mlx5_0:1`` (the Mellanox name for ``ib0``) and the same interface with ``--interface ib0``. If the system doesn't have an InfiniBand interface available, you would normally use the main network interface, such as ``eth0``, as seen below:
+  Replaces ``sockcm`` with ``rdmacm`` in ``UCX_TLS`` and ``UCX_SOCKADDR_TLS_PRIORITY``, enabling remote direct memory access (RDMA) for InfiniBand transfers.
+  This is recommended by UCX for use with InfiniBand, and will not work if InfiniBand is disabled.
 
-.. code-block:: bash
+- ``ucx.net-devices: <str>`` -- **recommended.**
+
+  Explicitly sets ``UCX_NET_DEVICES`` instead of defaulting to ``"all"``, which can result in suboptimal performance.
+  If using InfiniBand, set to ``"auto"`` to automatically detect the InfiniBand interface closest to each GPU.
+  If InfiniBand is disabled, set to a UCX-compatible ethernet interface, e.g. ``"enp1s0f0"`` on a DGX-1.
+  All available UCX-compatible interfaces can be listed by running ``ucx_info -d``.
 
-    DASK_RMM__POOL_SIZE=1GB DASK_UCX__CUDA_COPY=True DASK_UCX__TCP=True DASK_UCX__NVLINK=True dask-scheduler --protocol ucx --interface eth0
+  .. warning::
+      Setting ``ucx.net-devices: "auto"`` assumes that all InfiniBand interfaces on the system are connected and properly configured; undefined behavior may occur otherwise.
+  
 
-Setting ``DASK_UCX__NET_DEVICES`` when using an interface that isn't an InfiniBand can generally be skipped.
+- ``rmm.pool-size: <str|int>`` -- **recommended.**
+
+  Allocates an RMM pool of the specified size for the process; size can be provided with an integer number of bytes or in human readable format, e.g. ``"4GB"``.
+  It is recommended to set the pool size to at least the minimum amount of memory used by the process; if possible, one can map all GPU memory to a single pool, to be utilized for the lifetime of the process.
+
+.. note::
+    These options can be used with mainline Dask/Distributed.
+    However, some features are exclusive to Dask-CUDA, such as the automatic detection of InfiniBand interfaces. 
+    See :doc:`Specializations for GPU Usage <specializations>` for more details on the benefits of using Dask-CUDA.
 
 
+Usage
+-----
+
+Dask-CUDA workers using UCX communication can be started manually with the ``dask-cuda-worker`` CLI tool or automatically with ``LocalCUDACluster``.
+In either case, a ``dask.distributed.Client`` must be made for the worker cluster using the same UCX configuration.
+
 dask-cuda-worker
 ^^^^^^^^^^^^^^^^
 
-All ``DASK_*`` configurations described above have analogous parameters in ``dask-cuda-worker`` which are preferred over the regular configurations used for ``dask-scheduler`` due to some specializations, such as ``--net-devices="auto"`` which will correctly assign the topologically closest IB interface to the GPU of each worker, something that's not possible with ``DASK_UCX__NET_DEVICES``.
+A Dask cluster with UCX support can be started using the ``dask-cuda-worker`` CLI tool with a Dask scheduler which has been configured for UCX.
+This must be used for cases where a multi-node cluster is needed, as ``LocalCUDACluster`` will only start single-node clusters.
+
+Scheduler
+"""""""""
 
-- ``--disable-tcp-over-ucx`` (default) is analogous to ``DASK_UCX__TCP=False``, ``--enable-tcp-over-ucx`` is equivalent to ``DASK_UCX__TCP=True``;
-- ``--disable-nvlink`` (default) is analogous to ``DASK_UCX__NVLINK=False``, ``--enable-nvlink`` is equivalent to ``DASK_UCX__NVLINK=True``;
-- ``--disable-infiniband`` (default) is analogous to ``DASK_UCX__INFINIBAND=False``, ``--enable-infiniband`` is equivalent to ``DASK_UCX__INFINIBAND=True``;
-- ``--net-devices`` (default ``None``, implies ``UCX_NET_DEVICES=all``) equivalent to ``DASK_UCX__NET_DEVICES``;
-- ``--rmm-pool-size`` equivalent to ``DASK_RMM__POOL_SIZE``.
+UCX configuration options will need to be specified for ``dask-scheduler`` as environment variables; see `Dask Configuration <https://docs.dask.org/en/latest/configuration.html#environment-variables>`_ for more details on the mapping between environment variables and options.
 
-Here's how to start workers with all transports that are currently relevant for us:
+To start a Dask scheduler using UCX with all supported transports and a 1 gigabyte RMM pool:
 
 .. code-block:: bash
 
-    dask-cuda-worker ucx://SCHEDULER_IB0_IP:8786 --enable-tcp-over-ucx --enable-nvlink --enable-infiniband -- enable-rdmacm --net-devices="auto" --rmm-pool-size="30GB"
+    DASK_UCX__CUDA_COPY=True \
+    DASK_UCX__TCP=True \
+    DASK_UCX__NVLINK=True \
+    DASK_UCX__INFINIBAND=True \
+    DASK_UCX__RDMACM=True \
+    DASK_UCX__NET_DEVICES=mlx5_0:1 \
+    DASK_RMM__POOL_SIZE=1GB \
+    dask-scheduler --protocol ucx --interface ib0
 
+Note the specification of ``mlx5_0:1`` as our UCX net device; because the scheduler does not rely upon Dask-CUDA, it cannot automatically detect InfiniBand interfaces, so we must specify one explicitly.
+We communicate to the scheduler that we will be using UCX with the ``--protocol`` option, and that we will be using InfiniBand with the ``--interface`` option.
 
-client
-^^^^^^
+To start the same Dask scheduler as above but only using NVLink:
 
-The same configurations used for the scheduler should be used by the client. One possible exception is ``DASK_RMM__POOL_SIZE``, at this time it's unclear whether this is necessary or not, but using that should not cause any issues nevertheless.
+.. code-block:: bash
 
-One can use ``os.environ`` inside the client script, it's important to set them at the very top before importing anything other than ``os``. See example below:
+    DASK_UCX__CUDA_COPY=True \
+    DASK_UCX__TCP=True \
+    DASK_UCX__NVLINK=True \
+    DASK_RMM__POOL_SIZE=1GB \
+    dask-scheduler --protocol ucx --interface eth0
 
-.. code-block:: python
+Note that we no longer specify a net device, as this generally can be skipped when using a non-InfiniBand interface.
 
-    import os
+Workers
+"""""""
 
-    os.environ["DASK_RMM__POOL_SIZE"] = "1GB"
-    os.environ["DASK_UCX__CUDA_COPY"] = "True"  # os.environ needs using strings, not Python True/False
-    os.environ["DASK_UCX__TCP"] = "True"
-    os.environ["DASK_UCX__NVLINK"] = "True"
-    os.environ["DASK_UCX__INFINIBAND"] = "True"
-    os.environ["DASK_UCX__NET_DEVICES"] = "mlx5_0:1"
+All the relevant Dask configuration options for UCX have analogous parameters in ``dask-cuda-worker``; see :doc:`Worker <worker>` for a complete list of these options.
 
-    from distributed import Client
+To start workers with all supported transports and a 1 gigabyte RMM pool:
 
-    client = Client("ucx://SCHEDULER_IB0_IP:8786")  # SCHEDULER_IB0_IP must be the IP of ib0 on the node where scheduler runs
+.. code-block:: bash
 
-    # Client code goes here
+    dask-cuda-worker ucx://<scheduler_address>:8786 \
+    --enable-tcp-over-ucx \
+    --enable-nvlink \
+    --enable-infiniband \
+    --enable-rdmacm \
+    --net-devices="auto" \
+    --rmm-pool-size="1GB"
 
+LocalCUDACluster
+^^^^^^^^^^^^^^^^
 
-Starting a local cluster (single-node only)
--------------------------------------------
+All options available to ``dask-cuda-worker`` are also available as arguments for ``LocalCUDACluster``; see the :doc:`API reference <api>` for a complete list of arguments.
 
-All options discussed previously are also available in ``LocalCUDACluster``. It is shown below how to start a local cluster with all UCX capabilities enabled:
+To start a cluster with all supported transports and a 1 gigabyte RMM pool:
 
 .. code-block:: python
 
-    import os
+    from dask_cuda import LocalCUDACluster
+
+    cluster = LocalCUDACluster(
+        protocol="ucx",
+        interface="ib0", # passed to the scheduler
+        enable_tcp_over_ucx=True,
+        enable_nvlink=True,
+        enable_infiniband=True,
+        enable_rdmacm=True,
+        ucx_net_devices="auto",
+        rmm_pool_size="1GB"
+    )
+
+Client
+^^^^^^
+
+The UCX configurations used by the scheduler and client should be the same.
+This can be ensured by using ``dask_cuda.initialize``, a utility which takes the same UCX configuring arguments as ``LocalCUDACluster`` and adds them to the current Dask configuration used when creating the scheduler and client; see the :doc:`API reference <api>` for a complete list of arguments.
 
-    # The options here are to be used by the client only,
-    # inherent options for the Dask scheduler and workers
-    # have to be passed to LocalCUDACluster
-    os.environ["DASK_RMM__POOL_SIZE"] = "1GB"
-    os.environ["DASK_UCX__CUDA_COPY"] = "True"  # os.environ needs using strings, not Python True/False
-    os.environ["DASK_UCX__TCP"] = "True"
-    os.environ["DASK_UCX__NVLINK"] = "True"
-    os.environ["DASK_UCX__INFINIBAND"] = "True"
-    os.environ["DASK_UCX__NET_DEVICES"] = "mlx5_0:1"
+To start a cluster and client with all supported transports and a 1 gigabyte RMM pool:
+
+.. code-block:: python
 
     from dask.distributed import Client
     from dask_cuda import LocalCUDACluster
     from dask_cuda.initialize import initialize
 
+    initialize(
+        enable_tcp_over_ucx=True,
+        enable_nvlink=True,
+        enable_infiniband=True,
+        enable_rdmacm=True,
+        net_devices="mlx5_0:1",
+    )
     cluster = LocalCUDACluster(
-        protocol = "ucx"
-        interface = "ib0"  # Interface -- used for the scheduler
-        enable_tcp_over_ucx = True
-        enable_nvlink = True
-        enable_infiniband = True
-        ucx_net_devices="auto"
-        rmm_pool_size="24GB"
+        protocol="ucx",
+        interface="ib0", # passed to the scheduler
+        enable_tcp_over_ucx=True,
+        enable_nvlink=True,
+        enable_infiniband=True,
+        ucx_net_devices="auto",
+        rmm_pool_size="1GB",
     )
     client = Client(cluster)
 
-    # Client code goes here
+Note the specification of ``"mlx5_0:1"`` as our net device in ``initialize()``; because the scheduler and client do not rely upon Dask-CUDA, they cannot automatically detect InfiniBand interfaces, so we must specify one explicitly.
\ No newline at end of file