From babead6c86e82d90cf24123b1ac37db6f163e1f0 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 26 Sep 2023 16:49:52 +0100
Subject: [PATCH 1/3] Move nvtx annotation utilities to a separate file

This will enable us to use them in the spilling code without circular
import issues.
---
 python/cudf/cudf/utils/nvtx_annotation.py | 30 ++++++++++++++++++++++
 python/cudf/cudf/utils/utils.py           | 31 +++--------------------
 2 files changed, 34 insertions(+), 27 deletions(-)
 create mode 100644 python/cudf/cudf/utils/nvtx_annotation.py

diff --git a/python/cudf/cudf/utils/nvtx_annotation.py b/python/cudf/cudf/utils/nvtx_annotation.py
new file mode 100644
index 00000000000..a4404e51232
--- /dev/null
+++ b/python/cudf/cudf/utils/nvtx_annotation.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+import hashlib
+from functools import partial
+
+from nvtx import annotate
+
+_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
+
+
+def _get_color_for_nvtx(name):
+    m = hashlib.sha256()
+    m.update(name.encode())
+    hash_value = int(m.hexdigest(), 16)
+    idx = hash_value % len(_NVTX_COLORS)
+    return _NVTX_COLORS[idx]
+
+
+def _cudf_nvtx_annotate(func, domain="cudf_python"):
+    """Decorator for applying nvtx annotations to methods in cudf."""
+    return annotate(
+        message=func.__qualname__,
+        color=_get_color_for_nvtx(func.__qualname__),
+        domain=domain,
+    )(func)
+
+
+_dask_cudf_nvtx_annotate = partial(
+    _cudf_nvtx_annotate, domain="dask_cudf_python"
+)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index e2cb3f145a1..d219b075178 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -2,15 +2,13 @@
 
 import decimal
 import functools
-import hashlib
 import os
 import traceback
 import warnings
-from functools import partial
 from typing import FrozenSet, Set, Union
 
 import numpy as np
-from nvtx import annotate
+from nvtx import annotate  # noqa: F401
 
 import rmm
 
@@ -18,6 +16,9 @@
 import cudf.api.types
 from cudf.core import column
 from cudf.core.buffer import as_buffer
+from cudf.utils.nvtx_annotation import _NVTX_COLORS  # noqa: F401
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate  # noqa: F401
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate  # noqa: F401
 
 # The size of the mask in bytes
 mask_dtype = cudf.api.types.dtype(np.int32)
@@ -119,8 +120,6 @@ def _array_ufunc(obj, ufunc, method, inputs, kwargs):
     "__ge__",
 }
 
-_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
-
 # The test root is set by pytest to support situations where tests are run from
 # a source tree on a built version of cudf.
 NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS")
@@ -353,28 +352,6 @@ def is_na_like(obj):
     return obj is None or obj is cudf.NA or obj is cudf.NaT
 
 
-def _get_color_for_nvtx(name):
-    m = hashlib.sha256()
-    m.update(name.encode())
-    hash_value = int(m.hexdigest(), 16)
-    idx = hash_value % len(_NVTX_COLORS)
-    return _NVTX_COLORS[idx]
-
-
-def _cudf_nvtx_annotate(func, domain="cudf_python"):
-    """Decorator for applying nvtx annotations to methods in cudf."""
-    return annotate(
-        message=func.__qualname__,
-        color=_get_color_for_nvtx(func.__qualname__),
-        domain=domain,
-    )(func)
-
-
-_dask_cudf_nvtx_annotate = partial(
-    _cudf_nvtx_annotate, domain="dask_cudf_python"
-)
-
-
 def _warn_no_dask_cudf(fn):
     @functools.wraps(fn)
     def wrapper(self):

From 4f83404e943957739290d359658ef3eae456d02e Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 26 Sep 2023 17:02:04 +0100
Subject: [PATCH 2/3] Adapt nvtx imports

---
 python/cudf/cudf/core/dataframe.py           | 7 ++-----
 python/cudf/cudf/core/frame.py               | 7 ++-----
 python/cudf/cudf/core/groupby/groupby.py     | 3 ++-
 python/cudf/cudf/core/index.py               | 8 ++------
 python/cudf/cudf/core/indexed_frame.py       | 3 ++-
 python/cudf/cudf/core/multiindex.py          | 3 ++-
 python/cudf/cudf/core/series.py              | 2 +-
 python/cudf/cudf/core/single_column_frame.py | 3 ++-
 python/cudf/cudf/core/udf/groupby_utils.py   | 2 +-
 python/cudf/cudf/core/udf/utils.py           | 3 ++-
 python/cudf/cudf/io/csv.py                   | 2 +-
 python/cudf/cudf/io/parquet.py               | 2 +-
 python/cudf/cudf/io/text.py                  | 4 ++--
 python/cudf/cudf/utils/utils.py              | 4 ----
 python/dask_cudf/dask_cudf/backends.py       | 2 +-
 python/dask_cudf/dask_cudf/core.py           | 2 +-
 python/dask_cudf/dask_cudf/groupby.py        | 2 +-
 python/dask_cudf/dask_cudf/sorting.py        | 2 +-
 18 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8a3dbe77787..e8acae9686a 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -95,11 +95,8 @@
     min_scalar_type,
     numeric_normalize_types,
 )
-from cudf.utils.utils import (
-    GetAttrGetItemMixin,
-    _cudf_nvtx_annotate,
-    _external_only_api,
-)
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
 
 _cupy_nan_methods_map = {
     "min": "nanmin",
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 1e6d177f8ca..7cb78bc8d1f 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -47,11 +47,8 @@
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import find_common_type
-from cudf.utils.utils import (
-    _array_ufunc,
-    _cudf_nvtx_annotate,
-    _warn_no_dask_cudf,
-)
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 
 # TODO: It looks like Frame is missing a declaration of `copy`, need to add
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e1740140b44..3b8f0f3824a 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -29,7 +29,8 @@
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
-from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import GetAttrGetItemMixin
 
 
 # The three functions below return the quantiles [25%, 50%, 75%]
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index de8a5948033..5c323bda9ea 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -64,12 +64,8 @@
     is_mixed_with_object_dtype,
     numeric_normalize_types,
 )
-from cudf.utils.utils import (
-    _cudf_nvtx_annotate,
-    _is_same_name,
-    _warn_no_dask_cudf,
-    search_range,
-)
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import _is_same_name, _warn_no_dask_cudf, search_range
 
 
 def _lexsorted_equal_range(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 62e091b29b5..b3d70bc351e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -69,7 +69,8 @@
 )
 from cudf.utils import docutils
 from cudf.utils._numba import _CUDFNumbaConfig
-from cudf.utils.utils import _cudf_nvtx_annotate, _warn_no_dask_cudf
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import _warn_no_dask_cudf
 
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 21380bb841c..87a11478870 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -26,7 +26,8 @@
 from cudf.core._compat import PANDAS_GE_150
 from cudf.core.frame import Frame
 from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
-from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate, _is_same_name
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import NotIterable, _is_same_name
 
 
 def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index a195738af54..00ba722136e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -90,7 +90,7 @@
     is_mixed_with_object_dtype,
     to_cudf_compatible_scalar,
 )
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 def _format_percentile_names(percentiles):
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 6a56ab8f3a5..e30e1c747f5 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -19,7 +19,8 @@
 )
 from cudf.core.column import ColumnBase, as_column
 from cudf.core.frame import Frame
-from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import NotIterable
 
 
 class SingleColumnFrame(Frame, NotIterable):
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index b18720f5db5..5dbcf455e33 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -28,7 +28,7 @@
     _supported_dtypes_from_frame,
 )
 from cudf.utils._numba import _CUDFNumbaConfig
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 def _get_frame_groupby_type(dtype, index_dtype):
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index 35a3f6c1ffd..7b7ac2b3070 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -39,7 +39,8 @@
     STRING_TYPES,
     TIMEDELTA_TYPES,
 )
-from cudf.utils.utils import _cudf_nvtx_annotate, initfunc
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import initfunc
 
 # Maximum size of a string column is 2 GiB
 _STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get(
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index bacc0641639..764885dd7b6 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -11,7 +11,7 @@
 from cudf.api.types import is_scalar
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index d8510cf8e95..d84aff66d7b 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -22,7 +22,7 @@
 from cudf.api.types import is_list_like
 from cudf.core.column import build_categorical_column, column_empty, full
 from cudf.utils import ioutils
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 BYTE_SIZES = {
     "kb": 1000,
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index eb2c7fa7ef6..0e19972f6e0 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from io import BytesIO, StringIO
 
 import cudf
 from cudf._lib import text as libtext
 from cudf.utils import ioutils
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index d219b075178..0ff23bd37c6 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -8,7 +8,6 @@
 from typing import FrozenSet, Set, Union
 
 import numpy as np
-from nvtx import annotate  # noqa: F401
 
 import rmm
 
@@ -16,9 +15,6 @@
 import cudf.api.types
 from cudf.core import column
 from cudf.core.buffer import as_buffer
-from cudf.utils.nvtx_annotation import _NVTX_COLORS  # noqa: F401
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate  # noqa: F401
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate  # noqa: F401
 
 # The size of the mask in bytes
 mask_dtype = cudf.api.types.dtype(np.int32)
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index e3f4f04eb85..b1a8ca01924 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -42,7 +42,7 @@
 
 import cudf
 from cudf.api.types import is_string_dtype
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 from .core import DataFrame, Index, Series
 
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 5b37e6e825c..17650c9b70d 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -22,7 +22,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 from dask_cudf import sorting
 from dask_cudf.accessors import ListMethods, StructMethods
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index f4bbcaf4dd1..b1fdf443a17 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -15,7 +15,7 @@
 from dask.utils import funcname
 
 import cudf
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 # aggregations that are dask-cudf optimized
 OPTIMIZED_AGGS = (
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index e841f2d8830..e2c8a548100 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -16,7 +16,7 @@
 
 import cudf as gd
 from cudf.api.types import is_categorical_dtype
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 
 @_dask_cudf_nvtx_annotate

From 76ac0f2312e73d18b4f970bf365ed412f8aee01d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 26 Sep 2023 17:04:07 +0100
Subject: [PATCH 3/3] Add nvtx annotations for spill-induced memcpys

This allows us to see when RMM allocations are taking an excessive
amount of time due to spilling of cudf-managed buffers.
---
 python/cudf/cudf/core/buffer/spill_manager.py |  7 ++++++
 .../cudf/cudf/core/buffer/spillable_buffer.py | 24 +++++++++++++++----
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index f056a0fd592..91f3b2cd544 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -11,14 +11,20 @@
 import weakref
 from collections import defaultdict
 from dataclasses import dataclass
+from functools import partial
 from typing import Dict, List, Optional, Tuple
 
 import rmm.mr
 
 from cudf.core.buffer.spillable_buffer import SpillableBuffer
 from cudf.options import get_option
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.string import format_bytes
 
+_spill_cudf_nvtx_annotate = partial(
+    _cudf_nvtx_annotate, domain="cudf_python-spill"
+)
+
 
 def get_traceback() -> str:
     """Pretty print current traceback to a string"""
@@ -329,6 +335,7 @@ def buffers(
             ret = tuple(sorted(ret, key=lambda b: b.last_accessed))
         return ret
 
+    @_spill_cudf_nvtx_annotate
     def spill_device_memory(self, nbytes: int) -> int:
         """Try to spill device memory
 
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index 84fb2044c62..1856bec1876 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -20,6 +20,7 @@
     get_ptr_and_size,
     host_memory_allocation,
 )
+from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
 from cudf.utils.string import format_bytes
 
 if TYPE_CHECKING:
@@ -291,8 +292,15 @@ def spill(self, target: str = "cpu") -> None:
                 )
 
             if (ptr_type, target) == ("gpu", "cpu"):
-                host_mem = host_memory_allocation(self.size)
-                rmm._lib.device_buffer.copy_ptr_to_host(self._ptr, host_mem)
+                with annotate(
+                    message="SpillDtoH",
+                    color=_get_color_for_nvtx("SpillDtoH"),
+                    domain="cudf_python-spill",
+                ):
+                    host_mem = host_memory_allocation(self.size)
+                    rmm._lib.device_buffer.copy_ptr_to_host(
+                        self._ptr, host_mem
+                    )
                 self._ptr_desc["memoryview"] = host_mem
                 self._ptr = 0
                 self._owner = None
@@ -302,9 +310,15 @@ def spill(self, target: str = "cpu") -> None:
                 # trigger a new call to this buffer's `spill()`.
                 # Therefore, it is important that spilling-on-demand doesn't
                 # try to unspill an already locked buffer!
-                dev_mem = rmm.DeviceBuffer.to_device(
-                    self._ptr_desc.pop("memoryview")
-                )
+                with annotate(
+                    message="SpillHtoD",
+                    color=_get_color_for_nvtx("SpillHtoD"),
+                    domain="cudf_python-spill",
+                ):
+
+                    dev_mem = rmm.DeviceBuffer.to_device(
+                        self._ptr_desc.pop("memoryview")
+                    )
                 self._ptr = dev_mem.ptr
                 self._owner = dev_mem
                 assert self._size == dev_mem.size