From 5d8ea19389e4e79e1c9b70fbe99728e02a0d9a53 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 1 Mar 2022 11:07:52 -0600
Subject: [PATCH 1/9] Fix warnings in test_csv.py. (#10362)

This PR silences warnings in `test_csv.py`. (I am working through one test file at a time so we can enable `-Werr` in the future.)

The only warning in this file is related to integer overflow in pandas. Currently, the test data is as follows:
https://github.com/rapidsai/cudf/blob/21325e8348f33b28e434d08d687a28f251c38f67/python/cudf/cudf/tests/test_csv.py#L1313-L1319

First, I note that this "hex" dtype is not part of the pandas API. It is a cuDF addition (#1925, #2149).

Note that there are dtypes for `int32` / `hex32`, and the test data contains both a negative value `-0x1000` and a value `9512c20b`. The negative value `-0x1000` has a sensible interpretation if the results are meant to be signed, but then the value `9512c20b` is out of range (the maximum signed 32-bit value would be `0x7FFFFFFF` and the minimum signed 32-bit value would be `0x80000000`, using the big-endian convention of the parser). Recognizing this, pandas throws a `FutureWarning` when parsing the data `9512c20b` as `int32`, and unsafely wraps it to a negative value. This behavior will eventually be replaced by an `OverflowError`.

In the future, we may need to decide if cuDF should raise an `OverflowError` when exceeding `0x7FFFFFFF` for consistency with pandas, or decide to use unsigned integers when parsing "hex" dtypes and compare to pandas' unsigned types in this test.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/10362
---
 python/cudf/cudf/tests/test_csv.py | 31 +++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index f3d69e1745e..6176184b670 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1315,7 +1315,7 @@ def test_csv_reader_aligned_byte_range(tmpdir):
     [(None, None), ("int", "hex"), ("int32", "hex32"), ("int64", "hex64")],
 )
 def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype):
-    lines = ["0x0", "-0x1000", "0xfedcba", "0xABCDEF", "0xaBcDeF", "9512c20b"]
+    lines = ["0x0", "-0x1000", "0xfedcba", "0xABCDEF", "0xaBcDeF"]
     values = [int(hex_int, 16) for hex_int in lines]
 
     buffer = "\n".join(lines)
@@ -1334,6 +1334,35 @@ def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype):
         assert_eq(pdf, gdf)
 
 
+@pytest.mark.parametrize(
+    "np_dtype, gdf_dtype",
+    [("int", "hex"), ("int32", "hex32"), ("int64", "hex64")],
+)
+def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype):
+    # This tests values which cause an overflow warning that will become an
+    # error in pandas. NumPy wraps the overflow silently up to the bounds of a
+    # signed int64.
+    lines = [
+        "0x0",
+        "-0x1000",
+        "0xfedcba",
+        "0xABCDEF",
+        "0xaBcDeF",
+        "0x9512c20b",
+        "0x7fffffff",
+        "0x7fffffffffffffff",
+        "-0x8000000000000000",
+    ]
+    values = [int(hex_int, 16) for hex_int in lines]
+    buffer = "\n".join(lines)
+
+    gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"])
+
+    expected = np.array(values, dtype=np_dtype)
+    actual = gdf["hex_int"].to_numpy()
+    np.testing.assert_array_equal(expected, actual)
+
+
 @pytest.mark.parametrize("quoting", [0, 1, 2, 3])
 def test_csv_reader_pd_consistent_quotes(quoting):
     names = ["text"]

From 78b316c4ff8bcc7ecbf058d2a0dfe98dbe029b6f Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Tue, 1 Mar 2022 14:06:18 -0600
Subject: [PATCH 2/9] byte_range support for multibyte_split/read_text (#10150)

Adding byte_range support to multibyte_split/read_text.

Closes #9655

providing a byte range in terms of `(offset, size)` allows multibyte_split to read a whole file, but only return the offsets within those ranges as well as one additional offset (unless it's the end of the file). If thinking in terms of "records", where each delimiter dictates the end of a record, we effectively return all records which _begin_ within the byte range provided, and ignore all other records, including any record which may end (but not begin) within the range, and including any record which may begin in the range but _end_ outside of the range.

examples:
```
input: "abc..def..ghi..jkl.."
delimiter: ..
```
```
range offset: 0
range size: 2
output: ["abc.."]
```
```
range offset: 2
range size: 9
output: ["def..", "ghi.."]
```
```
range offset: 11
range size: 2
output: []
```
```
range offset: 13
range size: 7
output: ["jkl..", ""]
```

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10150
---
 conda/recipes/libcudf/meta.yaml               |   1 +
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/io/text/byte_range_info.hpp  |  74 ++++++++++
 .../cudf/io/text/data_chunk_source.hpp        |   7 +-
 .../io/text/data_chunk_source_factories.hpp   |  10 +-
 cpp/include/cudf/io/text/detail/trie.hpp      |  16 +-
 cpp/include/cudf/io/text/multibyte_split.hpp  |  48 +++++-
 cpp/src/io/text/byte_range_info.cpp           |  47 ++++++
 cpp/src/io/text/multibyte_split.cu            | 139 ++++++++++++------
 cpp/tests/io/text/multibyte_split_test.cpp    |  27 ++++
 python/cudf/cudf/_lib/cpp/io/text.pxd         |  13 +-
 python/cudf/cudf/_lib/text.pyx                |  30 +++-
 python/cudf/cudf/io/text.py                   |   8 +-
 python/cudf/cudf/tests/test_text.py           |  53 ++++++-
 14 files changed, 401 insertions(+), 73 deletions(-)
 create mode 100644 cpp/include/cudf/io/text/byte_range_info.hpp
 create mode 100644 cpp/src/io/text/byte_range_info.cpp

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 4e20c979f6c..eae915c47fe 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -137,6 +137,7 @@ test:
     - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
     - test -f $PREFIX/include/cudf/io/orc.hpp
     - test -f $PREFIX/include/cudf/io/parquet.hpp
+    - test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
     - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
     - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
     - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2ffd62f1b53..825ea37c6ac 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -311,6 +311,7 @@ add_library(
   src/io/parquet/writer_impl.cu
   src/io/statistics/orc_column_statistics.cu
   src/io/statistics/parquet_column_statistics.cu
+  src/io/text/byte_range_info.cpp
   src/io/text/multibyte_split.cu
   src/io/utilities/column_buffer.cpp
   src/io/utilities/config_utils.cpp
diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
new file mode 100644
index 00000000000..cb2d00f0d1f
--- /dev/null
+++ b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/error.hpp>
+
+#include <cstdint>
+#include <vector>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+/**
+ * @brief stores offset and size used to indicate a byte range
+ */
+class byte_range_info {
+ private:
+  int64_t _offset;
+  int64_t _size;
+
+ public:
+  constexpr byte_range_info() noexcept : _offset(0), _size(0) {}
+  constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
+  {
+    CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
+    CUDF_EXPECTS(size >= 0, "size must be non-negative");
+  }
+
+  constexpr byte_range_info(byte_range_info const& other) noexcept = default;
+  constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default;
+
+  [[nodiscard]] constexpr int64_t offset() { return _offset; }
+  [[nodiscard]] constexpr int64_t size() { return _size; }
+};
+
+/**
+ * @brief Create a collection of consecutive ranges between [0, total_bytes).
+ *
+ * Each range wil be the same size except if `total_bytes` is not evenly divisible by
+ * `range_count`, in which case the last range size will be the remainder.
+ *
+ * @param total_bytes total number of bytes in all ranges
+ * @param range_count total number of ranges in which to divide bytes
+ * @return Vector of range objects
+ */
+std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
+                                                                 int64_t range_count);
+
+/**
+ * @brief Create a byte_range_info which represents as much of a file as possible. Specifically,
+ * `[0, numeric_limit<int64_t>::max())`.
+ *
+ * @return `[0, numeric_limit<int64_t>::max())`
+ */
+byte_range_info create_byte_range_info_max();
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
index 5e6dda5a514..3499b86ab42 100644
--- a/cpp/include/cudf/io/text/data_chunk_source.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@ namespace text {
  */
 class device_data_chunk {
  public:
+  virtual ~device_data_chunk()                     = default;
   [[nodiscard]] virtual char const* data() const   = 0;
   [[nodiscard]] virtual std::size_t size() const   = 0;
   virtual operator device_span<char const>() const = 0;
@@ -52,6 +53,9 @@ class device_data_chunk {
  */
 class data_chunk_reader {
  public:
+  virtual ~data_chunk_reader()              = default;
+  virtual void skip_bytes(std::size_t size) = 0;
+
   /**
    * @brief Get the next chunk of bytes from the data source
    *
@@ -76,6 +80,7 @@ class data_chunk_reader {
  */
 class data_chunk_source {
  public:
+  virtual ~data_chunk_source()                                                   = default;
   [[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
 };
 
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
index aeb4b7fff53..ffe159b59dc 100644
--- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,6 +89,8 @@ class istream_data_chunk_reader : public data_chunk_reader {
     }
   }
 
+  void skip_bytes(std::size_t size) override { _datastream->ignore(size); };
+
   std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
                                                     rmm::cuda_stream_view stream) override
   {
@@ -143,6 +145,12 @@ class device_span_data_chunk_reader : public data_chunk_reader {
  public:
   device_span_data_chunk_reader(device_span<char const> data) : _data(data) {}
 
+  void skip_bytes(std::size_t read_size) override
+  {
+    if (read_size > _data.size() - _position) { read_size = _data.size() - _position; }
+    _position += read_size;
+  };
+
   std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
                                                     rmm::cuda_stream_view stream) override
   {
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index 06d15276a68..a908a9fa227 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,20 +89,6 @@ struct trie_device_view {
    */
   constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; }
 
-  /**
-   * @brief returns the longest matching state of any state in the multistate.
-   */
-  template <uint32_t N>
-  constexpr uint8_t get_match_length(multistate const& states)
-  {
-    int8_t val = 0;
-    for (uint8_t i = 0; i < states.size(); i++) {
-      auto match_length = get_match_length(states.get_tail(i));
-      if (match_length > val) { val = match_length; }
-    }
-    return val;
-  }
-
  private:
   constexpr void transition_enqueue_all(  //
     char c,
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index d42ee9f510e..25f7ef98a81 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
@@ -27,10 +28,53 @@ namespace cudf {
 namespace io {
 namespace text {
 
+/**
+ * @brief Splits the source text into a strings column using a multiple byte delimiter.
+ *
+ * Providing a byte range allows multibyte_split to read a whole file, but only return the offsets
+ * of delimiters which begin within the range. If thinking in terms of "records", where each
+ * delimiter dictates the end of a record, all records which begin within the byte range provided
+ * will be returned, including any record which may begin in the range but end outside of the
+ * range. Records which begin outside of the range will ignored, even if those records end inside
+ * the range.
+ *
+ * @code{.pseudo}
+ * Examples:
+ *  source:     "abc..def..ghi..jkl.."
+ *  delimiter:  ".."
+ *
+ *  byte_range: nullopt
+ *  return:     ["abc..", "def..", "ghi..", jkl..", ""]
+ *
+ *  byte_range: [0, 2)
+ *  return:     ["abc.."]
+ *
+ *  byte_range: [2, 9)
+ *  return:     ["def..", "ghi.."]
+ *
+ *  byte_range: [11, 2)
+ *  return:     []
+ *
+ *  byte_range: [13, 7)
+ *  return:     ["jkl..", ""]
+ * @endcode
+ *
+ * @param source The source string
+ * @param delimiter UTF-8 encoded string for which to find offsets in the source
+ * @param byte_range range in which to consider offsets relevant
+ * @param mr Memory resource to use for the device memory allocation
+ * @return The strings found by splitting the source by the delimiter within the relevant byte
+ * range.
+ */
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::optional<byte_range_info> byte_range = std::nullopt,
+  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace text
 }  // namespace io
diff --git a/cpp/src/io/text/byte_range_info.cpp b/cpp/src/io/text/byte_range_info.cpp
new file mode 100644
index 00000000000..290e0451839
--- /dev/null
+++ b/cpp/src/io/text/byte_range_info.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/text/byte_range_info.hpp>
+
+#include <limits>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits<int64_t>::max()}; }
+
+std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
+                                                                 int64_t range_count)
+{
+  auto range_size = util::div_rounding_up_safe(total_bytes, range_count);
+  auto ranges     = std::vector<byte_range_info>();
+
+  ranges.reserve(range_size);
+
+  for (int64_t i = 0; i < range_count; i++) {
+    auto offset = i * range_size;
+    auto size   = std::min(range_size, total_bytes - offset);
+    ranges.emplace_back(offset, size);
+  }
+
+  return ranges;
+}
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index d287b9f2419..99f3bde3bf6 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,16 +18,24 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 #include <cudf/io/text/detail/multistate.hpp>
 #include <cudf/io/text/detail/tile_state.hpp>
 #include <cudf/io/text/detail/trie.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <limits>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 
@@ -96,7 +104,7 @@ __global__ void multibyte_split_init_kernel(
   cudf::size_type base_tile_idx,
   cudf::size_type num_tiles,
   cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
-  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  cudf::io::text::detail::scan_tile_state_view<int64_t> tile_output_offsets,
   cudf::io::text::detail::scan_tile_status status =
     cudf::io::text::detail::scan_tile_status::invalid)
 {
@@ -110,7 +118,7 @@ __global__ void multibyte_split_init_kernel(
 
 __global__ void multibyte_split_seed_kernel(
   cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
-  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  cudf::io::text::detail::scan_tile_state_view<int64_t> tile_output_offsets,
   multistate tile_multistate_seed,
   uint32_t tile_output_offset)
 {
@@ -124,17 +132,15 @@ __global__ void multibyte_split_seed_kernel(
 __global__ void multibyte_split_kernel(
   cudf::size_type base_tile_idx,
   cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
-  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  cudf::io::text::detail::scan_tile_state_view<int64_t> tile_output_offsets,
   cudf::io::text::detail::trie_device_view trie,
-  int32_t chunk_input_offset,
   cudf::device_span<char const> chunk_input_chars,
-  cudf::device_span<int32_t> abs_output_delimiter_offsets,
-  cudf::device_span<char> abs_output_chars)
+  cudf::device_span<int64_t> abs_output_delimiter_offsets)
 {
   using InputLoad =
     cub::BlockLoad<char, THREADS_PER_TILE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_VECTORIZE>;
-  using OffsetScan         = cub::BlockScan<uint32_t, THREADS_PER_TILE>;
-  using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback<uint32_t>;
+  using OffsetScan         = cub::BlockScan<int64_t, THREADS_PER_TILE>;
+  using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback<int64_t>;
 
   __shared__ union {
     typename InputLoad::TempStorage input_load;
@@ -166,7 +172,7 @@ __global__ void multibyte_split_kernel(
 
   // STEP 3: Flag matches
 
-  uint32_t thread_offsets[ITEMS_PER_THREAD];
+  int64_t thread_offsets[ITEMS_PER_THREAD];
 
   for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) {
     thread_offsets[i] = i < thread_input_size and trie.is_match(thread_states[i]);
@@ -182,16 +188,11 @@ __global__ void multibyte_split_kernel(
 
   // Step 5: Assign outputs from each thread using match offsets.
 
-  if (abs_output_chars.size() > 0) {
-    for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) {
-      abs_output_chars[chunk_input_offset + thread_input_offset + i] = thread_chars[i];
-    }
-  }
-
   if (abs_output_delimiter_offsets.size() > 0) {
     for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) {
       if (trie.is_match(thread_states[i])) {
-        auto const match_end = base_tile_idx * ITEMS_PER_TILE + thread_input_offset + i + 1;
+        auto const match_end =
+          static_cast<int64_t>(base_tile_idx) * ITEMS_PER_TILE + thread_input_offset + i + 1;
         abs_output_delimiter_offsets[thread_offsets[i]] = match_end;
       }
     }
@@ -236,17 +237,16 @@ std::vector<rmm::cuda_stream_view> get_streams(int32_t count, rmm::cuda_stream_p
   return streams;
 }
 
-cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_source const& source,
-                                                 cudf::io::text::detail::trie const& trie,
-                                                 scan_tile_state<multistate>& tile_multistates,
-                                                 scan_tile_state<uint32_t>& tile_offsets,
-                                                 device_span<cudf::size_type> output_buffer,
-                                                 device_span<char> output_char_buffer,
-                                                 rmm::cuda_stream_view stream,
-                                                 std::vector<rmm::cuda_stream_view> const& streams)
+int64_t multibyte_split_scan_full_source(cudf::io::text::data_chunk_source const& source,
+                                         cudf::io::text::detail::trie const& trie,
+                                         scan_tile_state<multistate>& tile_multistates,
+                                         scan_tile_state<int64_t>& tile_offsets,
+                                         device_span<int64_t> output_buffer,
+                                         rmm::cuda_stream_view stream,
+                                         std::vector<rmm::cuda_stream_view> const& streams)
 {
   CUDF_FUNC_RANGE();
-  cudf::size_type chunk_offset = 0;
+  int64_t chunk_offset = 0;
 
   multibyte_split_init_kernel<<<TILES_PER_CHUNK, THREADS_PER_TILE, 0, stream.value()>>>(  //
     -TILES_PER_CHUNK,
@@ -298,14 +298,14 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour
       tile_multistates,
       tile_offsets,
       trie.view(),
-      chunk_offset,
       *chunk,
-      output_buffer,
-      output_char_buffer);
+      output_buffer);
 
     cudaEventRecord(last_launch_event, chunk_stream);
 
     chunk_offset += chunk->size();
+
+    chunk.reset();
   }
 
   cudaEventDestroy(last_launch_event);
@@ -317,6 +317,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour
 
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
+                                              byte_range_info byte_range,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr,
                                               rmm::cuda_stream_pool& stream_pool)
@@ -336,7 +337,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
   auto num_tile_states  = std::max(32, TILES_PER_CHUNK * concurrency + 32);
   auto tile_multistates = scan_tile_state<multistate>(num_tile_states, stream);
-  auto tile_offsets     = scan_tile_state<uint32_t>(num_tile_states, stream);
+  auto tile_offsets     = scan_tile_state<int64_t>(num_tile_states, stream);
 
   auto streams = get_streams(concurrency, stream_pool);
 
@@ -345,52 +346,104 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                      trie,
                                      tile_multistates,
                                      tile_offsets,
-                                     cudf::device_span<int32_t>(static_cast<int32_t*>(nullptr), 0),
-                                     cudf::device_span<char>(static_cast<char*>(nullptr), 0),
+                                     cudf::device_span<int64_t>(static_cast<int64_t*>(nullptr), 0),
                                      stream,
                                      streams);
 
   // allocate results
-  auto num_tiles      = cudf::util::div_rounding_up_safe(bytes_total, ITEMS_PER_TILE);
-  auto num_results    = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream);
-  auto string_offsets = rmm::device_uvector<int32_t>(num_results + 2, stream, mr);
-  auto string_chars   = rmm::device_uvector<char>(bytes_total, stream, mr);
+  auto num_tiles =
+    cudf::util::div_rounding_up_safe(bytes_total, static_cast<int64_t>(ITEMS_PER_TILE));
+  auto num_results = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream);
+
+  auto string_offsets = rmm::device_uvector<int64_t>(num_results + 2, stream);
 
   // first and last element are set manually to zero and size of input, respectively.
   // kernel is only responsible for determining delimiter offsets
-  auto string_count = static_cast<cudf::size_type>(string_offsets.size() - 1);
   string_offsets.set_element_to_zero_async(0, stream);
-  string_offsets.set_element_async(string_count, bytes_total, stream);
+  string_offsets.set_element_async(string_offsets.size() - 1, bytes_total, stream);
+
+  // kernel needs to find first and last relevant offset., as well as count of relevant offsets.
 
   multibyte_split_scan_full_source(
     source,
     trie,
     tile_multistates,
     tile_offsets,
-    cudf::device_span<int32_t>(string_offsets).subspan(1, num_results),
-    string_chars,
+    cudf::device_span<int64_t>(string_offsets).subspan(1, num_results),
     stream,
     streams);
 
+  auto relevant_offsets_begin = thrust::lower_bound(rmm::exec_policy(stream),
+                                                    string_offsets.begin(),
+                                                    string_offsets.end() - 1,
+                                                    byte_range.offset());
+
+  auto relevant_offsets_end = thrust::upper_bound(rmm::exec_policy(stream),
+                                                  string_offsets.begin(),
+                                                  string_offsets.end() - 1,
+                                                  byte_range.offset() + byte_range.size()) +
+                              1;
+
+  auto string_offsets_out_size = relevant_offsets_end - relevant_offsets_begin;
+
+  auto string_offsets_out = rmm::device_uvector<int32_t>(string_offsets_out_size, stream, mr);
+
+  auto relevant_offset_first =
+    string_offsets.element(relevant_offsets_begin - string_offsets.begin(), stream);
+  auto relevant_offset_last =
+    string_offsets.element(relevant_offsets_end - string_offsets.begin() - 1, stream);
+
+  auto string_chars_size = relevant_offset_last - relevant_offset_first;
+  auto string_chars      = rmm::device_uvector<char>(string_chars_size, stream, mr);
+
+  // copy relevant offsets and adjust them to be zero-based.
+  thrust::transform(rmm::exec_policy(stream),
+                    relevant_offsets_begin,
+                    relevant_offsets_end,
+                    string_offsets_out.begin(),
+                    [relevant_offset_first] __device__(int64_t offset) {
+                      return static_cast<int32_t>(offset - relevant_offset_first);
+                    });
+
+  auto reader = source.create_reader();
+  reader->skip_bytes(relevant_offset_first);
+
+  auto relevant_bytes = reader->get_next_chunk(string_chars_size, stream);
+
+  thrust::copy(rmm::exec_policy(stream),
+               relevant_bytes->data(),  //
+               relevant_bytes->data() + relevant_bytes->size(),
+               string_chars.begin());
+
+  auto string_count = string_offsets_out.size() - 1;
+
   return cudf::make_strings_column(
-    string_count, std::move(string_offsets), std::move(string_chars));
+    string_count, std::move(string_offsets_out), std::move(string_chars));
 }
 
 }  // namespace detail
 
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
+                                              std::optional<byte_range_info> byte_range,
                                               rmm::mr::device_memory_resource* mr)
 {
   auto stream      = rmm::cuda_stream_default;
   auto stream_pool = rmm::cuda_stream_pool(2);
-  auto result      = detail::multibyte_split(source, delimiter, stream, mr, stream_pool);
 
-  stream.synchronize();
+  auto result = detail::multibyte_split(
+    source, delimiter, byte_range.value_or(create_byte_range_info_max()), stream, mr, stream_pool);
 
   return result;
 }
 
+std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  return multibyte_split(source, delimiter, std::nullopt, mr);
+}
+
 }  // namespace text
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 27a8be95e9b..cfd1a16f19a 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -21,6 +21,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/concatenate.hpp>
+#include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -142,4 +144,29 @@ TEST_F(MultibyteSplitTest, HandpickedInput)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS);
 }
 
+TEST_F(MultibyteSplitTest, LargeInputMultipleRange)
+{
+  auto host_input    = std::string();
+  auto host_expected = std::vector<std::string>();
+
+  for (auto i = 0; i < 1000; i++) {
+    host_input += "...:|";
+  }
+
+  auto delimiter = std::string("...:|");
+  auto source    = cudf::io::text::make_source(host_input);
+
+  auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
+  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
+  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
+  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+
+  auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
+  auto out       = cudf::concatenate(out_views);
+
+  auto expected = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/cpp/io/text.pxd
index 9ce0c68cb08..5b110d6234c 100644
--- a/python/cudf/cudf/_lib/cpp/io/text.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/text.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -6,6 +6,13 @@ from libcpp.string cimport string
 from cudf._lib.cpp.column.column cimport column
 
 
+cdef extern from "cudf/io/text/byte_range_info.hpp" \
+        namespace "cudf::io::text" nogil:
+
+    cdef cppclass byte_range_info:
+        byte_range_info() except +
+        byte_range_info(size_t offset, size_t size) except +
+
 cdef extern from "cudf/io/text/data_chunk_source.hpp" \
         namespace "cudf::io::text" nogil:
 
@@ -25,3 +32,7 @@ cdef extern from "cudf/io/text/multibyte_split.hpp" \
 
     unique_ptr[column] multibyte_split(data_chunk_source source,
                                        string delimiter) except +
+
+    unique_ptr[column] multibyte_split(data_chunk_source source,
+                                       string delimiter,
+                                       byte_range_info byte_range) except +
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index 9f33f32bdaf..daea227cc39 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import cudf
 
@@ -10,6 +10,7 @@ from libcpp.utility cimport move
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.io.text cimport (
+    byte_range_info,
     data_chunk_source,
     make_source,
     make_source_from_file,
@@ -18,7 +19,8 @@ from cudf._lib.cpp.io.text cimport (
 
 
 def read_text(object filepaths_or_buffers,
-              object delimiter=None):
+              object delimiter=None,
+              object byte_range=None):
     """
     Cython function to call into libcudf API, see `multibyte_split`.
 
@@ -31,9 +33,25 @@ def read_text(object filepaths_or_buffers,
 
     cdef unique_ptr[data_chunk_source] datasource
     cdef unique_ptr[column] c_col
-
-    with nogil:
-        datasource = move(make_source_from_file(filename))
-        c_col = move(multibyte_split(dereference(datasource), delim))
+    cdef size_t c_byte_range_offset
+    cdef size_t c_byte_range_size
+    cdef byte_range_info c_byte_range
+
+    if (byte_range is not None):
+        c_byte_range_offset = byte_range[0]
+        c_byte_range_size = byte_range[1]
+        with nogil:
+            datasource = move(make_source_from_file(filename))
+            c_byte_range = byte_range_info(
+                c_byte_range_offset,
+                c_byte_range_size)
+            c_col = move(multibyte_split(
+                dereference(datasource),
+                delim,
+                c_byte_range))
+    else:
+        with nogil:
+            datasource = move(make_source_from_file(filename))
+            c_col = move(multibyte_split(dereference(datasource), delim))
 
     return {None: Column.from_unique_ptr(move(c_col))}
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 705645b8349..04809f8fd59 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 from io import BytesIO, StringIO
 
@@ -12,7 +12,7 @@
 @annotate("READ_TEXT", color="purple", domain="cudf_python")
 @ioutils.doc_read_text()
 def read_text(
-    filepath_or_buffer, delimiter=None, **kwargs,
+    filepath_or_buffer, delimiter=None, byte_range=None, **kwargs,
 ):
     """{docstring}"""
 
@@ -24,5 +24,7 @@ def read_text(
     )
 
     return cudf.Series._from_data(
-        libtext.read_text(filepath_or_buffer, delimiter=delimiter,)
+        libtext.read_text(
+            filepath_or_buffer, delimiter=delimiter, byte_range=byte_range
+        )
     )
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 5ff66fc750f..fb6505f5f92 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import numpy as np
 import pytest
@@ -778,3 +778,54 @@ def test_read_text(datadir):
     actual = cudf.read_text(chess_file, delimiter=delimiter)
 
     assert_eq(expected, actual)
+
+
+def test_read_text_byte_range(datadir):
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file, "r") as f:
+        data = f.read()
+        content = data.split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    byte_range_size = (len(data) // 3) + (len(data) % 3 != 0)
+
+    actual_0 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 0, byte_range_size],
+    )
+    actual_1 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 1, byte_range_size],
+    )
+    actual_2 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 2, byte_range_size],
+    )
+
+    actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True)
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_byte_range_large(datadir):
+    content = str(("\n" if x % 5 == 0 else "x") for x in range(0, 300000000))
+    delimiter = "1."
+    temp_file = str(datadir) + "/temp.txt"
+
+    with open(temp_file, "w") as f:
+        f.write(content)
+
+    cudf.read_text(temp_file, delimiter=delimiter)

From 1217f24d97c3559e15293040ebda7914f00cb25e Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 2 Mar 2022 10:52:16 -0500
Subject: [PATCH 3/9] Create a dispatcher for invoking regex kernel functions
 (#10349)

Closes #10138

Refactor the various regex function calls to use a dispatcher instead of if-else clauses. Each regex call currently requires different stack sizes (and later launch parameters). Changes to these parameters are sometimes difficult to coordinate since they usually need to be duplicated across about 10 APIs that are currently using regex calls. The new `regex_dispatcher` makes calling these much cleaner and easier to maintain. This will be helpful when experimenting with possibly using different launch parameters.

No functions have changed. Mostly this is a refactoring and cleanup effort. The `findall.cu` was also recoded to use the new `count_matches` utility.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Christopher Harris (https://github.com/cwharris)
  - Bradley Dice (https://github.com/bdice)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

URL: https://github.com/rapidsai/cudf/pull/10349
---
 cpp/src/strings/contains.cu              | 200 +++++++++--------------
 cpp/src/strings/count_matches.cu         |  48 +++---
 cpp/src/strings/extract/extract.cu       |  62 +++----
 cpp/src/strings/extract/extract_all.cu   |  54 +++---
 cpp/src/strings/regex/dispatcher.hpp     |  59 +++++++
 cpp/src/strings/replace/backref_re.cu    |  87 +++++-----
 cpp/src/strings/replace/multi_re.cu      | 114 ++++++-------
 cpp/src/strings/replace/replace_re.cu    |  83 ++++------
 cpp/src/strings/search/findall.cu        | 155 ++++++------------
 cpp/src/strings/search/findall_record.cu |  54 +++---
 cpp/src/strings/split/split_re.cu        |  43 +++--
 11 files changed, 452 insertions(+), 507 deletions(-)
 create mode 100644 cpp/src/strings/regex/dispatcher.hpp

diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index efdee65c1f6..23bc5cf2dfe 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <strings/regex/dispatcher.hpp>
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -23,123 +27,90 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/transform.h>
+
 namespace cudf {
 namespace strings {
 namespace detail {
+
 namespace {
 /**
  * @brief This functor handles both contains_re and match_re to minimize the number
  * of regex calls to find() to be inlined greatly reducing compile time.
- *
- * The stack is used to keep progress on evaluating the regex instructions on each string.
- * So the size of the stack is in proportion to the number of instructions in the given regex
- * pattern.
- *
- * There are three call types based on the number of regex instructions in the given pattern.
- * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
- * Longer patterns require global memory.
  */
 template <int stack_size>
 struct contains_fn {
   reprog_device prog;
-  column_device_view d_strings;
-  bool bmatch{false};  // do not make this a template parameter to keep compile times down
+  column_device_view const d_strings;
+  bool const beginning_only;  // do not make this a template parameter to keep compile times down
 
   __device__ bool operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) return false;
-    string_view d_str = d_strings.element<string_view>(idx);
-    int32_t begin     = 0;
-    int32_t end       = bmatch ? 1    // match only the beginning of the string;
-                               : -1;  // this handles empty strings too
+    auto const d_str = d_strings.element<string_view>(idx);
+    int32_t begin    = 0;
+    int32_t end      = beginning_only ? 1    // match only the beginning of the string;
+                                      : -1;  // match anywhere in the string
     return static_cast<bool>(prog.find<stack_size>(idx, d_str, begin, end));
   }
 };
 
-//
-std::unique_ptr<column> contains_util(
-  strings_column_view const& strings,
-  std::string const& pattern,
-  regex_flags const flags,
-  bool beginning_only                 = false,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_count  = strings.size();
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-
-  // compile regex into device object
-  auto prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
-  auto d_prog = *prog;
-
-  // create the output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
+struct contains_dispatch_fn {
+  reprog_device d_prog;
+  bool const beginning_only;
 
-  // fill the output column
-  int regex_insts = d_prog.insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      contains_fn<RX_STACK_SMALL>{d_prog, d_column, beginning_only});
-  else if (regex_insts <= RX_MEDIUM_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      contains_fn<RX_STACK_MEDIUM>{d_prog, d_column, beginning_only});
-  else if (regex_insts <= RX_LARGE_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      contains_fn<RX_STACK_LARGE>{d_prog, d_column, beginning_only});
-  else
+  template <int stack_size>
+  std::unique_ptr<column> operator()(strings_column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto results = make_numeric_column(data_type{type_id::BOOL8},
+                                       input.size(),
+                                       cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                       input.null_count(),
+                                       stream,
+                                       mr);
+
+    auto const d_strings = column_device_view::create(input.parent(), stream);
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      contains_fn<RX_STACK_ANY>{d_prog, d_column, beginning_only});
-
-  results->set_null_count(strings.null_count());
-  return results;
-}
+                      thrust::make_counting_iterator<size_type>(input.size()),
+                      results->mutable_view().data<bool>(),
+                      contains_fn<stack_size>{d_prog, *d_strings, beginning_only});
+    return results;
+  }
+};
 
 }  // namespace
 
 std::unique_ptr<column> contains_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  return contains_util(strings, pattern, flags, false, stream, mr);
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+
+  return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, false}, input, stream, mr);
 }
 
 std::unique_ptr<column> matches_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  return contains_util(strings, pattern, flags, true, stream, mr);
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+
+  return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, true}, input, stream, mr);
 }
 
 }  // namespace detail
@@ -172,12 +143,12 @@ namespace {
 template <int stack_size>
 struct count_fn {
   reprog_device prog;
-  column_device_view d_strings;
+  column_device_view const d_strings;
 
   __device__ int32_t operator()(unsigned int idx)
   {
     if (d_strings.is_null(idx)) return 0;
-    string_view d_str  = d_strings.element<string_view>(idx);
+    auto const d_str   = d_strings.element<string_view>(idx);
     auto const nchars  = d_str.length();
     int32_t find_count = 0;
     int32_t begin      = 0;
@@ -191,62 +162,45 @@ struct count_fn {
   }
 };
 
+struct count_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(strings_column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto results = make_numeric_column(data_type{type_id::INT32},
+                                       input.size(),
+                                       cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                       input.null_count(),
+                                       stream,
+                                       mr);
+
+    auto const d_strings = column_device_view::create(input.parent(), stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(input.size()),
+                      results->mutable_view().data<int32_t>(),
+                      count_fn<stack_size>{d_prog, *d_strings});
+    return results;
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<column> count_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto strings_count  = strings.size();
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-
   // compile regex into device object
-  auto prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
-  auto d_prog = *prog;
-
-  // create the output column
-  auto results   = make_numeric_column(data_type{type_id::INT32},
-                                     strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<int32_t>();
-
-  // fill the output column
-  int regex_insts = d_prog.insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      count_fn<RX_STACK_SMALL>{d_prog, d_column});
-  else if (regex_insts <= RX_MEDIUM_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      count_fn<RX_STACK_MEDIUM>{d_prog, d_column});
-  else if (regex_insts <= RX_LARGE_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      count_fn<RX_STACK_LARGE>{d_prog, d_column});
-  else
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      count_fn<RX_STACK_ANY>{d_prog, d_column});
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
 
-  results->set_null_count(strings.null_count());
-  return results;
+  return regex_dispatcher(*d_prog, count_dispatch_fn{*d_prog}, input, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index d0a6825666b..ae996cafd2c 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 
 #include <cudf/column/column_device_view.cuh>
@@ -54,6 +55,27 @@ struct count_matches_fn {
     return count;
   }
 };
+
+struct count_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(column_device_view const& d_strings,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto results = make_numeric_column(
+      data_type{type_id::INT32}, d_strings.size() + 1, mask_state::UNALLOCATED, stream, mr);
+
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(d_strings.size()),
+                      results->mutable_view().data<int32_t>(),
+                      count_matches_fn<stack_size>{d_strings, d_prog});
+    return results;
+  }
+};
+
 }  // namespace
 
 /**
@@ -71,31 +93,7 @@ std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  // Create output column
-  auto counts = make_numeric_column(
-    data_type{type_id::INT32}, d_strings.size() + 1, mask_state::UNALLOCATED, stream, mr);
-  auto d_counts = counts->mutable_view().data<offset_type>();
-
-  auto begin = thrust::make_counting_iterator<size_type>(0);
-  auto end   = thrust::make_counting_iterator<size_type>(d_strings.size());
-
-  // Count matches
-  auto const regex_insts = d_prog.insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    count_matches_fn<RX_STACK_SMALL> fn{d_strings, d_prog};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    count_matches_fn<RX_STACK_MEDIUM> fn{d_strings, d_prog};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    count_matches_fn<RX_STACK_LARGE> fn{d_strings, d_prog};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
-  } else {
-    count_matches_fn<RX_STACK_ANY> fn{d_strings, d_prog};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
-  }
-
-  return counts;
+  return regex_dispatcher(d_prog, count_dispatch_fn{d_prog}, d_strings, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index a67af9442f0..7394cdac6bb 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -77,53 +78,44 @@ struct extract_fn {
     thrust::fill(thrust::seq, d_output.begin(), d_output.end(), string_index_pair{nullptr, 0});
   }
 };
+
+struct extract_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  void operator()(column_device_view const& d_strings,
+                  cudf::detail::device_2dspan<string_index_pair>& d_indices,
+                  rmm::cuda_stream_view stream)
+  {
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       d_strings.size(),
+                       extract_fn<stack_size>{d_prog, d_strings, d_indices});
+  }
+};
 }  // namespace
 
 //
 std::unique_ptr<table> extract(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto const strings_count  = strings.size();
-  auto const strings_column = column_device_view::create(strings.parent(), stream);
-  auto const d_strings      = *strings_column;
-
   // compile regex into device object
-  auto prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
-  auto d_prog = *prog;
-  // extract should include groups
-  auto const groups = d_prog.group_counts();
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+
+  auto const groups = d_prog->group_counts();
   CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern");
 
-  rmm::device_uvector<string_index_pair> indices(strings_count * groups, stream);
-  cudf::detail::device_2dspan<string_index_pair> d_indices(indices.data(), strings_count, groups);
+  auto indices = rmm::device_uvector<string_index_pair>(input.size() * groups, stream);
+  auto d_indices =
+    cudf::detail::device_2dspan<string_index_pair>(indices.data(), input.size(), groups);
 
-  auto const regex_insts = d_prog.insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       strings_count,
-                       extract_fn<RX_STACK_SMALL>{d_prog, d_strings, d_indices});
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       strings_count,
-                       extract_fn<RX_STACK_MEDIUM>{d_prog, d_strings, d_indices});
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       strings_count,
-                       extract_fn<RX_STACK_LARGE>{d_prog, d_strings, d_indices});
-  } else {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       strings_count,
-                       extract_fn<RX_STACK_ANY>{d_prog, d_strings, d_indices});
-  }
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  regex_dispatcher(*d_prog, extract_dispatch_fn{*d_prog}, *d_strings, d_indices, stream);
 
   // build a result column for each group
   std::vector<std::unique_ptr<column>> results(groups);
@@ -135,7 +127,7 @@ std::unique_ptr<table> extract(
                                           0, [column_index, groups] __device__(size_type idx) {
                                             return (idx * groups) + column_index;
                                           }));
-    return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr);
+    return make_strings_column(indices_itr, indices_itr + input.size(), stream, mr);
   };
 
   std::transform(thrust::make_counting_iterator<size_type>(0),
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index e27dccb9338..1f1474c777b 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -86,6 +87,28 @@ struct extract_fn {
     }
   }
 };
+
+struct extract_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(column_device_view const& d_strings,
+                                     size_type total_groups,
+                                     offset_type const* d_offsets,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    rmm::device_uvector<string_index_pair> indices(total_groups, stream);
+
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       d_strings.size(),
+                       extract_fn<stack_size>{d_strings, d_prog, d_offsets, indices.data()});
+
+    return make_strings_column(indices.begin(), indices.end(), stream, mr);
+  }
+};
+
 }  // namespace
 
 /**
@@ -94,14 +117,14 @@ struct extract_fn {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> extract_all_record(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto const strings_count = strings.size();
-  auto const d_strings     = column_device_view::create(strings.parent(), stream);
+  auto const strings_count = input.size();
+  auto const d_strings     = column_device_view::create(input.parent(), stream);
 
   // Compile regex into device object.
   auto d_prog =
@@ -143,29 +166,8 @@ std::unique_ptr<column> extract_all_record(
   auto const total_groups =
     cudf::detail::get_value<offset_type>(offsets->view(), strings_count, stream);
 
-  // Create an indices vector with the total number of groups that will be extracted.
-  rmm::device_uvector<string_index_pair> indices(total_groups, stream);
-  auto d_indices = indices.data();
-  auto begin     = thrust::make_counting_iterator<size_type>(0);
-
-  // Call the extract functor to fill in the indices vector.
-  auto const regex_insts = d_prog->insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    extract_fn<RX_STACK_SMALL> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    extract_fn<RX_STACK_MEDIUM> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    extract_fn<RX_STACK_LARGE> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else {
-    extract_fn<RX_STACK_ANY> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  }
-
-  // Build the child strings column from the indices.
-  auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr);
+  auto strings_output = regex_dispatcher(
+    *d_prog, extract_dispatch_fn{*d_prog}, *d_strings, total_groups, d_offsets, stream, mr);
 
   // Build the lists column from the offsets and the strings.
   return make_lists_column(strings_count,
diff --git a/cpp/src/strings/regex/dispatcher.hpp b/cpp/src/strings/regex/dispatcher.hpp
new file mode 100644
index 00000000000..9ff51d1c979
--- /dev/null
+++ b/cpp/src/strings/regex/dispatcher.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <strings/regex/regex.cuh>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+/**
+ * The stack is used to keep progress (state) on evaluating the regex instructions on each string.
+ * So the size of the stack is in proportion to the number of instructions in the given regex
+ * pattern.
+ *
+ * There are four call types based on the number of regex instructions in the given pattern.
+ * Small, medium, and large instruction counts can use the stack effectively.
+ * Smaller stack sizes execute faster.
+ *
+ * Patterns with instruction counts bigger than large use global memory rather than the stack
+ * for managing the evaluation state data.
+ *
+ * @tparam Functor The functor to invoke with stack size templated value.
+ * @tparam Ts Parameter types for the functor call.
+ */
+template <typename Functor, typename... Ts>
+constexpr decltype(auto) regex_dispatcher(reprog_device d_prog, Functor f, Ts&&... args)
+{
+  auto const num_regex_insts = d_prog.insts_counts();
+  if (num_regex_insts <= RX_SMALL_INSTS) {
+    return f.template operator()<RX_STACK_SMALL>(std::forward<Ts>(args)...);
+  }
+  if (num_regex_insts <= RX_MEDIUM_INSTS) {
+    return f.template operator()<RX_STACK_MEDIUM>(std::forward<Ts>(args)...);
+  }
+  if (num_regex_insts <= RX_LARGE_INSTS) {
+    return f.template operator()<RX_STACK_LARGE>(std::forward<Ts>(args)...);
+  }
+
+  return f.template operator()<RX_STACK_ANY>(std::forward<Ts>(args)...);
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index ff86d7aa552..27e0bd4fac9 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "backref_re.cuh"
 
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -95,27 +96,54 @@ std::pair<std::string, std::vector<backref_type>> parse_backrefs(std::string con
   return {rtn, backrefs};
 }
 
+template <typename Iterator>
+struct replace_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(strings_column_view const& input,
+                                     string_view const& d_repl_template,
+                                     Iterator backrefs_begin,
+                                     Iterator backrefs_end,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto const d_strings = column_device_view::create(input.parent(), stream);
+
+    auto children = make_strings_children(
+      backrefs_fn<Iterator, stack_size>{
+        *d_strings, d_prog, d_repl_template, backrefs_begin, backrefs_end},
+      input.size(),
+      stream,
+      mr);
+
+    return make_strings_column(input.size(),
+                               std::move(children.first),
+                               std::move(children.second),
+                               input.null_count(),
+                               cudf::detail::copy_bitmask(input.parent(), stream, mr));
+  }
+};
+
 }  // namespace
 
 //
 std::unique_ptr<column> replace_with_backrefs(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   std::string const& replacement,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
+  if (input.is_empty()) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
   CUDF_EXPECTS(!replacement.empty(), "Parameter replacement must not be empty");
 
-  auto d_strings = column_device_view::create(strings.parent(), stream);
   // compile regex into device object
   auto d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings.size(), stream);
-  auto const regex_insts = d_prog->insts_counts();
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
 
   // parse the repl string for back-ref indicators
   auto const parse_result = parse_backrefs(replacement);
@@ -125,45 +153,14 @@ std::unique_ptr<column> replace_with_backrefs(
   string_view const d_repl_template = repl_scalar.value();
 
   using BackRefIterator = decltype(backrefs.begin());
-
-  // create child columns
-  auto [offsets, chars] = [&] {
-    if (regex_insts <= RX_SMALL_INSTS) {
-      return make_strings_children(
-        backrefs_fn<BackRefIterator, RX_STACK_SMALL>{
-          *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
-        strings.size(),
-        stream,
-        mr);
-    } else if (regex_insts <= RX_MEDIUM_INSTS) {
-      return make_strings_children(
-        backrefs_fn<BackRefIterator, RX_STACK_MEDIUM>{
-          *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
-        strings.size(),
-        stream,
-        mr);
-    } else if (regex_insts <= RX_LARGE_INSTS) {
-      return make_strings_children(
-        backrefs_fn<BackRefIterator, RX_STACK_LARGE>{
-          *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
-        strings.size(),
-        stream,
-        mr);
-    } else {
-      return make_strings_children(
-        backrefs_fn<BackRefIterator, RX_STACK_ANY>{
-          *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
-        strings.size(),
-        stream,
-        mr);
-    }
-  }();
-
-  return make_strings_column(strings.size(),
-                             std::move(offsets),
-                             std::move(chars),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+  return regex_dispatcher(*d_prog,
+                          replace_dispatch_fn<BackRefIterator>{*d_prog},
+                          input,
+                          d_repl_template,
+                          backrefs.begin(),
+                          backrefs.end(),
+                          stream,
+                          mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 2b5380b76dd..22f6d2cba39 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -30,6 +31,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <algorithm>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -40,16 +43,6 @@ using found_range = thrust::pair<size_type, size_type>;
 /**
  * @brief This functor handles replacing strings by applying the compiled regex patterns
  * and inserting the corresponding new string within the matched range of characters.
- *
- * The logic includes computing the size of each string and also writing the output.
- *
- * The stack is used to keep progress on evaluating the regex instructions on each string.
- * So the size of the stack is in proportion to the number of instructions in the given regex
- * pattern.
- *
- * There are three call types based on the number of regex instructions in the given pattern.
- * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
- * Longer patterns require global memory. Shorter patterns are common in data cleaning.
  */
 template <int stack_size>
 struct replace_multi_regex_fn {
@@ -127,69 +120,76 @@ struct replace_multi_regex_fn {
   }
 };
 
+struct replace_dispatch_fn {
+  template <int stack_size>
+  std::unique_ptr<column> operator()(strings_column_view const& input,
+                                     device_span<reprog_device const> d_progs,
+                                     strings_column_view const& replacements,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto const d_strings = column_device_view::create(input.parent(), stream);
+    auto const d_repls   = column_device_view::create(replacements.parent(), stream);
+
+    auto found_ranges = rmm::device_uvector<found_range>(d_progs.size() * input.size(), stream);
+
+    auto children = make_strings_children(
+      replace_multi_regex_fn<stack_size>{*d_strings, d_progs, found_ranges.data(), *d_repls},
+      input.size(),
+      stream,
+      mr);
+
+    return make_strings_column(input.size(),
+                               std::move(children.first),
+                               std::move(children.second),
+                               input.null_count(),
+                               cudf::detail::copy_bitmask(input.parent(), stream, mr));
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<column> replace_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::vector<std::string> const& patterns,
   strings_column_view const& replacements,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
-  if (patterns.empty())  // no patterns; just return a copy
-    return std::make_unique<column>(strings.parent(), stream, mr);
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
+  if (patterns.empty()) {  // if no patterns; just return a copy
+    return std::make_unique<column>(input.parent(), stream, mr);
+  }
 
   CUDF_EXPECTS(!replacements.has_nulls(), "Parameter replacements must not have any nulls");
 
-  auto d_strings    = column_device_view::create(strings.parent(), stream);
-  auto d_repls      = column_device_view::create(replacements.parent(), stream);
-  auto d_char_table = get_character_flags_table();
-
   // compile regexes into device objects
-  size_type regex_insts = 0;
-  std::vector<std::unique_ptr<reprog_device, std::function<void(reprog_device*)>>> h_progs;
-  std::vector<reprog_device> progs;
-  for (auto itr = patterns.begin(); itr != patterns.end(); ++itr) {
-    auto prog   = reprog_device::create(*itr, flags, d_char_table, strings_count, stream);
-    regex_insts = std::max(regex_insts, prog->insts_counts());
-    progs.push_back(*prog);
-    h_progs.emplace_back(std::move(prog));
-  }
+  auto const d_char_table = get_character_flags_table();
+  auto h_progs = std::vector<std::unique_ptr<reprog_device, std::function<void(reprog_device*)>>>(
+    patterns.size());
+  std::transform(patterns.begin(),
+                 patterns.end(),
+                 h_progs.begin(),
+                 [flags, d_char_table, input, stream](auto const& ptn) {
+                   return reprog_device::create(ptn, flags, d_char_table, input.size(), stream);
+                 });
+
+  // get the longest regex for the dispatcher
+  auto const max_prog =
+    std::max_element(h_progs.begin(), h_progs.end(), [](auto const& lhs, auto const& rhs) {
+      return lhs->insts_counts() < rhs->insts_counts();
+    });
 
   // copy all the reprog_device instances to a device memory array
+  std::vector<reprog_device> progs;
+  std::transform(h_progs.begin(), h_progs.end(), std::back_inserter(progs), [](auto const& d_prog) {
+    return *d_prog;
+  });
   auto d_progs = cudf::detail::make_device_uvector_async(progs, stream);
 
-  // create working buffer for ranges pairs
-  rmm::device_uvector<found_range> found_ranges(patterns.size() * strings_count, stream);
-  auto d_found_ranges = found_ranges.data();
-
-  // create child columns
-  auto children = [&] {
-    // Each invocation is predicated on the stack size which is dependent on the number of regex
-    // instructions
-    if (regex_insts <= RX_SMALL_INSTS) {
-      replace_multi_regex_fn<RX_STACK_SMALL> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else if (regex_insts <= RX_MEDIUM_INSTS) {
-      replace_multi_regex_fn<RX_STACK_MEDIUM> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else if (regex_insts <= RX_LARGE_INSTS) {
-      replace_multi_regex_fn<RX_STACK_LARGE> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else {
-      replace_multi_regex_fn<RX_STACK_ANY> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
-      return make_strings_children(fn, strings_count, stream, mr);
-    }
-  }();
-
-  return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+  return regex_dispatcher(
+    **max_prog, replace_dispatch_fn{}, input, d_progs, replacements, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 2c594bb86a8..d42359deeac 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -36,16 +37,6 @@ namespace {
 /**
  * @brief This functor handles replacing strings by applying the compiled regex pattern
  * and inserting the new string within the matched range of characters.
- *
- * The logic includes computing the size of each string and also writing the output.
- *
- * The stack is used to keep progress on evaluating the regex instructions on each string.
- * So the size of the stack is in proportion to the number of instructions in the given regex
- * pattern.
- *
- * There are three call types based on the number of regex instructions in the given pattern.
- * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
- * Longer patterns require global memory. Shorter patterns are common in data cleaning.
  */
 template <int stack_size>
 struct replace_regex_fn {
@@ -108,11 +99,37 @@ struct replace_regex_fn {
   }
 };
 
+struct replace_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(strings_column_view const& input,
+                                     string_view const& d_replacement,
+                                     size_type max_replace_count,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto const d_strings = column_device_view::create(input.parent(), stream);
+
+    auto children = make_strings_children(
+      replace_regex_fn<stack_size>{*d_strings, d_prog, d_replacement, max_replace_count},
+      input.size(),
+      stream,
+      mr);
+
+    return make_strings_column(input.size(),
+                               std::move(children.first),
+                               std::move(children.second),
+                               input.null_count(),
+                               cudf::detail::copy_bitmask(input.parent(), stream, mr));
+  }
+};
+
 }  // namespace
 
 //
 std::unique_ptr<column> replace_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   string_scalar const& replacement,
   std::optional<size_type> max_replace_count,
@@ -120,49 +137,19 @@ std::unique_ptr<column> replace_re(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  if (input.is_empty()) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   string_view d_repl(replacement.data(), replacement.size());
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
   // compile regex into device object
-  auto prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
-  auto d_prog            = *prog;
-  auto const regex_insts = d_prog.insts_counts();
-
-  // copy null mask
-  auto null_mask        = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-  auto const null_count = strings.null_count();
-  auto const maxrepl    = max_replace_count.value_or(-1);
-
-  // create child columns
-  auto children = [&] {
-    // Each invocation is predicated on the stack size which is dependent on the number of regex
-    // instructions
-    if (regex_insts <= RX_SMALL_INSTS) {
-      replace_regex_fn<RX_STACK_SMALL> fn{d_strings, d_prog, d_repl, maxrepl};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else if (regex_insts <= RX_MEDIUM_INSTS) {
-      replace_regex_fn<RX_STACK_MEDIUM> fn{d_strings, d_prog, d_repl, maxrepl};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else if (regex_insts <= RX_LARGE_INSTS) {
-      replace_regex_fn<RX_STACK_LARGE> fn{d_strings, d_prog, d_repl, maxrepl};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else {
-      replace_regex_fn<RX_STACK_ANY> fn{d_strings, d_prog, d_repl, maxrepl};
-      return make_strings_children(fn, strings_count, stream, mr);
-    }
-  }();
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+
+  auto const maxrepl = max_replace_count.value_or(-1);
 
-  return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
-                             null_count,
-                             std::move(null_mask));
+  return regex_dispatcher(
+    *d_prog, replace_dispatch_fn{*d_prog}, input, d_repl, maxrepl, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 810e44cc27d..201556033ad 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+#include <strings/count_matches.hpp>
+#include <strings/regex/dispatcher.hpp>
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -24,19 +29,16 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/extrema.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 using string_index_pair = thrust::pair<const char*, size_type>;
-using findall_result    = thrust::pair<size_type, string_index_pair>;
 
 namespace {
 /**
@@ -47,27 +49,20 @@ template <int stack_size>
 struct findall_fn {
   column_device_view const d_strings;
   reprog_device prog;
-  size_type column_index;
+  size_type const column_index;
   size_type const* d_counts;
 
-  findall_fn(column_device_view const& d_strings,
-             reprog_device& prog,
-             size_type column_index    = -1,
-             size_type const* d_counts = nullptr)
-    : d_strings(d_strings), prog(prog), column_index(column_index), d_counts(d_counts)
+  __device__ string_index_pair operator()(size_type idx)
   {
-  }
+    if (d_strings.is_null(idx) || (column_index >= d_counts[idx])) {
+      return string_index_pair{nullptr, 0};
+    }
+
+    auto const d_str  = d_strings.element<string_view>(idx);
+    auto const nchars = d_str.length();
+    int32_t spos      = 0;
+    auto epos         = static_cast<int32_t>(nchars);
 
-  // this will count columns as well as locate a specific string for a column
-  __device__ findall_result findall(size_type idx)
-  {
-    string_index_pair result{nullptr, 0};
-    if (d_strings.is_null(idx) || (d_counts && (column_index >= d_counts[idx])))
-      return findall_result{0, result};
-    string_view d_str      = d_strings.element<string_view>(idx);
-    auto const nchars      = d_str.length();
-    int32_t spos           = 0;
-    auto epos              = static_cast<int32_t>(nchars);
     size_type column_count = 0;
     while (spos <= nchars) {
       if (prog.find<stack_size>(idx, d_str, spos, epos) <= 0) break;  // no more matches found
@@ -76,36 +71,40 @@ struct findall_fn {
       epos = static_cast<int32_t>(nchars);
       ++column_count;
     }
-    if (spos <= epos) {
-      spos   = d_str.byte_offset(spos);  // convert
-      epos   = d_str.byte_offset(epos);  // to bytes
-      result = string_index_pair{d_str.data() + spos, (epos - spos)};
-    }
-    // return the strings location and the column count
-    return findall_result{column_count, result};
-  }
 
-  __device__ string_index_pair operator()(size_type idx)
-  {
-    // this one only cares about the string
-    return findall(idx).second;
+    auto const result = [&] {
+      if (spos > epos) { return string_index_pair{nullptr, 0}; }
+      // convert character positions to byte positions
+      spos = d_str.byte_offset(spos);
+      epos = d_str.byte_offset(epos);
+      return string_index_pair{d_str.data() + spos, (epos - spos)};
+    }();
+
+    return result;
   }
 };
 
-template <size_t stack_size>
-struct findall_count_fn : public findall_fn<stack_size> {
-  findall_count_fn(column_device_view const& strings, reprog_device& prog)
-    : findall_fn<stack_size>{strings, prog}
-  {
-  }
+struct findall_dispatch_fn {
+  reprog_device d_prog;
 
-  __device__ size_type operator()(size_type idx)
+  template <int stack_size>
+  std::unique_ptr<column> operator()(column_device_view const& d_strings,
+                                     size_type column_index,
+                                     size_type const* d_find_counts,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
-    // this one only cares about the column count
-    return findall_fn<stack_size>::findall(idx).first;
+    rmm::device_uvector<string_index_pair> indices(d_strings.size(), stream);
+
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(d_strings.size()),
+                      indices.begin(),
+                      findall_fn<stack_size>{d_strings, d_prog, column_index, d_find_counts});
+
+    return make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };
-
 }  // namespace
 
 //
@@ -124,38 +123,15 @@ std::unique_ptr<table> findall(
     reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
   auto const regex_insts = d_prog->insts_counts();
 
-  rmm::device_uvector<size_type> find_counts(strings_count, stream);
-  auto d_find_counts = find_counts.data();
-
-  if (regex_insts <= RX_SMALL_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_find_counts,
-                      findall_count_fn<RX_STACK_SMALL>{*d_strings, *d_prog});
-  else if (regex_insts <= RX_MEDIUM_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_find_counts,
-                      findall_count_fn<RX_STACK_MEDIUM>{*d_strings, *d_prog});
-  else if (regex_insts <= RX_LARGE_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_find_counts,
-                      findall_count_fn<RX_STACK_LARGE>{*d_strings, *d_prog});
-  else
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_find_counts,
-                      findall_count_fn<RX_STACK_ANY>{*d_strings, *d_prog});
+  auto find_counts =
+    count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
+  auto d_find_counts = find_counts->mutable_view().data<size_type>();
 
   std::vector<std::unique_ptr<column>> results;
 
   size_type const columns = thrust::reduce(
-    rmm::exec_policy(stream), find_counts.begin(), find_counts.end(), 0, thrust::maximum{});
+    rmm::exec_policy(stream), d_find_counts, d_find_counts + strings_count, 0, thrust::maximum{});
+
   // boundary case: if no columns, return all nulls column (issue #119)
   if (columns == 0)
     results.emplace_back(std::make_unique<column>(
@@ -166,39 +142,10 @@ std::unique_ptr<table> findall(
       strings_count));
 
   for (int32_t column_index = 0; column_index < columns; ++column_index) {
-    rmm::device_uvector<string_index_pair> indices(strings_count, stream);
-
-    if (regex_insts <= RX_SMALL_INSTS)
-      thrust::transform(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator<size_type>(0),
-        thrust::make_counting_iterator<size_type>(strings_count),
-        indices.begin(),
-        findall_fn<RX_STACK_SMALL>{*d_strings, *d_prog, column_index, d_find_counts});
-    else if (regex_insts <= RX_MEDIUM_INSTS)
-      thrust::transform(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator<size_type>(0),
-        thrust::make_counting_iterator<size_type>(strings_count),
-        indices.begin(),
-        findall_fn<RX_STACK_MEDIUM>{*d_strings, *d_prog, column_index, d_find_counts});
-    else if (regex_insts <= RX_LARGE_INSTS)
-      thrust::transform(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator<size_type>(0),
-        thrust::make_counting_iterator<size_type>(strings_count),
-        indices.begin(),
-        findall_fn<RX_STACK_LARGE>{*d_strings, *d_prog, column_index, d_find_counts});
-    else
-      thrust::transform(rmm::exec_policy(stream),
-                        thrust::make_counting_iterator<size_type>(0),
-                        thrust::make_counting_iterator<size_type>(strings_count),
-                        indices.begin(),
-                        findall_fn<RX_STACK_ANY>{*d_strings, *d_prog, column_index, d_find_counts});
-
-    //
-    results.emplace_back(make_strings_column(indices.begin(), indices.end(), stream, mr));
+    results.emplace_back(regex_dispatcher(
+      *d_prog, findall_dispatch_fn{*d_prog}, *d_strings, column_index, d_find_counts, stream, mr));
   }
+
   return std::make_unique<table>(std::move(results));
 }
 
diff --git a/cpp/src/strings/search/findall_record.cu b/cpp/src/strings/search/findall_record.cu
index c93eb0c17db..95e347a7c35 100644
--- a/cpp/src/strings/search/findall_record.cu
+++ b/cpp/src/strings/search/findall_record.cu
@@ -15,6 +15,9 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/dispatcher.hpp>
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -26,9 +29,6 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -75,6 +75,27 @@ struct findall_fn {
   }
 };
 
+struct findall_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(column_device_view const& d_strings,
+                                     size_type total_matches,
+                                     offset_type const* d_offsets,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    rmm::device_uvector<string_index_pair> indices(total_matches, stream);
+
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       d_strings.size(),
+                       findall_fn<stack_size>{d_strings, d_prog, d_offsets, indices.data()});
+
+    return make_strings_column(indices.begin(), indices.end(), stream, mr);
+  }
+};
+
 }  // namespace
 
 //
@@ -121,30 +142,11 @@ std::unique_ptr<column> findall_record(
     rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
 
   // Create indices vector with the total number of groups that will be extracted
-  auto total_matches = cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
-
-  rmm::device_uvector<string_index_pair> indices(total_matches, stream);
-  auto d_indices = indices.data();
-  auto begin     = thrust::make_counting_iterator<size_type>(0);
-
-  // Build the string indices
-  auto const regex_insts = d_prog->insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    findall_fn<RX_STACK_SMALL> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    findall_fn<RX_STACK_MEDIUM> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    findall_fn<RX_STACK_LARGE> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else {
-    findall_fn<RX_STACK_ANY> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  }
+  auto const total_matches =
+    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
 
-  // Build the child strings column from the resulting indices
-  auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr);
+  auto strings_output = regex_dispatcher(
+    *d_prog, findall_dispatch_fn{*d_prog}, *d_strings, total_matches, d_offsets, stream, mr);
 
   // Build the lists column from the offsets and the strings
   return make_lists_column(strings_count,
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index d80148f2fe6..a8a2467dd76 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -110,6 +111,28 @@ struct token_reader_fn {
   }
 };
 
+struct generate_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  rmm::device_uvector<string_index_pair> operator()(column_device_view const& d_strings,
+                                                    size_type total_tokens,
+                                                    split_direction direction,
+                                                    offset_type const* d_offsets,
+                                                    rmm::cuda_stream_view stream)
+  {
+    rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
+
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<size_type>(0),
+      d_strings.size(),
+      token_reader_fn<stack_size>{d_strings, d_prog, direction, d_offsets, tokens.data()});
+
+    return tokens;
+  }
+};
+
 /**
  * @brief Call regex to split each input string into tokens.
  *
@@ -148,24 +171,8 @@ rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const&
   // the last offset entry is the total number of tokens to be generated
   auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
 
-  // generate tokens for each string
-  rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
-  auto const regex_insts = d_prog.insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    token_reader_fn<RX_STACK_SMALL> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    token_reader_fn<RX_STACK_MEDIUM> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    token_reader_fn<RX_STACK_LARGE> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
-  } else {
-    token_reader_fn<RX_STACK_ANY> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
-  }
-
-  return tokens;
+  return regex_dispatcher(
+    d_prog, generate_dispatch_fn{d_prog}, d_strings, total_tokens, direction, d_offsets, stream);
 }
 
 /**

From 71206944b0094fd440d4a5d7fa9fa1aaa3089583 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 2 Mar 2022 09:21:27 -0800
Subject: [PATCH 4/9] Implement a mixin for binops (#10360)

This PR builds on the framework introduced in https://github.com/rapidsai/cudf/pull/9925 to implement scans. I plan to apply this mixin to ColumnBase as well, but that will require more work to clean up binary operations for column types and it is large enough to merit a separate PR.

Contributes to #10177.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/10360
---
 python/cudf/cudf/core/frame.py                | 131 ++++--------------
 python/cudf/cudf/core/index.py                |  54 ++------
 python/cudf/cudf/core/indexed_frame.py        |  10 +-
 python/cudf/cudf/core/mixins/__init__.py      |   3 +-
 python/cudf/cudf/core/mixins/binops.py        |  56 ++++++++
 python/cudf/cudf/core/mixins/binops.pyi       |  88 ++++++++++++
 python/cudf/cudf/core/mixins/mixin_factory.py |  12 +-
 python/cudf/cudf/core/mixins/reductions.pyi   |   4 +-
 python/cudf/cudf/core/scalar.py               |  70 +---------
 python/cudf/cudf/core/series.py               |   6 +-
 python/cudf/cudf/utils/utils.py               |   6 -
 11 files changed, 212 insertions(+), 228 deletions(-)
 create mode 100644 python/cudf/cudf/core/mixins/binops.py
 create mode 100644 python/cudf/cudf/core/mixins/binops.pyi

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 0fd7848c7d1..3d36d3bd893 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -46,6 +46,7 @@
 )
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.join import Merge, MergeSemi
+from cudf.core.mixins import BinaryOperand
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
@@ -97,7 +98,7 @@
 }
 
 
-class Frame:
+class Frame(BinaryOperand):
     """A collection of Column objects with an optional index.
 
     Parameters
@@ -114,6 +115,8 @@ class Frame:
     _index: Optional[cudf.core.index.BaseIndex]
     _names: Optional[List]
 
+    _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
+
     def __init__(self, data=None, index=None):
         if data is None:
             data = {}
@@ -3555,13 +3558,7 @@ def _unaryop(self, op):
         )
 
     def _binaryop(
-        self,
-        other: T,
-        fn: str,
-        fill_value: Any = None,
-        reflect: bool = False,
-        *args,
-        **kwargs,
+        self, other: T, op: str, fill_value: Any = None, *args, **kwargs,
     ) -> Frame:
         """Perform a binary operation between two frames.
 
@@ -3569,15 +3566,11 @@ def _binaryop(
         ----------
         other : Frame
             The second operand.
-        fn : str
+        op : str
             The operation to perform.
         fill_value : Any, default None
             The value to replace null values with. If ``None``, nulls are not
             filled before the operation.
-        reflect : bool, default False
-            If ``True``, swap the order of the operands. See
-            https://docs.python.org/3/reference/datamodel.html#object.__ror__
-            for more information on when this is necessary.
 
         Returns
         -------
@@ -3617,6 +3610,7 @@ def _colwise_binop(
             A dict of columns constructed from the result of performing the
             requested operation on the operands.
         """
+        fn = fn[2:-2]
 
         # Now actually perform the binop on the columns in left and right.
         output = {}
@@ -3899,83 +3893,12 @@ def dot(self, other, reflect=False):
             return cudf.DataFrame(result)
         return result.item()
 
-    # Binary arithmetic operations.
-    def __add__(self, other):
-        return self._binaryop(other, "add")
-
-    def __radd__(self, other):
-        return self._binaryop(other, "add", reflect=True)
-
-    def __sub__(self, other):
-        return self._binaryop(other, "sub")
-
-    def __rsub__(self, other):
-        return self._binaryop(other, "sub", reflect=True)
-
     def __matmul__(self, other):
         return self.dot(other)
 
     def __rmatmul__(self, other):
         return self.dot(other, reflect=True)
 
-    def __mul__(self, other):
-        return self._binaryop(other, "mul")
-
-    def __rmul__(self, other):
-        return self._binaryop(other, "mul", reflect=True)
-
-    def __mod__(self, other):
-        return self._binaryop(other, "mod")
-
-    def __rmod__(self, other):
-        return self._binaryop(other, "mod", reflect=True)
-
-    def __pow__(self, other):
-        return self._binaryop(other, "pow")
-
-    def __rpow__(self, other):
-        return self._binaryop(other, "pow", reflect=True)
-
-    def __floordiv__(self, other):
-        return self._binaryop(other, "floordiv")
-
-    def __rfloordiv__(self, other):
-        return self._binaryop(other, "floordiv", reflect=True)
-
-    def __truediv__(self, other):
-        return self._binaryop(other, "truediv")
-
-    def __rtruediv__(self, other):
-        return self._binaryop(other, "truediv", reflect=True)
-
-    def __and__(self, other):
-        return self._binaryop(other, "and")
-
-    def __or__(self, other):
-        return self._binaryop(other, "or")
-
-    def __xor__(self, other):
-        return self._binaryop(other, "xor")
-
-    # Binary rich comparison operations.
-    def __eq__(self, other):
-        return self._binaryop(other, "eq")
-
-    def __ne__(self, other):
-        return self._binaryop(other, "ne")
-
-    def __lt__(self, other):
-        return self._binaryop(other, "lt")
-
-    def __le__(self, other):
-        return self._binaryop(other, "le")
-
-    def __gt__(self, other):
-        return self._binaryop(other, "gt")
-
-    def __ge__(self, other):
-        return self._binaryop(other, "ge")
-
     # Unary logical operators
     def __neg__(self):
         return -1 * self
@@ -5185,7 +5108,7 @@ def add(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "add", fill_value)
+        return self._binaryop(other, "__add__", fill_value)
 
     @annotate("FRAME_RADD", color="green", domain="cudf_python")
     def radd(self, other, axis, level=None, fill_value=None):
@@ -5265,7 +5188,7 @@ def radd(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "add", fill_value, reflect=True)
+        return self._binaryop(other, "__radd__", fill_value)
 
     @annotate("FRAME_SUBTRACT", color="green", domain="cudf_python")
     def subtract(self, other, axis, level=None, fill_value=None):
@@ -5346,7 +5269,7 @@ def subtract(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "sub", fill_value)
+        return self._binaryop(other, "__sub__", fill_value)
 
     sub = subtract
 
@@ -5432,7 +5355,7 @@ def rsub(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "sub", fill_value, reflect=True)
+        return self._binaryop(other, "__rsub__", fill_value)
 
     @annotate("FRAME_MULTIPLY", color="green", domain="cudf_python")
     def multiply(self, other, axis, level=None, fill_value=None):
@@ -5515,7 +5438,7 @@ def multiply(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "mul", fill_value)
+        return self._binaryop(other, "__mul__", fill_value)
 
     mul = multiply
 
@@ -5602,7 +5525,7 @@ def rmul(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "mul", fill_value, reflect=True)
+        return self._binaryop(other, "__rmul__", fill_value)
 
     @annotate("FRAME_MOD", color="green", domain="cudf_python")
     def mod(self, other, axis, level=None, fill_value=None):
@@ -5673,7 +5596,7 @@ def mod(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "mod", fill_value)
+        return self._binaryop(other, "__mod__", fill_value)
 
     @annotate("FRAME_RMOD", color="green", domain="cudf_python")
     def rmod(self, other, axis, level=None, fill_value=None):
@@ -5756,7 +5679,7 @@ def rmod(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "mod", fill_value, reflect=True)
+        return self._binaryop(other, "__rmod__", fill_value)
 
     @annotate("FRAME_POW", color="green", domain="cudf_python")
     def pow(self, other, axis, level=None, fill_value=None):
@@ -5836,7 +5759,7 @@ def pow(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "pow", fill_value)
+        return self._binaryop(other, "__pow__", fill_value)
 
     @annotate("FRAME_RPOW", color="green", domain="cudf_python")
     def rpow(self, other, axis, level=None, fill_value=None):
@@ -5916,7 +5839,7 @@ def rpow(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "pow", fill_value, reflect=True)
+        return self._binaryop(other, "__rpow__", fill_value)
 
     @annotate("FRAME_FLOORDIV", color="green", domain="cudf_python")
     def floordiv(self, other, axis, level=None, fill_value=None):
@@ -5996,7 +5919,7 @@ def floordiv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "floordiv", fill_value)
+        return self._binaryop(other, "__floordiv__", fill_value)
 
     @annotate("FRAME_RFLOORDIV", color="green", domain="cudf_python")
     def rfloordiv(self, other, axis, level=None, fill_value=None):
@@ -6093,7 +6016,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "floordiv", fill_value, reflect=True)
+        return self._binaryop(other, "__rfloordiv__", fill_value)
 
     @annotate("FRAME_TRUEDIV", color="green", domain="cudf_python")
     def truediv(self, other, axis, level=None, fill_value=None):
@@ -6178,7 +6101,7 @@ def truediv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "truediv", fill_value)
+        return self._binaryop(other, "__truediv__", fill_value)
 
     # Alias for truediv
     div = truediv
@@ -6272,7 +6195,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "truediv", fill_value, reflect=True)
+        return self._binaryop(other, "__rtruediv__", fill_value)
 
     # Alias for rtruediv
     rdiv = rtruediv
@@ -6350,7 +6273,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """
         return self._binaryop(
-            other=other, fn="eq", fill_value=fill_value, can_reindex=True
+            other=other, op="__eq__", fill_value=fill_value, can_reindex=True
         )
 
     @annotate("FRAME_NE", color="green", domain="cudf_python")
@@ -6426,7 +6349,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """  # noqa: E501
         return self._binaryop(
-            other=other, fn="ne", fill_value=fill_value, can_reindex=True
+            other=other, op="__ne__", fill_value=fill_value, can_reindex=True
         )
 
     @annotate("FRAME_LT", color="green", domain="cudf_python")
@@ -6502,7 +6425,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """  # noqa: E501
         return self._binaryop(
-            other=other, fn="lt", fill_value=fill_value, can_reindex=True
+            other=other, op="__lt__", fill_value=fill_value, can_reindex=True
         )
 
     @annotate("FRAME_LE", color="green", domain="cudf_python")
@@ -6578,7 +6501,7 @@ def le(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """  # noqa: E501
         return self._binaryop(
-            other=other, fn="le", fill_value=fill_value, can_reindex=True
+            other=other, op="__le__", fill_value=fill_value, can_reindex=True
         )
 
     @annotate("FRAME_GT", color="green", domain="cudf_python")
@@ -6654,7 +6577,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """  # noqa: E501
         return self._binaryop(
-            other=other, fn="gt", fill_value=fill_value, can_reindex=True
+            other=other, op="__gt__", fill_value=fill_value, can_reindex=True
         )
 
     @annotate("FRAME_GE", color="green", domain="cudf_python")
@@ -6730,7 +6653,7 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """  # noqa: E501
         return self._binaryop(
-            other=other, fn="ge", fill_value=fill_value, can_reindex=True
+            other=other, op="__ge__", fill_value=fill_value, can_reindex=True
         )
 
     def nunique(self, method: builtins.str = "sort", dropna: bool = True):
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 5aab834d452..343ba33ece1 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -52,6 +52,7 @@
 from cudf.core.column.string import StringMethods as StringMethods
 from cudf.core.dtypes import IntervalDtype
 from cudf.core.frame import Frame
+from cudf.core.mixins import BinaryOperand
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import find_common_type
@@ -122,7 +123,7 @@ def _index_from_columns(
     return _index_from_data(dict(zip(range(len(columns)), columns)), name=name)
 
 
-class RangeIndex(BaseIndex):
+class RangeIndex(BaseIndex, BinaryOperand):
     """
     Immutable Index implementing a monotonic integer range.
 
@@ -155,6 +156,8 @@ class RangeIndex(BaseIndex):
     RangeIndex(start=1, stop=10, step=1, name='a')
     """
 
+    _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
+
     _range: range
 
     def __init__(
@@ -698,43 +701,16 @@ def _apply_boolean_mask(self, boolean_mask):
             [self._values.apply_boolean_mask(boolean_mask)], [self.name]
         )
 
+    def _binaryop(self, other, op: str):
+        return self._as_int64()._binaryop(other, op=op)
+
 
 # Patch in all binops and unary ops, which bypass __getattr__ on the instance
 # and prevent the above overload from working.
-for binop in (
-    "__add__",
-    "__radd__",
-    "__sub__",
-    "__rsub__",
-    "__mod__",
-    "__rmod__",
-    "__pow__",
-    "__rpow__",
-    "__floordiv__",
-    "__rfloordiv__",
-    "__truediv__",
-    "__rtruediv__",
-    "__and__",
-    "__or__",
-    "__xor__",
-    "__eq__",
-    "__ne__",
-    "__lt__",
-    "__le__",
-    "__gt__",
-    "__ge__",
-):
-    setattr(
-        RangeIndex,
-        binop,
-        lambda self, other, op=binop: getattr(self._as_int64(), op)(other),
-    )
-
-
 for unaop in ("__neg__", "__pos__", "__abs__"):
     setattr(
         RangeIndex,
-        binop,
+        unaop,
         lambda self, op=unaop: getattr(self._as_int64(), op)(),
     )
 
@@ -814,19 +790,15 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return NotImplemented
 
     def _binaryop(
-        self,
-        other: T,
-        fn: str,
-        fill_value: Any = None,
-        reflect: bool = False,
-        *args,
-        **kwargs,
+        self, other: T, op: str, fill_value: Any = None, *args, **kwargs,
     ) -> SingleColumnFrame:
-        # Specialize binops to generate the appropriate output index type.
+        reflect = self._is_reflected_op(op)
+        if reflect:
+            op = op[:2] + op[3:]
         operands = self._make_operands_for_binop(other, fill_value, reflect)
         if operands is NotImplemented:
             return NotImplemented
-        ret = _index_from_data(self._colwise_binop(operands, fn))
+        ret = _index_from_data(self._colwise_binop(operands, op))
 
         # pandas returns numpy arrays when the outputs are boolean. We
         # explicitly _do not_ use isinstance here: we want only boolean
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 3ae0a838873..331457d17ae 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1697,21 +1697,23 @@ def last(self, offset):
     def _binaryop(
         self,
         other: Any,
-        fn: str,
+        op: str,
         fill_value: Any = None,
-        reflect: bool = False,
         can_reindex: bool = False,
         *args,
         **kwargs,
     ):
+        reflect = self._is_reflected_op(op)
+        if reflect:
+            op = op[:2] + op[3:]
         operands, out_index = self._make_operands_and_index_for_binop(
-            other, fn, fill_value, reflect, can_reindex
+            other, op, fill_value, reflect, can_reindex
         )
         if operands is NotImplemented:
             return NotImplemented
 
         return self._from_data(
-            ColumnAccessor(type(self)._colwise_binop(operands, fn)),
+            ColumnAccessor(type(self)._colwise_binop(operands, op)),
             index=out_index,
         )
 
diff --git a/python/cudf/cudf/core/mixins/__init__.py b/python/cudf/cudf/core/mixins/__init__.py
index cecf4c1c7ed..dd3dcd6d388 100644
--- a/python/cudf/cudf/core/mixins/__init__.py
+++ b/python/cudf/cudf/core/mixins/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
 from .reductions import Reducible
+from .binops import BinaryOperand
 
-__all__ = ["Reducible"]
+__all__ = ["Reducible", "BinaryOperand"]
diff --git a/python/cudf/cudf/core/mixins/binops.py b/python/cudf/cudf/core/mixins/binops.py
new file mode 100644
index 00000000000..773b47b62b2
--- /dev/null
+++ b/python/cudf/cudf/core/mixins/binops.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from .mixin_factory import _create_delegating_mixin
+
+BinaryOperand = _create_delegating_mixin(
+    "BinaryOperand",
+    "Mixin encapsulating binary operations.",
+    "BINARY_OPERATION",
+    "_binaryop",
+    {
+        # Numeric operations.
+        "__add__",
+        "__sub__",
+        "__mul__",
+        "__matmul__",
+        "__truediv__",
+        "__floordiv__",
+        "__mod__",
+        # "__divmod__", # Not yet implemented
+        "__pow__",
+        # "__lshift__", # Not yet implemented
+        # "__rshift__", # Not yet implemented
+        "__and__",
+        "__xor__",
+        "__or__",
+        # Reflected numeric operations.
+        "__radd__",
+        "__rsub__",
+        "__rmul__",
+        "__rmatmul__",
+        "__rtruediv__",
+        "__rfloordiv__",
+        "__rmod__",
+        # "__rdivmod__", # Not yet implemented
+        "__rpow__",
+        # "__rlshift__", # Not yet implemented
+        # "__rrshift__", # Not yet implemented
+        "__rand__",
+        "__rxor__",
+        "__ror__",
+        # Rich comparison operations.
+        "__lt__",
+        "__le__",
+        "__eq__",
+        "__ne__",
+        "__gt__",
+        "__ge__",
+    },
+)
+
+
+def _is_reflected_op(op):
+    return op[2] == "r" and op != "__rshift__"
+
+
+BinaryOperand._is_reflected_op = staticmethod(_is_reflected_op)
diff --git a/python/cudf/cudf/core/mixins/binops.pyi b/python/cudf/cudf/core/mixins/binops.pyi
new file mode 100644
index 00000000000..45093cd04d4
--- /dev/null
+++ b/python/cudf/cudf/core/mixins/binops.pyi
@@ -0,0 +1,88 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from typing import Set
+
+class BinaryOperand:
+    _SUPPORTED_BINARY_OPERATIONS: Set
+
+    def __add__(self, other):
+        ...
+
+    def __sub__(self, other):
+        ...
+
+    def __mul__(self, other):
+        ...
+
+    def __truediv__(self, other):
+        ...
+
+    def __floordiv__(self, other):
+        ...
+
+    def __mod__(self, other):
+        ...
+
+    def __pow__(self, other):
+        ...
+
+    def __and__(self, other):
+        ...
+
+    def __xor__(self, other):
+        ...
+
+    def __or__(self, other):
+        ...
+
+    def __radd__(self, other):
+        ...
+
+    def __rsub__(self, other):
+        ...
+
+    def __rmul__(self, other):
+        ...
+
+    def __rtruediv__(self, other):
+        ...
+
+    def __rfloordiv__(self, other):
+        ...
+
+    def __rmod__(self, other):
+        ...
+
+    def __rpow__(self, other):
+        ...
+
+    def __rand__(self, other):
+        ...
+
+    def __rxor__(self, other):
+        ...
+
+    def __ror__(self, other):
+        ...
+
+    def __lt__(self, other):
+        ...
+
+    def __le__(self, other):
+        ...
+
+    def __eq__(self, other):
+        ...
+
+    def __ne__(self, other):
+        ...
+
+    def __gt__(self, other):
+        ...
+
+    def __ge__(self, other):
+        ...
+
+    @staticmethod
+    def _is_reflected_op(op) -> bool:
+        ...
diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py
index ecb18f61830..7bbb299d643 100644
--- a/python/cudf/cudf/core/mixins/mixin_factory.py
+++ b/python/cudf/cudf/core/mixins/mixin_factory.py
@@ -86,14 +86,18 @@ def _should_define_operation(cls, operation, base_operation_name):
     # At this point we know that the class has the operation defined but it
     # also overrides the base operation. Since this function is called before
     # the operation is defined on the current class, we know that it inherited
-    # the operation from a parent. We therefore have two possibilities:
+    # the operation from a parent. We therefore have three possibilities:
     # 1. A parent class manually defined the operation. That override takes
     #    precedence even if the current class defined the base operation.
     # 2. A parent class has an auto-generated operation, i.e. it is of type
     #    Operation and was created by OperationMixin.__init_subclass__. The
     #    current class must override it so that its base operation is used
     #    rather than the parent's base operation.
+    # 3. The method is defined for all classes, i.e. it is a method of object.
     for base_cls in cls.__mro__:
+        # We always override methods defined for object.
+        if base_cls is object:
+            return True
         # The first attribute in the MRO is the one that will be used.
         if operation in base_cls.__dict__:
             return isinstance(base_cls.__dict__[operation], Operation)
@@ -216,6 +220,7 @@ def __init_subclass__(cls):
             # Only add the valid set of operations for a particular class.
             valid_operations = set()
             for base_cls in cls.__mro__:
+                # Check for sentinel indicating that all operations are valid.
                 valid_operations |= getattr(base_cls, validity_attr, set())
 
             invalid_operations = valid_operations - supported_operations
@@ -251,9 +256,8 @@ def _operation(self, op: str, *args, **kwargs):
     )
 
     setattr(OperationMixin, base_operation_name, _operation)
-    # This attribute is set in case lookup is convenient at a later point, but
-    # it is not strictly necessary since `supported_operations` is part of the
-    # closure associated with the class's creation.
+    # Making this attribute available makes it easy for subclasses to indicate
+    # that all supported operations for this mixin are valid.
     setattr(OperationMixin, supported_attr, supported_operations)
 
     return OperationMixin
diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi
index 600f30e9372..3769b7c360e 100644
--- a/python/cudf/cudf/core/mixins/reductions.pyi
+++ b/python/cudf/cudf/core/mixins/reductions.pyi
@@ -1,8 +1,10 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-from __future__ import annotations
+from typing import Set
 
 class Reducible:
+    _SUPPORTED_REDUCTIONS: Set
+
     def sum(self):
         ...
 
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 134b94bf0f2..1c81803ed98 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -10,6 +10,7 @@
 from cudf.core.column.column import ColumnBase
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.index import BaseIndex
+from cudf.core.mixins import BinaryOperand
 from cudf.core.series import Series
 from cudf.utils.dtypes import (
     get_allowed_combinations_for_operator,
@@ -17,7 +18,7 @@
 )
 
 
-class Scalar:
+class Scalar(BinaryOperand):
     """
     A GPU-backed scalar object with NumPy scalar like properties
     May be used in binary operations against other scalars, cuDF
@@ -57,6 +58,8 @@ class Scalar:
         The data type
     """
 
+    _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
+
     def __init__(self, value, dtype=None):
 
         self._host_value = None
@@ -211,69 +214,8 @@ def __float__(self):
     def __bool__(self):
         return bool(self.value)
 
-    # Scalar Binary Operations
-    def __add__(self, other):
-        return self._scalar_binop(other, "__add__")
-
-    def __radd__(self, other):
-        return self._scalar_binop(other, "__radd__")
-
-    def __sub__(self, other):
-        return self._scalar_binop(other, "__sub__")
-
-    def __rsub__(self, other):
-        return self._scalar_binop(other, "__rsub__")
-
-    def __mul__(self, other):
-        return self._scalar_binop(other, "__mul__")
-
-    def __rmul__(self, other):
-        return self._scalar_binop(other, "__rmul__")
-
-    def __truediv__(self, other):
-        return self._scalar_binop(other, "__truediv__")
-
-    def __floordiv__(self, other):
-        return self._scalar_binop(other, "__floordiv__")
-
-    def __rtruediv__(self, other):
-        return self._scalar_binop(other, "__rtruediv__")
-
-    def __mod__(self, other):
-        return self._scalar_binop(other, "__mod__")
-
-    def __divmod__(self, other):
-        return self._scalar_binop(other, "__divmod__")
-
-    def __and__(self, other):
-        return self._scalar_binop(other, "__and__")
-
-    def __xor__(self, other):
-        return self._scalar_binop(other, "__or__")
-
-    def __pow__(self, other):
-        return self._scalar_binop(other, "__pow__")
-
-    def __gt__(self, other):
-        return self._scalar_binop(other, "__gt__")
-
-    def __lt__(self, other):
-        return self._scalar_binop(other, "__lt__")
-
-    def __ge__(self, other):
-        return self._scalar_binop(other, "__ge__")
-
-    def __le__(self, other):
-        return self._scalar_binop(other, "__le__")
-
-    def __eq__(self, other):
-        return self._scalar_binop(other, "__eq__")
-
-    def __ne__(self, other):
-        return self._scalar_binop(other, "__ne__")
-
     def __round__(self, n):
-        return self._scalar_binop(n, "__round__")
+        return self._binaryop(n, "__round__")
 
     # Scalar Unary Operations
     def __abs__(self):
@@ -330,7 +272,7 @@ def _binop_result_dtype_or_error(self, other, op):
 
         return cudf.dtype(out_dtype)
 
-    def _scalar_binop(self, other, op):
+    def _binaryop(self, other, op: str):
         if isinstance(other, (ColumnBase, Series, BaseIndex, np.ndarray)):
             # dispatch to column implementation
             return NotImplemented
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index ec87fcdb066..fffce27c89a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1245,21 +1245,21 @@ def logical_and(self, other):
             "Series.logical_and is deprecated and will be removed.",
             FutureWarning,
         )
-        return self._binaryop(other, "l_and").astype(np.bool_)
+        return self._binaryop(other, "__l_and__").astype(np.bool_)
 
     def remainder(self, other):
         warnings.warn(
             "Series.remainder is deprecated and will be removed.",
             FutureWarning,
         )
-        return self._binaryop(other, "mod")
+        return self._binaryop(other, "__mod__")
 
     def logical_or(self, other):
         warnings.warn(
             "Series.logical_or is deprecated and will be removed.",
             FutureWarning,
         )
-        return self._binaryop(other, "l_or").astype(np.bool_)
+        return self._binaryop(other, "__l_or__").astype(np.bool_)
 
     def logical_not(self):
         warnings.warn(
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 4dadfede866..315da4d8dd6 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -24,12 +24,6 @@
 
 
 _EQUALITY_OPS = {
-    "eq",
-    "ne",
-    "lt",
-    "gt",
-    "le",
-    "ge",
     "__eq__",
     "__ne__",
     "__lt__",

From b4d262d7d5886495614a9257b54c43b881a40a96 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 2 Mar 2022 10:35:03 -0800
Subject: [PATCH 5/9] Refactor array function (#10364)

This PR cleans up the implementation of `__array_function__` for `Series` and `DataFrame` to bring them further into alignment. It also inlines a number of functions defined in `utils/utils.py` that were previously used only in `Series.__array_ufunc__`, building on the improvements in #10217, #10287, and #10346 to clear out methods related to the old `__array_ufunc__` dispatch that are now only used by this `__array_function__` implementation. Inlining these methods also allows significant simplification since they were handling cases that are no longer relevant or possible. Unlike those previous PRs, this one does not actually enable any new features. Although it should marginally accelerate array functions by simplifying the dispatch logic, the fact that this API makes few promises about the nature of the function being applied and our desire to have it "just work" as much as possible means that we must simply adopt an EAFP approach and return `NotImplemented` if any part of the process fails.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10364
---
 python/cudf/cudf/core/dataframe.py |  58 ++++++-------
 python/cudf/cudf/core/series.py    |  72 +++++++++++-----
 python/cudf/cudf/utils/utils.py    | 131 -----------------------------
 3 files changed, 76 insertions(+), 185 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 287fd3796f4..4af8545e316 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1337,42 +1337,32 @@ def memory_usage(self, index=True, deep=False):
 
     @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python")
     def __array_function__(self, func, types, args, kwargs):
-
-        cudf_df_module = DataFrame
-        cudf_series_module = Series
-
-        for submodule in func.__module__.split(".")[1:]:
-            # point cudf to the correct submodule
-            if hasattr(cudf_df_module, submodule):
-                cudf_df_module = getattr(cudf_df_module, submodule)
-            else:
-                return NotImplemented
-
-        fname = func.__name__
-
-        handled_types = [cudf_df_module, cudf_series_module]
-
-        for t in types:
-            if t not in handled_types:
-                return NotImplemented
-
-        if hasattr(cudf_df_module, fname):
-            cudf_func = getattr(cudf_df_module, fname)
-            # Handle case if cudf_func is same as numpy function
-            if cudf_func is func:
-                return NotImplemented
-            # numpy returns an array from the dot product of two dataframes
-            elif (
-                func is np.dot
-                and isinstance(args[0], (DataFrame, pd.DataFrame))
-                and isinstance(args[1], (DataFrame, pd.DataFrame))
-            ):
-                return cudf_func(*args, **kwargs).values
-            else:
-                return cudf_func(*args, **kwargs)
-        else:
+        if "out" in kwargs or not all(
+            issubclass(t, (Series, DataFrame)) for t in types
+        ):
             return NotImplemented
 
+        try:
+            if cudf_func := getattr(self.__class__, func.__name__, None):
+                out = cudf_func(*args, **kwargs)
+                # The dot product of two DataFrames returns an array in pandas.
+                if (
+                    func is np.dot
+                    and isinstance(args[0], (DataFrame, pd.DataFrame))
+                    and isinstance(args[1], (DataFrame, pd.DataFrame))
+                ):
+                    return out.values
+                return out
+        except Exception:
+            # The rare instance where a "silent" failure is preferable. Except
+            # in the (highly unlikely) case that some other library
+            # interoperates with cudf objects, the result will be that numpy
+            # raises a TypeError indicating that the operation is not
+            # implemented, which is much friendlier than an arbitrary internal
+            # cudf error.
+            pass
+        return NotImplemented
+
     # The _get_numeric_data method is necessary for dask compatibility.
     @annotate("DATAFRAME_GET_NUMERIC_DATA", color="blue", domain="cudf_python")
     def _get_numeric_data(self):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index fffce27c89a..740be91eb9d 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -76,11 +76,7 @@
     is_mixed_with_object_dtype,
     min_scalar_type,
 )
-from cudf.utils.utils import (
-    get_appropriate_dispatched_func,
-    get_relevant_submodule,
-    to_cudf_compatible_scalar,
-)
+from cudf.utils.utils import to_cudf_compatible_scalar
 
 
 def _append_new_row_inplace(col: ColumnLike, value: ScalarLike):
@@ -960,23 +956,59 @@ def memory_usage(self, index=True, deep=False):
         return sum(super().memory_usage(index, deep).values())
 
     def __array_function__(self, func, types, args, kwargs):
-        handled_types = [cudf.Series]
-        for t in types:
-            if t not in handled_types:
+        if "out" in kwargs or not all(issubclass(t, Series) for t in types):
+            return NotImplemented
+
+        try:
+            # Apply a Series method if one exists.
+            if cudf_func := getattr(Series, func.__name__, None):
+                return cudf_func(*args, **kwargs)
+
+            # Assume that cupy subpackages match numpy and search the
+            # corresponding cupy submodule based on the func's __module__.
+            numpy_submodule = func.__module__.split(".")[1:]
+            cupy_func = cupy
+            for name in (*numpy_submodule, func.__name__):
+                cupy_func = getattr(cupy_func, name, None)
+
+            # Handle case if cupy does not implement the function or just
+            # aliases the numpy function.
+            if not cupy_func or cupy_func is func:
                 return NotImplemented
 
-        cudf_submodule = get_relevant_submodule(func, cudf)
-        cudf_ser_submodule = get_relevant_submodule(func, cudf.Series)
-        cupy_submodule = get_relevant_submodule(func, cupy)
-
-        return get_appropriate_dispatched_func(
-            cudf_submodule,
-            cudf_ser_submodule,
-            cupy_submodule,
-            func,
-            args,
-            kwargs,
-        )
+            # For now just fail on cases with mismatched indices. There is
+            # almost certainly no general solution for all array functions.
+            index = args[0].index
+            if not all(s.index.equals(index) for s in args):
+                return NotImplemented
+            out = cupy_func(*(s.values for s in args), **kwargs)
+
+            # Return (host) scalar values immediately.
+            if not isinstance(out, cupy.ndarray):
+                return out
+
+            # 0D array (scalar)
+            if out.ndim == 0:
+                return to_cudf_compatible_scalar(out)
+            # 1D array
+            elif (
+                # Only allow 1D arrays
+                ((out.ndim == 1) or (out.ndim == 2 and out.shape[1] == 1))
+                # If we have an index, it must be the same length as the
+                # output for cupy dispatching to be well-defined.
+                and len(index) == len(out)
+            ):
+                return Series(out, index=index)
+        except Exception:
+            # The rare instance where a "silent" failure is preferable. Except
+            # in the (highly unlikely) case that some other library
+            # interoperates with cudf objects, the result will be that numpy
+            # raises a TypeError indicating that the operation is not
+            # implemented, which is much friendlier than an arbitrary internal
+            # cudf error.
+            pass
+
+        return NotImplemented
 
     def map(self, arg, na_action=None) -> "Series":
         """
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 315da4d8dd6..4fa6b7d934c 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -4,7 +4,6 @@
 import functools
 import os
 import traceback
-from collections.abc import Sequence
 from typing import FrozenSet, Set, Union
 
 import cupy as cp
@@ -317,136 +316,6 @@ def search_range(start, stop, x, step=1, side="left"):
     return max(min(length, i), 0)
 
 
-_UFUNC_ALIASES = {
-    "power": "pow",
-    "equal": "eq",
-    "not_equal": "ne",
-    "less": "lt",
-    "less_equal": "le",
-    "greater": "gt",
-    "greater_equal": "ge",
-    "absolute": "abs",
-}
-# For op(., cudf.Series) -> cudf.Series.__r{op}__
-_REVERSED_NAMES = {
-    "lt": "__gt__",
-    "le": "__ge__",
-    "gt": "__lt__",
-    "ge": "__le__",
-    "eq": "__eq__",
-    "ne": "__ne__",
-}
-
-
-# todo: can probably be used to remove cudf/core/ops.py
-def _get_cudf_series_ufunc(fname, args, kwargs, cudf_ser_submodule):
-    if isinstance(args[0], cudf.Series):
-        cudf_ser_func = getattr(cudf_ser_submodule, fname)
-        return cudf_ser_func(*args, **kwargs)
-    elif len(args) == 2 and isinstance(args[1], cudf.Series):
-        rev_name = _REVERSED_NAMES.get(fname, f"__r{fname}__")
-        cudf_ser_func = getattr(cudf_ser_submodule, rev_name)
-        return cudf_ser_func(args[1], args[0], **kwargs)
-    return NotImplemented
-
-
-# Utils for using appropriate dispatch for array functions
-def get_appropriate_dispatched_func(
-    cudf_submodule, cudf_ser_submodule, cupy_submodule, func, args, kwargs
-):
-    if kwargs.get("out") is None:
-        fname = func.__name__
-        # Dispatch these functions to appropiate alias from the _UFUNC_ALIASES
-        is_ufunc = fname in _UFUNC_ALIASES
-        fname = _UFUNC_ALIASES.get(fname, fname)
-
-        if hasattr(cudf_submodule, fname):
-            cudf_func = getattr(cudf_submodule, fname)
-            return cudf_func(*args, **kwargs)
-
-        elif hasattr(cudf_ser_submodule, fname):
-            if is_ufunc:
-                return _get_cudf_series_ufunc(
-                    fname, args, kwargs, cudf_ser_submodule
-                )
-            else:
-                cudf_ser_func = getattr(cudf_ser_submodule, fname)
-                return cudf_ser_func(*args, **kwargs)
-
-        elif hasattr(cupy_submodule, fname):
-            cupy_func = getattr(cupy_submodule, fname)
-            # Handle case if cupy implements it as a numpy function
-            # Unsure if needed
-            if cupy_func is func:
-                return NotImplemented
-
-            cupy_compatible_args, index = _get_cupy_compatible_args_index(args)
-            if cupy_compatible_args:
-                cupy_output = cupy_func(*cupy_compatible_args, **kwargs)
-                if isinstance(cupy_output, cp.ndarray):
-                    return _cast_to_appropriate_cudf_type(cupy_output, index)
-                else:
-                    return cupy_output
-
-    return NotImplemented
-
-
-def _cast_to_appropriate_cudf_type(val, index=None):
-    # Handle scalar
-    if val.ndim == 0:
-        return to_cudf_compatible_scalar(val)
-    # 1D array
-    elif (val.ndim == 1) or (val.ndim == 2 and val.shape[1] == 1):
-        # if index is not None and is of a different length
-        # than the index, cupy dispatching behaviour is undefined
-        # so we don't implement it
-        if (index is None) or (len(index) == len(val)):
-            return cudf.Series(val, index=index)
-
-    return NotImplemented
-
-
-def _get_cupy_compatible_args_index(args, ser_index=None):
-    """
-    This function returns cupy compatible arguments and output index
-    if conversion is not possible it returns None
-    """
-
-    casted_ls = []
-    for arg in args:
-        if isinstance(arg, cp.ndarray):
-            casted_ls.append(arg)
-        elif isinstance(arg, cudf.Series):
-            # check if indexes can be aligned
-            if (ser_index is None) or (ser_index.equals(arg.index)):
-                ser_index = arg.index
-                casted_ls.append(arg.values)
-            else:
-                # this throws a value-error if indexes are not aligned
-                # following pandas behavior for ufunc numpy dispatching
-                raise ValueError(
-                    "Can only compare identically-labeled Series objects"
-                )
-        elif isinstance(arg, Sequence):
-            # we dont handle list of inputs for functions as
-            # these form inputs for functions like
-            # np.concatenate, vstack have ambiguity around index alignment
-            return None, ser_index
-        else:
-            casted_ls.append(arg)
-    return casted_ls, ser_index
-
-
-def get_relevant_submodule(func, module):
-    # point to the correct submodule
-    for submodule in func.__module__.split(".")[1:]:
-        if hasattr(module, submodule):
-            module = getattr(module, submodule)
-        else:
-            return None
-    return module
-
-
 def _categorical_scalar_broadcast_to(cat_scalar, size):
     if isinstance(cat_scalar, (cudf.Series, pd.Series)):
         cats = cat_scalar.cat.categories

From fbac0acb2999f0396c46fd7d6aee3c2ec2442d8e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 2 Mar 2022 14:18:08 -0600
Subject: [PATCH 6/9] Include <optional> in multibyte split. (#10385)

#10150 broke compiler support for GCC 11 (built locally) because it was missing `#include <optional>` in a couple files. This fixes it. cc: @cwharris

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/10385
---
 cpp/include/cudf/io/text/multibyte_split.hpp | 1 +
 cpp/src/io/text/multibyte_split.cu           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index 25f7ef98a81..77affa95ca8 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -23,6 +23,7 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <memory>
+#include <optional>
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 99f3bde3bf6..51622747831 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -40,6 +40,7 @@
 #include <cub/block/block_scan.cuh>
 
 #include <memory>
+#include <optional>
 
 namespace {
 

From 6bcfc104051f926f46467fc55c456a9b012fc4af Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 2 Mar 2022 15:58:47 -0600
Subject: [PATCH 7/9] Fix issue with column and scalar re-assignment (#10377)

Fixes: #10305  This PR fixes issues where the `scalar` that is being assigned was being type-casted to the column's type. Added test coverage for the same.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10377
---
 python/cudf/cudf/core/dataframe.py     | 4 +++-
 python/cudf/cudf/tests/test_setitem.py | 9 +++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4af8545e316..a820b527d3f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1161,7 +1161,9 @@ def __setitem__(self, arg, value):
                             allow_non_unique=True,
                         )
                     if is_scalar(value):
-                        self._data[arg][:] = value
+                        self._data[arg] = utils.scalar_broadcast_to(
+                            value, len(self)
+                        )
                     else:
                         value = as_column(value)
                         self._data[arg] = value
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 1fce7853fdf..fd3f2732556 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -33,9 +33,14 @@ def test_dataframe_setitem_scaler_bool():
     assert_eq(df, gdf)
 
 
-@pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})])
+@pytest.mark.parametrize(
+    "df",
+    [pd.DataFrame({"a": [1, 2, 3]}), pd.DataFrame({"a": ["x", "y", "z"]})],
+)
 @pytest.mark.parametrize("arg", [["a"], "a", "b"])
-@pytest.mark.parametrize("value", [-10, pd.DataFrame({"a": [-1, -2, -3]})])
+@pytest.mark.parametrize(
+    "value", [-10, pd.DataFrame({"a": [-1, -2, -3]}), "abc"]
+)
 def test_dataframe_setitem_columns(df, arg, value):
     gdf = cudf.from_pandas(df)
     cudf_replace_value = value

From b5337d7dc695275b3a72e19c7bc99a69c54d2d2c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 3 Mar 2022 09:42:37 -0600
Subject: [PATCH 8/9] Add `nvtx` annotations for `Series` and `Index` (#10374)

This PR adds some missing `nvtx` annotations for `Series` and `Index`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10374
---
 python/cudf/cudf/core/frame.py               |   6 +-
 python/cudf/cudf/core/index.py               | 190 ++++++++++++++++---
 python/cudf/cudf/core/multiindex.py          | 109 +++++++++--
 python/cudf/cudf/core/series.py              | 186 ++++++++++++++----
 python/cudf/cudf/core/single_column_frame.py |  91 +++++++--
 5 files changed, 485 insertions(+), 97 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 3d36d3bd893..87f4ed0bbc4 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6921,12 +6921,12 @@ def _drop_rows_by_labels(
             raise KeyError("One or more values not found in axis")
 
         key_df = cudf.DataFrame(index=labels)
-        if isinstance(obj, cudf.Series):
+        if isinstance(obj, cudf.DataFrame):
+            return obj.join(key_df, how="leftanti")
+        else:
             res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"]
             res.name = obj.name
             return res
-        else:
-            return obj.join(key_df, how="leftanti")
 
 
 def _apply_inverse_column(col: ColumnBase) -> ColumnBase:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 343ba33ece1..cf58a453bb6 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -160,6 +160,7 @@ class RangeIndex(BaseIndex, BinaryOperand):
 
     _range: range
 
+    @annotate("RANGEINDEX_INIT", color="green", domain="cudf_python")
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
@@ -190,43 +191,50 @@ def _copy_type_metadata(
         # have an underlying column.
         return self
 
-    @property
+    @property  # type: ignore
+    @annotate("RANGEINDEX_NAME", color="green", domain="cudf_python")
     def name(self):
         """
         Returns the name of the Index.
         """
         return self._name
 
-    @name.setter
+    @name.setter  # type: ignore
+    @annotate("RANGEINDEX_INIT_SETTER", color="green", domain="cudf_python")
     def name(self, value):
         self._name = value
 
-    @property
+    @property  # type: ignore
+    @annotate("RANGEINDEX_START", color="green", domain="cudf_python")
     def start(self):
         """
         The value of the `start` parameter (0 if this was not supplied).
         """
         return self._start
 
-    @property
+    @property  # type: ignore
+    @annotate("RANGEINDEX_STOP", color="green", domain="cudf_python")
     def stop(self):
         """
         The value of the stop parameter.
         """
         return self._stop
 
-    @property
+    @property  # type: ignore
+    @annotate("RANGEINDEX_STEP", color="green", domain="cudf_python")
     def step(self):
         """
         The value of the step parameter.
         """
         return self._step
 
-    @property
+    @property  # type: ignore
+    @annotate("RANGEINDEX_NUM_ROWS", color="green", domain="cudf_python")
     def _num_rows(self):
         return len(self)
 
     @cached_property
+    @annotate("RANGEINDEX_VALUES", color="green", domain="cudf_python")
     def _values(self):
         if len(self) > 0:
             return column.arange(
@@ -256,12 +264,14 @@ def is_categorical(self):
     def is_interval(self):
         return False
 
-    @property
+    @property  # type: ignore
+    @annotate("RANGEINDEX_DATA", color="green", domain="cudf_python")
     def _data(self):
         return cudf.core.column_accessor.ColumnAccessor(
             {self.name: self._values}
         )
 
+    @annotate("RANGEINDEX_CONTAINS", color="green", domain="cudf_python")
     def __contains__(self, item):
         if not isinstance(
             item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float])
@@ -271,6 +281,7 @@ def __contains__(self, item):
             return False
         return item in range(self._start, self._stop, self._step)
 
+    @annotate("RANGEINDEX_COPY", color="green", domain="cudf_python")
     def copy(self, name=None, deep=False, dtype=None, names=None):
         """
         Make a copy of this object.
@@ -301,9 +312,13 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
             start=self._start, stop=self._stop, step=self._step, name=name
         )
 
+    @annotate(
+        "RANGEINDEX_DROP_DUPLICATES", color="green", domain="cudf_python"
+    )
     def drop_duplicates(self, keep="first"):
         return self
 
+    @annotate("RANGEINDEX_REPR", color="green", domain="cudf_python")
     def __repr__(self):
         return (
             f"{self.__class__.__name__}(start={self._start}, stop={self._stop}"
@@ -316,9 +331,11 @@ def __repr__(self):
             + ")"
         )
 
+    @annotate("RANGEINDEX_LEN", color="green", domain="cudf_python")
     def __len__(self):
         return len(range(self._start, self._stop, self._step))
 
+    @annotate("RANGEINDEX_GETITEM", color="green", domain="cudf_python")
     def __getitem__(self, index):
         len_self = len(self)
         if isinstance(index, slice):
@@ -344,6 +361,7 @@ def __getitem__(self, index):
 
         return as_index(self._values[index], name=self.name)
 
+    @annotate("RangeIndex_EQUALS", color="green", domain="cudf_python")
     def equals(self, other):
         if isinstance(other, RangeIndex):
             if (self._start, self._stop, self._step) == (
@@ -354,6 +372,7 @@ def equals(self, other):
                 return True
         return Int64Index._from_data(self._data).equals(other)
 
+    @annotate("RANGEINDEX_SERIALIZE", color="green", domain="cudf_python")
     def serialize(self):
         header = {}
         header["index_column"] = {}
@@ -374,6 +393,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
+    @annotate("RANGEINDEX_DESERIALIZE", color="green", domain="cudf_python")
     def deserialize(cls, header, frames):
         h = header["index_column"]
         name = pickle.loads(header["name"])
@@ -382,13 +402,17 @@ def deserialize(cls, header, frames):
         step = h.get("step", 1)
         return RangeIndex(start=start, stop=stop, step=step, name=name)
 
-    @property
+    @property  # type: ignore
+    @annotate("RANGEINDEX_DTYPE", color="green", domain="cudf_python")
     def dtype(self):
         """
         `dtype` of the range of values in RangeIndex.
         """
         return cudf.dtype(np.int64)
 
+    @annotate(
+        "RANGEINDEX_FIND_LABEL_RANGE", color="green", domain="cudf_python"
+    )
     def find_label_range(self, first=None, last=None):
         """Find subrange in the ``RangeIndex``, marked by their positions, that
         starts greater or equal to ``first`` and ends less or equal to ``last``
@@ -428,6 +452,7 @@ def find_label_range(self, first=None, last=None):
 
         return begin, end
 
+    @annotate("RANGEINDEX_TO_PANDAS", color="green", domain="cudf_python")
     def to_pandas(self):
         return pd.RangeIndex(
             start=self._start,
@@ -444,14 +469,27 @@ def is_unique(self):
         """
         return True
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "RANGEINDEX_IS_MONOTONIC_INCREASING",
+        color="green",
+        domain="cudf_python",
+    )
     def is_monotonic_increasing(self):
         return self._step > 0 or len(self) <= 1
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "RANGEINDEX_IS_MONOTONIC_DECREASING",
+        color="green",
+        domain="cudf_python",
+    )
     def is_monotonic_decreasing(self):
         return self._step < 0 or len(self) <= 1
 
+    @annotate(
+        "RANGEINDEX_GET_SLICE_BOUND", color="green", domain="cudf_python"
+    )
     def get_slice_bound(self, label, side, kind=None):
         """
         Calculate slice bound that corresponds to given label.
@@ -486,6 +524,7 @@ def get_slice_bound(self, label, side, kind=None):
         pos = search_range(start, stop, label, step, side=side)
         return pos
 
+    @annotate("RANGEINDEX_MEMORY_USAGE", color="green", domain="cudf_python")
     def memory_usage(self, deep=False):
         if deep:
             warnings.warn(
@@ -498,6 +537,7 @@ def unique(self):
         # RangeIndex always has unique values
         return self
 
+    @annotate("RANGEINDEX_MUL", color="green", domain="cudf_python")
     def __mul__(self, other):
         # Multiplication by raw ints must return a RangeIndex to match pandas.
         if isinstance(other, cudf.Scalar) and other.dtype.kind in "iu":
@@ -514,20 +554,24 @@ def __mul__(self, other):
             )
         return self._as_int64().__mul__(other)
 
+    @annotate("RANGEINDEX_RMUL", color="green", domain="cudf_python")
     def __rmul__(self, other):
         # Multiplication is commutative.
         return self.__mul__(other)
 
+    @annotate("RANGEINDEX_AS_INT64", color="green", domain="cudf_python")
     def _as_int64(self):
         # Convert self to an Int64Index. This method is used to perform ops
         # that are not defined directly on RangeIndex.
         return Int64Index._from_data(self._data)
 
+    @annotate("RANGEINDEX_ARRAY_UFUNC", color="green", domain="cudf_python")
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return self._as_int64().__array_ufunc__(
             ufunc, method, *inputs, **kwargs
         )
 
+    @annotate("RANGEINDEX_GETATTR", color="green", domain="cudf_python")
     def __getattr__(self, key):
         # For methods that are not defined for RangeIndex we attempt to operate
         # on the corresponding integer index if possible.
@@ -538,6 +582,7 @@ def __getattr__(self, key):
                 f"'{type(self)}' object has no attribute {key}"
             )
 
+    @annotate("RANGEINDEX_GET_LOC", color="green", domain="cudf_python")
     def get_loc(self, key, method=None, tolerance=None):
         # Given an actual integer,
         idx = (key - self._start) / self._step
@@ -571,6 +616,7 @@ def get_loc(self, key, method=None, tolerance=None):
             raise KeyError(key)
         return np.clip(round_method(idx), 0, idx_int_upper_bound, dtype=int)
 
+    @annotate("RANGEINDEX_UNION_INTERNAL", color="green", domain="cudf_python")
     def _union(self, other, sort=None):
         if isinstance(other, RangeIndex):
             # Variable suffixes are of the
@@ -645,6 +691,9 @@ def _union(self, other, sort=None):
         # then perform `union`.
         return Int64Index(self._values)._union(other, sort=sort)
 
+    @annotate(
+        "RANGEINDEX_INTERSECTION_INTERNAL", color="green", domain="cudf_python"
+    )
     def _intersection(self, other, sort=False):
         if not isinstance(other, RangeIndex):
             return super()._intersection(other, sort=sort)
@@ -690,12 +739,18 @@ def _intersection(self, other, sort=False):
 
         return new_index
 
+    @annotate(
+        "RANGEINDEX_GATHER_INTERNAL", color="green", domain="cudf_python"
+    )
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
         return Int64Index._from_columns(
             [self._values.take(gather_map, nullify, check_bounds)], [self.name]
         )
 
+    @annotate(
+        "RANGEINDEX_APPLY_BOOLEAN_MASK", color="green", domain="cudf_python"
+    )
     def _apply_boolean_mask(self, boolean_mask):
         return Int64Index._from_columns(
             [self._values.apply_boolean_mask(boolean_mask)], [self.name]
@@ -734,6 +789,7 @@ class GenericIndex(SingleColumnFrame, BaseIndex):
         Column's, the data Column will be cloned to adopt this name.
     """
 
+    @annotate("GENERICINDEX_INIT", color="green", domain="cudf_python")
     def __init__(self, data, **kwargs):
         kwargs = _setdefault_name(data, **kwargs)
 
@@ -754,6 +810,7 @@ def __init__(self, data, **kwargs):
         name = kwargs.get("name")
         super().__init__({name: data})
 
+    @annotate("GENERICINDEX_ARRAY_UFUNC", color="green", domain="cudf_python")
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
 
@@ -789,6 +846,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
 
         return NotImplemented
 
+    @annotate("GENERICINDEX_BINARYOP", color="green", domain="cudf_python")
     def _binaryop(
         self, other: T, op: str, fill_value: Any = None, *args, **kwargs,
     ) -> SingleColumnFrame:
@@ -807,6 +865,9 @@ def _binaryop(
             return ret.values
         return ret
 
+    @annotate(
+        "GENERICINDEX_COPY_TYPE_METADATA", color="green", domain="cudf_python"
+    )
     def _copy_type_metadata(
         self, other: Frame, include_index: bool = True
     ) -> GenericIndex:
@@ -823,11 +884,13 @@ def _copy_type_metadata(
             )
         return self
 
-    @property
+    @property  # type: ignore
+    @annotate("GENERICINDEX_VALUES", color="green", domain="cudf_python")
     def _values(self):
         return self._column
 
     @classmethod
+    @annotate("GENERICINDEX_CONCAT", color="green", domain="cudf_python")
     def _concat(cls, objs):
         if all(isinstance(obj, RangeIndex) for obj in objs):
             result = _concat_range_index(objs)
@@ -844,6 +907,7 @@ def _concat(cls, objs):
         result.name = name
         return result
 
+    @annotate("GENERICINDEX_MEMORY_USAGE", color="green", domain="cudf_python")
     def memory_usage(self, deep=False):
         return sum(super().memory_usage(deep=deep).values())
 
@@ -877,6 +941,7 @@ def equals(self, other, **kwargs):
         except TypeError:
             return False
 
+    @annotate("GENERICINDEX_COPY", color="green", domain="cudf_python")
     def copy(self, name=None, deep=False, dtype=None, names=None):
         """
         Make a copy of this object.
@@ -904,6 +969,7 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
         col = self._values.astype(dtype)
         return _index_from_data({name: col.copy(True) if deep else col})
 
+    @annotate("GENERICINDEX_GET_LOC", color="green", domain="cudf_python")
     def get_loc(self, key, method=None, tolerance=None):
         """Get integer location, slice or boolean mask for requested label.
 
@@ -1022,6 +1088,7 @@ def get_loc(self, key, method=None, tolerance=None):
         mask[true_inds] = True
         return mask
 
+    @annotate("GENERICINDEX_REPR", color="green", domain="cudf_python")
     def __repr__(self):
         max_seq_items = get_option("max_seq_items") or len(self)
         mr = 0
@@ -1098,6 +1165,7 @@ def __repr__(self):
 
         return "\n".join(lines)
 
+    @annotate("GENERICINDEX_GETITEM", color="green", domain="cudf_python")
     def __getitem__(self, index):
         if type(self) == IntervalIndex:
             raise NotImplementedError(
@@ -1109,13 +1177,17 @@ def __getitem__(self, index):
             res.name = self.name
         return res
 
-    @property
+    @property  # type: ignore
+    @annotate("GENERICINDEX_DTYPE", color="green", domain="cudf_python")
     def dtype(self):
         """
         `dtype` of the underlying values in GenericIndex.
         """
         return self._values.dtype
 
+    @annotate(
+        "GENERICINDEX_FIND_LABEL_RANGE", color="green", domain="cudf_python"
+    )
     def find_label_range(self, first, last):
         """Find range that starts with *first* and ends with *last*,
         inclusively.
@@ -1135,6 +1207,9 @@ def find_label_range(self, first, last):
             end += 1
         return begin, end
 
+    @annotate(
+        "GENERICINDEX_GET_SLICE_BOUND", color="green", domain="cudf_python"
+    )
     def get_slice_bound(self, label, side, kind=None):
         return self._values.get_slice_bound(label, side, kind)
 
@@ -1159,6 +1234,7 @@ def is_categorical(self):
     def is_interval(self):
         return False
 
+    @annotate("GENERICINDEX_ARGSORT", color="green", domain="cudf_python")
     def argsort(
         self,
         axis=0,
@@ -1220,6 +1296,7 @@ class NumericIndex(GenericIndex):
     # Subclasses must define the dtype they are associated with.
     _dtype: Union[None, Type[np.number]] = None
 
+    @annotate("NUMERICINDEX_INIT", color="green", domain="cudf_python")
     def __init__(self, data=None, dtype=None, copy=False, name=None):
 
         dtype = type(self)._dtype
@@ -1557,6 +1634,7 @@ class DatetimeIndex(GenericIndex):
                   dtype='datetime64[ns]', name='a')
     """
 
+    @annotate("DATETIMEINDEX_INIT", color="green", domain="cudf_python")
     def __init__(
         self,
         data=None,
@@ -1611,7 +1689,8 @@ def __init__(
             data = column.as_column(np.array(data, dtype=dtype))
         super().__init__(data, **kwargs)
 
-    @property
+    @property  # type: ignore
+    @annotate("DATETIMEINDEX_YEAR", color="green", domain="cudf_python")
     def year(self):
         """
         The year of the datetime.
@@ -1629,7 +1708,8 @@ def year(self):
         """  # noqa: E501
         return self._get_dt_field("year")
 
-    @property
+    @property  # type: ignore
+    @annotate("DATETIMEINDEX_MONTH", color="green", domain="cudf_python")
     def month(self):
         """
         The month as January=1, December=12.
@@ -1647,7 +1727,8 @@ def month(self):
         """  # noqa: E501
         return self._get_dt_field("month")
 
-    @property
+    @property  # type: ignore
+    @annotate("DATETIMEINDEX_DAY", color="green", domain="cudf_python")
     def day(self):
         """
         The day of the datetime.
@@ -1665,7 +1746,8 @@ def day(self):
         """  # noqa: E501
         return self._get_dt_field("day")
 
-    @property
+    @property  # type: ignore
+    @annotate("DATETIMEINDEX_HOUR", color="green", domain="cudf_python")
     def hour(self):
         """
         The hours of the datetime.
@@ -1685,7 +1767,8 @@ def hour(self):
         """
         return self._get_dt_field("hour")
 
-    @property
+    @property  # type: ignore
+    @annotate("DATETIMEINDEX_MINUTE", color="green", domain="cudf_python")
     def minute(self):
         """
         The minutes of the datetime.
@@ -1705,7 +1788,8 @@ def minute(self):
         """
         return self._get_dt_field("minute")
 
-    @property
+    @property  # type: ignore
+    @annotate("DATETIMEINDEX_SECOND", color="green", domain="cudf_python")
     def second(self):
         """
         The seconds of the datetime.
@@ -1725,7 +1809,8 @@ def second(self):
         """
         return self._get_dt_field("second")
 
-    @property
+    @property  # type: ignore
+    @annotate("DATETIMEINDEX_WEEKDAY", color="green", domain="cudf_python")
     def weekday(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -1746,7 +1831,8 @@ def weekday(self):
         """
         return self._get_dt_field("weekday")
 
-    @property
+    @property  # type: ignore
+    @annotate("DATETIMEINDEX_DAYOFWEEK", color="green", domain="cudf_python")
     def dayofweek(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -1767,7 +1853,8 @@ def dayofweek(self):
         """
         return self._get_dt_field("weekday")
 
-    @property
+    @property  # type: ignore
+    @annotate("DATETIMEINDEX_DAYOFYEAR", color="green", domain="cudf_python")
     def dayofyear(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -1789,7 +1876,8 @@ def dayofyear(self):
         """
         return self._get_dt_field("day_of_year")
 
-    @property
+    @property  # type: ignore
+    @annotate("DATETIMEINDEX_DAY_OF_YEAR", color="green", domain="cudf_python")
     def day_of_year(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -1811,7 +1899,10 @@ def day_of_year(self):
         """
         return self._get_dt_field("day_of_year")
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "DATETIMEINDEX_IS_LEAP_YEAR", color="green", domain="cudf_python"
+    )
     def is_leap_year(self):
         """
         Boolean indicator if the date belongs to a leap year.
@@ -1829,7 +1920,8 @@ def is_leap_year(self):
         res = is_leap_year(self._values).fillna(False)
         return cupy.asarray(res)
 
-    @property
+    @property  # type: ignore
+    @annotate("DATETIMEINDEX_QUARTER", color="green", domain="cudf_python")
     def quarter(self):
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -1854,6 +1946,7 @@ def quarter(self):
         res = extract_quarter(self._values)
         return Int8Index(res, dtype="int8")
 
+    @annotate("DATETIMEINDEX_ISOCALENDAR", color="green", domain="cudf_python")
     def isocalendar(self):
         """
         Returns a DataFrame with the year, week, and day
@@ -1875,10 +1968,14 @@ def isocalendar(self):
         """
         return cudf.core.tools.datetimes._to_iso_calendar(self)
 
+    @annotate("DATETIMEINDEX_TO_PANDAS", color="green", domain="cudf_python")
     def to_pandas(self):
         nanos = self._values.astype("datetime64[ns]")
         return pd.DatetimeIndex(nanos.to_pandas(), name=self.name)
 
+    @annotate(
+        "DATETIMEINDEX_GET_DT_FIELD", color="green", domain="cudf_python"
+    )
     def _get_dt_field(self, field):
         out_column = self._values.get_dt_field(field)
         # column.column_empty_like always returns a Column object
@@ -1895,6 +1992,7 @@ def _get_dt_field(self, field):
     def is_boolean(self):
         return False
 
+    @annotate("DATETIMEINDEX_CEIL", color="green", domain="cudf_python")
     def ceil(self, freq):
         """
         Perform ceil operation on the data to the specified freq.
@@ -1927,6 +2025,7 @@ def ceil(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
+    @annotate("DATETIMEINDEX_FLOOR", color="green", domain="cudf_python")
     def floor(self, freq):
         """
         Perform floor operation on the data to the specified freq.
@@ -1959,6 +2058,7 @@ def floor(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
+    @annotate("DATETIMEINDEX_ROUND", color="green", domain="cudf_python")
     def round(self, freq):
         """
         Perform round operation on the data to the specified freq.
@@ -2041,6 +2141,7 @@ class TimedeltaIndex(GenericIndex):
                   dtype='timedelta64[s]', name='delta-index')
     """
 
+    @annotate("TIMEDELTAINDEX_INIT", color="green", domain="cudf_python")
     def __init__(
         self,
         data=None,
@@ -2072,6 +2173,7 @@ def __init__(
             data = column.as_column(np.array(data, dtype=dtype))
         super().__init__(data, **kwargs)
 
+    @annotate("TIMEDELTAINDEX_TO_PANDAS", color="green", domain="cudf_python")
     def to_pandas(self):
         return pd.TimedeltaIndex(
             self._values.to_pandas(),
@@ -2079,28 +2181,36 @@ def to_pandas(self):
             unit=self._values.time_unit,
         )
 
-    @property
+    @property  # type: ignore
+    @annotate("TIMEDELTAINDEX_INIT", color="green", domain="cudf_python")
     def days(self):
         """
         Number of days for each element.
         """
         return as_index(arbitrary=self._values.days, name=self.name)
 
-    @property
+    @property  # type: ignore
+    @annotate("TIMEDELTAINDEX_SECONDS", color="green", domain="cudf_python")
     def seconds(self):
         """
         Number of seconds (>= 0 and less than 1 day) for each element.
         """
         return as_index(arbitrary=self._values.seconds, name=self.name)
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "TIMEDELTAINDEX_MICROSECONDS", color="green", domain="cudf_python"
+    )
     def microseconds(self):
         """
         Number of microseconds (>= 0 and less than 1 second) for each element.
         """
         return as_index(arbitrary=self._values.microseconds, name=self.name)
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "TIMEDELTAINDEX_NANOSECONDS", color="green", domain="cudf_python"
+    )
     def nanoseconds(self):
         """
         Number of nanoseconds (>= 0 and less than 1 microsecond) for each
@@ -2108,7 +2218,8 @@ def nanoseconds(self):
         """
         return as_index(arbitrary=self._values.nanoseconds, name=self.name)
 
-    @property
+    @property  # type: ignore
+    @annotate("TIMEDELTAINDEX_COMPONENTS", color="green", domain="cudf_python")
     def components(self):
         """
         Return a dataframe of the components (days, hours, minutes,
@@ -2174,6 +2285,7 @@ class CategoricalIndex(GenericIndex):
     CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, dtype='category', name='a')
     """  # noqa: E501
 
+    @annotate("CATEGORICALINDEX_INIT", color="green", domain="cudf_python")
     def __init__(
         self,
         data=None,
@@ -2228,14 +2340,18 @@ def __init__(
 
         super().__init__(data, **kwargs)
 
-    @property
+    @property  # type: ignore
+    @annotate("CATEGORICALINDEX_CODES", color="green", domain="cudf_python")
     def codes(self):
         """
         The category codes of this categorical.
         """
         return as_index(self._values.codes)
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "CATEGORICALINDEX_CATEGORIES", color="green", domain="cudf_python"
+    )
     def categories(self):
         """
         The categories of this categorical.
@@ -2249,6 +2365,7 @@ def is_categorical(self):
         return True
 
 
+@annotate("INDEX_INTERVAL_RANGE", color="green", domain="cudf_python")
 def interval_range(
     start=None, end=None, periods=None, freq=None, name=None, closed="right",
 ) -> "IntervalIndex":
@@ -2411,6 +2528,7 @@ class IntervalIndex(GenericIndex):
     IntervalIndex
     """
 
+    @annotate("INTERVALINDEX_INIT", color="green", domain="cudf_python")
     def __init__(
         self, data, closed=None, dtype=None, copy=False, name=None,
     ):
@@ -2435,6 +2553,7 @@ def __init__(
         self.closed = closed
         super().__init__(data, **kwargs)
 
+    @annotate("INTERVALINDEX_FROM_BREAKS", color="green", domain="cudf_python")
     def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):
         """
         Construct an IntervalIndex from an array of splits.
@@ -2491,6 +2610,7 @@ class StringIndex(GenericIndex):
     name: A string
     """
 
+    @annotate("STRINGINDEX_INIT", color="green", domain="cudf_python")
     def __init__(self, values, copy=False, **kwargs):
         kwargs = _setdefault_name(values, **kwargs)
         if isinstance(values, StringColumn):
@@ -2506,11 +2626,13 @@ def __init__(self, values, copy=False, **kwargs):
 
         super().__init__(values, **kwargs)
 
+    @annotate("STRINGINDEX_TO_PANDAS", color="green", domain="cudf_python")
     def to_pandas(self):
         return pd.Index(
             self.to_numpy(na_value=None), name=self.name, dtype="object"
         )
 
+    @annotate("STRINGINDEX_REPR", color="green", domain="cudf_python")
     def __repr__(self):
         return (
             f"{self.__class__.__name__}({self._values.values_host},"
@@ -2525,6 +2647,7 @@ def __repr__(self):
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
+    @annotate("STRINGINDEX_STR", color="green", domain="cudf_python")
     def str(self):
         return StringMethods(parent=self)
 
@@ -2545,6 +2668,7 @@ def is_object(self):
         return True
 
 
+@annotate("INDEX_AS_INDEX", color="green", domain="cudf_python")
 def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
     """Create an Index from an arbitrary object
 
@@ -2673,6 +2797,7 @@ class Index(BaseIndex, metaclass=IndexMeta):
                 names=['a', 'b'])
     """
 
+    @annotate("INDEX_INIT", color="green", domain="cudf_python")
     def __new__(
         cls,
         data=None,
@@ -2701,6 +2826,7 @@ def __new__(
         )
 
     @classmethod
+    @annotate("INDEX_FROM_ARROW", color="green", domain="cudf_python")
     def from_arrow(cls, obj):
         try:
             return cls(ColumnBase.from_arrow(obj))
@@ -2709,6 +2835,7 @@ def from_arrow(cls, obj):
             return cudf.MultiIndex.from_arrow(obj)
 
 
+@annotate("INDEX_CONCAT_RANGE_INDEX", color="green", domain="cudf_python")
 def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
     """
     An internal Utility function to concat RangeIndex objects.
@@ -2749,6 +2876,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
     return RangeIndex(start, stop, step)
 
 
+@annotate("INDEX_EXTENDEX_GCD", color="green", domain="cudf_python")
 def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]:
     """
     Extended Euclidean algorithms to solve Bezout's identity:
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index b09a2d39c14..4864ca5bae1 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -13,6 +13,7 @@
 import cupy
 import numpy as np
 import pandas as pd
+from nvtx import annotate
 from pandas._config import get_option
 
 import cudf
@@ -63,6 +64,7 @@ class MultiIndex(Frame, BaseIndex, NotIterable):
                )
     """
 
+    @annotate("MULTIINDEX_INIT", color="green", domain="cudf_python")
     def __init__(
         self,
         levels=None,
@@ -147,11 +149,13 @@ def __init__(
         self._name = None
         self.names = names
 
-    @property
+    @property  # type: ignore
+    @annotate("MULTIINDEX_NAMES_GETTER", color="green", domain="cudf_python")
     def names(self):
         return self._names
 
-    @names.setter
+    @names.setter  # type: ignore
+    @annotate("MULTIINDEX_NAMES_SETTER", color="green", domain="cudf_python")
     def names(self, value):
         value = [None] * self.nlevels if value is None else value
 
@@ -169,6 +173,7 @@ def names(self, value):
             )
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
+    @annotate("MULTIINDEX_RENAME", color="green", domain="cudf_python")
     def rename(self, names, inplace=False):
         """
         Alter MultiIndex level names
@@ -215,6 +220,7 @@ def rename(self, names, inplace=False):
         """
         return self.set_names(names, level=None, inplace=inplace)
 
+    @annotate("MULTIINDEX_SET_NAMES", color="green", domain="cudf_python")
     def set_names(self, names, level=None, inplace=False):
         names_is_list_like = is_list_like(names)
         level_is_list_like = is_list_like(level)
@@ -252,6 +258,7 @@ def set_names(self, names, level=None, inplace=False):
         return self._set_names(names=names, inplace=inplace)
 
     @classmethod
+    @annotate("MULTIINDEX_FROM_DATA", color="green", domain="cudf_python")
     def _from_data(
         cls,
         data: MutableMapping,
@@ -264,14 +271,17 @@ def _from_data(
             obj.name = name
         return obj
 
-    @property
+    @property  # type: ignore
+    @annotate("MULTIINDEX_NAME_GETTER", color="green", domain="cudf_python")
     def name(self):
         return self._name
 
-    @name.setter
+    @name.setter  # type: ignore
+    @annotate("MULTIINDEX_NAME_GETTER", color="green", domain="cudf_python")
     def name(self, value):
         self._name = value
 
+    @annotate("MULTIINDEX_COPY", color="green", domain="cudf_python")
     def copy(
         self,
         names=None,
@@ -367,6 +377,7 @@ def copy(
 
         return mi
 
+    @annotate("MULTIINDEX_REPR", color="green", domain="cudf_python")
     def __repr__(self):
         max_seq_items = get_option("display.max_seq_items") or len(self)
 
@@ -443,7 +454,8 @@ def __repr__(self):
         data_output = "\n".join(lines)
         return output_prefix + data_output
 
-    @property
+    @property  # type: ignore
+    @annotate("MULTIINDEX_CODES", color="green", domain="cudf_python")
     def codes(self):
         """
         Returns the codes of the underlying MultiIndex.
@@ -473,12 +485,14 @@ def codes(self):
             self._compute_levels_and_codes()
         return self._codes
 
-    @property
+    @property  # type: ignore
+    @annotate("MULTIINDEX_NLEVELS", color="green", domain="cudf_python")
     def nlevels(self):
         """Integer number of levels in this MultiIndex."""
         return len(self._data)
 
-    @property
+    @property  # type: ignore
+    @annotate("MULTIINDEX_LEVELS", color="green", domain="cudf_python")
     def levels(self):
         """
         Returns list of levels in the MultiIndex
@@ -515,11 +529,15 @@ def levels(self):
             self._compute_levels_and_codes()
         return self._levels
 
-    @property
+    @property  # type: ignore
+    @annotate("MULTIINDEX_NDIM", color="green", domain="cudf_python")
     def ndim(self):
         """Dimension of the data. For MultiIndex ndim is always 2."""
         return 2
 
+    @annotate(
+        "MULTIINDEX_GET_LEVEL_LABEL", color="green", domain="cudf_python"
+    )
     def _get_level_label(self, level):
         """Get name of the level.
 
@@ -536,6 +554,7 @@ def _get_level_label(self, level):
         else:
             return self._data.names[level]
 
+    @annotate("MULTIINDEX_ISIN", color="green", domain="cudf_python")
     def isin(self, values, level=None):
         """Return a boolean array where the index values are in values.
 
@@ -640,6 +659,11 @@ def where(self, cond, other=None, inplace=False):
             ".where is not supported for MultiIndex operations"
         )
 
+    @annotate(
+        "MULTIINDEX_COMPUTE_LEVELS_AND_CODES",
+        color="green",
+        domain="cudf_python",
+    )
     def _compute_levels_and_codes(self):
         levels = []
 
@@ -652,6 +676,9 @@ def _compute_levels_and_codes(self):
         self._levels = levels
         self._codes = cudf.DataFrame._from_data(codes)
 
+    @annotate(
+        "MULTIINDEX_COMPUTE_VALIDITY_MASK", color="green", domain="cudf_python"
+    )
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
         lookup = cudf.DataFrame()
@@ -680,6 +707,11 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
                     raise KeyError(row)
         return result
 
+    @annotate(
+        "MULTIINDEX_GET_VALID_INDICES_BY_TUPLE",
+        color="green",
+        domain="cudf_python",
+    )
     def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
         # Instructions for Slicing
         # if tuple, get first and last elements of tuple
@@ -707,6 +739,9 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
             return row_tuple
         return self._compute_validity_mask(index, row_tuple, max_length)
 
+    @annotate(
+        "MULTIINDEX_INDEX_AND_DOWNCAST", color="green", domain="cudf_python"
+    )
     def _index_and_downcast(self, result, index, index_key):
 
         if isinstance(index_key, (numbers.Number, slice)):
@@ -775,6 +810,7 @@ def _index_and_downcast(self, result, index, index_key):
             result.index = index
         return result
 
+    @annotate("MULTIINDEX_GET_ROW_MAJOR", color="green", domain="cudf_python")
     def _get_row_major(
         self,
         df: DataFrameOrSeries,
@@ -800,6 +836,9 @@ def _get_row_major(
         final = self._index_and_downcast(result, result.index, row_tuple)
         return final
 
+    @annotate(
+        "MULTIINDEX_VALIDATE_INDEXER", color="green", domain="cudf_python"
+    )
     def _validate_indexer(
         self,
         indexer: Union[
@@ -826,6 +865,7 @@ def _validate_indexer(
             for i in indexer:
                 self._validate_indexer(i)
 
+    @annotate("MULTIINDEX_EQ", color="green", domain="cudf_python")
     def __eq__(self, other):
         if isinstance(other, MultiIndex):
             for self_col, other_col in zip(
@@ -836,11 +876,13 @@ def __eq__(self, other):
             return self.names == other.names
         return NotImplemented
 
-    @property
+    @property  # type: ignore
+    @annotate("MULTIINDEX_SIZE", color="green", domain="cudf_python")
     def size(self):
         # The size of a MultiIndex is only dependent on the number of rows.
         return self._num_rows
 
+    @annotate("MULTIINDEX_TAKE", color="green", domain="cudf_python")
     def take(self, indices):
         if isinstance(indices, cudf.Series) and indices.has_nulls:
             raise ValueError("Column must have no nulls.")
@@ -848,6 +890,7 @@ def take(self, indices):
         obj.names = self.names
         return obj
 
+    @annotate("MULTIINDEX_SERIALIZE", color="green", domain="cudf_python")
     def serialize(self):
         header, frames = super().serialize()
         # Overwrite the names in _data with the true names.
@@ -855,6 +898,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
+    @annotate("MULTIINDEX_DESERIALIZE", color="green", domain="cudf_python")
     def deserialize(cls, header, frames):
         # Spoof the column names to construct the frame, then set manually.
         column_names = pickle.loads(header["column_names"])
@@ -862,6 +906,7 @@ def deserialize(cls, header, frames):
         obj = super().deserialize(header, frames)
         return obj._set_names(column_names)
 
+    @annotate("MULTIINDEX_GETITEM", color="green", domain="cudf_python")
     def __getitem__(self, index):
         flatten = isinstance(index, int)
 
@@ -884,6 +929,7 @@ def __getitem__(self, index):
         result.names = self.names
         return result
 
+    @annotate("MULTIINDEX_TO_FRAME", color="green", domain="cudf_python")
     def to_frame(self, index=True, name=None):
         # TODO: Currently this function makes a shallow copy, which is
         # incorrect. We want to make a deep copy, otherwise further
@@ -900,6 +946,9 @@ def to_frame(self, index=True, name=None):
             df.columns = name
         return df
 
+    @annotate(
+        "MULTIINDEX_GET_LEVEL_VALUES", color="green", domain="cudf_python"
+    )
     def get_level_values(self, level):
         """
         Return the values at the requested level
@@ -953,6 +1002,7 @@ def is_interval(self):
         return False
 
     @classmethod
+    @annotate("MULTIINDEX_CONCAT", color="green", domain="cudf_python")
     def _concat(cls, objs):
 
         source_data = [o.to_frame(index=False) for o in objs]
@@ -973,6 +1023,7 @@ def _concat(cls, objs):
         return cudf.MultiIndex.from_frame(source_data, names=names)
 
     @classmethod
+    @annotate("MULTIINDEX_FROM_TUPLES", color="green", domain="cudf_python")
     def from_tuples(cls, tuples, names=None):
         """
         Convert list of tuples to MultiIndex.
@@ -1009,7 +1060,8 @@ def from_tuples(cls, tuples, names=None):
         pdi = pd.MultiIndex.from_tuples(tuples, names=names)
         return cls.from_pandas(pdi)
 
-    @property
+    @property  # type: ignore
+    @annotate("MULTIINDEX_VALUES_HOST", color="green", domain="cudf_python")
     def values_host(self):
         """
         Return a numpy representation of the MultiIndex.
@@ -1036,7 +1088,8 @@ def values_host(self):
         """
         return self.to_pandas().values
 
-    @property
+    @property  # type: ignore
+    @annotate("MULTIINDEX_VALUES", color="green", domain="cudf_python")
     def values(self):
         """
         Return a CuPy representation of the MultiIndex.
@@ -1068,6 +1121,7 @@ def values(self):
         return self.to_frame(index=False).values
 
     @classmethod
+    @annotate("MULTIINDEX_FROM_FRAME", color="green", domain="cudf_python")
     def from_frame(cls, df, names=None):
         """
         Make a MultiIndex from a DataFrame.
@@ -1141,6 +1195,7 @@ def from_frame(cls, df, names=None):
         return obj
 
     @classmethod
+    @annotate("MULTIINDEX_FROM_PRODUCT", color="green", domain="cudf_python")
     def from_product(cls, arrays, names=None):
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
@@ -1181,6 +1236,7 @@ def from_product(cls, arrays, names=None):
         pdi = pd.MultiIndex.from_product(arrays, names=names)
         return cls.from_pandas(pdi)
 
+    @annotate("MULTIINDEX_POP_LEVELS", color="green", domain="cudf_python")
     def _poplevels(self, level):
         """
         Remove and return the specified levels from self.
@@ -1231,6 +1287,7 @@ def _poplevels(self, level):
 
         return popped
 
+    @annotate("MULTIINDEX_DROP_LEVEL", color="green", domain="cudf_python")
     def droplevel(self, level=-1):
         """
         Removes the specified levels from the MultiIndex.
@@ -1293,11 +1350,13 @@ def droplevel(self, level=-1):
         else:
             return mi
 
+    @annotate("MULTIINDEX_TO_PANDAS", color="green", domain="cudf_python")
     def to_pandas(self, nullable=False, **kwargs):
         result = self.to_frame(index=False).to_pandas(nullable=nullable)
         return pd.MultiIndex.from_frame(result, names=self.names)
 
     @classmethod
+    @annotate("MULTIINDEX_FROM_PANDAS", color="green", domain="cudf_python")
     def from_pandas(cls, multiindex, nan_as_null=None):
         """
         Convert from a Pandas MultiIndex
@@ -1334,10 +1393,16 @@ def from_pandas(cls, multiindex, nan_as_null=None):
         return cls.from_frame(df, names=multiindex.names)
 
     @cached_property
+    @annotate("MULTIINDEX_IS_UNIQUE", color="green", domain="cudf_python")
     def is_unique(self):
         return len(self) == len(self.unique())
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "MULTIINDEX_IS_MONOTONIC_INCREASING",
+        color="green",
+        domain="cudf_python",
+    )
     def is_monotonic_increasing(self):
         """
         Return if the index is monotonic increasing
@@ -1345,7 +1410,12 @@ def is_monotonic_increasing(self):
         """
         return self._is_sorted(ascending=None, null_position=None)
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "MULTIINDEX_IS_MONOTONIC_DECREASING",
+        color="green",
+        domain="cudf_python",
+    )
     def is_monotonic_decreasing(self):
         """
         Return if the index is monotonic decreasing
@@ -1355,6 +1425,7 @@ def is_monotonic_decreasing(self):
             ascending=[False] * len(self.levels), null_position=None
         )
 
+    @annotate("MULTIINDEX_FILLNA", color="green", domain="cudf_python")
     def fillna(self, value):
         """
         Fill null values with the specified value.
@@ -1395,6 +1466,7 @@ def fillna(self, value):
 
         return super().fillna(value=value)
 
+    @annotate("MULTIINDEX_UNIQUE", color="green", domain="cudf_python")
     def unique(self):
         return self.drop_duplicates(keep="first")
 
@@ -1408,6 +1480,7 @@ def _clean_nulls_from_index(self):
             index_df._clean_nulls_from_dataframe(index_df), names=self.names
         )
 
+    @annotate("MULTIINDEX_MEMORY_USAGE", color="green", domain="cudf_python")
     def memory_usage(self, deep=False):
         usage = sum(super().memory_usage(deep=deep).values())
         if self.levels:
@@ -1418,11 +1491,13 @@ def memory_usage(self, deep=False):
                 usage += col.memory_usage
         return usage
 
+    @annotate("MULTIINDEX_DIFFERENCE", color="green", domain="cudf_python")
     def difference(self, other, sort=None):
         if hasattr(other, "to_pandas"):
             other = other.to_pandas()
         return self.to_pandas().difference(other, sort)
 
+    @annotate("MULTIINDEX_APPEND", color="green", domain="cudf_python")
     def append(self, other):
         """
         Append a collection of MultiIndex objects together
@@ -1485,6 +1560,7 @@ def append(self, other):
 
         return MultiIndex._concat(to_concat)
 
+    @annotate("MULTIINDEX_ARRAY_FUNCTION", color="green", domain="cudf_python")
     def __array_function__(self, func, types, args, kwargs):
         cudf_df_module = MultiIndex
 
@@ -1531,6 +1607,7 @@ def _level_index_from_level(self, level):
                 ) from None
             return level
 
+    @annotate("MULTIINDEX_GET_LOC", color="green", domain="cudf_python")
     def get_loc(self, key, method=None, tolerance=None):
         """
         Get location for a label or a tuple of labels.
@@ -1667,6 +1744,7 @@ def _maybe_match_names(self, other):
             for self_name, other_name in zip(self.names, other.names)
         ]
 
+    @annotate("MULTIINDEX_UNION", color="green", domain="cudf_python")
     def _union(self, other, sort=None):
         # TODO: When to_frame is refactored to return a
         # deep copy in future, we should push most of the common
@@ -1692,6 +1770,7 @@ def _union(self, other, sort=None):
             return midx.sort_values()
         return midx
 
+    @annotate("MULTIINDEX_INTERSECTION", color="green", domain="cudf_python")
     def _intersection(self, other, sort=None):
         if self.names != other.names:
             deep = True
@@ -1714,6 +1793,9 @@ def _intersection(self, other, sort=None):
             return midx.sort_values()
         return midx
 
+    @annotate(
+        "MULTIINDEX_COPY_TYPE_METADATA", color="green", domain="cudf_python"
+    )
     def _copy_type_metadata(
         self, other: Frame, include_index: bool = True
     ) -> Frame:
@@ -1721,6 +1803,7 @@ def _copy_type_metadata(
         res._names = other._names
         return res
 
+    @annotate("MULTIINDEX_SPLIT_LEVELS", color="green", domain="cudf_python")
     def _split_columns_by_levels(self, levels):
         # This function assumes that for levels with duplicate names, they are
         # specified by indices, not name by ``levels``. E.g. [None, None] can
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 740be91eb9d..e315b24851a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -14,6 +14,7 @@
 import cupy
 import numpy as np
 import pandas as pd
+from nvtx import annotate
 from pandas._config import get_option
 
 import cudf
@@ -95,6 +96,7 @@ class _SeriesIlocIndexer(_FrameIndexer):
     For integer-location based selection.
     """
 
+    @annotate("SERIESILOC_GETITEM", color="green", domain="cudf_python")
     def __getitem__(self, arg):
         if isinstance(arg, tuple):
             arg = list(arg)
@@ -110,6 +112,7 @@ def __getitem__(self, arg):
             {self._frame.name: data}, index=cudf.Index(self._frame.index[arg]),
         )
 
+    @annotate("SERIESILOC_SETITEM", color="green", domain="cudf_python")
     def __setitem__(self, key, value):
         from cudf.core.column import column
 
@@ -153,6 +156,7 @@ class _SeriesLocIndexer(_FrameIndexer):
     Label-based selection
     """
 
+    @annotate("SERIESLOC_GETITEM", color="green", domain="cudf_python")
     def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
         if isinstance(arg, pd.MultiIndex):
             arg = cudf.from_pandas(arg)
@@ -175,6 +179,7 @@ def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
 
         return self._frame.iloc[arg]
 
+    @annotate("SERIESLOC_SETITEM", color="green", domain="cudf_python")
     def __setitem__(self, key, value):
         try:
             key = self._loc_to_iloc(key)
@@ -298,6 +303,7 @@ def _constructor_expanddim(self):
         return cudf.DataFrame
 
     @classmethod
+    @annotate("SERIES_FROM_CATEGORICAL", color="green", domain="cudf_python")
     def from_categorical(cls, categorical, codes=None):
         """Creates from a pandas.Categorical
 
@@ -338,6 +344,7 @@ def from_categorical(cls, categorical, codes=None):
         return Series(data=col)
 
     @classmethod
+    @annotate("SERIES_FROM_MASKED_ARRAY", color="green", domain="cudf_python")
     def from_masked_array(cls, data, mask, null_count=None):
         """Create a Series with null-mask.
         This is equivalent to:
@@ -386,6 +393,7 @@ def from_masked_array(cls, data, mask, null_count=None):
         col = column.as_column(data).set_mask(mask)
         return cls(data=col)
 
+    @annotate("SERIES_INIT", color="green", domain="cudf_python")
     def __init__(
         self, data=None, index=None, dtype=None, name=None, nan_as_null=True,
     ):
@@ -447,6 +455,7 @@ def __init__(
         self._index = RangeIndex(len(data)) if index is None else index
 
     @classmethod
+    @annotate("SERIES_FROM_DATA", color="green", domain="cudf_python")
     def _from_data(
         cls,
         data: MutableMapping,
@@ -461,10 +470,12 @@ def _from_data(
             out._index = RangeIndex(out._data.nrows)
         return out
 
+    @annotate("SERIES_CONTAINS", color="green", domain="cudf_python")
     def __contains__(self, item):
         return item in self._index
 
     @classmethod
+    @annotate("SERIES_FROM_PANDAS", color="green", domain="cudf_python")
     def from_pandas(cls, s, nan_as_null=None):
         """
         Convert from a Pandas Series.
@@ -505,7 +516,8 @@ def from_pandas(cls, s, nan_as_null=None):
         """
         return cls(s, nan_as_null=nan_as_null)
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT", color="green", domain="cudf_python")
     def dt(self):
         """
         Accessor object for datetime-like properties of the Series values.
@@ -546,6 +558,7 @@ def dt(self):
                 "Can only use .dt accessor with datetimelike values"
             )
 
+    @annotate("SERIES_SERIALIZE", color="green", domain="cudf_python")
     def serialize(self):
         header, frames = super().serialize()
 
@@ -558,6 +571,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
+    @annotate("SERIES_DESERIALIZE", color="green", domain="cudf_python")
     def deserialize(cls, header, frames):
         index_nframes = header["index_frame_count"]
         obj = super().deserialize(
@@ -584,6 +598,7 @@ def _get_columns_by_label(self, labels, downcast=False):
             else self.__class__(dtype=self.dtype, name=self.name)
         )
 
+    @annotate("SERIES_DROP", color="green", domain="cudf_python")
     def drop(
         self,
         labels=None,
@@ -708,6 +723,7 @@ def drop(
         if not inplace:
             return out
 
+    @annotate("SERIES_APPEND", color="green", domain="cudf_python")
     def append(self, to_append, ignore_index=False, verify_integrity=False):
         """Append values from another ``Series`` or array-like object.
         If ``ignore_index=True``, the index is reset.
@@ -794,6 +810,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False):
 
         return cudf.concat(to_concat, ignore_index=ignore_index)
 
+    @annotate("SERIES_REINDEX", color="green", domain="cudf_python")
     def reindex(self, index=None, copy=True):
         """Return a Series that conforms to a new index
 
@@ -829,6 +846,7 @@ def reindex(self, index=None, copy=True):
         series.name = self.name
         return series
 
+    @annotate("SERIES_RESET_INDEX", color="green", domain="cudf_python")
     @docutils.doc_apply(
         doc_reset_index_template.format(
             klass="Series",
@@ -910,6 +928,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False):
             inplace=inplace,
         )
 
+    @annotate("SERIES_TO_FRAME", color="green", domain="cudf_python")
     def to_frame(self, name=None):
         """Convert Series into a DataFrame
 
@@ -952,9 +971,11 @@ def to_frame(self, name=None):
 
         return cudf.DataFrame({col: self._column}, index=self.index)
 
+    @annotate("SERIES_MEMORY_USAGE", color="green", domain="cudf_python")
     def memory_usage(self, index=True, deep=False):
         return sum(super().memory_usage(index, deep).values())
 
+    @annotate("SERIES_ARRAY_FUNCTION", color="green", domain="cudf_python")
     def __array_function__(self, func, types, args, kwargs):
         if "out" in kwargs or not all(issubclass(t, Series) for t in types):
             return NotImplemented
@@ -1010,6 +1031,7 @@ def __array_function__(self, func, types, args, kwargs):
 
         return NotImplemented
 
+    @annotate("SERIES_MAP", color="green", domain="cudf_python")
     def map(self, arg, na_action=None) -> "Series":
         """
         Map values of Series according to input correspondence.
@@ -1111,6 +1133,7 @@ def map(self, arg, na_action=None) -> "Series":
             result = self.applymap(arg)
         return result
 
+    @annotate("SERIES_GETITEM", color="green", domain="cudf_python")
     def __getitem__(self, arg):
         if isinstance(arg, slice):
             return self.iloc[arg]
@@ -1121,6 +1144,7 @@ def __getitem__(self, arg):
 
     items = SingleColumnFrame.__iter__
 
+    @annotate("SERIES_SETITEM", color="green", domain="cudf_python")
     def __setitem__(self, key, value):
         if isinstance(key, slice):
             self.iloc[key] = value
@@ -1272,6 +1296,7 @@ def _make_operands_and_index_for_binop(
         operands = lhs._make_operands_for_binop(other, fill_value, reflect)
         return operands, lhs._index
 
+    @annotate("SERIES_LOGICAL_AND", color="green", domain="cudf_python")
     def logical_and(self, other):
         warnings.warn(
             "Series.logical_and is deprecated and will be removed.",
@@ -1279,6 +1304,7 @@ def logical_and(self, other):
         )
         return self._binaryop(other, "__l_and__").astype(np.bool_)
 
+    @annotate("SERIES_REMAINDER", color="green", domain="cudf_python")
     def remainder(self, other):
         warnings.warn(
             "Series.remainder is deprecated and will be removed.",
@@ -1286,6 +1312,7 @@ def remainder(self, other):
         )
         return self._binaryop(other, "__mod__")
 
+    @annotate("SERIES_LOGICAL_OR", color="green", domain="cudf_python")
     def logical_or(self, other):
         warnings.warn(
             "Series.logical_or is deprecated and will be removed.",
@@ -1293,6 +1320,7 @@ def logical_or(self, other):
         )
         return self._binaryop(other, "__l_or__").astype(np.bool_)
 
+    @annotate("SERIES_LOGICAL_NOT", color="green", domain="cudf_python")
     def logical_not(self):
         warnings.warn(
             "Series.logical_not is deprecated and will be removed.",
@@ -1302,30 +1330,36 @@ def logical_not(self):
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
+    @annotate("SERIES_CAT", color="green", domain="cudf_python")
     def cat(self):
         return CategoricalAccessor(parent=self)
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
+    @annotate("SERIES_STR", color="green", domain="cudf_python")
     def str(self):
         return StringMethods(parent=self)
 
     @copy_docstring(ListMethods)  # type: ignore
     @property
+    @annotate("SERIES_LIST", color="green", domain="cudf_python")
     def list(self):
         return ListMethods(parent=self)
 
     @copy_docstring(StructMethods)  # type: ignore
     @property
+    @annotate("SERIES_STRUCT", color="green", domain="cudf_python")
     def struct(self):
         return StructMethods(parent=self)
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DTYPE", color="green", domain="cudf_python")
     def dtype(self):
         """dtype of the Series"""
         return self._column.dtype
 
     @classmethod
+    @annotate("SERIES_CONCAT", color="green", domain="cudf_python")
     def _concat(cls, objs, axis=0, index=True):
         # Concatenate index if not provided
         if index is True:
@@ -1395,22 +1429,26 @@ def _concat(cls, objs, axis=0, index=True):
 
         return cls(data=col, index=index, name=name)
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_VALID_COUNT", color="green", domain="cudf_python")
     def valid_count(self):
         """Number of non-null values"""
         return self._column.valid_count
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_NULL_COUNT", color="green", domain="cudf_python")
     def null_count(self):
         """Number of null values"""
         return self._column.null_count
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_NULLABLE", color="green", domain="cudf_python")
     def nullable(self):
         """A boolean indicating whether a null-mask is needed"""
         return self._column.nullable
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_HAS_NULLS", color="green", domain="cudf_python")
     def has_nulls(self):
         """
         Indicator whether Series contains null values.
@@ -1439,13 +1477,14 @@ def has_nulls(self):
         """
         return self._column.has_nulls()
 
+    @annotate("SERIES_DROPNA", color="green", domain="cudf_python")
     def dropna(self, axis=0, inplace=False, how=None):
         """
         Return a Series with null values removed.
 
         Parameters
         ----------
-        axis : {0 or ‘index’}, default 0
+        axis : {0 or 'index'}, default 0
             There is only one axis to drop values from.
         inplace : bool, default False
             If True, do operation inplace and return None.
@@ -1518,6 +1557,7 @@ def dropna(self, axis=0, inplace=False, how=None):
 
         return self._mimic_inplace(result, inplace=inplace)
 
+    @annotate("SERIES_DROP_DUPLICATES", color="green", domain="cudf_python")
     def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         """
         Return Series with duplicate values removed.
@@ -1553,9 +1593,9 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         Name: animal, dtype: object
 
         With the `keep` parameter, the selection behaviour of duplicated
-        values can be changed. The value ‘first’ keeps the first
+        values can be changed. The value 'first' keeps the first
         occurrence for each set of duplicated entries.
-        The default value of keep is ‘first’. Note that order of
+        The default value of keep is 'first'. Note that order of
         the rows being returned is not guaranteed
         to be sorted.
 
@@ -1566,7 +1606,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         0      lama
         Name: animal, dtype: object
 
-        The value ‘last’ for parameter `keep` keeps the last occurrence
+        The value 'last' for parameter `keep` keeps the last occurrence
         for each set of duplicated entries.
 
         >>> s.drop_duplicates(keep='last')
@@ -1591,6 +1631,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
 
         return self._mimic_inplace(result, inplace=inplace)
 
+    @annotate("SERIES_FILLNA", color="green", domain="cudf_python")
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):
@@ -1614,6 +1655,7 @@ def fillna(
             value=value, method=method, axis=axis, inplace=inplace, limit=limit
         )
 
+    @annotate("SERIES_ALL", color="green", domain="cudf_python")
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1621,6 +1663,7 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
             )
         return super().all(axis, skipna, level, **kwargs)
 
+    @annotate("SERIES_ANY", color="green", domain="cudf_python")
     def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1628,6 +1671,7 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
             )
         return super().any(axis, skipna, level, **kwargs)
 
+    @annotate("SERIES_TO_PANDAS", color="green", domain="cudf_python")
     def to_pandas(self, index=True, nullable=False, **kwargs):
         """
         Convert to a Pandas Series.
@@ -1691,7 +1735,8 @@ def to_pandas(self, index=True, nullable=False, **kwargs):
         s.name = self.name
         return s
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DATA", color="green", domain="cudf_python")
     def data(self):
         """The gpu buffer for the data
 
@@ -1717,11 +1762,13 @@ def data(self):
         """  # noqa: E501
         return self._column.data
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_NULLMASK", color="green", domain="cudf_python")
     def nullmask(self):
         """The gpu buffer for the null-mask"""
         return cudf.Series(self._column.nullmask)
 
+    @annotate("SERIES_ASTYPE", color="green", domain="cudf_python")
     def astype(self, dtype, copy=False, errors="raise"):
         """
         Cast the Series to the given dtype
@@ -1828,11 +1875,13 @@ def astype(self, dtype, copy=False, errors="raise"):
                 pass
             return self
 
+    @annotate("SERIES_SORT_INDEX", color="green", domain="cudf_python")
     def sort_index(self, axis=0, *args, **kwargs):
         if axis not in (0, "index"):
             raise ValueError("Only axis=0 is valid for Series.")
         return super().sort_index(axis=axis, *args, **kwargs)
 
+    @annotate("SERIES_SORT_VALUES", color="green", domain="cudf_python")
     def sort_values(
         self,
         axis=0,
@@ -1887,6 +1936,7 @@ def sort_values(
             ignore_index=ignore_index,
         )
 
+    @annotate("SERIES_NLARGEST", color="green", domain="cudf_python")
     def nlargest(self, n=5, keep="first"):
         """Returns a new Series of the *n* largest element.
 
@@ -1949,6 +1999,7 @@ def nlargest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(True, n, [self.name], keep)
 
+    @annotate("SERIES_NSMALLEST", color="green", domain="cudf_python")
     def nsmallest(self, n=5, keep="first"):
         """
         Returns a new Series of the *n* smallest element.
@@ -2024,6 +2075,7 @@ def nsmallest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, [self.name], keep)
 
+    @annotate("SERIES_ARGSORT", color="green", domain="cudf_python")
     def argsort(
         self,
         axis=0,
@@ -2046,6 +2098,7 @@ def argsort(
         obj.name = self.name
         return obj
 
+    @annotate("SERIES_REPLACE", color="green", domain="cudf_python")
     def replace(self, to_replace=None, value=None, *args, **kwargs):
         if is_dict_like(to_replace) and value is not None:
             raise ValueError(
@@ -2055,6 +2108,7 @@ def replace(self, to_replace=None, value=None, *args, **kwargs):
 
         return super().replace(to_replace, value, *args, **kwargs)
 
+    @annotate("SERIES_UPDATE", color="green", domain="cudf_python")
     def update(self, other):
         """
         Modify Series in place using values from passed Series.
@@ -2159,6 +2213,7 @@ def update(self, other):
 
         self.mask(mask, other, inplace=True)
 
+    @annotate("SERIES_LABEL_ENCODING", color="green", domain="cudf_python")
     def _label_encoding(self, cats, dtype=None, na_sentinel=-1):
         # Private implementation of deprecated public label_encoding method
         def _return_sentinel_series():
@@ -2202,6 +2257,7 @@ def _return_sentinel_series():
         return codes
 
     # UDF related
+    @annotate("SERIES_APPLY", color="green", domain="cudf_python")
     def apply(self, func, convert_dtype=True, args=(), **kwargs):
         """
         Apply a scalar function to the values of a Series.
@@ -2290,6 +2346,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
             raise ValueError("Series.apply only supports convert_dtype=True")
         return self._apply(func, _get_scalar_kernel, *args, **kwargs)
 
+    @annotate("SERIES_APPLY_MAP", color="green", domain="cudf_python")
     def applymap(self, udf, out_dtype=None):
         """Apply an elementwise function to transform the values in the Column.
 
@@ -2405,6 +2462,7 @@ def applymap(self, udf, out_dtype=None):
     #
     # Stats
     #
+    @annotate("SERIES_COUNT", color="green", domain="cudf_python")
     def count(self, level=None, **kwargs):
         """
         Return number of non-NA/null observations in the Series
@@ -2431,6 +2489,7 @@ def count(self, level=None, **kwargs):
 
         return self.valid_count
 
+    @annotate("SERIES_MODE", color="green", domain="cudf_python")
     def mode(self, dropna=True):
         """
         Return the mode(s) of the dataset.
@@ -2499,6 +2558,7 @@ def mode(self, dropna=True):
 
         return Series(val_counts.index.sort_values(), name=self.name)
 
+    @annotate("SERIES_ROUND", color="green", domain="cudf_python")
     def round(self, decimals=0, how="half_even"):
         if not is_integer(decimals):
             raise ValueError(
@@ -2507,6 +2567,7 @@ def round(self, decimals=0, how="half_even"):
         decimals = int(decimals)
         return super().round(decimals, how)
 
+    @annotate("SERIES_COV", color="green", domain="cudf_python")
     def cov(self, other, min_periods=None):
         """
         Compute covariance with Series, excluding missing values.
@@ -2556,6 +2617,7 @@ def cov(self, other, min_periods=None):
                 f"{other.dtype}"
             )
 
+    @annotate("SERIES_TRANSPOSE", color="green", domain="cudf_python")
     def transpose(self):
         """Return the transpose, which is by definition self.
         """
@@ -2564,6 +2626,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
+    @annotate("SERIES_CORR", color="green", domain="cudf_python")
     def corr(self, other, method="pearson", min_periods=None):
         """Calculates the sample correlation between two Series,
         excluding missing values.
@@ -2597,6 +2660,7 @@ def corr(self, other, method="pearson", min_periods=None):
                 f"cannot perform corr with types {self.dtype}, {other.dtype}"
             )
 
+    @annotate("SERIES_AUTOCORR", color="green", domain="cudf_python")
     def autocorr(self, lag=1):
         """Compute the lag-N autocorrelation. This method computes the Pearson
         correlation between the Series and its shifted self.
@@ -2622,6 +2686,7 @@ def autocorr(self, lag=1):
         """
         return self.corr(self.shift(lag))
 
+    @annotate("SERIES_ISIN", color="green", domain="cudf_python")
     def isin(self, values):
         """Check whether values are contained in Series.
 
@@ -2691,6 +2756,7 @@ def isin(self, values):
             {self.name: self._column.isin(values)}, index=self.index
         )
 
+    @annotate("SERIES_UNIQUE", color="green", domain="cudf_python")
     def unique(self):
         """
         Returns unique values of this Series.
@@ -2723,6 +2789,7 @@ def unique(self):
         res = self._column.unique()
         return Series(res, name=self.name)
 
+    @annotate("SERIES_VALUE_COUNTS", color="green", domain="cudf_python")
     def value_counts(
         self,
         normalize=False,
@@ -2845,6 +2912,7 @@ def value_counts(
             res = res / float(res._column.sum())
         return res
 
+    @annotate("SERIES_QUANTILE", color="green", domain="cudf_python")
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True
     ):
@@ -2909,6 +2977,7 @@ def quantile(
         return Series(result, index=index, name=self.name)
 
     @docutils.doc_describe()
+    @annotate("SERIES_DESCRIBE", color="green", domain="cudf_python")
     def describe(
         self,
         percentiles=None,
@@ -3064,6 +3133,7 @@ def _describe_timestamp(self):
         else:
             return _describe_categorical(self)
 
+    @annotate("SERIES_DIGITIZE", color="green", domain="cudf_python")
     def digitize(self, bins, right=False):
         """Return the indices of the bins to which each value in series belongs.
 
@@ -3099,6 +3169,7 @@ def digitize(self, bins, right=False):
             cudf.core.column.numerical.digitize(self._column, bins, right)
         )
 
+    @annotate("SERIES_DIFF", color="green", domain="cudf_python")
     def diff(self, periods=1):
         """Calculate the difference between values at positions i and i - N in
         an array and store the output in a new array.
@@ -3187,6 +3258,7 @@ def diff(self, periods=1):
         return Series(output_col, name=self.name, index=self.index)
 
     @copy_docstring(SeriesGroupBy)
+    @annotate("SERIES_GROUPBY", color="green", domain="cudf_python")
     def groupby(
         self,
         by=None,
@@ -3232,6 +3304,7 @@ def groupby(
             )
         )
 
+    @annotate("SERIES_RENAME", color="green", domain="cudf_python")
     def rename(self, index=None, copy=True):
         """
         Alter Series name
@@ -3277,6 +3350,7 @@ def rename(self, index=None, copy=True):
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
+    @annotate("SERIES_MERGE", color="green", domain="cudf_python")
     def merge(
         self,
         other,
@@ -3328,18 +3402,21 @@ def merge(
 
         return result
 
+    @annotate("SERIES_ADD_PREFIX", color="green", domain="cudf_python")
     def add_prefix(self, prefix):
         return Series._from_data(
             data=self._data.copy(deep=True),
             index=prefix + self.index.astype(str),
         )
 
+    @annotate("SERIES_ADD_SUFFIX", color="green", domain="cudf_python")
     def add_suffix(self, suffix):
         return Series._from_data(
             data=self._data.copy(deep=True),
             index=self.index.astype(str) + suffix,
         )
 
+    @annotate("SERIES_KEYS", color="green", domain="cudf_python")
     def keys(self):
         """
         Return alias for index.
@@ -3383,6 +3460,7 @@ def keys(self):
         """
         return self.index
 
+    @annotate("SERIES_EXPLODE", color="green", domain="cudf_python")
     def explode(self, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -3424,6 +3502,7 @@ def explode(self, ignore_index=False):
 
         return super()._explode(self._column_names[0], ignore_index)
 
+    @annotate("SERIES_PCT_CHANGE", color="green", domain="cudf_python")
     def pct_change(
         self, periods=1, fill_method="ffill", limit=None, freq=None
     ):
@@ -3575,7 +3654,8 @@ class DatetimeProperties:
     def __init__(self, series):
         self.series = series
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_YEAR", color="green", domain="cudf_python")
     def year(self):
         """
         The year of the datetime.
@@ -3599,7 +3679,8 @@ def year(self):
         """
         return self._get_dt_field("year")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_MONTH", color="green", domain="cudf_python")
     def month(self):
         """
         The month as January=1, December=12.
@@ -3623,7 +3704,8 @@ def month(self):
         """
         return self._get_dt_field("month")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_DAY", color="green", domain="cudf_python")
     def day(self):
         """
         The day of the datetime.
@@ -3647,7 +3729,8 @@ def day(self):
         """
         return self._get_dt_field("day")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_HOUR", color="green", domain="cudf_python")
     def hour(self):
         """
         The hours of the datetime.
@@ -3671,7 +3754,8 @@ def hour(self):
         """
         return self._get_dt_field("hour")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_MINUTE", color="green", domain="cudf_python")
     def minute(self):
         """
         The minutes of the datetime.
@@ -3695,7 +3779,8 @@ def minute(self):
         """
         return self._get_dt_field("minute")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_SECOND", color="green", domain="cudf_python")
     def second(self):
         """
         The seconds of the datetime.
@@ -3719,7 +3804,8 @@ def second(self):
         """
         return self._get_dt_field("second")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_WEEKDAY", color="green", domain="cudf_python")
     def weekday(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -3755,7 +3841,8 @@ def weekday(self):
         """
         return self._get_dt_field("weekday")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_DAYOFWEEK", color="green", domain="cudf_python")
     def dayofweek(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -3791,7 +3878,8 @@ def dayofweek(self):
         """
         return self._get_dt_field("weekday")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_DAYOFYEAR", color="green", domain="cudf_python")
     def dayofyear(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -3828,7 +3916,8 @@ def dayofyear(self):
         """
         return self._get_dt_field("day_of_year")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_DAY_OF_YEAR", color="green", domain="cudf_python")
     def day_of_year(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -3865,7 +3954,8 @@ def day_of_year(self):
         """
         return self._get_dt_field("day_of_year")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_IS_LEAP_YEAR", color="green", domain="cudf_python")
     def is_leap_year(self):
         """
         Boolean indicator if the date belongs to a leap year.
@@ -3923,7 +4013,8 @@ def is_leap_year(self):
             name=self.series.name,
         )
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_QUARTER", color="green", domain="cudf_python")
     def quarter(self):
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -3954,6 +4045,7 @@ def quarter(self):
             {None: res}, index=self.series._index, name=self.series.name,
         )
 
+    @annotate("SERIES_DT_ISOCALENDAR", color="green", domain="cudf_python")
     def isocalendar(self):
         """
         Returns a DataFrame with the year, week, and day
@@ -3997,14 +4089,16 @@ def isocalendar(self):
         """
         return cudf.core.tools.datetimes._to_iso_calendar(self)
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_IS_MONTH_START", color="green", domain="cudf_python")
     def is_month_start(self):
         """
         Booleans indicating if dates are the first day of the month.
         """
         return (self.day == 1).fillna(False)
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_DAYS_IN_MONTH", color="green", domain="cudf_python")
     def days_in_month(self):
         """
         Get the total number of days in the month that the date falls on.
@@ -4055,7 +4149,8 @@ def days_in_month(self):
             name=self.series.name,
         )
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_IS_MONTH_END", color="green", domain="cudf_python")
     def is_month_end(self):
         """
         Boolean indicator if the date is the last day of the month.
@@ -4101,7 +4196,10 @@ def is_month_end(self):
         )
         return (self.day == last_day.dt.day).fillna(False)
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "SERIES_DT_IS_EQUARTER_START", color="green", domain="cudf_python"
+    )
     def is_quarter_start(self):
         """
         Boolean indicator if the date is the first day of a quarter.
@@ -4147,7 +4245,8 @@ def is_quarter_start(self):
             {None: result}, index=self.series._index, name=self.series.name,
         )
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_IS_QUARTER_END", color="green", domain="cudf_python")
     def is_quarter_end(self):
         """
         Boolean indicator if the date is the last day of a quarter.
@@ -4195,7 +4294,8 @@ def is_quarter_end(self):
             {None: result}, index=self.series._index, name=self.series.name,
         )
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_IS_YEAR_START", color="green", domain="cudf_python")
     def is_year_start(self):
         """
         Boolean indicator if the date is the first day of the year.
@@ -4229,7 +4329,8 @@ def is_year_start(self):
             name=self.series.name,
         )
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_DT_IS_YEAR_END", color="green", domain="cudf_python")
     def is_year_end(self):
         """
         Boolean indicator if the date is the last day of the year.
@@ -4265,12 +4366,14 @@ def is_year_end(self):
             {None: result}, index=self.series._index, name=self.series.name,
         )
 
+    @annotate("SERIES_DT_GET_DT_FIELD", color="green", domain="cudf_python")
     def _get_dt_field(self, field):
         out_column = self.series._column.get_dt_field(field)
         return Series(
             data=out_column, index=self.series._index, name=self.series.name
         )
 
+    @annotate("SERIES_DT_CEIL", color="green", domain="cudf_python")
     def ceil(self, freq):
         """
         Perform ceil operation on the data to the specified freq.
@@ -4307,6 +4410,7 @@ def ceil(self, freq):
             data={self.series.name: out_column}, index=self.series._index
         )
 
+    @annotate("SERIES_DT_FLOOR", color="green", domain="cudf_python")
     def floor(self, freq):
         """
         Perform floor operation on the data to the specified freq.
@@ -4343,6 +4447,7 @@ def floor(self, freq):
             data={self.series.name: out_column}, index=self.series._index
         )
 
+    @annotate("SERIES_DT_ROUND", color="green", domain="cudf_python")
     def round(self, freq):
         """
         Perform round operation on the data to the specified freq.
@@ -4382,6 +4487,7 @@ def round(self, freq):
             data={self.series.name: out_column}, index=self.series._index
         )
 
+    @annotate("SERIES_DT_STRFTIME", color="green", domain="cudf_python")
     def strftime(self, date_format, *args, **kwargs):
         """
         Convert to Series using specified ``date_format``.
@@ -4535,7 +4641,8 @@ class TimedeltaProperties:
     def __init__(self, series):
         self.series = series
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_TD_DAYS", color="green", domain="cudf_python")
     def days(self):
         """
         Number of days.
@@ -4566,7 +4673,8 @@ def days(self):
         """
         return self._get_td_field("days")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_TD_SECONDS", color="green", domain="cudf_python")
     def seconds(self):
         """
         Number of seconds (>= 0 and less than 1 day).
@@ -4604,7 +4712,8 @@ def seconds(self):
         """
         return self._get_td_field("seconds")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_TD_MICROSECONDS", color="green", domain="cudf_python")
     def microseconds(self):
         """
         Number of microseconds (>= 0 and less than 1 second).
@@ -4635,7 +4744,8 @@ def microseconds(self):
         """
         return self._get_td_field("microseconds")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_TD_NANOSECONDS", color="green", domain="cudf_python")
     def nanoseconds(self):
         """
         Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
@@ -4666,7 +4776,8 @@ def nanoseconds(self):
         """
         return self._get_td_field("nanoseconds")
 
-    @property
+    @property  # type: ignore
+    @annotate("SERIES_TD_COMPONENTS", color="green", domain="cudf_python")
     def components(self):
         """
         Return a Dataframe of the components of the Timedeltas.
@@ -4695,6 +4806,7 @@ def components(self):
         """  # noqa: E501
         return self.series._column.components(index=self.series._index)
 
+    @annotate("SERIES_TD_GET_TD_FIELD", color="green", domain="cudf_python")
     def _get_td_field(self, field):
         out_column = getattr(self.series._column, field)
         return Series(
@@ -4702,6 +4814,7 @@ def _get_td_field(self, field):
         )
 
 
+@annotate("SERIES__ALIGN_INDICES", color="green", domain="cudf_python")
 def _align_indices(series_list, how="outer", allow_non_unique=False):
     """
     Internal util to align the indices of a list of Series objects
@@ -4769,6 +4882,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
     return result
 
 
+@annotate("CUDF_ISCLOSE", color="green", domain="cudf_python")
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     """Returns a boolean array where two arrays are equal within a tolerance.
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 508bbfb3a9a..666b743f7ef 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -18,6 +18,7 @@
 import cupy
 import numpy as np
 import pandas as pd
+from nvtx import annotate
 
 import cudf
 from cudf._typing import Dtype
@@ -42,6 +43,7 @@ class SingleColumnFrame(Frame, NotIterable):
         "index": 0,
     }
 
+    @annotate("SINGLECOLUMNFRAME_REDUCE", color="green", domain="cudf_python")
     def _reduce(
         self, op, axis=None, level=None, numeric_only=None, **kwargs,
     ):
@@ -60,6 +62,7 @@ def _reduce(
         except AttributeError:
             raise TypeError(f"cannot perform {op} with type {self.dtype}")
 
+    @annotate("SINGLECOLUMNFRAME_SCAN", color="green", domain="cudf_python")
     def _scan(self, op, axis=None, *args, **kwargs):
         if axis not in (None, 0):
             raise NotImplementedError("axis parameter is not implemented yet")
@@ -67,6 +70,9 @@ def _scan(self, op, axis=None, *args, **kwargs):
         return super()._scan(op, axis=axis, *args, **kwargs)
 
     @classmethod
+    @annotate(
+        "SINGLECOLUMNFRAME_FROM_DATA", color="green", domain="cudf_python"
+    )
     def _from_data(
         cls,
         data: MutableMapping,
@@ -79,21 +85,27 @@ def _from_data(
             out.name = name
         return out
 
-    @property
+    @property  # type: ignore
+    @annotate("SINGLECOLUMNFRAME_NAME", color="green", domain="cudf_python")
     def name(self):
         """Get the name of this object."""
         return next(iter(self._data.names))
 
-    @name.setter
+    @name.setter  # type: ignore
+    @annotate(
+        "SINGLECOLUMNFRAME_NAME_SETTER", color="green", domain="cudf_python"
+    )
     def name(self, value):
         self._data[value] = self._data.pop(self.name)
 
-    @property
+    @property  # type: ignore
+    @annotate("SINGLECOLUMNFRAME_NDIM", color="green", domain="cudf_python")
     def ndim(self):
         """Get the dimensionality (always 1 for single-columned frames)."""
         return 1
 
-    @property
+    @property  # type: ignore
+    @annotate("SINGLECOLUMNFRAME_SHAPE", color="green", domain="cudf_python")
     def shape(self):
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
@@ -104,26 +116,38 @@ def __bool__(self):
             "a.empty, a.bool(), a.item(), a.any() or a.all()."
         )
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "SINGLECOLUMNFRAME_NUM_COLUMNS", color="green", domain="cudf_python"
+    )
     def _num_columns(self):
         return 1
 
-    @property
+    @property  # type: ignore
+    @annotate("SINGLECOLUMNFRAME_COLUMN", color="green", domain="cudf_python")
     def _column(self):
         return self._data[self.name]
 
-    @_column.setter
+    @_column.setter  # type: ignore
+    @annotate(
+        "SINGLECOLUMNFRAME_COLUMN_SETTER", color="green", domain="cudf_python"
+    )
     def _column(self, value):
         self._data[self.name] = value
 
-    @property
+    @property  # type: ignore
+    @annotate("SINGLECOLUMNFRAME_VALUES", color="green", domain="cudf_python")
     def values(self):  # noqa: D102
         return self._column.values
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "SINGLECOLUMNFRAME_VALUES_HOST", color="green", domain="cudf_python"
+    )
     def values_host(self):  # noqa: D102
         return self._column.values_host
 
+    @annotate("SINGLECOLUMNFRAME_TO_CUPY", color="green", domain="cudf_python")
     def to_cupy(
         self,
         dtype: Union[Dtype, None] = None,
@@ -132,6 +156,9 @@ def to_cupy(
     ) -> cupy.ndarray:  # noqa: D102
         return super().to_cupy(dtype, copy, na_value).flatten()
 
+    @annotate(
+        "SINGLECOLUMNFRAME_TO_NUMPY", color="green", domain="cudf_python"
+    )
     def to_numpy(
         self,
         dtype: Union[Dtype, None] = None,
@@ -151,6 +178,9 @@ def tolist(self):  # noqa: D102
     to_list = tolist
 
     @classmethod
+    @annotate(
+        "SINGLECOLUMNFRAME_FROM_ARROW", color="green", domain="cudf_python"
+    )
     def from_arrow(cls, array):
         """Create from PyArrow Array/ChunkedArray.
 
@@ -181,6 +211,9 @@ def from_arrow(cls, array):
         """
         return cls(ColumnBase.from_arrow(array))
 
+    @annotate(
+        "SINGLECOLUMNFRAME_TO_ARROW", color="green", domain="cudf_python"
+    )
     def to_arrow(self):
         """
         Convert to a PyArrow Array.
@@ -211,7 +244,10 @@ def to_arrow(self):
         """
         return self._column.to_arrow()
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "SINGLECOLUMNFRAME_IS_UNIQUE", color="green", domain="cudf_python"
+    )
     def is_unique(self):
         """Return boolean if values in the object are unique.
 
@@ -221,7 +257,10 @@ def is_unique(self):
         """
         return self._column.is_unique
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "SINGLECOLUMNFRAME_IS_MONOTONIC", color="green", domain="cudf_python"
+    )
     def is_monotonic(self):
         """Return boolean if values in the object are monotonically increasing.
 
@@ -233,7 +272,12 @@ def is_monotonic(self):
         """
         return self.is_monotonic_increasing
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "SINGLECOLUMNFRAME_IS_MONOTONIC_INCREASING",
+        color="green",
+        domain="cudf_python",
+    )
     def is_monotonic_increasing(self):
         """Return boolean if values in the object are monotonically increasing.
 
@@ -243,7 +287,12 @@ def is_monotonic_increasing(self):
         """
         return self._column.is_monotonic_increasing
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "SINGLECOLUMNFRAME_IS_MONOTONIC_DECREASING",
+        color="green",
+        domain="cudf_python",
+    )
     def is_monotonic_decreasing(self):
         """Return boolean if values in the object are monotonically decreasing.
 
@@ -253,10 +302,18 @@ def is_monotonic_decreasing(self):
         """
         return self._column.is_monotonic_decreasing
 
-    @property
+    @property  # type: ignore
+    @annotate(
+        "SINGLECOLUMNFRAME_CUDA_ARRAY_INTERFACE",
+        color="green",
+        domain="cudf_python",
+    )
     def __cuda_array_interface__(self):
         return self._column.__cuda_array_interface__
 
+    @annotate(
+        "SINGLECOLUMNFRAME_FACTORIZE", color="green", domain="cudf_python"
+    )
     def factorize(self, na_sentinel=-1):
         """Encode the input values as integer labels.
 
@@ -284,6 +341,11 @@ def factorize(self, na_sentinel=-1):
         """
         return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
 
+    @annotate(
+        "SINGLECOLUMNFRAME_MAKE_OPERANDS_FOR_BINOP",
+        color="green",
+        domain="cudf_python",
+    )
     def _make_operands_for_binop(
         self,
         other: Any,
@@ -337,6 +399,7 @@ def _make_operands_for_binop(
 
         return {result_name: (self._column, other, reflect, fill_value)}
 
+    @annotate("SINGLECOLUMNFRAME_NUNIQUE", color="green", domain="cudf_python")
     def nunique(self, method: builtins.str = "sort", dropna: bool = True):
         """
         Return count of unique values for the column.

From 1e5b01f1f1e4e97b930de45074dc7109e22a9744 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 4 Mar 2022 10:27:13 -0800
Subject: [PATCH 9/9] Rewrites `sample` API (#10262)

This PR rewrites sample API. On function side, this API now accepts a cupy random state or a numpy random state. If a host (numpy) random state is accpeted, the sampled rows should match the result with pandas given the same initial state and operation sequence. On the other hand, if given a device random state, we should expect higher performance if the sample count is large.

Syntatically, this PR refactors existing code into two sub-method that deals with different axis to sample from. The sub-methods are type annotated.

Sampling from `cudf.Index/cudf.MultiIndex` is deprecated.

This PR is breaking because:
1. User who previously calls `sample` API now gets different rows.
2. To align with pandas API, `keep_index` is renamed to `ignore_index` and its semantic is negated.

Current implementation does not depend on `libcudf.copying.sample`, thus cython bindings are removed.

Performance: at 10K rows, this PR is 39% slower than current. Amounting for 0.3ms. At 100M rows, this PR is 7% slower using cupy random state.
<details>
<summary>Benchmark Axis=0</summary>

```
-------------------------------------------------------------------------------------- benchmark 'axis=0': 6 tests ---------------------------------------------------------------------------------------
Name (time in ms)                                                Min                   Max                  Mean              StdDev                Median                 IQR            Outliers  Rounds
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
sample_df[size100M-AxisIndex-CupyRandomState] (afte)        296.7751 (455.90)     299.2855 (401.57)     297.9519 (448.88)     1.1162 (94.15)      297.7824 (451.66)     2.0472 (192.32)        2;0       5
sample_df[size100M-AxisIndex-NumpyRandomState] (afte)     4,435.3055 (>1000.0)  4,717.0815 (>1000.0)  4,507.1635 (>1000.0)  119.8772 (>1000.0)  4,452.5009 (>1000.0)  115.2876 (>1000.0)       1;0       5
sample_df[size100M-AxisIndex-NumpyRandomState] (befo)       276.1754 (424.26)     276.4792 (370.97)     276.2995 (416.26)     0.1258 (10.61)      276.3024 (419.08)     0.2010 (18.88)         1;0       5
sample_df[size10K-AxisIndex-CupyRandomState] (afte)           1.0789 (1.66)         1.2420 (1.67)         1.1238 (1.69)       0.0683 (5.76)         1.0962 (1.66)       0.0721 (6.77)          1;0       5
sample_df[size10K-AxisIndex-NumpyRandomState] (afte)          0.9018 (1.39)         1.1441 (1.54)         0.9140 (1.38)       0.0182 (1.54)         0.9094 (1.38)       0.0106 (1.0)         11;11     346
sample_df[size10K-AxisIndex-NumpyRandomState] (befo)          0.6510 (1.0)          0.7453 (1.0)          0.6638 (1.0)        0.0119 (1.0)          0.6593 (1.0)        0.0108 (1.01)        76;44     638
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
```
</details>

On `axis=1` sample, this PR is faster than current if provided a numpy random state for `random_state` parameter, while slower if provided a seed instead.
<details>
<summary> Benchmark axis=1 </summary>

```
--------------------------------------------------------------------------------- benchmark 'axis=1': 6 tests ----------------------------------------------------------------------------------
Name (time in us)                                               Min                 Max                Mean             StdDev              Median               IQR            Outliers  Rounds
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
sample_df[size100M-AxisColumn-NumpyRandomState] (afte)     173.2660 (1.0)      290.5080 (1.14)     178.2199 (1.0)       8.0913 (1.58)     175.7130 (1.0)      2.0767 (1.73)      227;419    2707
sample_df[size100M-AxisColumn-Seed] (afte)                 441.9110 (2.55)     617.1150 (2.42)     452.4197 (2.54)     14.1272 (2.76)     447.1345 (2.54)     7.9060 (6.59)      151;162    1484
sample_df[size100M-AxisColumn-Seed] (befo)                 297.1560 (1.72)     477.1500 (1.87)     307.8915 (1.73)     17.2036 (3.36)     300.5620 (1.71)     9.4080 (7.85)      159;168    1695
sample_df[size10K-AxisColumn-NumpyRandomState] (afte)      176.6440 (1.02)     254.9110 (1.0)      180.0217 (1.01)      5.1152 (1.0)      178.8940 (1.02)     1.1990 (1.0)       226;405    3542
sample_df[size10K-AxisColumn-Seed] (afte)                  451.6370 (2.61)     689.8120 (2.71)     465.9937 (2.61)     14.3921 (2.81)     463.0710 (2.64)     6.7365 (5.62)        62;91    1183
sample_df[size10K-AxisColumn-Seed] (befo)                  309.4000 (1.79)     413.9080 (1.62)     316.5210 (1.78)      7.6379 (1.49)     315.2130 (1.79)     5.4100 (4.51)        66;42     826
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
```
</details>

Part of #10153

Authors:
  - Michael Wang (https://github.com/isVoid)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10262
---
 python/cudf/cudf/_lib/copying.pyx        |  26 ---
 python/cudf/cudf/_lib/cpp/copying.pxd    |   9 +-
 python/cudf/cudf/core/_base_index.py     |  22 +++
 python/cudf/cudf/core/dataframe.py       |  27 +++
 python/cudf/cudf/core/frame.py           | 195 +------------------
 python/cudf/cudf/core/indexed_frame.py   | 196 +++++++++++++++++++
 python/cudf/cudf/tests/conftest.py       | 102 ++++++++++
 python/cudf/cudf/tests/test_dataframe.py | 236 ++++++++++++++---------
 python/cudf/cudf/tests/test_index.py     |   3 +
 python/cudf/cudf/tests/test_series.py    |   6 +
 10 files changed, 499 insertions(+), 323 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index d8836738adb..2f18c904c05 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -656,32 +656,6 @@ def get_element(Column input_column, size_type index):
     )
 
 
-def sample(input, size_type n,
-           bool replace, int64_t seed, bool keep_index=True):
-    cdef table_view tbl_view = table_view_from_table(input, not keep_index)
-    cdef cpp_copying.sample_with_replacement replacement
-
-    if replace:
-        replacement = cpp_copying.sample_with_replacement.TRUE
-    else:
-        replacement = cpp_copying.sample_with_replacement.FALSE
-
-    cdef unique_ptr[table] c_output
-    with nogil:
-        c_output = move(
-            cpp_copying.sample(tbl_view, n, replacement, seed)
-        )
-
-    return data_from_unique_ptr(
-        move(c_output),
-        column_names=input._column_names,
-        index_names=(
-            None if keep_index is False
-            else input._index_names
-        )
-    )
-
-
 def segmented_gather(Column source_column, Column gather_map):
     cdef shared_ptr[lists_column_view] source_LCV = (
         make_shared[lists_column_view](source_column.view())
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index be1b6d8069c..a1c433774b5 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t, uint8_t
 from libcpp cimport bool
@@ -175,10 +175,3 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
     ctypedef enum sample_with_replacement:
         FALSE 'cudf::sample_with_replacement::FALSE',
         TRUE 'cudf::sample_with_replacement::TRUE',
-
-    cdef unique_ptr[table] sample (
-        table_view input,
-        size_type n,
-        sample_with_replacement replacement,
-        int64_t seed
-    ) except +
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index b7b61e4d332..c3607e56b4f 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import pickle
+import warnings
 from functools import cached_property
 from typing import Any, Set
 
@@ -1528,6 +1529,27 @@ def _split_columns_by_levels(self, levels):
             [],
         )
 
+    def sample(
+        self,
+        n=None,
+        frac=None,
+        replace=False,
+        weights=None,
+        random_state=None,
+        axis=None,
+        ignore_index=False,
+    ):
+        warnings.warn(
+            "Index.sample is deprecated and will be removed.", FutureWarning,
+        )
+        return cudf.core.index._index_from_data(
+            self.to_frame()
+            .sample(
+                n, frac, replace, weights, random_state, axis, ignore_index
+            )
+            ._data
+        )
+
 
 def _get_result_name(left_name, right_name):
     if left_name == right_name:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a820b527d3f..46e97ae8ee4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -36,6 +36,7 @@
 import cudf
 import cudf.core.common
 from cudf import _lib as libcudf
+from cudf._typing import ColumnLike
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
@@ -6322,6 +6323,32 @@ def nunique(self, axis=0, dropna=True):
 
         return cudf.Series(super().nunique(method="sort", dropna=dropna))
 
+    def _sample_axis_1(
+        self,
+        n: int,
+        weights: Optional[ColumnLike],
+        replace: bool,
+        random_state: np.random.RandomState,
+        ignore_index: bool,
+    ):
+        if replace:
+            # Since cuDF does not support multiple columns with same name,
+            # sample with replace=True at axis 1 is unsupported.
+            raise NotImplementedError(
+                "Sample is not supported for axis 1/`columns` when"
+                "`replace=True`."
+            )
+
+        sampled_column_labels = random_state.choice(
+            self._column_names, size=n, replace=False, p=weights
+        )
+
+        result = self._get_columns_by_label(sampled_column_labels)
+        if ignore_index:
+            result.reset_index(drop=True)
+
+        return result
+
 
 def from_dataframe(df, allow_copy=False):
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 87f4ed0bbc4..e99892749d7 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -50,7 +50,7 @@
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.dtypes import find_common_type, is_column_like
+from cudf.utils.dtypes import find_common_type
 
 T = TypeVar("T", bound="Frame")
 
@@ -1659,199 +1659,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             zip(self._column_names, data_columns), self._index
         )
 
-    @annotate("FRAME_SAMPLE", color="orange", domain="cudf_python")
-    def sample(
-        self,
-        n=None,
-        frac=None,
-        replace=False,
-        weights=None,
-        random_state=None,
-        axis=None,
-        keep_index=True,
-    ):
-        """Return a random sample of items from an axis of object.
-
-        You can use random_state for reproducibility.
-
-        Parameters
-        ----------
-        n : int, optional
-            Number of items from axis to return. Cannot be used with frac.
-            Default = 1 if frac = None.
-        frac : float, optional
-            Fraction of axis items to return. Cannot be used with n.
-        replace : bool, default False
-            Allow or disallow sampling of the same row more than once.
-            replace == True is not yet supported for axis = 1/"columns"
-        weights : str or ndarray-like, optional
-            Only supported for axis=1/"columns"
-        random_state : int, numpy RandomState or None, default None
-            Seed for the random number generator (if int), or None.
-            If None, a random seed will be chosen.
-            if RandomState, seed will be extracted from current state.
-        axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
-            Axis to sample. Accepts axis number or name.
-            Default is stat axis for given data type
-            (0 for Series and DataFrames). Series and Index doesn't
-            support axis=1.
-
-        Returns
-        -------
-        Series or DataFrame or Index
-            A new object of same type as caller containing n items
-            randomly sampled from the caller object.
-
-        Examples
-        --------
-        >>> import cudf as cudf
-        >>> df = cudf.DataFrame({"a":{1, 2, 3, 4, 5}})
-        >>> df.sample(3)
-           a
-        1  2
-        3  4
-        0  1
-
-        >>> sr = cudf.Series([1, 2, 3, 4, 5])
-        >>> sr.sample(10, replace=True)
-        1    4
-        3    1
-        2    4
-        0    5
-        0    1
-        4    5
-        4    1
-        0    2
-        0    3
-        3    2
-        dtype: int64
-
-        >>> df = cudf.DataFrame(
-        ... {"a":[1, 2], "b":[2, 3], "c":[3, 4], "d":[4, 5]})
-        >>> df.sample(2, axis=1)
-           a  c
-        0  1  3
-        1  2  4
-        """
-
-        if frac is not None and frac > 1 and not replace:
-            raise ValueError(
-                "Replace has to be set to `True` "
-                "when upsampling the population `frac` > 1."
-            )
-        elif frac is not None and n is not None:
-            raise ValueError(
-                "Please enter a value for `frac` OR `n`, not both"
-            )
-
-        if frac is None and n is None:
-            n = 1
-        elif frac is not None:
-            if axis is None or axis == 0 or axis == "index":
-                n = int(round(self.shape[0] * frac))
-            else:
-                n = int(round(self.shape[1] * frac))
-
-        if axis is None or axis == 0 or axis == "index":
-            if n > 0 and self.shape[0] == 0:
-                raise ValueError(
-                    "Cannot take a sample larger than 0 when axis is empty"
-                )
-
-            if not replace and n > self.shape[0]:
-                raise ValueError(
-                    "Cannot take a larger sample than population "
-                    "when 'replace=False'"
-                )
-
-            if weights is not None:
-                raise NotImplementedError(
-                    "weights is not yet supported for axis=0/index"
-                )
-
-            if random_state is None:
-                seed = np.random.randint(
-                    np.iinfo(np.int64).max, dtype=np.int64
-                )
-            elif isinstance(random_state, np.random.mtrand.RandomState):
-                _, keys, pos, _, _ = random_state.get_state()
-                seed = 0 if pos >= len(keys) else pos
-            else:
-                seed = np.int64(random_state)
-
-            result = self.__class__._from_data(
-                *libcudf.copying.sample(
-                    self,
-                    n=n,
-                    replace=replace,
-                    seed=seed,
-                    keep_index=keep_index,
-                )
-            )
-            result._copy_type_metadata(self)
-
-            return result
-        else:
-            if len(self.shape) != 2:
-                raise ValueError(
-                    f"No axis named {axis} for "
-                    f"object type {self.__class__}"
-                )
-
-            if replace:
-                raise NotImplementedError(
-                    "Sample is not supported for "
-                    f"axis {axis} when 'replace=True'"
-                )
-
-            if n > 0 and self.shape[1] == 0:
-                raise ValueError(
-                    "Cannot take a sample larger than 0 when axis is empty"
-                )
-
-            columns = np.asarray(self._data.names)
-            if not replace and n > columns.size:
-                raise ValueError(
-                    "Cannot take a larger sample "
-                    "than population when 'replace=False'"
-                )
-
-            if weights is not None:
-                if is_column_like(weights):
-                    weights = np.asarray(weights)
-                else:
-                    raise ValueError(
-                        "Strings can only be passed to weights "
-                        "when sampling from rows on a DataFrame"
-                    )
-
-                if columns.size != len(weights):
-                    raise ValueError(
-                        "Weights and axis to be sampled must be of same length"
-                    )
-
-                total_weight = weights.sum()
-                if total_weight != 1:
-                    if not isinstance(weights.dtype, float):
-                        weights = weights.astype("float64")
-                    weights = weights / total_weight
-
-            np.random.seed(random_state)
-            gather_map = np.random.choice(
-                columns, size=n, replace=replace, p=weights
-            )
-
-            if isinstance(self, cudf.MultiIndex):
-                # TODO: Need to update this once MultiIndex is refactored,
-                # should be able to treat it similar to other Frame object
-                result = cudf.Index(self.to_frame(index=False)[gather_map])
-            else:
-                result = self[gather_map]
-                if not keep_index:
-                    result.index = None
-
-            return result
-
     @classmethod
     @annotate("FRAME_FROM_ARROW", color="orange", domain="cudf_python")
     def from_arrow(cls, data):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 331457d17ae..3752ab5c843 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1694,6 +1694,202 @@ def last(self, offset):
             slice_func=lambda i: self.iloc[i:],
         )
 
+    @annotate("SAMPLE", color="orange", domain="cudf_python")
+    def sample(
+        self,
+        n=None,
+        frac=None,
+        replace=False,
+        weights=None,
+        random_state=None,
+        axis=None,
+        ignore_index=False,
+    ):
+        """Return a random sample of items from an axis of object.
+
+        If reproducible results are required, a random number generator may be
+        provided via the `random_state` parameter. This function will always
+        produce the same sample given an identical `random_state`.
+
+        Notes
+        -----
+        When sampling from ``axis=0/'index'``, ``random_state`` can be either
+        a numpy random state (``numpy.random.RandomState``) or a cupy random
+        state (``cupy.random.RandomState``). When a numpy random state is
+        used, the output is guaranteed to match the output of the corresponding
+        pandas method call, but generating the sample may be slow. If exact
+        pandas equivalence is not required, using a cupy random state will
+        achieve better performance, especially when sampling large number of
+        items. It's advised to use the matching `ndarray` type to the random
+        state for the `weights` array.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of items from axis to return. Cannot be used with `frac`.
+            Default = 1 if frac = None.
+        frac : float, optional
+            Fraction of axis items to return. Cannot be used with n.
+        replace : bool, default False
+            Allow or disallow sampling of the same row more than once.
+            `replace == True` is not supported for axis = 1/"columns".
+            `replace == False` is not supported for axis = 0/"index" given
+            `random_state` is `None` or a cupy random state, and `weights` is
+            specified.
+        weights : ndarray-like, optional
+            Default `None` for uniform probability distribution over rows to
+            sample from. If `ndarray` is passed, the length of `weights` should
+            equal to the number of rows to sample from, and will be normalized
+            to have a sum of 1. Unlike pandas, index alignment is not currently
+            not performed.
+        random_state : int, numpy/cupy RandomState, or None, default None
+            If None, default cupy random state is chosen.
+            If int, the seed for the default cupy random state.
+            If RandomState, rows-to-sample are generated from the RandomState.
+        axis : {0 or `index`, 1 or `columns`, None}, default None
+            Axis to sample. Accepts axis number or name.
+            Default is stat axis for given data type
+            (0 for Series and DataFrames). Series doesn't support axis=1.
+        ignore_index : bool, default False
+            If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        Series or DataFrame
+            A new object of same type as caller containing n items
+            randomly sampled from the caller object.
+
+        Examples
+        --------
+        >>> import cudf as cudf
+        >>> df = cudf.DataFrame({"a":{1, 2, 3, 4, 5}})
+        >>> df.sample(3)
+           a
+        1  2
+        3  4
+        0  1
+
+        >>> sr = cudf.Series([1, 2, 3, 4, 5])
+        >>> sr.sample(10, replace=True)
+        1    4
+        3    1
+        2    4
+        0    5
+        0    1
+        4    5
+        4    1
+        0    2
+        0    3
+        3    2
+        dtype: int64
+
+        >>> df = cudf.DataFrame(
+        ...     {"a": [1, 2], "b": [2, 3], "c": [3, 4], "d": [4, 5]}
+        ... )
+        >>> df.sample(2, axis=1)
+           a  c
+        0  1  3
+        1  2  4
+        """
+        axis = self._get_axis_from_axis_arg(axis)
+        size = self.shape[axis]
+
+        # Compute `n` from parameter `frac`.
+        if frac is None:
+            n = 1 if n is None else n
+        else:
+            if frac > 1 and not replace:
+                raise ValueError(
+                    "Replace has to be set to `True` when upsampling the "
+                    "population `frac` > 1."
+                )
+            if n is not None:
+                raise ValueError(
+                    "Please enter a value for `frac` OR `n`, not both."
+                )
+            n = int(round(size * frac))
+
+        if n > 0 and size == 0:
+            raise ValueError(
+                "Cannot take a sample larger than 0 when axis is empty."
+            )
+
+        if isinstance(random_state, cp.random.RandomState):
+            lib = cp
+        elif isinstance(random_state, np.random.RandomState):
+            lib = np
+        else:
+            # Construct random state if `random_state` parameter is None or a
+            # seed. By default, cupy random state is used to sample rows
+            # and numpy is used to sample columns. This is because row data
+            # is stored on device, and the column objects are stored on host.
+            lib = cp if axis == 0 else np
+            random_state = lib.random.RandomState(seed=random_state)
+
+        # Normalize `weights` array.
+        if weights is not None:
+            if isinstance(weights, str):
+                raise NotImplementedError(
+                    "Weights specified by string is unsupported yet."
+                )
+
+            if size != len(weights):
+                raise ValueError(
+                    "Weights and axis to be sampled must be of same length."
+                )
+
+            weights = lib.asarray(weights)
+            weights = weights / weights.sum()
+
+        if axis == 0:
+            return self._sample_axis_0(
+                n, weights, replace, random_state, ignore_index
+            )
+        else:
+            if isinstance(random_state, cp.random.RandomState):
+                raise ValueError(
+                    "Sampling from `axis=1`/`columns` with cupy random state"
+                    "isn't supported."
+                )
+            return self._sample_axis_1(
+                n, weights, replace, random_state, ignore_index
+            )
+
+    def _sample_axis_0(
+        self,
+        n: int,
+        weights: Optional[ColumnLike],
+        replace: bool,
+        random_state: Union[np.random.RandomState, cp.random.RandomState],
+        ignore_index: bool,
+    ):
+        try:
+            gather_map_array = random_state.choice(
+                len(self), size=n, replace=replace, p=weights
+            )
+        except NotImplementedError as e:
+            raise NotImplementedError(
+                "Random sampling with cupy does not support these inputs."
+            ) from e
+
+        return self._gather(
+            cudf.core.column.as_column(gather_map_array),
+            keep_index=not ignore_index,
+            check_bounds=False,
+        )
+
+    def _sample_axis_1(
+        self,
+        n: int,
+        weights: Optional[ColumnLike],
+        replace: bool,
+        random_state: np.random.RandomState,
+        ignore_index: bool,
+    ):
+        raise NotImplementedError(
+            f"Sampling from axis 1 is not implemented for {self.__class__}."
+        )
+
     def _binaryop(
         self,
         other: Any,
diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py
index 4d5b5926d6e..4a42d811c80 100644
--- a/python/cudf/cudf/tests/conftest.py
+++ b/python/cudf/cudf/tests/conftest.py
@@ -1,12 +1,17 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
+import itertools
 import os
 import pathlib
 
+import cupy as cp
+import numpy as np
 import pytest
 
 import rmm  # noqa: F401
 
+from cudf.testing._utils import assert_eq
+
 _CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent)
 
 
@@ -15,6 +20,103 @@ def datadir():
     return pathlib.Path(__file__).parent / "data"
 
 
+@pytest.fixture(
+    params=itertools.product([0, 2, None], [0.3, None]),
+    ids=lambda arg: f"n={arg[0]}-frac={arg[1]}",
+)
+def sample_n_frac(request):
+    """
+    Specific to `test_sample*` tests.
+    """
+    n, frac = request.param
+    if n is not None and frac is not None:
+        pytest.skip("Cannot specify both n and frac.")
+    return n, frac
+
+
+def shape_checker(expected, got):
+    assert expected.shape == got.shape
+
+
+def exact_checker(expected, got):
+    assert_eq(expected, got)
+
+
+@pytest.fixture(
+    params=[
+        (None, None, shape_checker),
+        (42, 42, shape_checker),
+        (np.random.RandomState(42), np.random.RandomState(42), exact_checker),
+    ],
+    ids=["None", "IntSeed", "NumpyRandomState"],
+)
+def random_state_tuple_axis_1(request):
+    """
+    Specific to `test_sample*_axis_1` tests.
+    A pytest fixture of valid `random_state` parameter pairs for pandas
+    and cudf. Valid parameter combinations, and what to check for each pair
+    are listed below:
+
+    pandas:   None,   seed(int),  np.random.RandomState
+    cudf:     None,   seed(int),  np.random.RandomState
+    ------
+    check:    shape,  shape,      exact result
+
+    Each column above stands for one valid parameter combination and check.
+    """
+
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        (None, None, shape_checker),
+        (42, 42, shape_checker),
+        (np.random.RandomState(42), np.random.RandomState(42), exact_checker),
+        (np.random.RandomState(42), cp.random.RandomState(42), shape_checker),
+    ],
+    ids=["None", "IntSeed", "NumpyRandomState", "CupyRandomState"],
+)
+def random_state_tuple_axis_0(request):
+    """
+    Specific to `test_sample*_axis_0` tests.
+    A pytest fixture of valid `random_state` parameter pairs for pandas
+    and cudf. Valid parameter combinations, and what to check for each pair
+    are listed below:
+
+    pandas:   None,   seed(int),  np.random.RandomState,  np.random.RandomState
+    cudf:     None,   seed(int),  np.random.RandomState,  cp.random.RandomState
+    ------
+    check:    shape,  shape,      exact result,           shape
+
+    Each column above stands for one valid parameter combination and check.
+    """
+
+    return request.param
+
+
+@pytest.fixture(params=[None, "builtin_list", "ndarray"])
+def make_weights_axis_0(request):
+    """Specific to `test_sample*_axis_0` tests.
+    Only testing weights array that matches type with random state.
+    """
+
+    if request.param is None:
+        return lambda *_: (None, None)
+    elif request.param == "builtin-list":
+        return lambda size, _: ([1] * size, [1] * size)
+    else:
+
+        def wrapped(size, numpy_weights_for_cudf):
+            # Uniform distribution, non-normalized
+            if numpy_weights_for_cudf:
+                return np.ones(size), np.ones(size)
+            else:
+                return np.ones(size), cp.ones(size)
+
+        return wrapped
+
+
 # To set and remove the NO_EXTERNAL_ONLY_APIS environment variable we must use
 # the sessionstart and sessionfinish hooks rather than a simple autouse,
 # session-scope fixture because we need to set these variable before collection
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d7a5d07a5fc..5bde75c2e21 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11,6 +11,7 @@
 from copy import copy
 
 import cupy
+import cupy as cp
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -7143,120 +7144,165 @@ def test_cudf_arrow_array_error():
         sr.__arrow_array__()
 
 
-@pytest.mark.parametrize("n", [0, 2, 5, 10, None])
-@pytest.mark.parametrize("frac", [0.1, 0.5, 1, 2, None])
-@pytest.mark.parametrize("replace", [True, False])
-@pytest.mark.parametrize("axis", [0, 1])
-def test_dataframe_sample_basic(n, frac, replace, axis):
-    # as we currently don't support column with same name
-    if axis == 1 and replace:
-        return
+@pytest.mark.parametrize(
+    "make_weights_axis_1",
+    [lambda _: None, lambda s: [1] * s, lambda s: np.ones(s)],
+)
+def test_sample_axis_1(
+    sample_n_frac, random_state_tuple_axis_1, make_weights_axis_1
+):
+    n, frac = sample_n_frac
+    pd_random_state, gd_random_state, checker = random_state_tuple_axis_1
+
     pdf = pd.DataFrame(
         {
             "a": [1, 2, 3, 4, 5],
             "float": [0.05, 0.2, 0.3, 0.2, 0.25],
             "int": [1, 3, 5, 4, 2],
         },
-        index=[1, 2, 3, 4, 5],
     )
     df = cudf.DataFrame.from_pandas(pdf)
-    random_state = 0
-
-    try:
-        pout = pdf.sample(
-            n=n,
-            frac=frac,
-            replace=replace,
-            random_state=random_state,
-            axis=axis,
-        )
-    except BaseException:
-        assert_exceptions_equal(
-            lfunc=pdf.sample,
-            rfunc=df.sample,
-            lfunc_args_and_kwargs=(
-                [],
-                {
-                    "n": n,
-                    "frac": frac,
-                    "replace": replace,
-                    "random_state": random_state,
-                    "axis": axis,
-                },
-            ),
-            rfunc_args_and_kwargs=(
-                [],
-                {
-                    "n": n,
-                    "frac": frac,
-                    "replace": replace,
-                    "random_state": random_state,
-                    "axis": axis,
-                },
-            ),
-        )
-    else:
-        gout = df.sample(
-            n=n,
-            frac=frac,
-            replace=replace,
-            random_state=random_state,
-            axis=axis,
+
+    weights = make_weights_axis_1(len(pdf.columns))
+
+    expected = pdf.sample(
+        n=n,
+        frac=frac,
+        replace=False,
+        random_state=pd_random_state,
+        weights=weights,
+        axis=1,
+    )
+    got = df.sample(
+        n=n,
+        frac=frac,
+        replace=False,
+        random_state=gd_random_state,
+        weights=weights,
+        axis=1,
+    )
+    checker(expected, got)
+
+
+@pytest.mark.parametrize(
+    "pdf",
+    [
+        pd.DataFrame(
+            {
+                "a": [1, 2, 3, 4, 5],
+                "float": [0.05, 0.2, 0.3, 0.2, 0.25],
+                "int": [1, 3, 5, 4, 2],
+            },
+        ),
+        pd.Series([1, 2, 3, 4, 5]),
+    ],
+)
+@pytest.mark.parametrize("replace", [True, False])
+def test_sample_axis_0(
+    pdf, sample_n_frac, replace, random_state_tuple_axis_0, make_weights_axis_0
+):
+    n, frac = sample_n_frac
+    pd_random_state, gd_random_state, checker = random_state_tuple_axis_0
+
+    df = cudf.from_pandas(pdf)
+
+    pd_weights, gd_weights = make_weights_axis_0(
+        len(pdf), isinstance(gd_random_state, np.random.RandomState)
+    )
+    if (
+        not replace
+        and not isinstance(gd_random_state, np.random.RandomState)
+        and gd_weights is not None
+    ):
+        pytest.skip(
+            "`cupy.random.RandomState` doesn't support weighted sampling "
+            "without replacement."
         )
-        assert pout.shape == gout.shape
+
+    expected = pdf.sample(
+        n=n,
+        frac=frac,
+        replace=replace,
+        random_state=pd_random_state,
+        weights=pd_weights,
+        axis=0,
+    )
+
+    got = df.sample(
+        n=n,
+        frac=frac,
+        replace=replace,
+        random_state=gd_random_state,
+        weights=gd_weights,
+        axis=0,
+    )
+    checker(expected, got)
 
 
 @pytest.mark.parametrize("replace", [True, False])
-@pytest.mark.parametrize("random_state", [1, np.random.mtrand.RandomState(10)])
-def test_dataframe_reproducibility(replace, random_state):
+@pytest.mark.parametrize(
+    "random_state_lib", [cp.random.RandomState, np.random.RandomState]
+)
+def test_sample_reproducibility(replace, random_state_lib):
     df = cudf.DataFrame({"a": cupy.arange(0, 1024)})
 
-    expected = df.sample(1024, replace=replace, random_state=random_state)
-    out = df.sample(1024, replace=replace, random_state=random_state)
+    n = 1024
+    expected = df.sample(n, replace=replace, random_state=random_state_lib(10))
+    out = df.sample(n, replace=replace, random_state=random_state_lib(10))
 
     assert_eq(expected, out)
 
 
-@pytest.mark.parametrize("n", [0, 2, 5, 10, None])
-@pytest.mark.parametrize("frac", [0.1, 0.5, 1, 2, None])
-@pytest.mark.parametrize("replace", [True, False])
-def test_series_sample_basic(n, frac, replace):
-    psr = pd.Series([1, 2, 3, 4, 5])
-    sr = cudf.Series.from_pandas(psr)
-    random_state = 0
-
-    try:
-        pout = psr.sample(
-            n=n, frac=frac, replace=replace, random_state=random_state
-        )
-    except BaseException:
-        assert_exceptions_equal(
-            lfunc=psr.sample,
-            rfunc=sr.sample,
-            lfunc_args_and_kwargs=(
-                [],
-                {
-                    "n": n,
-                    "frac": frac,
-                    "replace": replace,
-                    "random_state": random_state,
-                },
-            ),
-            rfunc_args_and_kwargs=(
-                [],
-                {
-                    "n": n,
-                    "frac": frac,
-                    "replace": replace,
-                    "random_state": random_state,
-                },
-            ),
-        )
-    else:
-        gout = sr.sample(
-            n=n, frac=frac, replace=replace, random_state=random_state
+@pytest.mark.parametrize("axis", [0, 1])
+def test_sample_invalid_n_frac_combo(axis):
+    n, frac = 2, 0.5
+    pdf = pd.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5],
+            "float": [0.05, 0.2, 0.3, 0.2, 0.25],
+            "int": [1, 3, 5, 4, 2],
+        },
+    )
+    df = cudf.DataFrame.from_pandas(pdf)
+
+    assert_exceptions_equal(
+        lfunc=pdf.sample,
+        rfunc=df.sample,
+        lfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}),
+        rfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}),
+    )
+
+
+@pytest.mark.parametrize("n, frac", [(100, None), (None, 3)])
+@pytest.mark.parametrize("axis", [0, 1])
+def test_oversample_without_replace(n, frac, axis):
+    pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]})
+    df = cudf.DataFrame.from_pandas(pdf)
+
+    assert_exceptions_equal(
+        lfunc=pdf.sample,
+        rfunc=df.sample,
+        lfunc_args_and_kwargs=(
+            [],
+            {"n": n, "frac": frac, "axis": axis, "replace": False},
+        ),
+        rfunc_args_and_kwargs=(
+            [],
+            {"n": n, "frac": frac, "axis": axis, "replace": False},
+        ),
+    )
+
+
+@pytest.mark.parametrize("random_state", [None, cp.random.RandomState(42)])
+def test_sample_unsupported_arguments(random_state):
+    df = cudf.DataFrame({"float": [0.05, 0.2, 0.3, 0.2, 0.25]})
+    with pytest.raises(
+        NotImplementedError,
+        match="Random sampling with cupy does not support these inputs.",
+    ):
+        df.sample(
+            n=2, replace=False, random_state=random_state, weights=[1] * 5
         )
-        assert pout.shape == gout.shape
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index faaa42ac7f8..ffeb40d41f1 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1648,6 +1648,7 @@ def test_index_sample_basic(n, frac, replace):
                     "random_state": random_state,
                 },
             ),
+            compare_error_message=False,
         )
     else:
         gout = gindex.sample(
@@ -1716,6 +1717,8 @@ def test_multiindex_sample_basic(n, frac, replace, axis):
             random_state=random_state,
             axis=axis,
         )
+        if axis == 1 and n is None and frac is None:
+            pout = pout.iloc[:, 0]
         assert pout.shape == gout.shape
 
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 3e3c5d1b053..55fcd15f0d5 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1584,6 +1584,12 @@ def test_isin_numeric(data, values):
     assert_eq(got, expected)
 
 
+@pytest.mark.xfail(raises=ValueError)
+def test_fill_new_category():
+    gs = cudf.Series(pd.Categorical(["a", "b", "c"]))
+    gs[0:1] = "d"
+
+
 @pytest.mark.parametrize(
     "data",
     [