From 09995a5d3f5b1dd83584529e44ccf774d7f6efe2 Mon Sep 17 00:00:00 2001
From: ChrisJar <chris.jarrett.0@gmail.com>
Date: Wed, 27 Apr 2022 13:13:29 -0500
Subject: [PATCH 01/28] Add bindings for index_of with column search key
 (#10696)

This adds bindings for `index_of` to enable using `list.index` with a Series of search keys.

Closes #10692

cc: @randerzander

Authors:
  - https://github.com/ChrisJar

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10696
---
 python/cudf/cudf/_lib/cpp/lists/contains.pxd |  5 ++
 python/cudf/cudf/_lib/lists.pyx              | 20 ++++++-
 python/cudf/cudf/core/column/lists.py        | 61 ++++++++++++++++++--
 python/cudf/cudf/tests/test_list.py          | 55 ++++++++++++++++--
 4 files changed, 129 insertions(+), 12 deletions(-)
diff --git a/python/cudf/cudf/_lib/cpp/lists/contains.pxd b/python/cudf/cudf/_lib/cpp/lists/contains.pxd
index 46aea37643f..e3cb01721a0 100644
--- a/python/cudf/cudf/_lib/cpp/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/cpp/lists/contains.pxd
@@ -18,3 +18,8 @@ cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
         lists_column_view lists,
         scalar search_key,
     ) except +
+
+    cdef unique_ptr[column] index_of(
+        lists_column_view lists,
+        column_view search_keys,
+    ) except +
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index e5a705ab603..025fb0665d3 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -176,7 +176,7 @@ def contains_scalar(Column col, object py_search_key):
     return result
 
 
-def index_of(Column col, object py_search_key):
+def index_of_scalar(Column col, object py_search_key):
 
     cdef DeviceScalar search_key = py_search_key.device_value
 
@@ -195,6 +195,24 @@ def index_of(Column col, object py_search_key):
     return Column.from_unique_ptr(move(c_result))
 
 
+def index_of_column(Column col, Column search_keys):
+
+    cdef column_view keys_view = search_keys.view()
+
+    cdef shared_ptr[lists_column_view] list_view = (
+        make_shared[lists_column_view](col.view())
+    )
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_index_of(
+            list_view.get()[0],
+            keys_view,
+        ))
+    return Column.from_unique_ptr(move(c_result))
+
+
 def concatenate_rows(list source_columns):
     cdef unique_ptr[column] c_result
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index df6aaa91a2b..2964378d114 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -17,7 +17,8 @@
     drop_list_duplicates,
     extract_element_column,
     extract_element_scalar,
-    index_of,
+    index_of_column,
+    index_of_scalar,
     sort_lists,
 )
 from cudf._lib.strings.convert.convert_lists import format_list_column
@@ -463,10 +464,61 @@ def contains(self, search_key: ScalarLike) -> ParentType:
             raise
         return res
 
-    def index(self, search_key: ScalarLike) -> ParentType:
-        search_key = cudf.Scalar(search_key)
+    def index(self, search_key: Union[ScalarLike, ColumnLike]) -> ParentType:
+        """
+        Returns integers representing the index of the search key for each row.
+
+        If ``search_key`` is a sequence, it must be the same length as the
+        Series and ``search_key[i]`` represents the search key for the
+        ``i``-th row of the Series.
+
+        If the search key is not contained in a row, -1 is returned. If either
+        the row or the search key are null, <NA> is returned. If the search key
+        is contained multiple times, the smallest matching index is returned.
+
+        Parameters
+        ----------
+        search_key : scalar or sequence of scalars
+            Element or elements being searched for in each row of the list
+            column
+
+        Returns
+        -------
+        Series or Index
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
+        >>> s.list.index(4)
+        0   -1
+        1    1
+        2    0
+        dtype: int32
+
+        >>> s = cudf.Series([["a", "b", "c"], ["x", "y", "z"]])
+        >>> s.list.index(["b", "z"])
+        0    1
+        1    2
+        dtype: int32
+
+        >>> s = cudf.Series([[4, 5, 6], None, [-3, -2, -1]])
+        >>> s.list.index([None, 3, -2])
+        0    <NA>
+        1    <NA>
+        2       1
+        dtype: int32
+        """
+
         try:
-            res = self._return_or_inplace(index_of(self._column, search_key))
+            if is_scalar(search_key):
+                return self._return_or_inplace(
+                    index_of_scalar(self._column, cudf.Scalar(search_key))
+                )
+            else:
+                return self._return_or_inplace(
+                    index_of_column(self._column, as_column(search_key))
+                )
+
         except RuntimeError as e:
             if (
                 "Type/Scale of search key does not "
@@ -474,7 +526,6 @@ def index(self, search_key: ScalarLike) -> ParentType:
             ):
                 raise TypeError(str(e)) from e
             raise
-        return res
 
     @property
     def leaves(self) -> ParentType:
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index c21e1a0f61f..09eee3520e5 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -11,6 +11,7 @@
 import cudf
 from cudf import NA
 from cudf._lib.copying import get_element
+from cudf.api.types import is_scalar
 from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
@@ -425,7 +426,7 @@ def test_contains_invalid(data, scalar):
 
 
 @pytest.mark.parametrize(
-    "data, scalar, expect",
+    "data, search_key, expect",
     [
         (
             [[1, 2, 3], [], [3, 4, 5]],
@@ -448,6 +449,16 @@ def test_contains_invalid(data, scalar):
             "y",
             [3, -1],
         ),
+        (
+            [["h", "a", None], ["t", "g"]],
+            ["a", "b"],
+            [1, -1],
+        ),
+        (
+            [None, ["h", "i"], ["p", "k", "z"]],
+            ["x", None, "z"],
+            [None, None, 2],
+        ),
         (
             [["d", None, "e"], [None, "f"], []],
             cudf.Scalar(cudf.NA, "O"),
@@ -460,15 +471,21 @@ def test_contains_invalid(data, scalar):
         ),
     ],
 )
-def test_index(data, scalar, expect):
+def test_index(data, search_key, expect):
     sr = cudf.Series(data)
     expect = cudf.Series(expect, dtype="int32")
-    got = sr.list.index(cudf.Scalar(scalar, sr.dtype.element_type))
+    if is_scalar(search_key):
+        got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type))
+    else:
+        got = sr.list.index(
+            cudf.Series(search_key, dtype=sr.dtype.element_type)
+        )
+
     assert_eq(expect, got)
 
 
 @pytest.mark.parametrize(
-    "data, scalar",
+    "data, search_key",
     [
         (
             [[9, None, 8], [], [7, 6, 5]],
@@ -478,16 +495,42 @@ def test_index(data, scalar, expect):
             [["a", "b", "c"], None, [None, "d"]],
             2,
         ),
+        (
+            [["e", "s"], ["t", "w"]],
+            [5, 6],
+        ),
     ],
 )
-def test_index_invalid(data, scalar):
+def test_index_invalid_type(data, search_key):
     sr = cudf.Series(data)
     with pytest.raises(
         TypeError,
         match="Type/Scale of search key does not "
         "match list column element type.",
     ):
-        sr.list.index(scalar)
+        sr.list.index(search_key)
+
+
+@pytest.mark.parametrize(
+    "data, search_key",
+    [
+        (
+            [[5, 8], [2, 6]],
+            [8, 2, 4],
+        ),
+        (
+            [["h", "j"], ["p", None], ["t", "z"]],
+            ["j", "a"],
+        ),
+    ],
+)
+def test_index_invalid_length(data, search_key):
+    sr = cudf.Series(data)
+    with pytest.raises(
+        RuntimeError,
+        match="Number of search keys must match list column size.",
+    ):
+        sr.list.index(search_key)
 
 
 @pytest.mark.parametrize(

From 1f8a03e69704562dfac38de40b7172650280c6ea Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 27 Apr 2022 23:57:20 +0530
Subject: [PATCH 02/28] Replace std::make_pair with std::pair (C++17 CTAD)
 (#10727)

Addresses part of https://github.com/rapidsai/cudf/issues/10350 Take advantage of C++17 feature CTAD.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10727
---
 cpp/benchmarks/reduction/segment_reduce.cu     |  2 +-
 cpp/docs/DEVELOPER_GUIDE.md                    |  4 ++--
 cpp/include/cudf/detail/null_mask.cuh          |  2 +-
 cpp/include/cudf/detail/valid_if.cuh           |  2 +-
 cpp/include/cudf/strings/detail/utilities.cuh  |  2 +-
 .../cudf/table/experimental/row_operators.cuh  | 13 ++++++-------
 cpp/include/cudf_test/column_wrapper.hpp       | 16 ++++++++--------
 cpp/src/bitmask/null_mask.cu                   |  8 ++++----
 cpp/src/copying/contiguous_split.cu            |  4 ++--
 cpp/src/groupby/groupby.cu                     | 17 ++++++++---------
 cpp/src/groupby/hash/groupby.cu                |  2 +-
 cpp/src/groupby/sort/aggregate.cpp             |  2 +-
 cpp/src/groupby/sort/group_collect.cu          |  7 +++----
 cpp/src/groupby/sort/scan.cpp                  |  2 +-
 cpp/src/io/orc/aggregate_orc_metadata.cpp      |  6 +++---
 cpp/src/io/orc/reader_impl.cu                  | 18 +++++++++---------
 cpp/src/io/parquet/chunk_dict.cu               |  2 +-
 cpp/src/io/parquet/writer_impl.cu              |  8 ++++----
 cpp/src/join/conditional_join.cu               | 16 ++++++++--------
 cpp/src/join/hash_join.cu                      | 12 ++++++------
 cpp/src/join/join.cu                           |  2 +-
 cpp/src/join/join_utils.cu                     |  4 ++--
 cpp/src/join/mixed_join.cu                     | 16 ++++++++--------
 .../lists/combine/concatenate_list_elements.cu |  2 +-
 cpp/src/lists/copying/scatter_helper.cu        |  6 +++---
 cpp/src/partitioning/partitioning.cu           | 11 +++++------
 cpp/src/partitioning/round_robin.cu            | 10 +++++-----
 cpp/src/replace/clamp.cu                       |  2 +-
 cpp/src/strings/convert/convert_datetime.cu    |  2 +-
 cpp/src/strings/json/json_path.cu              |  4 ++--
 cpp/src/strings/repeat_strings.cu              |  6 +++---
 cpp/src/structs/utilities.cpp                  |  6 +++---
 cpp/src/text/subword/data_normalizer.cu        |  8 ++++----
 cpp/src/transform/bools_to_mask.cu             |  8 ++++----
 cpp/src/transform/encode.cu                    |  2 +-
 cpp/src/transform/nans_to_nulls.cu             |  8 +++-----
 cpp/src/transform/one_hot_encode.cu            | 10 ++++------
 cpp/src/transpose/transpose.cu                 |  4 ++--
 cpp/tests/groupby/m2_tests.cpp                 |  5 ++---
 cpp/tests/groupby/merge_lists_tests.cpp        |  5 ++---
 cpp/tests/groupby/merge_m2_tests.cpp           | 12 +++++-------
 cpp/tests/groupby/merge_sets_tests.cpp         |  5 ++---
 cpp/tests/interop/to_arrow_test.cpp            |  2 +-
 cpp/tests/join/conditional_join_tests.cu       |  6 +++---
 cpp/tests/join/join_tests.cpp                  |  2 +-
 cpp/tests/join/mixed_join_tests.cu             |  6 +++---
 cpp/tests/merge/merge_test.cpp                 |  4 ++--
 cpp/tests/search/search_struct_test.cpp        |  2 +-
 .../stream_compaction/distinct_count_tests.cpp |  2 +-
 .../stream_compaction/unique_count_tests.cpp   |  2 +-
 cpp/tests/strings/translate_tests.cpp          |  2 +-
 51 files changed, 149 insertions(+), 162 deletions(-)

diff --git a/cpp/benchmarks/reduction/segment_reduce.cu b/cpp/benchmarks/reduction/segment_reduce.cu
index 3723147d95c..08fc4622b43 100644
--- a/cpp/benchmarks/reduction/segment_reduce.cu
+++ b/cpp/benchmarks/reduction/segment_reduce.cu
@@ -82,7 +82,7 @@ std::pair<std::unique_ptr<column>, thrust::device_vector<size_type>> make_test_d
 
   thrust::device_vector<size_type> d_offsets(offset_it, offset_it + num_segments + 1);
 
-  return std::make_pair(std::move((input->release())[0]), d_offsets);
+  return std::pair(std::move((input->release())[0]), d_offsets);
 }
 
 template <typename InputType, typename OutputType, aggregation::Kind kind>
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
index 165edd443f6..84f69f559a8 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -572,7 +572,7 @@ The preferred style for how inputs are passed in and outputs are returned is the
 
 Sometimes it is necessary for functions to have multiple outputs. There are a few ways this can be
 done in C++ (including creating a `struct` for the output). One convenient way to do this is
-using `std::tie`  and `std::make_pair`. Note that objects passed to `std::make_pair` will invoke
+using `std::tie`  and `std::pair`. Note that objects passed to `std::pair` will invoke
 either the copy constructor or the move constructor of the object, and it may be preferable to move
 non-trivially copyable objects (and required for types with deleted copy constructors, like
 `std::unique_ptr`).
@@ -585,7 +585,7 @@ std::pair<table, table> return_two_tables(void){
   // Do stuff with out0, out1
 
   // Return a std::pair of the two outputs
-  return std::make_pair(std::move(out0), std::move(out1));
+  return std::pair(std::move(out0), std::move(out1));
 }
 
 cudf::table out0;
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 7aec56fdc51..6a6cdd43004 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -133,7 +133,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_binop(
                           stream,
                           mr);
 
-  return std::make_pair(std::move(dest_mask), null_count);
+  return std::pair(std::move(dest_mask), null_count);
 }
 
 /**
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index aa4421bb4ed..f91f51b2161 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -110,7 +110,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(
 
     null_count = size - valid_count.value(stream);
   }
-  return std::make_pair(std::move(null_mask), null_count);
+  return std::pair(std::move(null_mask), null_count);
 }
 
 /**
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index bb7f29a4172..e6dba5147b5 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -156,7 +156,7 @@ auto make_strings_children(
     for_each_fn(size_and_exec_fn);
   }
 
-  return std::make_pair(std::move(offsets_column), std::move(chars_column));
+  return std::pair(std::move(offsets_column), std::move(chars_column));
 }
 
 /**
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index 88e31744fdf..32b71e660ac 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -165,14 +165,13 @@ class device_row_comparator {
         bool const rhs_is_null{_rhs.is_null(rhs_element_index)};
 
         if (lhs_is_null or rhs_is_null) {  // at least one is null
-          return cuda::std::make_pair(null_compare(lhs_is_null, rhs_is_null, _null_precedence),
-                                      _depth);
+          return cuda::std::pair(null_compare(lhs_is_null, rhs_is_null, _null_precedence), _depth);
         }
       }
 
-      return cuda::std::make_pair(relational_compare(_lhs.element<Element>(lhs_element_index),
-                                                     _rhs.element<Element>(rhs_element_index)),
-                                  std::numeric_limits<int>::max());
+      return cuda::std::pair(relational_compare(_lhs.element<Element>(lhs_element_index),
+                                                _rhs.element<Element>(rhs_element_index)),
+                             std::numeric_limits<int>::max());
     }
 
     template <typename Element,
@@ -197,11 +196,11 @@ class device_row_comparator {
 
         if (lhs_is_null or rhs_is_null) {  // at least one is null
           weak_ordering state = null_compare(lhs_is_null, rhs_is_null, _null_precedence);
-          return cuda::std::make_pair(state, depth);
+          return cuda::std::pair(state, depth);
         }
 
         if (lcol.num_child_columns() == 0) {
-          return cuda::std::make_pair(weak_ordering::EQUIVALENT, depth);
+          return cuda::std::pair(weak_ordering::EQUIVALENT, depth);
         }
 
         // Non-empty structs have been modified to only have 1 child when using this.
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 158c4ff20bb..ff2ff2a0961 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -275,7 +275,7 @@ auto make_chars_and_offsets(StringsIterator begin, StringsIterator end, Validity
     chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp));
     offsets.push_back(offsets.back() + tmp.length());
   }
-  return std::make_pair(std::move(chars), std::move(offsets));
+  return std::pair(std::move(chars), std::move(offsets));
 };
 }  // namespace detail
 
@@ -1464,13 +1464,13 @@ class lists_column_wrapper : public detail::column_wrapper {
       0, [&v](auto i) { return v.empty() ? true : v[i]; });
 
     // compute the expected hierarchy and depth
-    auto const hierarchy_and_depth = std::accumulate(
-      elements.begin(),
-      elements.end(),
-      std::pair<column_view, int32_t>{{}, -1},
-      [](auto acc, lists_column_wrapper const& lcw) {
-        return lcw.depth > acc.second ? std::make_pair(lcw.get_view(), lcw.depth) : acc;
-      });
+    auto const hierarchy_and_depth =
+      std::accumulate(elements.begin(),
+                      elements.end(),
+                      std::pair<column_view, int32_t>{{}, -1},
+                      [](auto acc, lists_column_wrapper const& lcw) {
+                        return lcw.depth > acc.second ? std::pair(lcw.get_view(), lcw.depth) : acc;
+                      });
     column_view expected_hierarchy = hierarchy_and_depth.first;
     int32_t const expected_depth   = hierarchy_and_depth.second;
 
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 756cf3421c9..ec14f8e6ded 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -445,7 +445,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
   if (view.num_rows() == 0 or view.num_columns() == 0) {
-    return std::make_pair(std::move(null_mask), 0);
+    return std::pair(std::move(null_mask), 0);
   }
 
   std::vector<bitmask_type const*> masks;
@@ -467,7 +467,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
       mr);
   }
 
-  return std::make_pair(std::move(null_mask), 0);
+  return std::pair(std::move(null_mask), 0);
 }
 
 // Returns the bitwise OR of the null masks of all columns in the table view
@@ -478,7 +478,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
   if (view.num_rows() == 0 or view.num_columns() == 0) {
-    return std::make_pair(std::move(null_mask), 0);
+    return std::pair(std::move(null_mask), 0);
   }
 
   std::vector<bitmask_type const*> masks;
@@ -500,7 +500,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
       mr);
   }
 
-  return std::make_pair(std::move(null_mask), 0);
+  return std::pair(std::move(null_mask), 0);
 }
 
 }  // namespace detail
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 514374d450d..35e7eba974f 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -688,9 +688,9 @@ BufInfo build_output_columns(InputIter begin,
                                   ? 0
                                   : (current_info->num_rows - current_info->valid_count);
         ++current_info;
-        return std::make_pair(ptr, null_count);
+        return std::pair(ptr, null_count);
       }
-      return std::make_pair(static_cast<bitmask_type const*>(nullptr), 0);
+      return std::pair(static_cast<bitmask_type const*>(nullptr), 0);
     }();
 
     // size/data pointer for the column
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 57bb222aaa0..79882239b38 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -83,8 +83,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
                  "Unsupported groupby key type does not support equality comparison");
     auto [grouped_keys, results] =
       detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr);
-    return std::make_pair(unflatten_nested_columns(std::move(grouped_keys), _keys),
-                          std::move(results));
+    return std::pair(unflatten_nested_columns(std::move(grouped_keys), _keys), std::move(results));
   } else {
     return sort_aggregate(requests, stream, mr);
   }
@@ -193,7 +192,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
   verify_valid_requests(requests);
 
-  if (_keys.num_rows() == 0) { return std::make_pair(empty_like(_keys), empty_results(requests)); }
+  if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); }
 
   return dispatch_aggregation(requests, rmm::cuda_stream_default, mr);
 }
@@ -211,7 +210,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan
 
   verify_valid_requests(requests);
 
-  if (_keys.num_rows() == 0) { return std::make_pair(empty_like(_keys), empty_results(requests)); }
+  if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); }
 
   return sort_scan(requests, rmm::cuda_stream_default, mr);
 }
@@ -250,7 +249,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls
   CUDF_EXPECTS(static_cast<cudf::size_type>(replace_policies.size()) == values.num_columns(),
                "Size mismatch between num_columns and replace_policies.");
 
-  if (values.is_empty()) { return std::make_pair(empty_like(_keys), empty_like(values)); }
+  if (values.is_empty()) { return std::pair(empty_like(_keys), empty_like(values)); }
   auto const stream = rmm::cuda_stream_default;
 
   auto const& group_labels = helper().group_labels(stream);
@@ -269,8 +268,8 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls
                       : std::move(grouped_values);
     });
 
-  return std::make_pair(std::move(helper().sorted_keys(stream, mr)),
-                        std::make_unique<table>(std::move(results)));
+  return std::pair(std::move(helper().sorted_keys(stream, mr)),
+                   std::make_unique<table>(std::move(results)));
 }
 
 // Get the sort helper object
@@ -310,8 +309,8 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
         grouped_values->view(), group_offsets, offsets[i], fill_values[i].get(), stream, mr);
     });
 
-  return std::make_pair(helper().sorted_keys(stream, mr),
-                        std::make_unique<cudf::table>(std::move(results)));
+  return std::pair(helper().sorted_keys(stream, mr),
+                   std::make_unique<cudf::table>(std::move(results)));
 }
 
 }  // namespace groupby
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index f225afaec71..e22b3a4f3a4 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -672,7 +672,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   std::unique_ptr<table> unique_keys =
     groupby(keys, requests, &cache, has_nulls(keys), include_null_keys, stream, mr);
 
-  return std::make_pair(std::move(unique_keys), extract_results(requests, cache, stream, mr));
+  return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr));
 }
 }  // namespace hash
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 4904aa42723..02036ff0bbf 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -778,7 +778,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
 
   auto results = detail::extract_results(requests, cache, stream, mr);
 
-  return std::make_pair(helper().unique_keys(stream, mr), std::move(results));
+  return std::pair(helper().unique_keys(stream, mr), std::move(results));
 }
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index 8b8a03f35a5..000a595ea2f 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -82,8 +82,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   auto null_purged_offsets = strings::detail::make_offsets_child_column(
     null_purged_sizes.cbegin(), null_purged_sizes.cend(), stream, mr);
 
-  return std::make_pair<std::unique_ptr<column>, std::unique_ptr<column>>(
-    std::move(null_purged_values), std::move(null_purged_offsets));
+  return std::pair(std::move(null_purged_values), std::move(null_purged_offsets));
 }
 
 std::unique_ptr<column> group_collect(column_view const& values,
@@ -109,8 +108,8 @@ std::unique_ptr<column> group_collect(column_view const& values,
       return cudf::groupby::detail::purge_null_entries(
         values, offsets_column->view(), num_groups, stream, mr);
     } else {
-      return std::make_pair(std::make_unique<cudf::column>(values, stream, mr),
-                            std::move(offsets_column));
+      return std::pair(std::make_unique<cudf::column>(values, stream, mr),
+                       std::move(offsets_column));
     }
   }();
 
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 8c4959da35b..20edc1b3f50 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -185,7 +185,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
 
   auto results = detail::extract_results(requests, cache, stream, mr);
 
-  return std::make_pair(helper().sorted_keys(stream, mr), std::move(results));
+  return std::pair(helper().sorted_keys(stream, mr), std::move(results));
 }
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index a4ae9999a19..47244279599 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -177,7 +177,7 @@ std::vector<metadata::stripe_source_mapping> aggregate_orc_metadata::select_stri
                                              per_file_metadata[src_file_idx].ff.stripes.size()),
           "Invalid stripe index");
         stripe_infos.push_back(
-          std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+          std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
         row_count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
       }
       selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
@@ -206,7 +206,7 @@ std::vector<metadata::stripe_source_mapping> aggregate_orc_metadata::select_stri
         count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
         if (count > row_start || count == 0) {
           stripe_infos.push_back(
-            std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+            std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
         } else {
           stripe_skip_rows = count;
         }
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 83c23774362..a768d568178 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -108,20 +108,20 @@ constexpr std::pair<gpu::StreamIndexType, uint32_t> get_index_type_and_pos(
     case orc::DATA:
       skip_count += 1;
       skip_count |= (skip_count & 0xff) << 8;
-      return std::make_pair(gpu::CI_DATA, skip_count);
+      return std::pair(gpu::CI_DATA, skip_count);
     case orc::LENGTH:
     case orc::SECONDARY:
       skip_count += 1;
       skip_count |= (skip_count & 0xff) << 16;
-      return std::make_pair(gpu::CI_DATA2, skip_count);
-    case orc::DICTIONARY_DATA: return std::make_pair(gpu::CI_DICTIONARY, skip_count);
+      return std::pair(gpu::CI_DATA2, skip_count);
+    case orc::DICTIONARY_DATA: return std::pair(gpu::CI_DICTIONARY, skip_count);
     case orc::PRESENT:
       skip_count += (non_child ? 1 : 0);
-      return std::make_pair(gpu::CI_PRESENT, skip_count);
-    case orc::ROW_INDEX: return std::make_pair(gpu::CI_INDEX, skip_count);
+      return std::pair(gpu::CI_PRESENT, skip_count);
+    case orc::ROW_INDEX: return std::pair(gpu::CI_INDEX, skip_count);
     default:
       // Skip this stream as it's not strictly required
-      return std::make_pair(gpu::CI_NUM_STREAMS, 0);
+      return std::pair(gpu::CI_NUM_STREAMS, 0);
   }
 }
 
@@ -1120,9 +1120,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
             if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
                   .source->is_device_read_preferred(len)) {
               read_tasks.push_back(
-                std::make_pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                 .source->device_read_async(offset, len, d_dst, stream),
-                               len));
+                std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                            .source->device_read_async(offset, len, d_dst, stream),
+                          len));
 
             } else {
               const auto buffer =
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 9075a319ab3..93e76a6ac23 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -75,7 +75,7 @@ struct map_insert_fn {
     if constexpr (column_device_view::has_element_accessor<T>()) {
       auto hash_fn     = hash_functor<T>{col};
       auto equality_fn = equality_functor<T>{col};
-      return map.insert(std::make_pair(i, i), hash_fn, equality_fn);
+      return map.insert(std::pair(i, i), hash_fn, equality_fn);
     } else {
       CUDF_UNREACHABLE("Unsupported type to insert in map");
     }
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 92d436e4566..75a50714407 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -876,7 +876,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   std::vector<rmm::device_uvector<size_type>> dict_data;
   std::vector<rmm::device_uvector<uint16_t>> dict_index;
 
-  if (h_chunks.size() == 0) { return std::make_pair(std::move(dict_data), std::move(dict_index)); }
+  if (h_chunks.size() == 0) { return std::pair(std::move(dict_data), std::move(dict_index)); }
 
   // Allocate slots for each chunk
   std::vector<rmm::device_uvector<gpu::slot_type>> hash_maps_storage;
@@ -912,7 +912,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
       // We don't use dictionary if the indices are > 16 bits because that's the maximum bitpacking
       // bitsize we efficiently support
-      if (nbits > 16) { return std::make_pair(false, 0); }
+      if (nbits > 16) { return std::pair(false, 0); }
 
       // Only these bit sizes are allowed for RLE encoding because it's compute optimized
       constexpr auto allowed_bitsizes = std::array<size_type, 6>{1, 2, 4, 8, 12, 16};
@@ -925,7 +925,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
       bool use_dict = (ck.plain_data_size > dict_enc_size);
       if (not use_dict) { rle_bits = 0; }
-      return std::make_pair(use_dict, rle_bits);
+      return std::pair(use_dict, rle_bits);
     }();
   }
 
@@ -946,7 +946,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
   gpu::get_dictionary_indices(frags, stream);
 
-  return std::make_pair(std::move(dict_data), std::move(dict_index));
+  return std::pair(std::move(dict_data), std::move(dict_index));
 }
 
 void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 9bf7e6a7a43..ae1561b422b 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -59,8 +59,8 @@ conditional_join(table_view const& left,
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
-        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+        return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                         std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       default: CUDF_FAIL("Invalid join kind."); break;
     }
   } else if (left_num_rows == 0) {
@@ -70,12 +70,12 @@ conditional_join(table_view const& left,
       case join_kind::LEFT_ANTI_JOIN:
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
-        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+        return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                         std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
         auto ret_flipped = get_trivial_left_join_indices(right, stream);
-        return std::make_pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
+        return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
     }
@@ -139,8 +139,8 @@ conditional_join(table_view const& left,
   // all other cases (inner, left semi, and left anti joins) if we reach this
   // point we can safely return an empty result.
   if (join_size == 0) {
-    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+    return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                     std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
   rmm::device_scalar<size_type> write_index(0, stream);
@@ -176,7 +176,7 @@ conditional_join(table_view const& left,
         swap_tables);
   }
 
-  auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));
+  auto join_indices = std::pair(std::move(left_indices), std::move(right_indices));
 
   // For full joins, get the indices in the right table that were not joined to
   // by any row in the left table.
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 086e1e49986..8d2888fd761 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -44,7 +44,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table
 {
   std::unique_ptr<table> empty_probe = empty_like(probe);
   std::unique_ptr<table> empty_build = empty_like(build);
-  return std::make_pair(std::move(empty_probe), std::move(empty_build));
+  return std::pair(std::move(empty_probe), std::move(empty_build));
 }
 
 /**
@@ -88,8 +88,8 @@ probe_join_hash_table(cudf::table_device_view build_table,
 
   // If output size is zero, return immediately
   if (join_size == 0) {
-    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+    return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                     std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
@@ -125,7 +125,7 @@ probe_join_hash_table(cudf::table_device_view build_table,
     hash_table.pair_retrieve(
       iter, iter + probe_table_num_rows, out1_zip_begin, out2_zip_begin, equality, stream.value());
   }
-  return std::make_pair(std::move(left_indices), std::move(right_indices));
+  return std::pair(std::move(left_indices), std::move(right_indices));
 }
 
 /**
@@ -390,8 +390,8 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe,
                "Mismatch in number of columns to be joined on");
 
   if (is_trivial_join(flattened_probe_table, _build, JoinKind)) {
-    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+    return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                     std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
   CUDF_EXPECTS(std::equal(std::cbegin(_build),
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 7a478ca2eb3..15aed83b641 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -52,7 +52,7 @@ inner_join(table_view const& left_input,
   if (right.num_rows() > left.num_rows()) {
     cudf::hash_join hj_obj(left, compare_nulls, stream);
     auto [right_result, left_result] = hj_obj.inner_join(right, std::nullopt, stream, mr);
-    return std::make_pair(std::move(left_result), std::move(right_result));
+    return std::pair(std::move(left_result), std::move(right_result));
   } else {
     cudf::hash_join hj_obj(right, compare_nulls, stream);
     return hj_obj.inner_join(left, std::nullopt, stream, mr);
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 151db830962..1eb2d4cf4a7 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -61,7 +61,7 @@ get_trivial_left_join_indices(table_view const& left,
     std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
   thrust::uninitialized_fill(
     rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue);
-  return std::make_pair(std::move(left_indices), std::move(right_indices));
+  return std::pair(std::move(left_indices), std::move(right_indices));
 }
 
 VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream)
@@ -151,7 +151,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                              left_invalid_indices->end(),
                              JoinNoneValue);
 
-  return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement));
+  return std::pair(std::move(left_invalid_indices), std::move(right_indices_complement));
 }
 
 }  // namespace detail
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index f9cbb2b5441..b540c013f47 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -81,8 +81,8 @@ mixed_join(
       case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left_conditional, stream);
       // Inner joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
-        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+        return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                         std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       default: CUDF_FAIL("Invalid join kind."); break;
     }
   } else if (left_num_rows == 0) {
@@ -90,12 +90,12 @@ mixed_join(
       // Left and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
       case join_kind::INNER_JOIN:
-        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+        return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                         std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
         auto ret_flipped = get_trivial_left_join_indices(right_conditional, stream);
-        return std::make_pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
+        return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
     }
@@ -208,8 +208,8 @@ mixed_join(
   // all other cases (inner, left semi, and left anti joins) if we reach this
   // point we can safely return an empty result.
   if (join_size == 0) {
-    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+    return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                     std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
   // Given the number of matches per row, we need to compute the offsets for insertion.
@@ -258,7 +258,7 @@ mixed_join(
         swap_tables);
   }
 
-  auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));
+  auto join_indices = std::pair(std::move(left_indices), std::move(right_indices));
 
   // For full joins, get the indices in the right table that were not joined to
   // by any row in the left table.
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index fecdec0b1b2..f4d8e7678b1 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -81,7 +81,7 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
 
   auto [null_mask, null_count] = [&] {
     if (!build_null_mask)
-      return std::make_pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+      return std::pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
 
     // The output row will be null only if all lists on the input row are null.
     auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child(), stream);
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index fecf6e1c1a1..7220e8b5980 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -175,7 +175,7 @@ struct list_child_constructor {
       source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
         ? construct_child_nullmask(
             list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
+        : std::pair(rmm::device_buffer{}, 0);
 
     auto child_column = cudf::make_fixed_width_column(source_lists_column_view.child().type(),
                                                       num_child_rows,
@@ -348,7 +348,7 @@ struct list_child_constructor {
       source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
         ? construct_child_nullmask(
             list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
+        : std::pair(rmm::device_buffer{}, 0);
 
     return cudf::make_lists_column(num_child_rows,
                                    std::move(child_offsets),
@@ -444,7 +444,7 @@ struct list_child_constructor {
       source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
         ? construct_child_nullmask(
             list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
+        : std::pair(rmm::device_buffer{}, 0);
 
     return cudf::make_structs_column(num_child_rows,
                                      std::move(child_columns),
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 09f07a1ca8c..0371065a2e5 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -595,8 +595,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     }
 
     stream.synchronize();  // Async D2H copy must finish before returning host vec
-    return std::make_pair(std::make_unique<table>(std::move(output_cols)),
-                          std::move(partition_offsets));
+    return std::pair(std::make_unique<table>(std::move(output_cols)), std::move(partition_offsets));
   } else {
     // Compute a scatter map from input to output such that the output rows are
     // sorted by partition number
@@ -613,7 +612,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
       input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, stream, mr);
 
     stream.synchronize();  // Async D2H copy must finish before returning host vec
-    return std::make_pair(std::move(output), std::move(partition_offsets));
+    return std::pair(std::move(output), std::move(partition_offsets));
   }
 }
 
@@ -700,7 +699,7 @@ struct dispatch_map_type {
     auto scattered =
       cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, false, stream, mr);
 
-    return std::make_pair(std::move(scattered), std::move(partition_offsets));
+    return std::pair(std::move(scattered), std::move(partition_offsets));
   }
 
   template <typename MapType, typename... Args>
@@ -728,7 +727,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
 
   // Return empty result if there are no partitions or nothing to hash
   if (num_partitions <= 0 || input.num_rows() == 0 || table_to_hash.num_columns() == 0) {
-    return std::make_pair(empty_like(input), std::vector<size_type>{});
+    return std::pair(empty_like(input), std::vector<size_type>{});
   }
 
   if (has_nulls(table_to_hash)) {
@@ -753,7 +752,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   CUDF_EXPECTS(not partition_map.has_nulls(), "Unexpected null values in partition_map.");
 
   if (num_partitions == 0 or t.num_rows() == 0) {
-    return std::make_pair(empty_like(t), std::vector<size_type>{});
+    return std::pair(empty_like(t), std::vector<size_type>{});
   }
 
   return cudf::type_dispatcher(
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 193bb5a4353..9cfad602db0 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -104,8 +104,8 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
                                          stream,
                                          mr);
 
-    return std::make_pair(std::move(uniq_tbl),
-                          cudf::detail::make_std_vector_sync(partition_offsets, stream));
+    return std::pair(std::move(uniq_tbl),
+                     cudf::detail::make_std_vector_sync(partition_offsets, stream));
   } else {  //( num_partitions > nrows )
     rmm::device_uvector<cudf::size_type> d_row_indices(nrows, stream);
 
@@ -140,8 +140,8 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
                            nedges_iter_begin + num_partitions,
                            partition_offsets.begin());
 
-    return std::make_pair(std::move(uniq_tbl),
-                          cudf::detail::make_std_vector_sync(partition_offsets, stream));
+    return std::pair(std::move(uniq_tbl),
+                     cudf::detail::make_std_vector_sync(partition_offsets, stream));
   }
 }
 }  // namespace
@@ -230,7 +230,7 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
 
   auto uniq_tbl = cudf::detail::gather(
     input, iter_begin, iter_begin + nrows, cudf::out_of_bounds_policy::DONT_CHECK, stream, mr);
-  auto ret_pair = std::make_pair(std::move(uniq_tbl), std::vector<cudf::size_type>(num_partitions));
+  auto ret_pair = std::pair(std::move(uniq_tbl), std::vector<cudf::size_type>(num_partitions));
 
   // this has the effect of rotating the set of partition sizes
   // right by start_partition positions:
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 8b696854c25..73b224b0c99 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -76,7 +76,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> form_offsets_and_cha
     cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   auto chars_column = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
 
-  return std::make_pair(std::move(offsets_column), std::move(chars_column));
+  return std::pair(std::move(offsets_column), std::move(chars_column));
 }
 
 template <typename OptionalScalarIterator, typename ReplaceScalarIterator>
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 70a6252e9b3..9473bed963e 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -1086,7 +1086,7 @@ struct dispatch_from_timestamps_fn {
                        thrust::make_counting_iterator<cudf::size_type>(0),
                        d_timestamps.size(),
                        pfn);
-    return std::make_pair(std::move(offsets_column), std::move(chars_column));
+    return std::pair(std::move(offsets_column), std::move(chars_column));
   }
 
   template <typename T, typename... Args>
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 30e8770c3c2..995b6223ddc 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -670,8 +670,8 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
 
   auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END;
   return is_empty
-           ? std::make_pair(thrust::nullopt, 0)
-           : std::make_pair(
+           ? std::pair(thrust::nullopt, 0)
+           : std::pair(
                thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)),
                max_stack_depth);
 }
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index d496b46bc36..7a3e0fb0243 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -283,7 +283,7 @@ auto make_strings_children(Func fn,
     for_each_fn(fn);
   }
 
-  return std::make_pair(std::move(offsets_column), std::move(chars_column));
+  return std::pair(std::move(offsets_column), std::move(chars_column));
 }
 
 }  // namespace
@@ -345,7 +345,7 @@ std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
 
   auto const strings_count = input.size();
   if (strings_count == 0) {
-    return std::make_pair(make_empty_column(type_to_id<size_type>()), int64_t{0});
+    return std::pair(make_empty_column(type_to_id<size_type>()), int64_t{0});
   }
 
   auto output_sizes = make_numeric_column(
@@ -374,7 +374,7 @@ std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
                              int64_t{0},
                              thrust::plus{});
 
-  return std::make_pair(std::move(output_sizes), total_bytes);
+  return std::pair(std::move(output_sizes), total_bytes);
 }
 
 }  // namespace detail
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 852a32bed3d..a2c173cae5f 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -371,7 +371,7 @@ std::tuple<cudf::column_view, std::vector<rmm::device_buffer>> superimpose_paren
     auto [new_child_mask, null_count] = [&] {
       if (not child.nullable()) {
         // Adopt parent STRUCT's null mask.
-        return std::make_pair(structs_column.null_mask(), 0);
+        return std::pair(structs_column.null_mask(), 0);
       }
 
       // Both STRUCT and child are nullable. AND() for the child's new null mask.
@@ -387,8 +387,8 @@ std::tuple<cudf::column_view, std::vector<rmm::device_buffer>> superimpose_paren
                                                               stream,
                                                               mr);
       ret_validity_buffers.push_back(std::move(new_mask));
-      return std::make_pair(
-        reinterpret_cast<bitmask_type const*>(ret_validity_buffers.back().data()), null_count);
+      return std::pair(reinterpret_cast<bitmask_type const*>(ret_validity_buffers.back().data()),
+                       null_count);
     }();
 
     return cudf::column_view(
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 2ed59c3ae0c..71f9e3f7043 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -278,8 +278,8 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
                                         rmm::cuda_stream_view stream) const
 {
   if (num_strings == 0)
-    return std::make_pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
-                          std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
+    return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
+                     std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
 
   // copy offsets to working memory
   size_t const num_offsets = num_strings + 1;
@@ -294,8 +294,8 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
                     });
   uint32_t const bytes_count = d_strings_offsets->element(num_strings, stream);
   if (bytes_count == 0)  // if no bytes, nothing to do
-    return std::make_pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
-                          std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
+    return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
+                     std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
 
   cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(bytes_count), THREADS_PER_BLOCK, 1};
   size_t const threads_on_device  = grid.num_threads_per_block * grid.num_blocks;
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index 2cf4771890b..a1f49a5685f 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 {
   CUDF_EXPECTS(input.type().id() == type_id::BOOL8, "Input is not of type bool");
 
-  if (input.is_empty()) { return std::make_pair(std::make_unique<rmm::device_buffer>(), 0); }
+  if (input.is_empty()) { return std::pair(std::make_unique<rmm::device_buffer>(), 0); }
 
   auto input_device_view_ptr = column_device_view::create(input, stream);
   auto input_device_view     = *input_device_view_ptr;
@@ -45,12 +45,12 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 
     auto mask = detail::valid_if(input_begin, input_begin + input.size(), pred, stream, mr);
 
-    return std::make_pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
+    return std::pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
   } else {
     auto mask = detail::valid_if(
       input_device_view.begin<bool>(), input_device_view.end<bool>(), pred, stream, mr);
 
-    return std::make_pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
+    return std::pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
   }
 }
 
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 04821b09eab..60769665fca 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -57,7 +57,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
   auto indices_column = cudf::detail::lower_bound(
     sorted_unique_keys->view(), input_table, column_order, null_precedence, stream, mr);
 
-  return std::make_pair(std::move(sorted_unique_keys), std::move(indices_column));
+  return std::pair(std::move(sorted_unique_keys), std::move(indices_column));
 }
 
 }  // namespace detail
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index ee63e6d366f..42d41b44779 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -53,8 +53,7 @@ struct dispatch_nan_to_null {
                                    stream,
                                    mr);
 
-      return std::make_pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)),
-                            mask.second);
+      return std::pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
     } else {
       auto pred = [input_device_view] __device__(cudf::size_type idx) {
         return not(std::isnan(input_device_view.element<T>(idx)));
@@ -66,8 +65,7 @@ struct dispatch_nan_to_null {
                                    stream,
                                    mr);
 
-      return std::make_pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)),
-                            mask.second);
+      return std::pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
     }
   }
 
@@ -85,7 +83,7 @@ struct dispatch_nan_to_null {
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
   column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty()) { return std::make_pair(std::make_unique<rmm::device_buffer>(), 0); }
+  if (input.is_empty()) { return std::pair(std::make_unique<rmm::device_buffer>(), 0); }
 
   return cudf::type_dispatcher(input.type(), dispatch_nan_to_null{}, input, stream, mr);
 }
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 16aee349bb5..b1a8858f847 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,7 +89,7 @@ struct one_hot_encode_launcher {
     auto views = cudf::split(all_encodings->view(), split_indices);
     table_view encodings_view{views};
 
-    return std::make_pair(std::move(all_encodings), encodings_view);
+    return std::pair(std::move(all_encodings), encodings_view);
   }
 
   template <typename InputType,
@@ -108,14 +108,12 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
 {
   CUDF_EXPECTS(input.type() == categories.type(), "Mismatch type between input and categories.");
 
-  if (categories.is_empty()) {
-    return std::make_pair(make_empty_column(type_id::BOOL8), table_view{});
-  }
+  if (categories.is_empty()) { return std::pair(make_empty_column(type_id::BOOL8), table_view{}); }
 
   if (input.is_empty()) {
     auto empty_data = make_empty_column(type_id::BOOL8);
     std::vector<column_view> views(categories.size(), empty_data->view());
-    return std::make_pair(std::move(empty_data), table_view{views});
+    return std::pair(std::move(empty_data), table_view{views});
   }
 
   return type_dispatcher(input.type(), one_hot_encode_launcher{}, input, categories, stream, mr);
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index b5b00b11a0f..a87cf60a252 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -37,7 +37,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
 {
   // If there are no rows in the input, return successfully
   if (input.num_columns() == 0 || input.num_rows() == 0) {
-    return std::make_pair(std::make_unique<column>(), table_view{});
+    return std::pair(std::make_unique<column>(), table_view{});
   }
 
   // Check datatype homogeneity
@@ -54,7 +54,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
   auto splits = std::vector<size_type>(splits_iter, splits_iter + input.num_rows() - 1);
   auto output_column_views = split(output_column->view(), splits, stream);
 
-  return std::make_pair(std::move(output_column), table_view(output_column_views));
+  return std::pair(std::move(output_column), table_view(output_column_views));
 }
 }  // namespace detail
 
diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp
index be7d6c1ce05..6f5a04e3752 100644
--- a/cpp/tests/groupby/m2_tests.cpp
+++ b/cpp/tests/groupby/m2_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,8 +48,7 @@ auto compute_M2(cudf::column_view const& keys, cudf::column_view const& values)
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys}));
   auto result = gb_obj.aggregate(requests);
-  return std::make_pair(std::move(result.first->release()[0]),
-                        std::move(result.second[0].results[0]));
+  return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0]));
 }
 }  // namespace
 
diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp
index 7c24c6267ca..593bb7c50af 100644
--- a/cpp/tests/groupby/merge_lists_tests.cpp
+++ b/cpp/tests/groupby/merge_lists_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,8 +47,7 @@ auto merge_lists(vcol_views const& keys_cols, vcol_views const& values_cols)
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
-  return std::make_pair(std::move(result.first->release()[0]),
-                        std::move(result.second[0].results[0]));
+  return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0]));
 }
 
 }  // namespace
diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
index 60067e78022..79ffebf146c 100644
--- a/cpp/tests/groupby/merge_m2_tests.cpp
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,10 +67,9 @@ auto compute_partial_results(cudf::column_view const& keys, cudf::column_view co
   auto [out_keys, out_results] = gb_obj.aggregate(requests);
 
   auto const num_output_rows = out_keys->num_rows();
-  return std::make_pair(
-    std::move(out_keys->release()[0]),
-    cudf::make_structs_column(
-      num_output_rows, std::move(out_results[0].results), 0, rmm::device_buffer{}));
+  return std::pair(std::move(out_keys->release()[0]),
+                   cudf::make_structs_column(
+                     num_output_rows, std::move(out_results[0].results), 0, rmm::device_buffer{}));
 }
 
 /**
@@ -93,8 +92,7 @@ auto merge_M2(vcol_views const& keys_cols, vcol_views const& values_cols)
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
-  return std::make_pair(std::move(result.first->release()[0]),
-                        std::move(result.second[0].results[0]));
+  return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0]));
 }
 }  // namespace
 
diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp
index 1e2f0c9fa9e..57f67f6b81a 100644
--- a/cpp/tests/groupby/merge_sets_tests.cpp
+++ b/cpp/tests/groupby/merge_sets_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,8 +47,7 @@ auto merge_sets(vcol_views const& keys_cols, vcol_views const& values_cols)
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
-  return std::make_pair(std::move(result.first->release()[0]),
-                        std::move(result.second[0].results[0]));
+  return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0]));
 }
 
 }  // namespace
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index d1dc60119b6..4b481ade83f 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -148,7 +148,7 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
 
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
 
-  return std::make_pair(
+  return std::pair(
     std::make_unique<cudf::table>(std::move(columns)),
     arrow::Table::Make(
       schema, {int64array, string_array, dict_array, boolarray, list_array, struct_array}));
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 73b355d496d..13852027bf0 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -93,7 +93,7 @@ std::pair<std::vector<T>, std::vector<T>> gen_random_repeated_columns(
   std::mt19937 gen(rd());
   std::shuffle(left.begin(), left.end(), gen);
   std::shuffle(right.begin(), right.end(), gen);
-  return std::make_pair(std::move(left), std::move(right));
+  return std::pair(std::move(left), std::move(right));
 }
 
 // Generate a single pair of left/right nullable columns of random data
@@ -120,8 +120,8 @@ gen_random_nullable_repeated_columns(unsigned int N = 10000, unsigned int num_re
     return uniform_dist(gen) > 0.5;
   });
 
-  return std::make_pair(std::make_pair(std::move(left), std::move(left_nulls)),
-                        std::make_pair(std::move(right), std::move(right_nulls)));
+  return std::pair(std::pair(std::move(left), std::move(left_nulls)),
+                   std::pair(std::move(right), std::move(right_nulls)));
 }
 
 }  // namespace
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index f560ce7f20c..8ed50c8fb39 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -67,7 +67,7 @@ struct JoinTest : public cudf::test::BaseFixture {
     auto gold_sort_order = cudf::sorted_order(gold);
     auto sorted_gold     = cudf::gather(gold, *gold_sort_order);
 
-    return std::make_pair(std::move(sorted_gold), std::move(sorted_result));
+    return std::pair(std::move(sorted_gold), std::move(sorted_result));
   }
 };
 
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index df5b1f5c14a..edcf1d1be27 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -94,7 +94,7 @@ std::pair<std::vector<T>, std::vector<T>> gen_random_repeated_columns(
   std::mt19937 gen(rd());
   std::shuffle(left.begin(), left.end(), gen);
   std::shuffle(right.begin(), right.end(), gen);
-  return std::make_pair(std::move(left), std::move(right));
+  return std::pair(std::move(left), std::move(right));
 }
 
 // Generate a single pair of left/right nullable columns of random data
@@ -121,8 +121,8 @@ gen_random_nullable_repeated_columns(unsigned int N = 10000, unsigned int num_re
     return uniform_dist(gen) > 0.5;
   });
 
-  return std::make_pair(std::make_pair(std::move(left), std::move(left_nulls)),
-                        std::make_pair(std::move(right), std::move(right_nulls)));
+  return std::pair(std::pair(std::move(left), std::move(left_nulls)),
+                   std::pair(std::move(right), std::move(right_nulls)));
 }
 
 }  // namespace
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index ea26cad3b59..129d1ad66f3 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -652,8 +652,8 @@ TYPED_TEST(MergeTest_, NMerge1KeyColumns)
   std::vector<std::pair<PairT0, PairT1>> facts{};
   std::vector<cudf::table_view> tables{};
   for (int i = 0; i < num_tables; ++i) {
-    facts.emplace_back(std::make_pair(PairT0(sequence0, sequence0 + inputRows),
-                                      PairT1(sequence1, sequence1 + inputRows)));
+    facts.emplace_back(std::pair(PairT0(sequence0, sequence0 + inputRows),
+                                 PairT1(sequence1, sequence1 + inputRows)));
     tables.push_back(cudf::table_view{{facts.back().first, facts.back().second}});
   }
   std::vector<cudf::size_type> key_cols{0};
diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp
index a1f0b1d81cf..159b082890a 100644
--- a/cpp/tests/search/search_struct_test.cpp
+++ b/cpp/tests/search/search_struct_test.cpp
@@ -57,7 +57,7 @@ auto search_bounds(cudf::column_view const& t_col_view,
   auto const values       = cudf::table_view{std::vector<cudf::column_view>{values_col->view()}};
   auto result_lower_bound = cudf::lower_bound(t, values, column_orders, null_precedence);
   auto result_upper_bound = cudf::upper_bound(t, values, column_orders, null_precedence);
-  return std::make_pair(std::move(result_lower_bound), std::move(result_upper_bound));
+  return std::pair(std::move(result_lower_bound), std::move(result_upper_bound));
 }
 
 auto search_bounds(std::unique_ptr<cudf::column> const& t_col,
diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp
index 0529539c4b2..31bbd43c78d 100644
--- a/cpp/tests/stream_compaction/distinct_count_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp
@@ -71,7 +71,7 @@ TYPED_TEST(TypedDistinctCount, TableNoNull)
   std::vector<std::pair<T, T>> pair_input;
   std::transform(
     input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) {
-      return std::make_pair(a, b);
+      return std::pair(a, b);
     });
 
   cudf::test::fixed_width_column_wrapper<T> input_col1(input1.begin(), input1.end());
diff --git a/cpp/tests/stream_compaction/unique_count_tests.cpp b/cpp/tests/stream_compaction/unique_count_tests.cpp
index 3285cd1a711..591fe042592 100644
--- a/cpp/tests/stream_compaction/unique_count_tests.cpp
+++ b/cpp/tests/stream_compaction/unique_count_tests.cpp
@@ -71,7 +71,7 @@ TYPED_TEST(TypedUniqueCount, TableNoNull)
   std::vector<std::pair<T, T>> pair_input;
   std::transform(
     input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) {
-      return std::make_pair(a, b);
+      return std::pair(a, b);
     });
 
   cudf::test::fixed_width_column_wrapper<T> input_col1(input1.begin(), input1.end());
diff --git a/cpp/tests/strings/translate_tests.cpp b/cpp/tests/strings/translate_tests.cpp
index e928065dca4..53c6982b880 100644
--- a/cpp/tests/strings/translate_tests.cpp
+++ b/cpp/tests/strings/translate_tests.cpp
@@ -38,7 +38,7 @@ std::pair<cudf::char_utf8, cudf::char_utf8> make_entry(const char* from, const c
   cudf::char_utf8 out = 0;
   cudf::strings::detail::to_char_utf8(from, in);
   if (to) cudf::strings::detail::to_char_utf8(to, out);
-  return std::make_pair(in, out);
+  return std::pair(in, out);
 }
 
 TEST_F(StringsTranslateTest, Translate)

From 3d92bf257bcfb46fe5386821d7f81d4b9f4e6dd5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 27 Apr 2022 17:55:33 -0400
Subject: [PATCH 03/28] Fix scatter for all-empty-string column case (#10724)

Closes #10717

Fixes bug introduced with changes in #10673 which uses the `cudf::make_strings_column` that accepts a span of `string_view` objects with a null-placeholder. The placeholder can be unintentionally created in `create_string_vector_from_column` when given a strings column where all the rows are empty. The utility is fixed to prevent creating the placeholder for empty strings.

A gtest was added to scatter from/to an all-empty strings column to verify this behavior.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/10724
---
 cpp/include/cudf/strings/detail/scatter.cuh |  7 ++++++-
 cpp/src/lists/copying/scatter_helper.cu     | 15 +++++++++-----
 cpp/src/strings/utilities.cu                | 22 +++++++++++++--------
 cpp/tests/copying/scatter_tests.cpp         | 13 +++++++++++-
 4 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index f167206f36b..cfede60c771 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -67,8 +67,13 @@ std::unique_ptr<column> scatter(
   // create vector of string_view's to scatter into
   rmm::device_uvector<string_view> target_vector = create_string_vector_from_column(target, stream);
 
+  // this ensures empty strings are not mapped to nulls in the make_strings_column function
+  auto const size = thrust::distance(begin, end);
+  auto itr        = thrust::make_transform_iterator(
+    begin, [] __device__(string_view const sv) { return sv.empty() ? string_view{} : sv; });
+
   // do the scatter
-  thrust::scatter(rmm::exec_policy(stream), begin, end, scatter_map, target_vector.begin());
+  thrust::scatter(rmm::exec_policy(stream), itr, itr + size, scatter_map, target_vector.begin());
 
   // build the output column
   auto sv_span = cudf::device_span<string_view const>(target_vector);
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index 7220e8b5980..38f738b4035 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -232,6 +232,8 @@ struct list_child_constructor {
 
     auto string_views = rmm::device_uvector<string_view>(num_child_rows, stream);
 
+    auto const null_string_view = string_view{nullptr, 0};  // placeholder for factory function
+
     thrust::transform(
       rmm::exec_policy(stream),
       thrust::make_counting_iterator<size_type>(0),
@@ -241,7 +243,8 @@ struct list_child_constructor {
        offset_size   = list_offsets.size(),
        d_list_vector = list_vector.begin(),
        source_lists,
-       target_lists] __device__(auto index) {
+       target_lists,
+       null_string_view] __device__(auto index) {
         auto const list_index_iter =
           thrust::upper_bound(thrust::seq, offset_begin, offset_begin + offset_size, index);
         auto const list_index =
@@ -254,14 +257,16 @@ struct list_child_constructor {
         auto child_strings_column = lists_column.child();
         auto strings_offset       = lists_offsets_ptr[row_index] + intra_index;
 
-        return child_strings_column.is_null(strings_offset)
-                 ? string_view{nullptr, 0}
-                 : child_strings_column.template element<string_view>(strings_offset);
+        if (child_strings_column.is_null(strings_offset)) { return null_string_view; }
+        auto const d_str = child_strings_column.template element<string_view>(strings_offset);
+        // ensure a string from an all-empty column is not mapped to the null placeholder
+        auto const empty_string_view = string_view{};
+        return d_str.empty() ? empty_string_view : d_str;
       });
 
     // string_views should now have been populated with source and target references.
     auto sv_span = cudf::device_span<string_view const>(string_views);
-    return cudf::make_strings_column(sv_span, string_view{nullptr, 0}, stream, mr);
+    return cudf::make_strings_column(sv_span, null_string_view, stream, mr);
   }
 
   /**
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index a7ef2afb47f..ac073f8efbc 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -46,14 +46,20 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
 
   auto strings_vector = rmm::device_uvector<string_view>(input.size(), stream, mr);
 
-  thrust::transform(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(input.size()),
-    strings_vector.begin(),
-    [d_strings = *d_strings] __device__(size_type idx) {
-      return d_strings.is_null(idx) ? string_view{nullptr, 0} : d_strings.element<string_view>(idx);
-    });
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(input.size()),
+                    strings_vector.begin(),
+                    [d_strings = *d_strings] __device__(size_type idx) {
+                      // placeholder for factory function that takes a span of string_views
+                      auto const null_string_view = string_view{nullptr, 0};
+                      if (d_strings.is_null(idx)) { return null_string_view; }
+                      auto const d_str = d_strings.element<string_view>(idx);
+                      // special case when the entire column is filled with empty strings:
+                      // here the empty d_str may have a d_str.data() == nullptr
+                      auto const empty_string_view = string_view{};
+                      return d_str.empty() ? empty_string_view : d_str;
+                    });
 
   return strings_vector;
 }
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index 28ebb6cbcb6..306ab8a3d5c 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -573,6 +573,17 @@ TEST_F(ScatterStringsTests, ScatterScalarNoNulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
 
+TEST_F(ScatterStringsTests, EmptyStrings)
+{
+  cudf::test::strings_column_wrapper input{"", "", ""};
+  cudf::table_view t({input});
+
+  // Test for issue 10717: all-empty-string column scatter
+  auto map    = cudf::test::fixed_width_column_wrapper<int32_t>({0});
+  auto result = cudf::scatter(t, map, t);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), t);
+}
+
 template <typename T>
 class BooleanMaskScatter : public cudf::test::BaseFixture {
 };

From f0b91179b38ba7224a33a9b1390230f8575f886a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 27 Apr 2022 17:40:48 -0500
Subject: [PATCH 04/28] Revise CONTRIBUTING.md (#10644)

I have revised the `CONTRIBUTING.md` file to address several pieces that are out of date. I also revised a good portion of the text and updated external references. Finally, I wrapped the lines at 100 characters to align with other Markdown files in the C++ docs. I would prefer to adopt a convention of one sentence per line if reviewers agree, but went with the 100 character wrapping for now to be consistent with other docs.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - https://github.com/brandon-b-miller
  - Jason Lowe (https://github.com/jlowe)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/10644
---
 CONTRIBUTING.md | 359 +++++++++++++++++++++++-------------------------
 1 file changed, 171 insertions(+), 188 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6d1c0528832..db8a8d88b99 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,69 +1,79 @@
 # Contributing to cuDF
 
-Contributions to cuDF fall into the following three categories.
-
-1. To report a bug, request a new feature, or report a problem with
-    documentation, please file an [issue](https://github.com/rapidsai/cudf/issues/new/choose)
-    describing in detail the problem or new feature. The RAPIDS team evaluates
-    and triages issues, and schedules them for a release. If you believe the
-    issue needs priority attention, please comment on the issue to notify the
-    team.
-2. To propose and implement a new Feature, please file a new feature request
-    [issue](https://github.com/rapidsai/cudf/issues/new/choose). Describe the
-    intended feature and discuss the design and implementation with the team and
-    community. Once the team agrees that the plan looks good, go ahead and
-    implement it, using the [code contributions](#code-contributions) guide below.
-3. To implement a feature or bug-fix for an existing outstanding issue, please
-    Follow the [code contributions](#code-contributions) guide below. If you
-    need more context on a particular issue, please ask in a comment.
-
-As contributors and maintainers to this project,
-you are expected to abide by cuDF's code of conduct.
-More information can be found at: [Contributor Code of Conduct](https://docs.rapids.ai/resources/conduct/).
+Contributions to cuDF fall into the following categories:
+
+1. To report a bug, request a new feature, or report a problem with documentation, please file an
+   [issue](https://github.com/rapidsai/cudf/issues/new/choose) describing the problem or new feature
+   in detail. The RAPIDS team evaluates and triages issues, and schedules them for a release. If you
+   believe the issue needs priority attention, please comment on the issue to notify the team.
+2. To propose and implement a new feature, please file a new feature request
+   [issue](https://github.com/rapidsai/cudf/issues/new/choose). Describe the intended feature and
+   discuss the design and implementation with the team and community. Once the team agrees that the
+   plan looks good, go ahead and implement it, using the [code contributions](#code-contributions)
+   guide below.
+3. To implement a feature or bug fix for an existing issue, please follow the [code
+   contributions](#code-contributions) guide below. If you need more context on a particular issue,
+   please ask in a comment.
+
+As contributors and maintainers to this project, you are expected to abide by cuDF's code of
+conduct. More information can be found at:
+[Contributor Code of Conduct](https://docs.rapids.ai/resources/conduct/).
 
 ## Code contributions
 
 ### Your first issue
 
-1. Follow the guide at the bottom of this page for [Setting Up Your Build Environment](#setting-up-your-build-environment).
-2. Find an issue to work on. The best way is to look for the [good first issue](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
-    or [help wanted](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels.
+1. Follow the guide at the bottom of this page for
+   [Setting up your build environment](#setting-up-your-build-environment).
+2. Find an issue to work on. The best way is to look for the
+   [good first issue](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
+   or [help wanted](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22)
+   labels.
 3. Comment on the issue stating that you are going to work on it.
-4. Code! Make sure to update unit tests!
-5. When done, [create your pull request](https://github.com/rapidsai/cudf/compare).
-6. Verify that CI passes all [status checks](https://help.github.com/articles/about-status-checks/). Fix if needed.
-7. Wait for other developers to review your code and update code as needed.
-8. Once reviewed and approved, a RAPIDS developer will merge your pull request.
-
-Remember, if you are unsure about anything, don't hesitate to comment on issues
-and ask for clarifications!
+4. Create a fork of the cudf repository and check out a branch with a name that
+   describes your planned work. For example, `fix-documentation`.
+5. Write code to address the issue or implement the feature.
+6. Add unit tests and unit benchmarks.
+7. [Create your pull request](https://github.com/rapidsai/cudf/compare).
+8. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks).
+   Fix if needed.
+9. Wait for other developers to review your code and update code as needed.
+10. Once reviewed and approved, a RAPIDS developer will merge your pull request.
+
+If you are unsure about anything, don't hesitate to comment on issues and ask for clarification!
 
 ### Seasoned developers
 
-Once you have gotten your feet wet and are more comfortable with the code, you
-can look at the prioritized issues for our next release in our [project boards](https://github.com/rapidsai/cudf/projects).
-
-> **Pro Tip:** Always look at the release board with the highest number for
-issues to work on. This is where RAPIDS developers also focus their efforts.
+Once you have gotten your feet wet and are more comfortable with the code, you can look at the
+prioritized issues for our next release in our
+[project boards](https://github.com/rapidsai/cudf/projects).
 
-Look at the unassigned issues, and find an issue to which you are comfortable
-contributing. Start with _Step 3_ above, commenting on the issue to let
-others know you are working on it. If you have any questions related to the
-implementation of the issue, ask them in the issue instead of the PR.
+**Note:** Always look at the release board that is
+[currently under development](https://docs.rapids.ai/maintainers) for issues to work on. This is
+where RAPIDS developers also focus their efforts.
 
-## Setting Up Your Build Environment
+Look at the unassigned issues, and find an issue to which you are comfortable contributing. Start
+with _Step 3_ above, commenting on the issue to let others know you are working on it. If you have
+any questions related to the implementation of the issue, ask them in the issue instead of the PR.
 
-The following instructions are for developers and contributors to cuDF OSS development. These instructions are tested on Linux Ubuntu 16.04 & 18.04. Use these instructions to build cuDF from source and contribute to its development.  Other operating systems may be compatible, but are not currently tested.
+## Setting up your build environment
 
+The following instructions are for developers and contributors to cuDF development. These
+instructions are tested on Ubuntu Linux LTS releases. Use these instructions to build cuDF from
+source and contribute to its development. Other operating systems may be compatible, but are not
+currently tested.
 
+Building cudf with the provided conda environment is recommended for users who wish to enable all
+library features. The following instructions are for building with a conda environment. Dependencies
+for a minimal build of libcudf without using conda are also listed below.
 
 ### General requirements
 
 Compilers:
 
-* `gcc`     version 9.3+
-* `nvcc`    version 11.5+
-* `cmake`   version 3.20.1+
+* `gcc` version 9.3+
+* `nvcc` version 11.5+
+* `cmake` version 3.20.1+
 
 CUDA/GPU:
 
@@ -71,127 +81,166 @@ CUDA/GPU:
 * NVIDIA driver 450.80.02+
 * Pascal architecture or better
 
-You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
+You can obtain CUDA from
+[https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
 
-### Create the build Environment
+### Create the build environment
+
+- Clone the repository:
 
-- Clone the repository and submodules
 ```bash
 CUDF_HOME=$(pwd)/cudf
 git clone https://github.com/rapidsai/cudf.git $CUDF_HOME
 cd $CUDF_HOME
-git submodule update --init --remote --recursive
 ```
+
+#### Building with a conda environment
+
+**Note:** Using a conda environment is the easiest way to satisfy the library's dependencies.
+Instructions for a minimal build environment without conda are included below.
+
 - Create the conda development environment `cudf_dev`:
+
 ```bash
 # create the conda environment (assuming in base `cudf` directory)
-# note: RAPIDS currently doesn't support `channel_priority: strict`; use `channel_priority: flexible` instead
+# note: RAPIDS currently doesn't support `channel_priority: strict`;
+# use `channel_priority: flexible` instead
 conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.5.yml
 # activate the environment
 conda activate cudf_dev
 ```
-- For other CUDA versions, check the corresponding cudf_dev_cuda*.yml file in conda/environments
+
+- **Note**: the conda environment files are updated frequently, so the
+  development environment may also need to be updated if dependency versions or
+  pinnings are changed.
+
+- For other CUDA versions, check the corresponding `cudf_dev_cuda*.yml` file in
+  `conda/environments/`.
+
+#### Building without a conda environment
+
+- libcudf has the following minimal dependencies (in addition to those listed in the [General
+  requirements](#general-requirements)). The packages listed below use Ubuntu package names:
+
+  - `build-essential`
+  - `libssl-dev`
+  - `libz-dev`
+  - `libpython3-dev` (required if building cudf)
 
 ### Build cuDF from source
 
-- A `build.sh` script is provided in `$CUDF_HOME`. Running the script with no additional arguments will install the `libcudf`, `cudf` and `dask_cudf` libraries. By default, the libraries are installed to the `$CONDA_PREFIX` directory. To install into a different location, set the location in `$INSTALL_PREFIX`. Finally, note that the script depends on the `nvcc` executable being on your path, or defined in `$CUDACXX`.
+- A `build.sh` script is provided in `$CUDF_HOME`. Running the script with no additional arguments
+  will install the `libcudf`, `cudf` and `dask_cudf` libraries. By default, the libraries are
+  installed to the `$CONDA_PREFIX` directory. To install into a different location, set the location
+  in `$INSTALL_PREFIX`. Finally, note that the script depends on the `nvcc` executable being on your
+  path, or defined in `$CUDACXX`.
+
 ```bash
 cd $CUDF_HOME
 
 # Choose one of the following commands, depending on whether
-# you want to build and install the libcudf C++ library only, 
+# you want to build and install the libcudf C++ library only,
 # or include the cudf and/or dask_cudf Python libraries:
 
 ./build.sh  # libcudf, cudf and dask_cudf
 ./build.sh libcudf  # libcudf only
-./build.sh libcudf cudf  # libcudf and cudf only             
+./build.sh libcudf cudf  # libcudf and cudf only
 ```
-- Other libraries like `cudf-kafka` and `custreamz` can be installed with this script. For the complete list of libraries as well as details about the script usage, run the `help` command:
+
+- Other libraries like `cudf-kafka` and `custreamz` can be installed with this script. For the
+  complete list of libraries as well as details about the script usage, run the `help` command:
+
 ```bash
-./build.sh --help            
+./build.sh --help
 ```
 
 ### Build, install and test cuDF libraries for contributors
 
-The general workflow is provided below. Please, also see the last section about [code formatting](###code-formatting).
+The general workflow is provided below. Please also see the last section about
+[code formatting](#code-formatting).
 
 #### `libcudf` (C++)
 
-If you're only interested in building the library (and not the unit tests):
- 
+- If you're only interested in building the library (and not the unit tests):
+
 ```bash
 cd $CUDF_HOME
 ./build.sh libcudf
 ```
-If, in addition, you want to build tests:
+
+- If, in addition, you want to build tests:
 
 ```bash
 ./build.sh libcudf tests
 ```
-To run the tests:
+
+- To run the tests:
 
 ```bash
-make test                                      
+make test
 ```
 
 #### `cudf` (Python)
 
 - First, build the `libcudf` C++ library following the steps above
 
-- To build and install in edit/develop `cudf` python package:
+- To build and install in edit/develop `cudf` Python package:
 ```bash
 cd $CUDF_HOME/python/cudf
 python setup.py build_ext --inplace
 python setup.py develop
 ```
 
-- To run `cudf` tests :
+- To run `cudf` tests:
 ```bash
 cd $CUDF_HOME/python
-py.test -v cudf/cudf/tests
+pytest -v cudf/cudf/tests
 ```
 
 #### `dask-cudf` (Python)
 
 - First, build the `libcudf` C++ and `cudf` Python libraries following the steps above
 
-- To install in edit/develop mode the `dask-cudf` python package:
+- To install the `dask-cudf` Python package in editable/develop mode:
 ```bash
 cd $CUDF_HOME/python/dask_cudf
 python setup.py build_ext --inplace
 python setup.py develop
 ```
 
-- To run `dask_cudf` tests :
+- To run `dask_cudf` tests:
 ```bash
 cd $CUDF_HOME/python
-py.test -v dask_cudf
+pytest -v dask_cudf
 ```
 
 #### `libcudf_kafka` (C++)
 
-If you're only interested in building the library (and not the unit tests):
- 
+- If you're only interested in building the library (and not the unit tests):
+
 ```bash
 cd $CUDF_HOME
 ./build.sh libcudf_kafka
 ```
-If, in addition, you want to build tests:
+
+- If, in addition, you want to build tests:
 
 ```bash
 ./build.sh libcudf_kafka tests
 ```
-To run the tests:
+
+- To run the tests:
 
 ```bash
-make test                                      
+make test
 ```
 
 #### `cudf-kafka` (Python)
 
-- First, build the `libcudf` and `libcudf_kafka` following the steps above
+- First, build the `libcudf` and `libcudf_kafka` libraries following the steps above
+
+- To install the `cudf-kafka` Python package in editable/develop mode:
 
-- To install in edit/develop mode the `cudf-kafka` python package:
 ```bash
 cd $CUDF_HOME/python/cudf_kafka
 python setup.py build_ext --inplace
@@ -202,7 +251,8 @@ python setup.py develop
 
 - First, build `libcudf`, `libcudf_kafka`, and `cudf_kafka` following the steps above
 
-- To install in edit/develop mode the `custreamz` python package:
+- To install the `custreamz` Python package in editable/develop mode:
+
 ```bash
 cd $CUDF_HOME/python/custreamz
 python setup.py build_ext --inplace
@@ -210,40 +260,45 @@ python setup.py develop
 ```
 
 - To run `custreamz` tests :
+
 ```bash
 cd $CUDF_HOME/python
-py.test -v custreamz
+pytest -v custreamz
 ```
 
 #### `cudf` (Java):
 
 - First, build the `libcudf` C++ library following the steps above
 
-- Then, refer to [Java README](https://github.com/rapidsai/cudf/blob/branch-21.10/java/README.md)
-
+- Then, refer to the [Java README](java/README.md)
 
-Done! You are ready to develop for the cuDF OSS project. But please go to [code formatting](###code-formatting) to ensure that you contributing code follows the expected format.
+Done! You are ready to develop for the cuDF project. Please review the project's
+[code formatting guidelines](#code-formatting).
 
 ## Debugging cuDF
 
-### Building Debug mode from source
+### Building in debug mode from source
 
-Follow the [above instructions](####build-cudf-from-source) to build from source and add `-g` to the `./build.sh` command.
+Follow the instructions to [build from source](#build-cudf-from-source) and add `-g` to the
+`./build.sh` command.
 
 For example:
+
 ```bash
 ./build.sh libcudf -g
 ```
 
-This builds `libcudf` in Debug mode which enables some `assert` safety checks and includes symbols in the library for debugging.
+This builds `libcudf` in debug mode which enables some `assert` safety checks and includes symbols
+in the library for debugging.
 
 All other steps for installing `libcudf` into your environment are the same.
 
 ### Debugging with `cuda-gdb` and `cuda-memcheck`
 
-When you have a debug build of `libcudf` installed, debugging with the `cuda-gdb` and `cuda-memcheck` is easy.
+When you have a debug build of `libcudf` installed, debugging with the `cuda-gdb` and
+`cuda-memcheck` is easy.
 
-If you are debugging a Python script, simply run the following:
+If you are debugging a Python script, run the following:
 
 ```bash
 cuda-gdb -ex r --args python <program_name>.py <program_arguments>
@@ -255,143 +310,71 @@ cuda-memcheck python <program_name>.py <program_arguments>
 
 ### Device debug symbols
 
-The device debug symbols are not automatically added with the cmake `Debug`
-build type because it causes a runtime delay of several minutes when loading
-the libcudf.so library.
+The device debug symbols are not automatically added with the cmake `Debug` build type because it
+causes a runtime delay of several minutes when loading the libcudf.so library.
 
-Therefore, it is recommended to add device debug symbols only to specific files by
-setting the `-G` compile option locally in your `cpp/CMakeLists.txt` for that file.
-Here is an example of adding the `-G` option to the compile command for
-`src/copying/copy.cu` source file:
+Therefore, it is recommended to add device debug symbols only to specific files by setting the `-G`
+compile option locally in your `cpp/CMakeLists.txt` for that file. Here is an example of adding the
+`-G` option to the compile command for `src/copying/copy.cu` source file:
 
-```
+```cmake
 set_source_files_properties(src/copying/copy.cu PROPERTIES COMPILE_OPTIONS "-G")
 ```
 
-This will add the device debug symbols for this object file in libcudf.so.
-You can then use `cuda-dbg` to debug into the kernels in that source file.
-
-### Building and Testing on a gpuCI image locally
-
-Before submitting a pull request, you can do a local build and test on your machine that mimics our gpuCI environment using the `ci/local/build.sh` script.
-For detailed information on usage of this script, see [here](ci/local/README.md).
-
+This will add the device debug symbols for this object file in `libcudf.so`.  You can then use
+`cuda-dbg` to debug into the kernels in that source file.
 
-## Automated Build in Docker Container
+## Code Formatting
 
-A Dockerfile is provided with a preconfigured conda environment for building and installing cuDF from source based off of the main branch.
+### C++/CUDA
 
-### Prerequisites
+cuDF uses [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html).
 
-* Install [nvidia-docker2](https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)) for Docker + GPU support
-* Verify NVIDIA driver is `450.80.02` or higher
-* Ensure CUDA 11.0+ is installed
-
-### Usage
+In order to format the C++/CUDA files, navigate to the root (`cudf`) directory and run:
 
-From cudf project root run the following, to build with defaults:
-```bash
-docker build --tag cudf .
-```
-After the container is built run the container:
 ```bash
-docker run --runtime=nvidia -it cudf bash
-```
-Activate the conda environment `cudf` to use the newly built cuDF and libcudf libraries:
-```
-root@3f689ba9c842:/# source activate cudf
-(cudf) root@3f689ba9c842:/# python -c "import cudf"
-(cudf) root@3f689ba9c842:/#
+python3 ./cpp/scripts/run-clang-format.py -inplace
 ```
 
-### Customizing the Build
-
-Several build arguments are available to customize the build process of the
-container. These are specified by using the Docker [build-arg](https://docs.docker.com/engine/reference/commandline/build/#set-build-time-variables---build-arg)
-flag. Below is a list of the available arguments and their purpose:
+Additionally, many editors have plugins or extensions that you can set up to automatically run
+`clang-format` either manually or on file save.
 
-| Build Argument | Default Value | Other Value(s) | Purpose |
-| --- | --- | --- | --- |
-| `CUDA_VERSION` | 11.0 | 11.2.2 | set CUDA version |
-| `LINUX_VERSION` | ubuntu18.04 | ubuntu20.04 | set Ubuntu version |
-| `CC` & `CXX` | 9 | 10 | set gcc/g++ version |
-| `CUDF_REPO` | This repo | Forks of cuDF | set git URL to use for `git clone` |
-| `CUDF_BRANCH` | main | Any branch name | set git branch to checkout of `CUDF_REPO` |
-| `NUMBA_VERSION` | newest | >=0.40.0 | set numba version |
-| `NUMPY_VERSION` | newest | >=1.14.3 | set numpy version |
-| `PANDAS_VERSION` | newest | >=0.23.4 | set pandas version |
-| `PYARROW_VERSION` | 1.0.1 | Not supported | set pyarrow version |
-| `CMAKE_VERSION` | newest | >=3.18 | set cmake version |
-| `CYTHON_VERSION` | 0.29 | Not supported | set Cython version |
-| `PYTHON_VERSION` | 3.7 | 3.8 | set python version |
+### Python / Pre-commit hooks
 
+cuDF uses [pre-commit](https://pre-commit.com/) to execute code linters and formatters such as
+[Black](https://black.readthedocs.io/en/stable/), [isort](https://pycqa.github.io/isort/), and
+[flake8](https://flake8.pycqa.org/en/latest/). These tools ensure a consistent code format
+throughout the project. Using pre-commit ensures that linter versions and options are aligned for
+all developers. Additionally, there is a CI check in place to enforce that committed code follows
+our standards.
 
-### Code Formatting
-
-
-#### Python
-
-cuDF uses [Black](https://black.readthedocs.io/en/stable/),
-[isort](https://readthedocs.org/projects/isort/), and
-[flake8](http://flake8.pycqa.org/en/latest/) to ensure a consistent code format
-throughout the project. They have been installed during the `cudf_dev` environment creation.
-
-These tools are used to auto-format the Python code, as well as check the Cython
-code in the repository. Additionally, there is a CI check in place to enforce
-that committed code follows our standards. You can use the tools to
-automatically format your python code by running:
+To use `pre-commit`, install via `conda` or `pip`:
 
 ```bash
-isort --atomic python/**/*.py
-black python
+conda install -c conda-forge pre-commit
 ```
 
-and then check the syntax of your Python and Cython code by running:
-
 ```bash
-flake8 python
-flake8 --config=python/.flake8.cython
-```
-
-Additionally, many editors have plugins that will apply `isort` and `Black` as
-you edit files, as well as use `flake8` to report any style / syntax issues.
-
-#### C++/CUDA
-
-cuDF uses [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html)
-
-In order to format the C++/CUDA files, navigate to the root (`cudf`) directory and run:
-```
-python3 ./cpp/scripts/run-clang-format.py -inplace
+pip install pre-commit
 ```
 
-Additionally, many editors have plugins or extensions that you can set up to automatically run `clang-format` either manually or on file save.
-
-#### Pre-commit hooks
-
-Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com/)
-to automatically run `isort`, `Black`, `flake8` and `clang-format` when you make a git commit.
-This can be done by installing `pre-commit` via `conda` or `pip`:
+Then run pre-commit hooks before committing code:
 
 ```bash
-conda install -c conda-forge pre_commit
+pre-commit run
 ```
 
-```bash
-pip install pre-commit
-```
-
-and then running:
+Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running:
 
 ```bash
 pre-commit install
 ```
 
-from the root of the cuDF repository. Now `isort`, `Black`, `flake8` and `clang-format` will be
-run each time you commit changes.
+Now code linters and formatters will be run each time you commit changes.
 
----
+You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`.
 
 ## Attribution
+
 Portions adopted from https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md
 Portions adopted from https://github.com/dask/dask/blob/master/docs/source/develop.rst

From 03d419d96753d29cf24226ab661377da23eef969 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 28 Apr 2022 08:27:45 -0500
Subject: [PATCH 05/28] Prepare dask_cudf test_parquet.py for upcoming API
 changes (#10709)

This is a relatively-simple PR to clean up `dask_cudf`'s `to/read_parquet` tests. These changes are mostly meant to avoid **future** test failures that will arise after impending changes are implemented in up-stream Dask. These changes include:

- The default value for `write_metadata_file` will become `False` for `to_parquet` (because writing the _metadata file scales very poorly)
- The default value for `split_row_groups` will become `False` (because this setting is typically optimal when the file are not too large). Users with larger-than-memory files will need to specify `split_row_groups=True/int` explicitly.
- The `gather_statistics` argument will be removed in favor of a more descriptive `calculate_divisions` argument.

This PR also removes the long-deprecated `row_groups_per_part` argument from `dask_cudf.read_parquet` (established replacement is `split_row_groups`).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Ray Douglass (https://github.com/raydouglass)
  - gpuCI (https://github.com/GPUtester)
  - Mike Wendt (https://github.com/mike-wendt)
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Randy Gelhausen (https://github.com/randerzander)

URL: https://github.com/rapidsai/cudf/pull/10709
---
 python/dask_cudf/dask_cudf/io/parquet.py      | 212 +++++++++++-------
 .../dask_cudf/io/tests/test_parquet.py        | 106 ++++++---
 2 files changed, 197 insertions(+), 121 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 042759f68cf..b201626becf 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -177,65 +177,98 @@ def read_partition(
         strings_to_cats = kwargs.get("strings_to_categorical", False)
         read_kwargs = kwargs.get("read", {})
         read_kwargs.update(open_file_options or {})
-
-        # Assume multi-piece read
-        paths = []
-        rgs = []
-        last_partition_keys = None
-        dfs = []
-
-        for i, piece in enumerate(pieces):
-
-            (path, row_group, partition_keys) = piece
-            row_group = None if row_group == [None] else row_group
-
-            if i > 0 and partition_keys != last_partition_keys:
-                dfs.append(
-                    cls._read_paths(
-                        paths,
-                        fs,
-                        columns=read_columns,
-                        row_groups=rgs if rgs else None,
-                        strings_to_categorical=strings_to_cats,
-                        partitions=partitions,
-                        partitioning=partitioning,
-                        partition_keys=last_partition_keys,
-                        **read_kwargs,
+        check_file_size = read_kwargs.pop("check_file_size", None)
+
+        # Wrap reading logic in a `try` block so that we can
+        # inform the user that the `read_parquet` partition
+        # size is too large for the available memory
+        try:
+
+            # Assume multi-piece read
+            paths = []
+            rgs = []
+            last_partition_keys = None
+            dfs = []
+
+            for i, piece in enumerate(pieces):
+
+                (path, row_group, partition_keys) = piece
+                row_group = None if row_group == [None] else row_group
+
+                # File-size check to help "protect" users from change
+                # to up-stream `split_row_groups` default. We only
+                # check the file size if this partition corresponds
+                # to a full file, and `check_file_size` is defined
+                if check_file_size and len(pieces) == 1 and row_group is None:
+                    file_size = fs.size(path)
+                    if file_size > check_file_size:
+                        warnings.warn(
+                            f"A large parquet file ({file_size}B) is being "
+                            f"used to create a DataFrame partition in "
+                            f"read_parquet. This may cause out of memory "
+                            f"exceptions in operations downstream. See the "
+                            f"notes on split_row_groups in the read_parquet "
+                            f"documentation. Setting split_row_groups "
+                            f"explicitly will silence this warning."
+                        )
+
+                if i > 0 and partition_keys != last_partition_keys:
+                    dfs.append(
+                        cls._read_paths(
+                            paths,
+                            fs,
+                            columns=read_columns,
+                            row_groups=rgs if rgs else None,
+                            strings_to_categorical=strings_to_cats,
+                            partitions=partitions,
+                            partitioning=partitioning,
+                            partition_keys=last_partition_keys,
+                            **read_kwargs,
+                        )
                     )
+                    paths = rgs = []
+                    last_partition_keys = None
+                paths.append(path)
+                rgs.append(
+                    [row_group]
+                    if not isinstance(row_group, list)
+                    and row_group is not None
+                    else row_group
                 )
-                paths = rgs = []
-                last_partition_keys = None
-            paths.append(path)
-            rgs.append(
-                [row_group]
-                if not isinstance(row_group, list) and row_group is not None
-                else row_group
-            )
-            last_partition_keys = partition_keys
+                last_partition_keys = partition_keys
 
-        dfs.append(
-            cls._read_paths(
-                paths,
-                fs,
-                columns=read_columns,
-                row_groups=rgs if rgs else None,
-                strings_to_categorical=strings_to_cats,
-                partitions=partitions,
-                partitioning=partitioning,
-                partition_keys=last_partition_keys,
-                **read_kwargs,
+            dfs.append(
+                cls._read_paths(
+                    paths,
+                    fs,
+                    columns=read_columns,
+                    row_groups=rgs if rgs else None,
+                    strings_to_categorical=strings_to_cats,
+                    partitions=partitions,
+                    partitioning=partitioning,
+                    partition_keys=last_partition_keys,
+                    **read_kwargs,
+                )
             )
-        )
-        df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0]
-
-        # Re-set "object" dtypes align with pa schema
-        set_object_dtypes_from_pa_schema(df, schema)
+            df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0]
 
-        if index and (index[0] in df.columns):
-            df = df.set_index(index[0])
-        elif index is False and df.index.names != (None,):
-            # If index=False, we shouldn't have a named index
-            df.reset_index(inplace=True)
+            # Re-set "object" dtypes align with pa schema
+            set_object_dtypes_from_pa_schema(df, schema)
+
+            if index and (index[0] in df.columns):
+                df = df.set_index(index[0])
+            elif index is False and df.index.names != (None,):
+                # If index=False, we shouldn't have a named index
+                df.reset_index(inplace=True)
+
+        except MemoryError as err:
+            raise MemoryError(
+                "Parquet data was larger than the available GPU memory!\n\n"
+                "See the notes on split_row_groups in the read_parquet "
+                "documentation.\n\n"
+                "Original Error: " + str(err)
+            )
+            raise err
 
         return df
 
@@ -349,25 +382,34 @@ def set_object_dtypes_from_pa_schema(df, schema):
                 df._data[col_name] = col.astype(typ)
 
 
-def read_parquet(
-    path,
-    columns=None,
-    split_row_groups=None,
-    row_groups_per_part=None,
-    **kwargs,
-):
+def read_parquet(path, columns=None, **kwargs):
     """Read parquet files into a Dask DataFrame
 
-    Calls ``dask.dataframe.read_parquet`` to cordinate the execution of
-    ``cudf.read_parquet``, and ultimately read multiple partitions into
-    a single Dask dataframe. The Dask version must supply an
-    ``ArrowDatasetEngine`` class to support full functionality.
-    See ``cudf.read_parquet`` and Dask documentation for further details.
+    Calls ``dask.dataframe.read_parquet`` with ``engine=CudfEngine``
+    to cordinate the execution of ``cudf.read_parquet``, and to
+    ultimately create a ``dask_cudf.DataFrame`` collection.
+
+    See the ``dask.dataframe.read_parquet`` documentation for
+    all available options.
 
     Examples
     --------
-    >>> import dask_cudf
-    >>> df = dask_cudf.read_parquet("/path/to/dataset/")  # doctest: +SKIP
+    >>> from dask_cudf import read_parquet
+    >>> df = read_parquet("/path/to/dataset/")  # doctest: +SKIP
+
+    When dealing with one or more large parquet files having an
+    in-memory footprint >15% device memory, the ``split_row_groups``
+    argument should be used to map Parquet **row-groups** to DataFrame
+    partitions (instead of **files** to partitions). For example, the
+    following code will map each row-group to a distinct partition:
+
+    >>> df = read_parquet(..., split_row_groups=True)  # doctest: +SKIP
+
+    To map **multiple** row-groups to each partition, an integer can be
+    passed to ``split_row_groups`` to specify the **maximum** number of
+    row-groups allowed in each output partition:
+
+    >>> df = read_parquet(..., split_row_groups=10)  # doctest: +SKIP
 
     See Also
     --------
@@ -376,22 +418,24 @@ def read_parquet(
     if isinstance(columns, str):
         columns = [columns]
 
-    if row_groups_per_part:
-        warnings.warn(
-            "row_groups_per_part is deprecated. "
-            "Pass an integer value to split_row_groups instead.",
-            FutureWarning,
-        )
-        if split_row_groups is None:
-            split_row_groups = row_groups_per_part
-
-    return dd.read_parquet(
-        path,
-        columns=columns,
-        split_row_groups=split_row_groups,
-        engine=CudfEngine,
-        **kwargs,
-    )
+    # Set "check_file_size" option to determine whether we
+    # should check the parquet-file size. This check is meant
+    # to "protect" users from `split_row_groups` default changes
+    check_file_size = kwargs.pop("check_file_size", 500_000_000)
+    if (
+        check_file_size
+        and ("split_row_groups" not in kwargs)
+        and ("chunksize" not in kwargs)
+    ):
+        # User is not specifying `split_row_groups` or `chunksize`,
+        # so we should warn them if/when a file is ~>0.5GB on disk.
+        # They can set `split_row_groups` explicitly to silence/skip
+        # this check
+        if "read" not in kwargs:
+            kwargs["read"] = {}
+        kwargs["read"]["check_file_size"] = check_file_size
+
+    return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs)
 
 
 to_parquet = partial(dd.to_parquet, engine=CudfEngine)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index d9b8ee4595a..ef5741b0539 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -36,42 +36,55 @@
 ddf = dd.from_pandas(df, npartitions=npartitions)
 
 
-@pytest.mark.parametrize("stats", [True, False])
-def test_roundtrip_from_dask(tmpdir, stats):
+# Helper function to make it easier to handle the
+# upcoming deprecation of `gather_statistics`.
+# See: https://github.com/dask/dask/issues/8937
+# TODO: This function should be used to switch to
+# the "new" `calculate_divisions` kwarg (for newer
+# Dask versions) once it is introduced
+def _divisions(setting):
+    return {"gather_statistics": setting}
+
+
+@pytest.mark.parametrize("write_metadata_file", [True, False])
+@pytest.mark.parametrize("divisions", [True, False])
+def test_roundtrip_from_dask(tmpdir, divisions, write_metadata_file):
     tmpdir = str(tmpdir)
-    ddf.to_parquet(tmpdir, engine="pyarrow")
+    ddf.to_parquet(
+        tmpdir, write_metadata_file=write_metadata_file, engine="pyarrow"
+    )
     files = sorted(
         (os.path.join(tmpdir, f) for f in os.listdir(tmpdir)),
         key=natural_sort_key,
     )
 
     # Read list of parquet files
-    ddf2 = dask_cudf.read_parquet(files, gather_statistics=stats)
-    dd.assert_eq(ddf, ddf2, check_divisions=stats)
+    ddf2 = dask_cudf.read_parquet(files, **_divisions(divisions))
+    dd.assert_eq(ddf, ddf2, check_divisions=divisions)
 
     # Specify columns=['x']
     ddf2 = dask_cudf.read_parquet(
-        files, columns=["x"], gather_statistics=stats
+        files, columns=["x"], **_divisions(divisions)
     )
-    dd.assert_eq(ddf[["x"]], ddf2, check_divisions=stats)
+    dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions)
 
     # Specify columns='y'
-    ddf2 = dask_cudf.read_parquet(files, columns="y", gather_statistics=stats)
-    dd.assert_eq(ddf[["y"]], ddf2, check_divisions=stats)
+    ddf2 = dask_cudf.read_parquet(files, columns="y", **_divisions(divisions))
+    dd.assert_eq(ddf[["y"]], ddf2, check_divisions=divisions)
 
     # Now include metadata
-    ddf2 = dask_cudf.read_parquet(tmpdir, gather_statistics=stats)
-    dd.assert_eq(ddf, ddf2, check_divisions=stats)
+    ddf2 = dask_cudf.read_parquet(tmpdir, **_divisions(divisions))
+    dd.assert_eq(ddf, ddf2, check_divisions=divisions)
 
     # Specify columns=['x'] (with metadata)
     ddf2 = dask_cudf.read_parquet(
-        tmpdir, columns=["x"], gather_statistics=stats
+        tmpdir, columns=["x"], **_divisions(divisions)
     )
-    dd.assert_eq(ddf[["x"]], ddf2, check_divisions=stats)
+    dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions)
 
     # Specify columns='y' (with metadata)
-    ddf2 = dask_cudf.read_parquet(tmpdir, columns="y", gather_statistics=stats)
-    dd.assert_eq(ddf[["y"]], ddf2, check_divisions=stats)
+    ddf2 = dask_cudf.read_parquet(tmpdir, columns="y", **_divisions(divisions))
+    dd.assert_eq(ddf[["y"]], ddf2, check_divisions=divisions)
 
 
 def test_roundtrip_from_dask_index_false(tmpdir):
@@ -99,8 +112,8 @@ def test_roundtrip_from_dask_cudf(tmpdir, write_meta):
     gddf = dask_cudf.from_dask_dataframe(ddf)
     gddf.to_parquet(tmpdir, write_metadata_file=write_meta)
 
-    gddf2 = dask_cudf.read_parquet(tmpdir)
-    dd.assert_eq(gddf, gddf2, check_divisions=write_meta)
+    gddf2 = dask_cudf.read_parquet(tmpdir, **_divisions(True))
+    dd.assert_eq(gddf, gddf2)
 
 
 def test_roundtrip_none_rangeindex(tmpdir):
@@ -161,21 +174,21 @@ def test_dask_timeseries_from_pandas(tmpdir):
 
 
 @pytest.mark.parametrize("index", [False, None])
-@pytest.mark.parametrize("stats", [False, True])
-def test_dask_timeseries_from_dask(tmpdir, index, stats):
+@pytest.mark.parametrize("divisions", [False, True])
+def test_dask_timeseries_from_dask(tmpdir, index, divisions):
 
     fn = str(tmpdir)
     ddf2 = dask.datasets.timeseries(freq="D")
     ddf2.to_parquet(fn, engine="pyarrow", write_index=index)
-    read_df = dask_cudf.read_parquet(fn, index=index, gather_statistics=stats)
+    read_df = dask_cudf.read_parquet(fn, index=index, **_divisions(divisions))
     dd.assert_eq(
-        ddf2, read_df, check_divisions=(stats and index), check_index=index
+        ddf2, read_df, check_divisions=(divisions and index), check_index=index
     )
 
 
 @pytest.mark.parametrize("index", [False, None])
-@pytest.mark.parametrize("stats", [False, True])
-def test_dask_timeseries_from_daskcudf(tmpdir, index, stats):
+@pytest.mark.parametrize("divisions", [False, True])
+def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
 
     fn = str(tmpdir)
     ddf2 = dask_cudf.from_cudf(
@@ -183,9 +196,9 @@ def test_dask_timeseries_from_daskcudf(tmpdir, index, stats):
     )
     ddf2.name = ddf2.name.astype("object")
     ddf2.to_parquet(fn, write_index=index)
-    read_df = dask_cudf.read_parquet(fn, index=index, gather_statistics=stats)
+    read_df = dask_cudf.read_parquet(fn, index=index, **_divisions(divisions))
     dd.assert_eq(
-        ddf2, read_df, check_divisions=(stats and index), check_index=index
+        ddf2, read_df, check_divisions=(divisions and index), check_index=index
     )
 
 
@@ -212,17 +225,23 @@ def test_filters(tmpdir):
 
     ddf.to_parquet(tmp_path, engine="pyarrow")
 
-    a = dask_cudf.read_parquet(tmp_path, filters=[("x", ">", 4)])
+    a = dask_cudf.read_parquet(
+        tmp_path, filters=[("x", ">", 4)], split_row_groups=True
+    )
     assert a.npartitions == 3
     assert (a.x > 3).all().compute()
 
-    b = dask_cudf.read_parquet(tmp_path, filters=[("y", "==", "c")])
+    b = dask_cudf.read_parquet(
+        tmp_path, filters=[("y", "==", "c")], split_row_groups=True
+    )
     assert b.npartitions == 1
     b = b.compute().to_pandas()
     assert (b.y == "c").all()
 
     c = dask_cudf.read_parquet(
-        tmp_path, filters=[("y", "==", "c"), ("x", ">", 6)]
+        tmp_path,
+        filters=[("y", "==", "c"), ("x", ">", 6)],
+        split_row_groups=True,
     )
     assert c.npartitions <= 1
     assert not len(c)
@@ -237,13 +256,17 @@ def test_filters_at_row_group_level(tmpdir):
 
     ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=10 / 5)
 
-    a = dask_cudf.read_parquet(tmp_path, filters=[("x", "==", 1)])
+    a = dask_cudf.read_parquet(
+        tmp_path, filters=[("x", "==", 1)], split_row_groups=True
+    )
     assert a.npartitions == 1
     assert (a.shape[0] == 2).compute()
 
     ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=1)
 
-    b = dask_cudf.read_parquet(tmp_path, filters=[("x", "==", 1)])
+    b = dask_cudf.read_parquet(
+        tmp_path, filters=[("x", "==", 1)], split_row_groups=True
+    )
     assert b.npartitions == 1
     assert (b.shape[0] == 1).compute()
 
@@ -341,7 +364,7 @@ def test_chunksize(tmpdir, chunksize, metadata):
         path,
         chunksize=chunksize,
         split_row_groups=True,
-        gather_statistics=True,
+        **_divisions(True),
     )
     ddf2.compute(scheduler="synchronous")
 
@@ -360,8 +383,8 @@ def test_chunksize(tmpdir, chunksize, metadata):
             path,
             chunksize=chunksize,
             split_row_groups=True,
-            gather_statistics=True,
             aggregate_files=True,
+            **_divisions(True),
         )
 
         dd.assert_eq(ddf1, ddf3, check_divisions=False)
@@ -382,7 +405,7 @@ def test_chunksize(tmpdir, chunksize, metadata):
 
 @pytest.mark.parametrize("row_groups", [1, 3, 10, 12])
 @pytest.mark.parametrize("index", [False, True])
-def test_row_groups_per_part(tmpdir, row_groups, index):
+def test_split_row_groups(tmpdir, row_groups, index):
     nparts = 2
     df_size = 100
     row_group_size = 5
@@ -410,7 +433,7 @@ def test_row_groups_per_part(tmpdir, row_groups, index):
 
     ddf2 = dask_cudf.read_parquet(
         str(tmpdir),
-        row_groups_per_part=row_groups,
+        split_row_groups=row_groups,
     )
 
     dd.assert_eq(ddf1, ddf2, check_divisions=False)
@@ -448,9 +471,9 @@ def test_create_metadata_file(tmpdir, partition_on):
     # with the _metadata file present
     ddf2 = dask_cudf.read_parquet(
         tmpdir,
-        gather_statistics=True,
         split_row_groups=False,
         index="myindex",
+        **_divisions(True),
     )
     if partition_on:
         ddf1 = df1.sort_values("b")
@@ -481,7 +504,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
     # New pyarrow-dataset base can handle an inconsistent
     # schema (even without a _metadata file), but computing
     # and dtype validation may fail
-    ddf1 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True)
+    ddf1 = dask_cudf.read_parquet(str(tmpdir), **_divisions(True))
 
     # Add global metadata file.
     # Dask-CuDF can do this without requiring schema
@@ -490,7 +513,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
 
     # Check that we can still read the ddf
     # with the _metadata file present
-    ddf2 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True)
+    ddf2 = dask_cudf.read_parquet(str(tmpdir), **_divisions(True))
 
     # Check that the result is the same with and
     # without the _metadata file.  Note that we must
@@ -538,3 +561,12 @@ def test_cudf_list_struct_write(tmpdir):
     ddf.to_parquet(temp_file)
     new_ddf = dask_cudf.read_parquet(temp_file)
     dd.assert_eq(df, new_ddf)
+
+
+def test_check_file_size(tmpdir):
+    # Test simple file-size check to help warn users
+    # of upstream change to `split_row_groups` default
+    fn = str(tmpdir.join("test.parquet"))
+    cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn)
+    with pytest.warns(match="large parquet file"):
+        dask_cudf.read_parquet(fn, check_file_size=1).compute()

From a43fb9eafb15b50bf5de21ac0bdebd3b490f511e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 28 Apr 2022 11:04:12 -0400
Subject: [PATCH 06/28] Implement DataFrame.eval using libcudf ASTs (#8022)

This PR exposes `libcudf`'s expression parsing functionality in `cudf` and uses it to implement `DataFrame.eval`. The implementation is mostly feature-complete, but there are a few limitations relative to the `pandas` API and a couple of gotchas around type casting. The implementation is reasonably performant, improving upon an equivalent `df.apply` even accounting for JIT-compilation overhead. This implementation provides a stepping stone to leveraging `libcudf`'s AST implementation for more complex tasks in `cudf` such as conditional joins.

The most significant issue with the current implementation is the lack of casting between integral types, meaning that operations can only be performed between columns of the _exact_ same dtype. For example, operations between int8 and int16 would fail. This becomes particularly problematic for constants e.g. `df.eval('x+1')`. The best paths to improve this are at the C++ level of the expression evaluation, so I think we'll have to live with this limitation for now if we want to move forward.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/8022
---
 cpp/include/cudf/ast/expressions.hpp          |   6 +-
 python/cudf/cudf/_lib/__init__.py             |   3 +-
 python/cudf/cudf/_lib/cpp/expressions.pxd     |  88 +++++++
 python/cudf/cudf/_lib/cpp/transform.pxd       |   8 +-
 python/cudf/cudf/_lib/expressions.pxd         |  38 +++
 python/cudf/cudf/_lib/expressions.pyx         | 130 ++++++++++
 python/cudf/cudf/_lib/transform.pyx           |  37 +++
 .../cudf/cudf/core/_internals/expressions.py  | 222 ++++++++++++++++++
 python/cudf/cudf/core/dataframe.py            | 160 +++++++++++++
 python/cudf/cudf/tests/test_dataframe.py      |  87 +++++++
 10 files changed, 775 insertions(+), 4 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/expressions.pxd
 create mode 100644 python/cudf/cudf/_lib/expressions.pxd
 create mode 100644 python/cudf/cudf/_lib/expressions.pyx
 create mode 100644 python/cudf/cudf/core/_internals/expressions.py

diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index eb98e0e0bee..96c99e054a5 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <cstdint>
+
 namespace cudf {
 namespace ast {
 
@@ -53,7 +55,7 @@ struct expression {
 /**
  * @brief Enum of supported operators.
  */
-enum class ast_operator {
+enum class ast_operator : int32_t {
   // Binary operators
   ADD,         ///< operator +
   SUB,         ///< operator -
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index bd25aa53405..542262b7908 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 import numpy as np
 
 from . import (
@@ -8,6 +8,7 @@
     copying,
     csv,
     datetime,
+    expressions,
     filling,
     gpuarrow,
     groupby,
diff --git a/python/cudf/cudf/_lib/cpp/expressions.pxd b/python/cudf/cudf/_lib/cpp/expressions.pxd
new file mode 100644
index 00000000000..1721f8aa734
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/expressions.pxd
@@ -0,0 +1,88 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.scalar.scalar cimport (
+    duration_scalar,
+    numeric_scalar,
+    timestamp_scalar,
+)
+from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport size_type
+
+
+cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
+    ctypedef enum ast_operator:
+        # Binary operators
+        ADD "cudf::ast::ast_operator::ADD"
+        SUB "cudf::ast::ast_operator::SUB"
+        MUL "cudf::ast::ast_operator::MUL"
+        DIV "cudf::ast::ast_operator::DIV"
+        TRUE_DIV "cudf::ast::ast_operator::TRUE_DIV"
+        FLOOR_DIV "cudf::ast::ast_operator::FLOOR_DIV"
+        MOD "cudf::ast::ast_operator::MOD"
+        PYMOD "cudf::ast::ast_operator::PYMOD"
+        POW "cudf::ast::ast_operator::POW"
+        EQUAL "cudf::ast::ast_operator::EQUAL"
+        NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL"
+        NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL"
+        LESS "cudf::ast::ast_operator::LESS"
+        GREATER "cudf::ast::ast_operator::GREATER"
+        LESS_EQUAL "cudf::ast::ast_operator::LESS_EQUAL"
+        GREATER_EQUAL "cudf::ast::ast_operator::GREATER_EQUAL"
+        BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND"
+        BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR"
+        BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR"
+        NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND"
+        LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND"
+        NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR"
+        LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR"
+        # Unary operators
+        IDENTITY "cudf::ast::ast_operator::IDENTITY"
+        SIN "cudf::ast::ast_operator::SIN"
+        COS "cudf::ast::ast_operator::COS"
+        TAN "cudf::ast::ast_operator::TAN"
+        ARCSIN "cudf::ast::ast_operator::ARCSIN"
+        ARCCOS "cudf::ast::ast_operator::ARCCOS"
+        ARCTAN "cudf::ast::ast_operator::ARCTAN"
+        SINH "cudf::ast::ast_operator::SINH"
+        COSH "cudf::ast::ast_operator::COSH"
+        TANH "cudf::ast::ast_operator::TANH"
+        ARCSINH "cudf::ast::ast_operator::ARCSINH"
+        ARCCOSH "cudf::ast::ast_operator::ARCCOSH"
+        ARCTANH "cudf::ast::ast_operator::ARCTANH"
+        EXP "cudf::ast::ast_operator::EXP"
+        LOG "cudf::ast::ast_operator::LOG"
+        SQRT "cudf::ast::ast_operator::SQRT"
+        CBRT "cudf::ast::ast_operator::CBRT"
+        CEIL "cudf::ast::ast_operator::CEIL"
+        FLOOR "cudf::ast::ast_operator::FLOOR"
+        ABS "cudf::ast::ast_operator::ABS"
+        RINT "cudf::ast::ast_operator::RINT"
+        BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT"
+        NOT "cudf::ast::ast_operator::NOT"
+
+    cdef cppclass expression:
+        pass
+
+    ctypedef enum table_reference:
+        LEFT "cudf::ast::table_reference::LEFT"
+        RIGHT "cudf::ast::table_reference::RIGHT"
+
+    cdef cppclass literal(expression):
+        # Due to https://github.com/cython/cython/issues/3198, we need to
+        # specify a return type for templated constructors.
+        literal literal[T](numeric_scalar[T] &) except +
+        literal literal[T](timestamp_scalar[T] &) except +
+        literal literal[T](duration_scalar[T] &) except +
+
+    cdef cppclass column_reference(expression):
+        # Allow for default C++ parameters by declaring multiple constructors
+        # with the default parameters optionally omitted.
+        column_reference(size_type) except +
+        column_reference(size_type, table_reference) except +
+
+    cdef cppclass operation(expression):
+        operation(ast_operator, const expression &)
+        operation(ast_operator, const expression &, const expression&)
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 590a371ff52..d9de04b676e 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -9,6 +9,7 @@ from rmm._lib.device_buffer cimport device_buffer
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.expressions cimport expression
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type
@@ -42,3 +43,8 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         column_view input_column,
         column_view categories
     )
+
+    cdef unique_ptr[column] compute_column(
+        const table_view table,
+        const expression& expr
+    ) except +
diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/expressions.pxd
new file mode 100644
index 00000000000..85665822174
--- /dev/null
+++ b/python/cudf/cudf/_lib/expressions.pxd
@@ -0,0 +1,38 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t, int64_t
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.expressions cimport (
+    column_reference,
+    expression,
+    literal,
+    operation,
+)
+from cudf._lib.cpp.scalar.scalar cimport numeric_scalar
+
+ctypedef enum scalar_type_t:
+    INT
+    DOUBLE
+
+
+ctypedef union int_or_double_scalar_ptr:
+    unique_ptr[numeric_scalar[int64_t]] int_ptr
+    unique_ptr[numeric_scalar[double]] double_ptr
+
+
+cdef class Expression:
+    cdef unique_ptr[expression] c_obj
+
+
+cdef class Literal(Expression):
+    cdef scalar_type_t c_scalar_type
+    cdef int_or_double_scalar_ptr c_scalar
+
+
+cdef class ColumnReference(Expression):
+    pass
+
+
+cdef class Operation(Expression):
+    pass
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
new file mode 100644
index 00000000000..f069bcdbe73
--- /dev/null
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -0,0 +1,130 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from enum import Enum
+
+from cython.operator cimport dereference
+from libc.stdint cimport int64_t
+from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp cimport expressions as libcudf_exp
+from cudf._lib.cpp.types cimport size_type
+
+# Necessary for proper casting, see below.
+ctypedef int32_t underlying_type_ast_operator
+
+
+# Aliases for simplicity
+ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
+
+
+class ASTOperator(Enum):
+    ADD = libcudf_exp.ast_operator.ADD
+    SUB = libcudf_exp.ast_operator.SUB
+    MUL = libcudf_exp.ast_operator.MUL
+    DIV = libcudf_exp.ast_operator.DIV
+    TRUE_DIV = libcudf_exp.ast_operator.TRUE_DIV
+    FLOOR_DIV = libcudf_exp.ast_operator.FLOOR_DIV
+    MOD = libcudf_exp.ast_operator.MOD
+    PYMOD = libcudf_exp.ast_operator.PYMOD
+    POW = libcudf_exp.ast_operator.POW
+    EQUAL = libcudf_exp.ast_operator.EQUAL
+    NULL_EQUAL = libcudf_exp.ast_operator.NULL_EQUAL
+    NOT_EQUAL = libcudf_exp.ast_operator.NOT_EQUAL
+    LESS = libcudf_exp.ast_operator.LESS
+    GREATER = libcudf_exp.ast_operator.GREATER
+    LESS_EQUAL = libcudf_exp.ast_operator.LESS_EQUAL
+    GREATER_EQUAL = libcudf_exp.ast_operator.GREATER_EQUAL
+    BITWISE_AND = libcudf_exp.ast_operator.BITWISE_AND
+    BITWISE_OR = libcudf_exp.ast_operator.BITWISE_OR
+    BITWISE_XOR = libcudf_exp.ast_operator.BITWISE_XOR
+    LOGICAL_AND = libcudf_exp.ast_operator.LOGICAL_AND
+    NULL_LOGICAL_AND = libcudf_exp.ast_operator.NULL_LOGICAL_AND
+    LOGICAL_OR = libcudf_exp.ast_operator.LOGICAL_OR
+    NULL_LOGICAL_OR = libcudf_exp.ast_operator.NULL_LOGICAL_OR
+    # Unary operators
+    IDENTITY = libcudf_exp.ast_operator.IDENTITY
+    SIN = libcudf_exp.ast_operator.SIN
+    COS = libcudf_exp.ast_operator.COS
+    TAN = libcudf_exp.ast_operator.TAN
+    ARCSIN = libcudf_exp.ast_operator.ARCSIN
+    ARCCOS = libcudf_exp.ast_operator.ARCCOS
+    ARCTAN = libcudf_exp.ast_operator.ARCTAN
+    SINH = libcudf_exp.ast_operator.SINH
+    COSH = libcudf_exp.ast_operator.COSH
+    TANH = libcudf_exp.ast_operator.TANH
+    ARCSINH = libcudf_exp.ast_operator.ARCSINH
+    ARCCOSH = libcudf_exp.ast_operator.ARCCOSH
+    ARCTANH = libcudf_exp.ast_operator.ARCTANH
+    EXP = libcudf_exp.ast_operator.EXP
+    LOG = libcudf_exp.ast_operator.LOG
+    SQRT = libcudf_exp.ast_operator.SQRT
+    CBRT = libcudf_exp.ast_operator.CBRT
+    CEIL = libcudf_exp.ast_operator.CEIL
+    FLOOR = libcudf_exp.ast_operator.FLOOR
+    ABS = libcudf_exp.ast_operator.ABS
+    RINT = libcudf_exp.ast_operator.RINT
+    BIT_INVERT = libcudf_exp.ast_operator.BIT_INVERT
+    NOT = libcudf_exp.ast_operator.NOT
+
+
+class TableReference(Enum):
+    LEFT = libcudf_exp.table_reference.LEFT
+    RIGHT = libcudf_exp.table_reference.RIGHT
+
+
+# Note that this function only currently supports numeric literals. libcudf
+# expressions don't really support other types yet though, so this isn't
+# restrictive at the moment.
+cdef class Literal(Expression):
+    def __cinit__(self, value):
+        # TODO: Would love to find a better solution than unions for literals.
+        cdef int intval
+        cdef double doubleval
+
+        if isinstance(value, int):
+            self.c_scalar_type = scalar_type_t.INT
+            intval = value
+            self.c_scalar.int_ptr = make_unique[numeric_scalar[int64_t]](
+                intval, True
+            )
+            self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
+                <numeric_scalar[int64_t] &>dereference(self.c_scalar.int_ptr)
+            )
+        elif isinstance(value, float):
+            self.c_scalar_type = scalar_type_t.DOUBLE
+            doubleval = value
+            self.c_scalar.double_ptr = make_unique[numeric_scalar[double]](
+                doubleval, True
+            )
+            self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
+                <numeric_scalar[double] &>dereference(self.c_scalar.double_ptr)
+            )
+
+
+cdef class ColumnReference(Expression):
+    def __cinit__(self, size_type index):
+        self.c_obj = <expression_ptr>make_unique[libcudf_exp.column_reference](
+            index
+        )
+
+
+cdef class Operation(Expression):
+    def __cinit__(self, op, Expression left, Expression right=None):
+        # This awkward double casting is the only way to get Cython to generate
+        # valid C++. Cython doesn't support scoped enumerations, so it assumes
+        # that enums correspond to their underlying value types and will thus
+        # attempt operations that are invalid without first explicitly casting
+        # to the underlying before casting to the desired type.
+        cdef libcudf_exp.ast_operator op_value = <libcudf_exp.ast_operator>(
+            <underlying_type_ast_operator> op.value
+        )
+
+        if right is None:
+            self.c_obj = <expression_ptr> make_unique[libcudf_exp.operation](
+                op_value, dereference(left.c_obj)
+            )
+        else:
+            self.c_obj = <expression_ptr> make_unique[libcudf_exp.operation](
+                op_value, dereference(left.c_obj), dereference(right.c_obj)
+            )
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 175150b6865..2d94ef2cedf 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -5,9 +5,11 @@ from numba.np import numpy_support
 
 import cudf
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
+from cudf.core._internals.expressions import parse_expression
 from cudf.core.buffer import Buffer
 from cudf.utils import cudautils
 
+from cython.operator cimport dereference
 from libc.stdint cimport uintptr_t
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
@@ -20,14 +22,18 @@ cimport cudf._lib.cpp.transform as libcudf_transform
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.expressions cimport expression
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id
+from cudf._lib.expressions cimport Expression
 from cudf._lib.types cimport underlying_type_t_type_id
 from cudf._lib.utils cimport (
     columns_from_unique_ptr,
     data_from_table_view,
+    data_from_unique_ptr,
     table_view_from_columns,
+    table_view_from_table,
 )
 
 
@@ -156,3 +162,34 @@ def one_hot_encode(Column input_column, Column categories):
     )
 
     return encodings
+
+
+def compute_column(list columns, tuple column_names, expr: str):
+    """Compute a new column by evaluating an expression on a set of columns.
+
+    Parameters
+    ----------
+    columns : list
+        The set of columns forming the table to evaluate the expression on.
+    column_names : tuple[str]
+        The names associated with each column. These names are necessary to map
+        column names in the expression to indices in the provided list of
+        columns, which are what will be used by libcudf to evaluate the
+        expression on the table.
+    expr : str
+        The expression to evaluate.
+    """
+    visitor = parse_expression(expr, column_names)
+
+    # At the end, all the stack contains is the expression to evaluate.
+    cdef Expression cudf_expr = visitor.expression
+    cdef table_view tbl = table_view_from_columns(columns)
+    cdef unique_ptr[column] col
+    with nogil:
+        col = move(
+            libcudf_transform.compute_column(
+                tbl,
+                <expression &> dereference(cudf_expr.c_obj.get())
+            )
+        )
+    return Column.from_unique_ptr(move(col))
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
new file mode 100644
index 00000000000..bc587d4e1e2
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/expressions.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+import ast
+import functools
+from typing import List, Tuple
+
+from cudf._lib.expressions import (
+    ASTOperator,
+    ColumnReference,
+    Expression,
+    Literal,
+    Operation,
+)
+
+# This dictionary encodes the mapping from Python AST operators to their cudf
+# counterparts.
+python_cudf_operator_map = {
+    # Binary operators
+    ast.Add: ASTOperator.ADD,
+    ast.Sub: ASTOperator.SUB,
+    ast.Mult: ASTOperator.MUL,
+    ast.Div: ASTOperator.DIV,
+    ast.FloorDiv: ASTOperator.FLOOR_DIV,
+    ast.Mod: ASTOperator.PYMOD,
+    ast.Pow: ASTOperator.POW,
+    ast.Eq: ASTOperator.EQUAL,
+    ast.NotEq: ASTOperator.NOT_EQUAL,
+    ast.Lt: ASTOperator.LESS,
+    ast.Gt: ASTOperator.GREATER,
+    ast.LtE: ASTOperator.LESS_EQUAL,
+    ast.GtE: ASTOperator.GREATER_EQUAL,
+    ast.BitXor: ASTOperator.BITWISE_XOR,
+    # TODO: The mapping of logical/bitwise operators here is inconsistent with
+    # pandas. In pandas, Both `BitAnd` and `And` map to
+    # `ASTOperator.LOGICAL_AND` for booleans, while they map to
+    # `ASTOperator.BITWISE_AND` for integers. However, there is no good way to
+    # encode this at present because expressions can be arbitrarily nested so
+    # we won't know the dtype of the input without inserting a much more
+    # complex traversal of the expression tree to determine the output types at
+    # each node. For now, we'll rely on users to use the appropriate operator.
+    ast.BitAnd: ASTOperator.BITWISE_AND,
+    ast.BitOr: ASTOperator.BITWISE_OR,
+    ast.And: ASTOperator.LOGICAL_AND,
+    ast.Or: ASTOperator.LOGICAL_OR,
+    # Unary operators
+    ast.Invert: ASTOperator.BIT_INVERT,
+    ast.Not: ASTOperator.NOT,
+    # TODO: Missing USub, possibility other unary ops?
+}
+
+
+# Mapping between Python function names encode in an ast.Call node and the
+# corresponding libcudf C++ AST operators.
+python_cudf_function_map = {
+    # TODO: Operators listed on
+    # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval  # noqa: E501
+    # that we don't support yet:
+    # expm1, log1p, arctan2 and log10.
+    "sin": ASTOperator.SIN,
+    "cos": ASTOperator.COS,
+    "tan": ASTOperator.TAN,
+    "arcsin": ASTOperator.ARCSIN,
+    "arccos": ASTOperator.ARCCOS,
+    "arctan": ASTOperator.ARCTAN,
+    "sinh": ASTOperator.SINH,
+    "cosh": ASTOperator.COSH,
+    "tanh": ASTOperator.TANH,
+    "arcsinh": ASTOperator.ARCSINH,
+    "arccosh": ASTOperator.ARCCOSH,
+    "arctanh": ASTOperator.ARCTANH,
+    "exp": ASTOperator.EXP,
+    "log": ASTOperator.LOG,
+    "sqrt": ASTOperator.SQRT,
+    "abs": ASTOperator.ABS,
+    "ceil": ASTOperator.CEIL,
+    "floor": ASTOperator.FLOOR,
+    # TODO: Operators supported by libcudf with no Python function analog.
+    # ast.rint: ASTOperator.RINT,
+    # ast.cbrt: ASTOperator.CBRT,
+}
+
+
+class libcudfASTVisitor(ast.NodeVisitor):
+    """A NodeVisitor specialized for constructing a libcudf expression tree.
+
+    This visitor is designed to handle AST nodes that have libcudf equivalents.
+    It constructs column references from names and literals from constants,
+    then builds up operations. The final result can be accessed using the
+    `expression` property. The visitor must be kept in scope for as long as the
+    expression is needed because all of the underlying libcudf expressions will
+    be destroyed when the libcudfASTVisitor is.
+
+    Parameters
+    ----------
+    col_names : Tuple[str]
+        The column names used to map the names in an expression.
+    """
+
+    def __init__(self, col_names: Tuple[str]):
+        self.stack: List[Expression] = []
+        self.nodes: List[Expression] = []
+        self.col_names = col_names
+
+    @property
+    def expression(self):
+        """Expression: The result of parsing an AST."""
+        assert len(self.stack) == 1
+        return self.stack[-1]
+
+    def visit_Name(self, node):
+        try:
+            col_id = self.col_names.index(node.id)
+        except ValueError:
+            raise ValueError(f"Unknown column name {node.id}")
+        self.stack.append(ColumnReference(col_id))
+
+    def visit_Constant(self, node):
+        if not isinstance(node, ast.Num):
+            raise ValueError(
+                f"Unsupported literal {repr(node.value)} of type "
+                "{type(node.value).__name__}"
+            )
+        self.stack.append(Literal(node.value))
+
+    def visit_UnaryOp(self, node):
+        self.visit(node.operand)
+        self.nodes.append(self.stack.pop())
+        if isinstance(node.op, ast.USub):
+            # TODO: Except for leaf nodes, we won't know the type of the
+            # operand, so there's no way to know whether this should be a float
+            # or an int. We should maybe see what Spark does, and this will
+            # probably require casting.
+            self.nodes.append(Literal(-1))
+            op = ASTOperator.MUL
+            self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2]))
+        elif isinstance(node.op, ast.UAdd):
+            self.stack.append(self.nodes[-1])
+        else:
+            op = python_cudf_operator_map[type(node.op)]
+            self.stack.append(Operation(op, self.nodes[-1]))
+
+    def visit_BinOp(self, node):
+        self.visit(node.left)
+        self.visit(node.right)
+        self.nodes.append(self.stack.pop())
+        self.nodes.append(self.stack.pop())
+
+        op = python_cudf_operator_map[type(node.op)]
+        self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2]))
+
+    def _visit_BoolOp_Compare(self, operators, operands, has_multiple_ops):
+        # Helper function handling the common components of parsing BoolOp and
+        # Compare AST nodes. These two types of nodes both support chaining
+        # (e.g. `a > b > c` is equivalent to `a > b and b > c`, so this
+        # function helps standardize that.
+
+        # TODO: Whether And/Or and BitAnd/BitOr actually correspond to
+        # logical or bitwise operators depends on the data types that they
+        # are applied to. We'll need to add logic to map to that.
+        inner_ops = []
+        for op, (left, right) in zip(operators, operands):
+            # Note that this will lead to duplicate nodes, e.g. if
+            # the comparison is `a < b < c` that will be encoded as
+            # `a < b and b < c`. We could potentially optimize by caching
+            # expressions by name so that we only construct them once.
+            self.visit(left)
+            self.visit(right)
+
+            self.nodes.append(self.stack.pop())
+            self.nodes.append(self.stack.pop())
+
+            op = python_cudf_operator_map[type(op)]
+            inner_ops.append(Operation(op, self.nodes[-1], self.nodes[-2]))
+
+        self.nodes.extend(inner_ops)
+
+        # If we have more than one comparator, we need to link them
+        # together with LOGICAL_AND operators.
+        if has_multiple_ops:
+            op = ASTOperator.LOGICAL_AND
+
+            def _combine_compare_ops(left, right):
+                self.nodes.append(Operation(op, left, right))
+                return self.nodes[-1]
+
+            functools.reduce(_combine_compare_ops, inner_ops)
+
+        self.stack.append(self.nodes[-1])
+
+    def visit_BoolOp(self, node):
+        operators = [node.op] * (len(node.values) - 1)
+        operands = zip(node.values[:-1], node.values[1:])
+        self._visit_BoolOp_Compare(operators, operands, len(node.values) > 2)
+
+    def visit_Compare(self, node):
+        operands = (node.left, *node.comparators)
+        has_multiple_ops = len(operands) > 2
+        operands = zip(operands[:-1], operands[1:])
+        self._visit_BoolOp_Compare(node.ops, operands, has_multiple_ops)
+
+    def visit_Call(self, node):
+        try:
+            op = python_cudf_function_map[node.func.id]
+        except KeyError:
+            raise ValueError(f"Unsupported function {node.func}.")
+        # Assuming only unary functions are supported, which is checked above.
+        if len(node.args) != 1 or node.keywords:
+            raise ValueError(
+                f"Function {node.func} only accepts one positional "
+                "argument."
+            )
+        self.visit(node.args[0])
+
+        self.nodes.append(self.stack.pop())
+        self.stack.append(Operation(op, self.nodes[-1]))
+
+
+@functools.lru_cache(256)
+def parse_expression(expr: str, col_names: Tuple[str]):
+    visitor = libcudfASTVisitor(col_names)
+    visitor.visit(ast.parse(expr))
+    return visitor
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7b4b81630bd..0d3b3ee0300 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7,6 +7,7 @@
 import itertools
 import numbers
 import pickle
+import re
 import sys
 import warnings
 from collections import abc, defaultdict
@@ -6253,6 +6254,165 @@ def interleave_columns(self):
             {None: libcudf.reshape.interleave_columns([*self._columns])}
         )
 
+    @_cudf_nvtx_annotate
+    def eval(self, expr: str, inplace: bool = False, **kwargs):
+        """Evaluate a string describing operations on DataFrame columns.
+
+        Operates on columns only, not specific rows or elements.
+
+        Parameters
+        ----------
+        expr : str
+            The expression string to evaluate.
+        inplace : bool, default False
+            If the expression contains an assignment, whether to perform the
+            operation inplace and mutate the existing DataFrame. Otherwise,
+            a new DataFrame is returned.
+        **kwargs
+            Not supported.
+
+        Returns
+        -------
+        DataFrame, Series, or None
+            Series if a single column is returned (the typical use case),
+            DataFrame if any assignment statements are included in
+            ``expr``, or None if ``inplace=True``.
+
+        Notes
+        -----
+        Difference from pandas:
+            * Additional kwargs are not supported.
+            * Bitwise and logical operators are not dtype-dependent.
+              Specifically, `&` must be used for bitwise operators on integers,
+              not `and`, which is specifically for the logical and between
+              booleans.
+            * Only numerical types are currently supported.
+            * Operators generally will not cast automatically. Users are
+              responsible for casting columns to suitable types before
+              evaluating a function.
+            * Multiple assignments to the same name (i.e. a sequence of
+              assignment statements where later statements are conditioned upon
+              the output of earlier statements) is not supported.
+
+        Examples
+        --------
+        >>> df = cudf.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
+        >>> df
+           A   B
+        0  1  10
+        1  2   8
+        2  3   6
+        3  4   4
+        4  5   2
+        >>> df.eval('A + B')
+        0    11
+        1    10
+        2     9
+        3     8
+        4     7
+        dtype: int64
+
+        Assignment is allowed though by default the original DataFrame is not
+        modified.
+
+        >>> df.eval('C = A + B')
+           A   B   C
+        0  1  10  11
+        1  2   8  10
+        2  3   6   9
+        3  4   4   8
+        4  5   2   7
+        >>> df
+           A   B
+        0  1  10
+        1  2   8
+        2  3   6
+        3  4   4
+        4  5   2
+
+        Use ``inplace=True`` to modify the original DataFrame.
+
+        >>> df.eval('C = A + B', inplace=True)
+        >>> df
+           A   B   C
+        0  1  10  11
+        1  2   8  10
+        2  3   6   9
+        3  4   4   8
+        4  5   2   7
+
+        Multiple columns can be assigned to using multi-line expressions:
+
+        >>> df.eval(
+        ...     '''
+        ... C = A + B
+        ... D = A - B
+        ... '''
+        ... )
+           A   B   C  D
+        0  1  10  11 -9
+        1  2   8  10 -6
+        2  3   6   9 -3
+        3  4   4   8  0
+        4  5   2   7  3
+        """
+        if kwargs:
+            raise ValueError(
+                "Keyword arguments other than `inplace` are not supported"
+            )
+
+        # Have to use a regex match to avoid capturing "=="
+        includes_assignment = re.search("[^=]=[^=]", expr) is not None
+
+        # Check if there were multiple statements. Filter out empty lines.
+        statements = tuple(filter(None, expr.strip().split("\n")))
+        if len(statements) > 1 and any(
+            re.search("[^=]=[^=]", st) is None for st in statements
+        ):
+            raise ValueError(
+                "Multi-line expressions are only valid if all expressions "
+                "contain an assignment."
+            )
+
+        if not includes_assignment:
+            if inplace:
+                raise ValueError(
+                    "Cannot operate inplace if there is no assignment"
+                )
+            return Series._from_data(
+                {
+                    None: libcudf.transform.compute_column(
+                        [*self._columns], self._column_names, statements[0]
+                    )
+                }
+            )
+
+        targets = []
+        exprs = []
+        for st in statements:
+            try:
+                t, e = re.split("[^=]=[^=]", st)
+            except ValueError as err:
+                if "too many values" in str(err):
+                    raise ValueError(
+                        f"Statement {st} contains too many assignments ('=')"
+                    )
+                raise
+            targets.append(t.strip())
+            exprs.append(e.strip())
+
+        cols = (
+            libcudf.transform.compute_column(
+                [*self._columns], self._column_names, e
+            )
+            for e in exprs
+        )
+        ret = self if inplace else self.copy(deep=False)
+        for name, col in zip(targets, cols):
+            ret._data[name] = col
+        if not inplace:
+            return ret
+
 
 def from_dataframe(df, allow_copy=False):
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index bf5c4ae319b..d95fe278469 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9266,3 +9266,90 @@ def test_empty_numeric_only(data):
     expected = pdf.prod(numeric_only=True)
     actual = gdf.prod(numeric_only=True)
     assert_eq(expected, actual)
+
+
+@pytest.fixture
+def df_eval():
+    N = 10
+    int_max = 10
+    rng = cupy.random.default_rng(0)
+    return cudf.DataFrame(
+        {
+            "a": rng.integers(N, size=int_max),
+            "b": rng.integers(N, size=int_max),
+            "c": rng.integers(N, size=int_max),
+            "d": rng.integers(N, size=int_max),
+        }
+    )
+
+
+# Note that for now expressions do not automatically handle casting, so inputs
+# need to be casted appropriately
+@pytest.mark.parametrize(
+    "expr, dtype",
+    [
+        ("a", int),
+        ("+a", int),
+        ("a + b", int),
+        ("a == b", int),
+        ("a / b", float),
+        ("a * b", int),
+        ("a > b", int),
+        ("a > b > c", int),
+        ("a > b < c", int),
+        ("a & b", int),
+        ("a & b | c", int),
+        ("sin(a)", float),
+        ("exp(sin(abs(a)))", float),
+        ("sqrt(floor(a))", float),
+        ("ceil(arctanh(a))", float),
+        ("(a + b) - (c * d)", int),
+        ("~a", int),
+        ("(a > b) and (c > d)", int),
+        ("(a > b) or (c > d)", int),
+        ("not (a > b)", int),
+        ("a + 1", int),
+        ("a + 1.0", float),
+        ("-a + 1", int),
+        ("+a + 1", int),
+        ("e = a + 1", int),
+        (
+            """
+            e = log(cos(a)) + 1.0
+            f = abs(c) - exp(d)
+            """,
+            float,
+        ),
+        ("a_b_are_equal = (a == b)", int),
+    ],
+)
+def test_dataframe_eval(df_eval, expr, dtype):
+    df_eval = df_eval.astype(dtype)
+    expect = df_eval.to_pandas().eval(expr)
+    got = df_eval.eval(expr)
+    # In the specific case where the evaluated expression is a unary function
+    # of a single column with no nesting, pandas will retain the name. This
+    # level of compatibility is out of scope for now.
+    assert_eq(expect, got, check_names=False)
+
+    # Test inplace
+    if re.search("[^=]=[^=]", expr) is not None:
+        pdf_eval = df_eval.to_pandas()
+        pdf_eval.eval(expr, inplace=True)
+        df_eval.eval(expr, inplace=True)
+        assert_eq(pdf_eval, df_eval)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        """
+        e = a + b
+        a == b
+        """,
+        "a_b_are_equal = (a == b) = c",
+    ],
+)
+def test_dataframe_eval_errors(df_eval, expr):
+    with pytest.raises(ValueError):
+        df_eval.eval(expr)

From 20569f6cd9e03f1d7536ac49e6e93ffc99941e98 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 28 Apr 2022 11:38:06 -0400
Subject: [PATCH 07/28] Add `detail::hash_join` (#10695)

Closes https://github.com/rapidsai/cudf/issues/10587

This PR adds a `detail::hash_join` class which is templated on the hash function. It also cleans up `join` internal functions by moving code around to proper files. The implementation of `detail::hash_join` is mainly taken from `cudf::hash_join::hash_join_impl`.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/10695
---
 cpp/include/cudf/detail/join.hpp              | 185 +++++++++++
 .../cudf/detail/utilities/hash_functions.cuh  |   3 -
 cpp/include/cudf/hashing.hpp                  |   3 +
 cpp/include/cudf/join.hpp                     |  17 +-
 cpp/src/join/hash_join.cu                     | 267 +++++++++++-----
 cpp/src/join/hash_join.cuh                    | 289 ------------------
 cpp/src/join/join.cu                          |  85 ++----
 cpp/src/join/join_common_utils.cuh            |  82 ++++-
 cpp/src/join/join_common_utils.hpp            |  13 +-
 cpp/src/join/join_utils.cu                    |   2 +-
 cpp/src/join/mixed_join.cu                    |   9 +-
 cpp/src/join/mixed_join_kernel.cuh            |   7 +-
 cpp/src/join/mixed_join_semi.cu               |  11 +-
 cpp/src/join/mixed_join_size_kernel.cuh       |   7 +-
 cpp/src/join/semi_join.cu                     |   4 +-
 15 files changed, 516 insertions(+), 468 deletions(-)
 create mode 100644 cpp/include/cudf/detail/join.hpp
 delete mode 100644 cpp/src/join/hash_join.cuh

diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
new file mode 100644
index 00000000000..12e4aaa03fd
--- /dev/null
+++ b/cpp/include/cudf/detail/join.hpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <cuco/static_multimap.cuh>
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+
+// Forward declaration
+template <typename T>
+class default_allocator;
+
+namespace cudf {
+namespace detail {
+
+constexpr int DEFAULT_JOIN_CG_SIZE = 2;
+
+enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
+
+/**
+ * @brief Hash join that builds hash table in creation and probes results in subsequent `*_join`
+ * member functions.
+ *
+ * User-defined hash function can be passed via the template parameter `Hasher`
+ *
+ * @tparam Hasher Unary callable type
+ */
+template <typename Hasher>
+struct hash_join {
+ public:
+  using map_type =
+    cuco::static_multimap<hash_value_type,
+                          cudf::size_type,
+                          cuda::thread_scope_device,
+                          rmm::mr::stream_allocator_adaptor<default_allocator<char>>,
+                          cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, Hasher, Hasher>>;
+
+  hash_join()                 = delete;
+  ~hash_join()                = default;
+  hash_join(hash_join const&) = delete;
+  hash_join(hash_join&&)      = delete;
+  hash_join& operator=(hash_join const&) = delete;
+  hash_join& operator=(hash_join&&) = delete;
+
+ private:
+  bool const _is_empty;                    ///< true if `_hash_table` is empty
+  cudf::null_equality const _nulls_equal;  ///< whether to consider nulls as equal
+  cudf::table_view _build;                 ///< input table to build the hash map
+  cudf::structs::detail::flattened_table
+    _flattened_build_table;  ///< flattened data structures for `_build`
+  map_type _hash_table;      ///< hash table built on `_build`
+
+ public:
+  /**
+   * @brief Constructor that internally builds the hash table based on the given `build` table.
+   *
+   * @throw cudf::logic_error if the number of columns in `build` table is 0.
+   * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE.
+   *
+   * @param build The build table, from which the hash table is built.
+   * @param compare_nulls Controls whether null join-key values should match or not.
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   */
+  hash_join(cudf::table_view const& build,
+            cudf::null_equality compare_nulls,
+            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+  /**
+   * @copydoc cudf::hash_join::inner_join
+   */
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  inner_join(cudf::table_view const& probe,
+             std::optional<std::size_t> output_size,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr) const;
+
+  /**
+   * @copydoc cudf::hash_join::left_join
+   */
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  left_join(cudf::table_view const& probe,
+            std::optional<std::size_t> output_size,
+            rmm::cuda_stream_view stream,
+            rmm::mr::device_memory_resource* mr) const;
+
+  /**
+   * @copydoc cudf::hash_join::full_join
+   */
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  full_join(cudf::table_view const& probe,
+            std::optional<std::size_t> output_size,
+            rmm::cuda_stream_view stream,
+            rmm::mr::device_memory_resource* mr) const;
+
+  /**
+   * @copydoc cudf::hash_join::inner_join_size
+   */
+  [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe,
+                                            rmm::cuda_stream_view stream) const;
+
+  /**
+   * @copydoc cudf::hash_join::left_join_size
+   */
+  [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe,
+                                           rmm::cuda_stream_view stream) const;
+
+  /**
+   * @copydoc cudf::hash_join::full_join_size
+   */
+  std::size_t full_join_size(cudf::table_view const& probe,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr) const;
+
+ private:
+  /**
+   * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`,
+   * and returns the output indices of `build_table` and `probe_table` as a combined table,
+   * i.e. if full join is specified as the join type then left join is called. Behavior
+   * is undefined if the provided `output_size` is smaller than the actual output size.
+   *
+   * @throw cudf::logic_error if build table is empty and `JoinKind == INNER_JOIN`.
+   *
+   * @tparam JoinKind The type of join to be performed.
+   *
+   * @param probe_table Table of probe side columns to join.
+   * @param output_size Optional value which allows users to specify the exact output size.
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource used to allocate the returned vectors.
+   *
+   * @return Join output indices vector pair.
+   */
+  template <cudf::detail::join_kind JoinKind>
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  probe_join_indices(cudf::table_view const& probe_table,
+                     std::optional<std::size_t> output_size,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const;
+
+  /**
+   * @copydoc cudf::detail::hash_join::probe_join_indices
+   *
+   * @throw cudf::logic_error if probe table is empty.
+   * @throw cudf::logic_error if the size of probe table exceeds `MAX_JOIN_SIZE`.
+   * @throw cudf::logic_error if the number of columns in build table and probe table do not match.
+   * @throw cudf::logic_error if the column data types in build table and probe table do not match.
+   */
+  template <cudf::detail::join_kind JoinKind>
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  compute_hash_join(cudf::table_view const& probe,
+                    std::optional<std::size_t> output_size,
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* mr) const;
+};
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 9c6f3e9cb13..2c5434b63d2 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -32,9 +32,6 @@
 #include <thrust/reverse.h>
 
 namespace cudf {
-
-using hash_value_type = uint32_t;
-
 namespace detail {
 
 /**
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index e973c585410..bbff304e547 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -19,6 +19,9 @@
 #include <cudf/table/table_view.hpp>
 
 namespace cudf {
+
+using hash_value_type = uint32_t;
+
 /**
  * @addtogroup column_hash
  * @{
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index d56f8f0e904..f48f8a83e9a 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/hashing.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
@@ -29,6 +30,16 @@
 #include <vector>
 
 namespace cudf {
+
+// forward declaration
+namespace detail {
+template <typename T>
+class MurmurHash3_32;
+
+template <typename T>
+class hash_join;
+}  // namespace detail
+
 /**
  * @addtogroup column_join
  * @{
@@ -503,6 +514,9 @@ std::unique_ptr<cudf::table> cross_join(
  */
 class hash_join {
  public:
+  using impl_type =
+    typename cudf::detail::hash_join<cudf::detail::MurmurHash3_32<cudf::hash_value_type>>;
+
   hash_join() = delete;
   ~hash_join();
   hash_join(hash_join const&) = delete;
@@ -634,8 +648,7 @@ class hash_join {
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
  private:
-  struct hash_join_impl;
-  const std::unique_ptr<const hash_join_impl> impl;
+  const std::unique_ptr<const impl_type> _impl;
 };
 
 /**
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 8d2888fd761..3e0e76de708 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -13,11 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <join/hash_join.cuh>
+#include "join_common_utils.cuh"
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/join.hpp>
 #include <cudf/detail/structs/utilities.hpp>
+#include <cudf/join.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -38,13 +41,67 @@
 
 namespace cudf {
 namespace detail {
-
-std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
-  table_view const& probe, table_view const& build)
+namespace {
+/**
+ * @brief Calculates the exact size of the join output produced when
+ * joining two tables together.
+ *
+ * @throw cudf::logic_error if JoinKind is not INNER_JOIN or LEFT_JOIN
+ *
+ * @tparam JoinKind The type of join to be performed
+ *
+ * @param build_table The right hand table
+ * @param probe_table The left hand table
+ * @param hash_table A hash table built on the build table that maps the index
+ * of every row to the hash value of that row.
+ * @param nulls_equal Flag to denote nulls are equal or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return The exact size of the output of the join operation
+ */
+template <join_kind JoinKind>
+std::size_t compute_join_output_size(table_device_view build_table,
+                                     table_device_view probe_table,
+                                     cudf::detail::multimap_type const& hash_table,
+                                     bool const has_nulls,
+                                     cudf::null_equality const nulls_equal,
+                                     rmm::cuda_stream_view stream)
 {
-  std::unique_ptr<table> empty_probe = empty_like(probe);
-  std::unique_ptr<table> empty_build = empty_like(build);
-  return std::pair(std::move(empty_probe), std::move(empty_build));
+  const size_type build_table_num_rows{build_table.num_rows()};
+  const size_type probe_table_num_rows{probe_table.num_rows()};
+
+  // If the build table is empty, we know exactly how large the output
+  // will be for the different types of joins and can return immediately
+  if (0 == build_table_num_rows) {
+    switch (JoinKind) {
+      // Inner join with an empty table will have no output
+      case join_kind::INNER_JOIN: return 0;
+
+      // Left join with an empty table will have an output of NULL rows
+      // equal to the number of rows in the probe table
+      case join_kind::LEFT_JOIN: return probe_table_num_rows;
+
+      default: CUDF_FAIL("Unsupported join type");
+    }
+  }
+
+  auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
+  pair_equality equality{probe_table, build_table, probe_nulls, nulls_equal};
+
+  row_hash hash_probe{probe_nulls, probe_table};
+  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_probe, empty_key_sentinel};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+
+  std::size_t size;
+  if constexpr (JoinKind == join_kind::LEFT_JOIN) {
+    size = hash_table.pair_count_outer(iter, iter + probe_table_num_rows, equality, stream.value());
+  } else {
+    size = hash_table.pair_count(iter, iter + probe_table_num_rows, equality, stream.value());
+  }
+
+  return size;
 }
 
 /**
@@ -69,7 +126,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 probe_join_hash_table(cudf::table_device_view build_table,
                       cudf::table_device_view probe_table,
-                      multimap_type const& hash_table,
+                      cudf::detail::multimap_type const& hash_table,
                       bool has_nulls,
                       null_equality compare_nulls,
                       std::optional<std::size_t> output_size,
@@ -145,7 +202,7 @@ probe_join_hash_table(cudf::table_device_view build_table,
  */
 std::size_t get_full_join_size(cudf::table_device_view build_table,
                                cudf::table_device_view probe_table,
-                               multimap_type const& hash_table,
+                               cudf::detail::multimap_type const& hash_table,
                                bool const has_nulls,
                                null_equality const compare_nulls,
                                rmm::cuda_stream_view stream,
@@ -157,8 +214,6 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
   // If output size is zero, return immediately
   if (join_size == 0) { return join_size; }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
-
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
@@ -221,25 +276,12 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
   }
   return join_size + left_join_complement_size;
 }
+}  // namespace
 
-std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
-                                                std::unique_ptr<cudf::table>&& right)
-{
-  auto joined_cols = left->release();
-  auto right_cols  = right->release();
-  joined_cols.insert(joined_cols.end(),
-                     std::make_move_iterator(right_cols.begin()),
-                     std::make_move_iterator(right_cols.end()));
-  return std::make_unique<cudf::table>(std::move(joined_cols));
-}
-
-}  // namespace detail
-
-hash_join::hash_join_impl::~hash_join_impl() = default;
-
-hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,
-                                          null_equality compare_nulls,
-                                          rmm::cuda_stream_view stream)
+template <typename Hasher>
+hash_join<Hasher>::hash_join(cudf::table_view const& build,
+                             cudf::null_equality compare_nulls,
+                             rmm::cuda_stream_view stream)
   : _is_empty{build.num_rows() == 0},
     _nulls_equal{compare_nulls},
     _hash_table{compute_hash_table_size(build.num_rows()),
@@ -263,41 +305,45 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,
   cudf::detail::build_join_hash_table(_build, _hash_table, _nulls_equal, stream);
 }
 
+template <typename Hasher>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::inner_join(cudf::table_view const& probe,
-                                      std::optional<std::size_t> output_size,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr) const
+hash_join<Hasher>::inner_join(cudf::table_view const& probe,
+                              std::optional<std::size_t> output_size,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join<cudf::detail::join_kind::INNER_JOIN>(probe, output_size, stream, mr);
 }
 
+template <typename Hasher>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::left_join(cudf::table_view const& probe,
-                                     std::optional<std::size_t> output_size,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+hash_join<Hasher>::left_join(cudf::table_view const& probe,
+                             std::optional<std::size_t> output_size,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join<cudf::detail::join_kind::LEFT_JOIN>(probe, output_size, stream, mr);
 }
 
+template <typename Hasher>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::full_join(cudf::table_view const& probe,
-                                     std::optional<std::size_t> output_size,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+hash_join<Hasher>::full_join(cudf::table_view const& probe,
+                             std::optional<std::size_t> output_size,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join<cudf::detail::join_kind::FULL_JOIN>(probe, output_size, stream, mr);
 }
 
-std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& probe,
-                                                       rmm::cuda_stream_view stream) const
+template <typename Hasher>
+std::size_t hash_join<Hasher>::inner_join_size(cudf::table_view const& probe,
+                                               rmm::cuda_stream_view stream) const
 {
   CUDF_FUNC_RANGE();
 
@@ -320,8 +366,9 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p
     stream);
 }
 
-std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe,
-                                                      rmm::cuda_stream_view stream) const
+template <typename Hasher>
+std::size_t hash_join<Hasher>::left_join_size(cudf::table_view const& probe,
+                                              rmm::cuda_stream_view stream) const
 {
   CUDF_FUNC_RANGE();
 
@@ -344,9 +391,10 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr
     stream);
 }
 
-std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr) const
+template <typename Hasher>
+std::size_t hash_join<Hasher>::full_join_size(cudf::table_view const& probe,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
 
@@ -370,13 +418,51 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr
     mr);
 }
 
+template <typename Hasher>
+template <cudf::detail::join_kind JoinKind>
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join<Hasher>::probe_join_indices(cudf::table_view const& probe_table,
+                                      std::optional<std::size_t> output_size,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr) const
+{
+  // Trivial left join case - exit early
+  if (_is_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) {
+    return get_trivial_left_join_indices(probe_table, stream, mr);
+  }
+
+  CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null.");
+
+  auto build_table_ptr = cudf::table_device_view::create(_build, stream);
+  auto probe_table_ptr = cudf::table_device_view::create(probe_table, stream);
+
+  auto join_indices = cudf::detail::probe_join_hash_table<JoinKind>(
+    *build_table_ptr,
+    *probe_table_ptr,
+    _hash_table,
+    cudf::has_nulls(probe_table) | cudf::has_nulls(_build),
+    _nulls_equal,
+    output_size,
+    stream,
+    mr);
+
+  if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
+    auto complement_indices = detail::get_left_join_indices_complement(
+      join_indices.second, probe_table.num_rows(), _build.num_rows(), stream, mr);
+    join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
+  }
+  return join_indices;
+}
+
+template <typename Hasher>
 template <cudf::detail::join_kind JoinKind>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe,
-                                             std::optional<std::size_t> output_size,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr) const
+hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
+                                     std::optional<std::size_t> output_size,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
 {
   CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
   CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
@@ -403,41 +489,64 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe,
 
   return probe_join_indices<JoinKind>(flattened_probe_table, output_size, stream, mr);
 }
+}  // namespace detail
+
+hash_join::~hash_join() = default;
+
+hash_join::hash_join(cudf::table_view const& build,
+                     null_equality compare_nulls,
+                     rmm::cuda_stream_view stream)
+  : _impl{std::make_unique<const impl_type>(build, compare_nulls, stream)}
+{
+}
 
-template <cudf::detail::join_kind JoinKind>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe_table,
-                                              std::optional<std::size_t> output_size,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr) const
+hash_join::inner_join(cudf::table_view const& probe,
+                      std::optional<std::size_t> output_size,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr) const
 {
-  // Trivial left join case - exit early
-  if (_is_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) {
-    return get_trivial_left_join_indices(probe_table, stream, mr);
-  }
+  return _impl->inner_join(probe, output_size, stream, mr);
+}
 
-  CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null.");
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::left_join(cudf::table_view const& probe,
+                     std::optional<std::size_t> output_size,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const
+{
+  return _impl->left_join(probe, output_size, stream, mr);
+}
 
-  auto build_table_ptr = cudf::table_device_view::create(_build, stream);
-  auto probe_table_ptr = cudf::table_device_view::create(probe_table, stream);
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::full_join(cudf::table_view const& probe,
+                     std::optional<std::size_t> output_size,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const
+{
+  return _impl->full_join(probe, output_size, stream, mr);
+}
 
-  auto join_indices = cudf::detail::probe_join_hash_table<JoinKind>(
-    *build_table_ptr,
-    *probe_table_ptr,
-    _hash_table,
-    cudf::has_nulls(probe_table) | cudf::has_nulls(_build),
-    _nulls_equal,
-    output_size,
-    stream,
-    mr);
+std::size_t hash_join::inner_join_size(cudf::table_view const& probe,
+                                       rmm::cuda_stream_view stream) const
+{
+  return _impl->inner_join_size(probe, stream);
+}
 
-  if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
-    auto complement_indices = detail::get_left_join_indices_complement(
-      join_indices.second, probe_table.num_rows(), _build.num_rows(), stream, mr);
-    join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
-  }
-  return join_indices;
+std::size_t hash_join::left_join_size(cudf::table_view const& probe,
+                                      rmm::cuda_stream_view stream) const
+{
+  return _impl->left_join_size(probe, stream);
+}
+
+std::size_t hash_join::full_join_size(cudf::table_view const& probe,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr) const
+{
+  return _impl->full_join_size(probe, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
deleted file mode 100644
index e55de043372..00000000000
--- a/cpp/src/join/hash_join.cuh
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-
-#include <cudf/detail/concatenate.cuh>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/structs/utilities.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/join.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/table/table_view.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/sequence.h>
-
-#include <cstddef>
-#include <limits>
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief Remaps a hash value to a new value if it is equal to the specified sentinel value.
- *
- * @param hash The hash value to potentially remap
- * @param sentinel The reserved value
- */
-template <typename H, typename S>
-constexpr auto remap_sentinel_hash(H hash, S sentinel)
-{
-  // Arbitrarily choose hash - 1
-  return (hash == sentinel) ? (hash - 1) : hash;
-}
-
-/**
- * @brief Device functor to create a pair of hash value and index for a given row.
- */
-class make_pair_function {
- public:
-  CUDF_HOST_DEVICE make_pair_function(row_hash const& hash,
-                                      hash_value_type const empty_key_sentinel)
-    : _hash{hash}, _empty_key_sentinel{empty_key_sentinel}
-  {
-  }
-
-  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
-  {
-    // Compute the hash value of row `i`
-    auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel);
-    return cuco::make_pair(row_hash_value, i);
-  }
-
- private:
-  row_hash _hash;
-  hash_value_type const _empty_key_sentinel;
-};
-
-/**
- * @brief Calculates the exact size of the join output produced when
- * joining two tables together.
- *
- * @throw cudf::logic_error if JoinKind is not INNER_JOIN or LEFT_JOIN
- *
- * @tparam JoinKind The type of join to be performed
- * @tparam multimap_type The type of the hash table
- *
- * @param build_table The right hand table
- * @param probe_table The left hand table
- * @param hash_table A hash table built on the build table that maps the index
- * of every row to the hash value of that row.
- * @param nulls_equal Flag to denote nulls are equal or not.
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return The exact size of the output of the join operation
- */
-template <join_kind JoinKind, typename multimap_type>
-std::size_t compute_join_output_size(table_device_view build_table,
-                                     table_device_view probe_table,
-                                     multimap_type const& hash_table,
-                                     bool const has_nulls,
-                                     cudf::null_equality const nulls_equal,
-                                     rmm::cuda_stream_view stream)
-{
-  const size_type build_table_num_rows{build_table.num_rows()};
-  const size_type probe_table_num_rows{probe_table.num_rows()};
-
-  // If the build table is empty, we know exactly how large the output
-  // will be for the different types of joins and can return immediately
-  if (0 == build_table_num_rows) {
-    switch (JoinKind) {
-      // Inner join with an empty table will have no output
-      case join_kind::INNER_JOIN: return 0;
-
-      // Left join with an empty table will have an output of NULL rows
-      // equal to the number of rows in the probe table
-      case join_kind::LEFT_JOIN: return probe_table_num_rows;
-
-      default: CUDF_FAIL("Unsupported join type");
-    }
-  }
-
-  auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
-  pair_equality equality{probe_table, build_table, probe_nulls, nulls_equal};
-
-  row_hash hash_probe{probe_nulls, probe_table};
-  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
-  make_pair_function pair_func{hash_probe, empty_key_sentinel};
-
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
-
-  std::size_t size;
-  if constexpr (JoinKind == join_kind::LEFT_JOIN) {
-    size = hash_table.pair_count_outer(iter, iter + probe_table_num_rows, equality, stream.value());
-  } else {
-    size = hash_table.pair_count(iter, iter + probe_table_num_rows, equality, stream.value());
-  }
-
-  return size;
-}
-
-std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
-  table_view const& probe, table_view const& build);
-
-std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
-                                                std::unique_ptr<cudf::table>&& right);
-
-/**
- * @brief Builds the hash table based on the given `build_table`.
- *
- * @tparam MultimapType The type of the hash table
- *
- * @param build Table of columns used to build join hash.
- * @param hash_table Build hash table.
- * @param nulls_equal Flag to denote nulls are equal or not.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- *
- */
-template <typename MultimapType>
-void build_join_hash_table(cudf::table_view const& build,
-                           MultimapType& hash_table,
-                           null_equality const nulls_equal,
-                           rmm::cuda_stream_view stream)
-{
-  auto build_table_ptr = cudf::table_device_view::create(build, stream);
-
-  CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
-  CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
-
-  row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr};
-  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
-  make_pair_function pair_func{hash_build, empty_key_sentinel};
-
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
-
-  size_type const build_table_num_rows{build_table_ptr->num_rows()};
-  if (nulls_equal == cudf::null_equality::EQUAL or (not nullable(build))) {
-    hash_table.insert(iter, iter + build_table_num_rows, stream.value());
-  } else {
-    thrust::counting_iterator<size_type> stencil(0);
-    auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
-    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
-
-    // insert valid rows
-    hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value());
-  }
-}
-}  // namespace detail
-
-struct hash_join::hash_join_impl {
- public:
-  hash_join_impl() = delete;
-  ~hash_join_impl();
-  hash_join_impl(hash_join_impl const&) = delete;
-  hash_join_impl(hash_join_impl&&)      = delete;
-  hash_join_impl& operator=(hash_join_impl const&) = delete;
-  hash_join_impl& operator=(hash_join_impl&&) = delete;
-
- private:
-  bool const _is_empty;
-  cudf::null_equality const _nulls_equal;
-  cudf::table_view _build;
-  std::vector<std::unique_ptr<cudf::column>> _created_null_columns;
-  cudf::structs::detail::flattened_table _flattened_build_table;
-  cudf::detail::multimap_type _hash_table;
-
- public:
-  /**
-   * @brief Constructor that internally builds the hash table based on the given `build` table
-   *
-   * @throw cudf::logic_error if the number of columns in `build` table is 0.
-   * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE.
-   *
-   * @param build The build table, from which the hash table is built.
-   * @param compare_nulls Controls whether null join-key values should match or not.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  hash_join_impl(cudf::table_view const& build,
-                 null_equality compare_nulls,
-                 rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(cudf::table_view const& probe,
-             std::optional<std::size_t> output_size,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const;
-
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
-  left_join(cudf::table_view const& probe,
-            std::optional<std::size_t> output_size,
-            rmm::cuda_stream_view stream,
-            rmm::mr::device_memory_resource* mr) const;
-
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
-  full_join(cudf::table_view const& probe,
-            std::optional<std::size_t> output_size,
-            rmm::cuda_stream_view stream,
-            rmm::mr::device_memory_resource* mr) const;
-
-  [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe,
-                                            rmm::cuda_stream_view stream) const;
-
-  [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe,
-                                           rmm::cuda_stream_view stream) const;
-
-  std::size_t full_join_size(cudf::table_view const& probe,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr) const;
-
- private:
-  template <cudf::detail::join_kind JoinKind>
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
-  compute_hash_join(cudf::table_view const& probe,
-                    std::optional<std::size_t> output_size,
-                    rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr) const;
-
-  /**
-   * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`,
-   * and returns the output indices of `build_table` and `probe_table` as a combined table,
-   * i.e. if full join is specified as the join type then left join is called. Behavior
-   * is undefined if the provided `output_size` is smaller than the actual output size.
-   *
-   * @throw cudf::logic_error if hash table is null.
-   *
-   * @tparam JoinKind The type of join to be performed.
-   *
-   * @param probe_table Table of probe side columns to join.
-   * @param output_size Optional value which allows users to specify the exact output size.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   * @param mr Device memory resource used to allocate the returned vectors.
-   *
-   * @return Join output indices vector pair.
-   */
-  template <cudf::detail::join_kind JoinKind>
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
-  probe_join_indices(cudf::table_view const& probe_table,
-                     std::optional<std::size_t> output_size,
-                     rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const;
-};
-
-}  // namespace cudf
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 15aed83b641..5c529c88d9d 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -13,8 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "join/hash_join.cuh"
-#include "join/join_common_utils.hpp"
+#include "join_common_utils.hpp"
 
 #include <cudf/detail/gather.cuh>
 #include <cudf/dictionary/detail/update_keys.hpp>
@@ -26,6 +25,26 @@
 
 namespace cudf {
 namespace detail {
+namespace {
+std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
+  table_view const& probe, table_view const& build)
+{
+  std::unique_ptr<table> empty_probe = empty_like(probe);
+  std::unique_ptr<table> empty_build = empty_like(build);
+  return std::pair(std::move(empty_probe), std::move(empty_build));
+}
+
+std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
+                                                std::unique_ptr<cudf::table>&& right)
+{
+  auto joined_cols = left->release();
+  auto right_cols  = right->release();
+  joined_cols.insert(joined_cols.end(),
+                     std::make_move_iterator(right_cols.begin()),
+                     std::make_move_iterator(right_cols.end()));
+  return std::make_unique<cudf::table>(std::move(joined_cols));
+}
+}  // namespace
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
@@ -222,69 +241,8 @@ std::unique_ptr<table> full_join(table_view const& left_input,
                                                        mr);
   return combine_table_pair(std::move(left_result), std::move(right_result));
 }
-
 }  // namespace detail
 
-hash_join::~hash_join() = default;
-
-hash_join::hash_join(cudf::table_view const& build,
-                     null_equality compare_nulls,
-                     rmm::cuda_stream_view stream)
-  : impl{std::make_unique<const hash_join::hash_join_impl>(build, compare_nulls, stream)}
-{
-}
-
-std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-          std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::inner_join(cudf::table_view const& probe,
-                      std::optional<std::size_t> output_size,
-                      rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr) const
-{
-  return impl->inner_join(probe, output_size, stream, mr);
-}
-
-std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-          std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::left_join(cudf::table_view const& probe,
-                     std::optional<std::size_t> output_size,
-                     rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const
-{
-  return impl->left_join(probe, output_size, stream, mr);
-}
-
-std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-          std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::full_join(cudf::table_view const& probe,
-                     std::optional<std::size_t> output_size,
-                     rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const
-{
-  return impl->full_join(probe, output_size, stream, mr);
-}
-
-std::size_t hash_join::inner_join_size(cudf::table_view const& probe,
-                                       rmm::cuda_stream_view stream) const
-{
-  return impl->inner_join_size(probe, stream);
-}
-
-std::size_t hash_join::left_join_size(cudf::table_view const& probe,
-                                      rmm::cuda_stream_view stream) const
-{
-  return impl->left_join_size(probe, stream);
-}
-
-std::size_t hash_join::full_join_size(cudf::table_view const& probe,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr) const
-{
-  return impl->full_join_size(probe, stream, mr);
-}
-
-// external APIs
-
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 inner_join(table_view const& left,
@@ -353,5 +311,4 @@ std::unique_ptr<table> full_join(table_view const& left,
   return detail::full_join(
     left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
 }
-
 }  // namespace cudf
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index b778f13b5e1..fdb63419c84 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
-#include <join/join_common_utils.hpp>
+#include "join_common_utils.hpp"
 
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -26,6 +28,41 @@
 
 namespace cudf {
 namespace detail {
+/**
+ * @brief Remaps a hash value to a new value if it is equal to the specified sentinel value.
+ *
+ * @param hash The hash value to potentially remap
+ * @param sentinel The reserved value
+ */
+template <typename H, typename S>
+constexpr auto remap_sentinel_hash(H hash, S sentinel)
+{
+  // Arbitrarily choose hash - 1
+  return (hash == sentinel) ? (hash - 1) : hash;
+}
+
+/**
+ * @brief Device functor to create a pair of hash value and index for a given row.
+ */
+class make_pair_function {
+ public:
+  CUDF_HOST_DEVICE make_pair_function(row_hash const& hash,
+                                      hash_value_type const empty_key_sentinel)
+    : _hash{hash}, _empty_key_sentinel{empty_key_sentinel}
+  {
+  }
+
+  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
+  {
+    // Compute the hash value of row `i`
+    auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel);
+    return cuco::make_pair(row_hash_value, i);
+  }
+
+ private:
+  row_hash _hash;
+  hash_value_type const _empty_key_sentinel;
+};
 
 /**
  * @brief Device functor to determine if a row is valid.
@@ -98,6 +135,47 @@ get_trivial_left_join_indices(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Builds the hash table based on the given `build_table`.
+ *
+ * @tparam MultimapType The type of the hash table
+ *
+ * @param build Table of columns used to build join hash.
+ * @param hash_table Build hash table.
+ * @param nulls_equal Flag to denote nulls are equal or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ *
+ */
+template <typename MultimapType>
+void build_join_hash_table(cudf::table_view const& build,
+                           MultimapType& hash_table,
+                           null_equality const nulls_equal,
+                           rmm::cuda_stream_view stream)
+{
+  auto build_table_ptr = cudf::table_device_view::create(build, stream);
+
+  CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
+  CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
+
+  row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr};
+  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_build, empty_key_sentinel};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+
+  size_type const build_table_num_rows{build_table_ptr->num_rows()};
+  if (nulls_equal == cudf::null_equality::EQUAL or (not nullable(build))) {
+    hash_table.insert(iter, iter + build_table_num_rows, stream.value());
+  } else {
+    thrust::counting_iterator<size_type> stencil(0);
+    auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
+    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+
+    // insert valid rows
+    hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value());
+  }
+}
+
 // Convenient alias for a pair of unique pointers to device uvectors.
 using VectorPair = std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                              std::unique_ptr<rmm::device_uvector<size_type>>>;
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 526c22d1d5c..060e8bff6f8 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#include <cudf/detail/join.hpp>
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/join.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
@@ -34,7 +36,6 @@ namespace cudf {
 namespace detail {
 constexpr size_type MAX_JOIN_SIZE{std::numeric_limits<size_type>::max()};
 
-constexpr int DEFAULT_JOIN_CG_SIZE    = 2;
 constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128;
 constexpr int DEFAULT_JOIN_CACHE_SIZE = 128;
 constexpr size_type JoinNoneValue     = std::numeric_limits<size_type>::min();
@@ -45,12 +46,7 @@ using hash_type = cuco::detail::MurmurHash3_32<hash_value_type>;
 
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
-using multimap_type =
-  cuco::static_multimap<hash_value_type,
-                        size_type,
-                        cuda::thread_scope_device,
-                        hash_table_allocator_type,
-                        cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, hash_type, hash_type>>;
+using multimap_type = cudf::hash_join::impl_type::map_type;
 
 // Multimap type used for mixed joins. TODO: This is a temporary alias used
 // until the mixed joins are converted to using CGs properly. Right now it's
@@ -68,9 +64,6 @@ using row_hash = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;
 
 using row_equality = cudf::row_equality_comparator<cudf::nullate::DYNAMIC>;
 
-enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
-
 bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type);
-
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 1eb2d4cf4a7..7fa6642b19f 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <join/join_common_utils.cuh>
+#include "join_common_utils.cuh"
 
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index b540c013f47..27ee77e3edd 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_kernels.cuh"
+
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -23,12 +27,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
-#include <join/hash_join.cuh>
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-#include <join/mixed_join_kernels.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/scan.h>
diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index f7081cc4d63..38955ef4667 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -16,10 +16,9 @@
 
 #pragma once
 
-#include <join/hash_join.cuh>
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-#include <join/mixed_join_common_utils.cuh>
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_common_utils.cuh"
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 60cc74991ef..13a1f1a0ce2 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -14,8 +14,14 @@
  * limitations under the License.
  */
 
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_kernels_semi.cuh"
+
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
@@ -23,12 +29,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
-#include <join/hash_join.cuh>
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-#include <join/mixed_join_kernels_semi.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 9eedc1a8015..ce70f7f18ee 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
-#include <join/hash_join.cuh>
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-#include <join/mixed_join_common_utils.cuh>
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_common_utils.cuh"
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 9e1aa27a4e7..687e553fefd 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -45,7 +45,7 @@ namespace {
 /**
  * @brief Device functor to create a pair of hash value and index for a given row.
  */
-struct make_pair_function {
+struct make_pair_fn {
   __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
   {
     // The value is irrelevant since we only ever use the hash map to check for
@@ -101,7 +101,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   auto const right_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(right_flattened_keys)};
   row_hash const hash_build{right_nulls, *right_rows_d};
   row_equality equality_build{right_nulls, *right_rows_d, *right_rows_d, compare_nulls};
-  make_pair_function pair_func_build{};
+  make_pair_fn pair_func_build{};
 
   auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
 

From 9ac24773d186c22ffbacbe31d92dad60ed2cdb5f Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 28 Apr 2022 21:52:46 +0530
Subject: [PATCH 08/28] Implement all methods of groupby rank aggregation in
 libcudf, python (#9569)

Addresses part of https://github.com/rapidsai/cudf/issues/3591

- [x] move RANK (min method), DENSE_RANK (dense method) into single RANK aggregation
- [x] max method
- [x] average method
- [x] first method
- [x] percentage
- [x] order, null order
RANK, DENSE_RANK was implemented for spark requirement. Pandas groupby has 3 more methods. `rank(column_view, rank_method)` already has all 5 methods implemented.

Current implementation has 2 separate aggregations RANK and DENSE_RANK. This is merged to single RANK with parameters `rank_aggregation(rank_method method, null_policy null_handling, bool percentage)`
Groupby.rank support for 3 more methods will be added.

This PR is also pre-requisite for spearman correlation.


Additionally
- [x] Cython, Python plumbing
- [x] benchmark for groupby rank (all methods)
- [x] PERCENT_RANK aggregation is replaced with MIN_0_INDEXED rank_method in RANK aggregation

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - MithunR (https://github.com/mythrocks)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/9569
---
 cpp/benchmarks/CMakeLists.txt                 |  12 +-
 .../groupby/group_rank_benchmark.cu           | 109 +++++++
 cpp/include/cudf/aggregation.hpp              | 186 ++++--------
 .../cudf/detail/aggregation/aggregation.hpp   |  92 +++---
 cpp/include/cudf/detail/scan.hpp              |   9 +-
 cpp/include/cudf/sorting.hpp                  |  16 +-
 cpp/src/aggregation/aggregation.cpp           |  79 ++---
 cpp/src/groupby/groupby.cu                    |  13 +-
 cpp/src/groupby/sort/functors.hpp             |   3 +-
 cpp/src/groupby/sort/group_rank_scan.cu       | 287 ++++++++++++++----
 cpp/src/groupby/sort/group_scan.hpp           |  99 ++++--
 cpp/src/groupby/sort/scan.cpp                 | 113 ++++---
 cpp/src/reductions/scan/rank_scan.cu          |   9 +-
 cpp/src/reductions/scan/scan.cpp              |  22 +-
 cpp/tests/groupby/rank_scan_tests.cpp         | 176 ++++++-----
 cpp/tests/reductions/list_rank_test.cpp       |  63 ++--
 cpp/tests/reductions/rank_tests.cpp           |  21 +-
 cpp/tests/reductions/scan_tests.cpp           |   1 +
 java/src/main/native/src/AggregationJni.cpp   |  11 +-
 python/cudf/cudf/_lib/aggregation.pyx         |  41 ++-
 python/cudf/cudf/_lib/cpp/aggregation.pxd     |  24 ++
 python/cudf/cudf/_lib/cpp/sorting.pxd         |   9 +-
 python/cudf/cudf/_lib/groupby.pyx             |   2 +-
 python/cudf/cudf/_lib/sort.pxd                |   3 -
 python/cudf/cudf/_lib/sort.pyx                |  14 +-
 python/cudf/cudf/core/groupby/groupby.py      |  24 ++
 python/cudf/cudf/core/indexed_frame.py        |   2 +-
 python/cudf/cudf/tests/test_groupby.py        |  44 +++
 28 files changed, 944 insertions(+), 540 deletions(-)
 create mode 100644 cpp/benchmarks/groupby/group_rank_benchmark.cu
 delete mode 100644 python/cudf/cudf/_lib/sort.pxd

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 26bb10da69f..e93b2bf4f25 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -194,10 +194,18 @@ ConfigureBench(FILL_BENCH filling/repeat.cpp)
 # ##################################################################################################
 # * groupby benchmark -----------------------------------------------------------------------------
 ConfigureBench(
-  GROUPBY_BENCH groupby/group_sum.cu groupby/group_nth.cu groupby/group_shift.cu
-  groupby/group_struct.cu groupby/group_no_requests.cu groupby/group_scan.cu
+  GROUPBY_BENCH
+  groupby/group_sum.cu
+  groupby/group_nth.cu
+  groupby/group_shift.cu
+  groupby/group_struct.cu
+  groupby/group_no_requests.cu
+  groupby/group_scan.cu
+  groupby/group_rank_benchmark.cu
 )
 
+ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu)
+
 # ##################################################################################################
 # * hashing benchmark -----------------------------------------------------------------------------
 ConfigureBench(HASHING_BENCH hashing/hash.cpp hashing/partition.cpp)
diff --git a/cpp/benchmarks/groupby/group_rank_benchmark.cu b/cpp/benchmarks/groupby/group_rank_benchmark.cu
new file mode 100644
index 00000000000..1eeb15debe9
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_rank_benchmark.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/groupby.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <cudf::rank_method method>
+static void nvbench_groupby_rank(nvbench::state& state,
+                                 nvbench::type_list<nvbench::enum_type<method>>)
+{
+  using namespace cudf;
+  using type           = int64_t;
+  constexpr auto dtype = type_to_id<int64_t>();
+  cudf::rmm_pool_raii pool_raii;
+
+  bool const is_sorted              = state.get_int64("is_sorted");
+  cudf::size_type const column_size = state.get_int64("data_size");
+  constexpr int num_groups          = 100;
+
+  data_profile profile;
+  profile.set_null_frequency(std::nullopt);
+  profile.set_cardinality(0);
+  profile.set_distribution_params<type>(dtype, distribution_id::UNIFORM, 0, num_groups);
+
+  auto source_table = create_random_table({dtype, dtype}, row_count{column_size}, profile);
+
+  // values to be pre-sorted too for groupby rank
+  if (is_sorted) source_table = cudf::sort(*source_table);
+
+  table_view keys{{source_table->view().column(0)}};
+  column_view order_by{source_table->view().column(1)};
+
+  auto agg = cudf::make_rank_aggregation<groupby_scan_aggregation>(method);
+  std::vector<groupby::scan_request> requests;
+  requests.emplace_back(groupby::scan_request());
+  requests[0].values = order_by;
+  requests[0].aggregations.push_back(std::move(agg));
+
+  groupby::groupby gb_obj(keys, null_policy::EXCLUDE, is_sorted ? sorted::YES : sorted::NO);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    // groupby scan uses sort implementation
+    auto result = gb_obj.scan(requests);
+  });
+}
+
+enum class rank_method : int32_t {};
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  cudf::rank_method,
+  [](cudf::rank_method value) {
+    switch (value) {
+      case cudf::rank_method::FIRST: return "FIRST";
+      case cudf::rank_method::AVERAGE: return "AVERAGE";
+      case cudf::rank_method::MIN: return "MIN";
+      case cudf::rank_method::MAX: return "MAX";
+      case cudf::rank_method::DENSE: return "DENSE";
+      default: return "unknown";
+    }
+  },
+  [](cudf::rank_method value) {
+    switch (value) {
+      case cudf::rank_method::FIRST: return "cudf::rank_method::FIRST";
+      case cudf::rank_method::AVERAGE: return "cudf::rank_method::AVERAGE";
+      case cudf::rank_method::MIN: return "cudf::rank_method::MIN";
+      case cudf::rank_method::MAX: return "cudf::rank_method::MAX";
+      case cudf::rank_method::DENSE: return "cudf::rank_method::DENSE";
+      default: return "unknown";
+    }
+  })
+
+using methods = nvbench::enum_type_list<cudf::rank_method::AVERAGE,
+                                        cudf::rank_method::DENSE,
+                                        cudf::rank_method::FIRST,
+                                        cudf::rank_method::MAX,
+                                        cudf::rank_method::MIN>;
+
+NVBENCH_BENCH_TYPES(nvbench_groupby_rank, NVBENCH_TYPE_AXES(methods))
+  .set_type_axes_names({"rank_method"})
+  .set_name("groupby_rank")
+  .add_int64_axis("data_size",
+                  {
+                    1000000,    // 1M
+                    10000000,   // 10M
+                    100000000,  // 100M
+                  })
+
+  .add_int64_axis("is_sorted", {0, 1});
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 539a7c04106..5c7513a6c99 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -43,6 +43,32 @@ namespace detail {
 class simple_aggregations_collector;
 class aggregation_finalizer;
 }  // namespace detail
+
+/**
+ * @brief Tie-breaker method to use for ranking the column.
+ *
+ * @see cudf::make_rank_aggregation for more details.
+ * @ingroup column_sort
+ */
+enum class rank_method : int32_t {
+  FIRST,    ///< stable sort order ranking (no ties)
+  AVERAGE,  ///< mean of first in the group
+  MIN,      ///< min of first in the group
+  MAX,      ///< max of first in the group
+  DENSE     ///< rank always increases by 1 between groups
+};
+
+/**
+ * @brief Whether returned rank should be percentage or not and
+ *  mention the type of percentage normalization.
+ *
+ */
+enum class rank_percentage : int32_t {
+  NONE,             ///< rank
+  ZERO_NORMALIZED,  ///< rank / count
+  ONE_NORMALIZED    ///< (rank - 1) / (count - 1)
+};
+
 /**
  * @brief Abstract base class for specifying the desired aggregation in an
  * `aggregation_request`.
@@ -77,9 +103,7 @@ class aggregation {
     NUNIQUE,         ///< count number of unique elements
     NTH_ELEMENT,     ///< get the nth element
     ROW_NUMBER,      ///< get row-number of current index (relative to rolling window)
-    RANK,            ///< get rank       of current index
-    DENSE_RANK,      ///< get dense rank of current index
-    PERCENT_RANK,    ///< get percent (i.e. fractional) rank of current index
+    RANK,            ///< get rank of current index
     COLLECT_LIST,    ///< collect values into a list
     COLLECT_SET,     ///< collect values into a list without duplicate entries
     LEAD,            ///< window function, accesses row at specified offset following current row
@@ -323,9 +347,11 @@ std::unique_ptr<Base> make_row_number_aggregation();
 /**
  * @brief Factory to create a RANK aggregation
  *
- * `RANK` returns a non-nullable column of size_type "ranks": the number of rows preceding or
- * equal to the current row plus one. As a result, ranks are not unique and gaps will appear in
- * the ranking sequence.
+ * `RANK` returns a column of size_type or double "ranks" (see note 3 below for how the
+ * data type is determined) for a given rank method and column order.
+ * If nulls are excluded, the rank will be null for those rows, otherwise a non-nullable column is
+ * returned. Double precision column is returned only when percentage!=NONE and when rank method is
+ * average.
  *
  * This aggregation only works with "scan" algorithms. The input column into the group or
  * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks.
@@ -333,10 +359,12 @@ std::unique_ptr<Base> make_row_number_aggregation();
  * column containing the ordering columns.
  *
  * Note:
- *  1. This method requires that the rows are presorted by the group keys and order_by columns.
- *  2. `RANK` aggregations will return a fully valid column regardless of null_handling policy
- *     specified in the scan.
- *  3. `RANK` aggregations are not compatible with exclusive scans.
+ *  1. This method could work faster with the rows that are presorted by the group keys and order_by
+ *     columns. Though groupby object does not require order_by column to be sorted, groupby rank
+ *     scan aggregation does require the order_by column to be sorted if the keys are sorted.
+ *  2. `RANK` aggregations are not compatible with exclusive scans.
+ *  3. All rank methods except AVERAGE method and percentage!=NONE returns size_type column.
+ *     For AVERAGE method and percentage!=NONE, the return type is double column.
  *
  * @code{.pseudo}
  * Example: Consider a motor-racing statistics dataset, containing the following columns:
@@ -362,123 +390,37 @@ std::unique_ptr<Base> make_row_number_aggregation();
  * A grouped rank aggregation scan with:
  *   groupby column      : venue
  *   input orderby column: time
- * Produces the following rank column:
- * {   1,     2,     3,     3,     5,      1,     2,     2,     4,     5}
- * (This corresponds to the following grouping and `driver` rows:)
- * { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
- *   <----------silverstone----------->|<-------------monza-------------->
- * @endcode
- */
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_rank_aggregation();
-
-/**
- * @brief Factory to create a DENSE_RANK aggregation
- *
- * `DENSE_RANK` returns a non-nullable column of size_type "dense ranks": the preceding unique
- * value's rank plus one. As a result, ranks are not unique but there are no gaps in the ranking
- * sequence (unlike RANK aggregations).
- *
- * This aggregation only works with "scan" algorithms. The input column into the group or
- * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks.
- * If rows are ordered by more than one column, the orderby input column should be a struct
- * column containing the ordering columns.
- *
- * Note:
- *  1. This method requires that the rows are presorted by the group keys and order_by columns.
- *  2. `DENSE_RANK` aggregations will return a fully valid column regardless of null_handling
- *     policy specified in the scan.
- *  3. `DENSE_RANK` aggregations are not compatible with exclusive scans.
- *
- * @code{.pseudo}
- * Example: Consider a motor-racing statistics dataset, containing the following columns:
- *   1. venue:  (STRING) Location of the race event
- *   2. driver: (STRING) Name of the car driver (abbreviated to 3 characters)
- *   3. time:   (INT32)  Time taken to complete the circuit
- *
- * For the following presorted data:
+ * Produces the following rank column for each methods:
+ * first:   {   1,     2,     3,     4,     5,      1,     2,     3,     4,     5}
+ * average: {   1,     2,   3.5,   3.5,     5,      1,   2.5,   2.5,     4,     5}
+ * min:     {   1,     2,     3,     3,     5,      1,     2,     2,     4,     5}
+ * max:     {   1,     2,     4,     4,     5,      1,     3,     3,     4,     5}
+ * dense:   {   1,     2,     3,     3,     4,      1,     2,     2,     3,     4}
+ * This corresponds to the following grouping and `driver` rows:
+ *          { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
+ *            <----------silverstone----------->|<-------------monza-------------->
+ *
+ * min rank for each percentage types:
+ * NONE:             {   1,      2,     3,     3,     5,      1,     2,     2,     4,     5 }
+ * ZERO_NORMALIZED : { 0.16,  0.33,  0.50,  0.50,  0.83,   0.16,  0.33,  0.33,  0.66,  0.83 }
+ * ONE_NORMALIZED:   { 0.00,  0.25,  0.50,  0.50,  1.00,   0.00,  0.25,  0.25,  0.75,  1.00 }
+ * where count corresponds to the number of rows in the group. @see cudf::rank_percentage
  *
- *  [ //      venue,           driver,           time
- *    {   "silverstone",  "HAM" ("hamilton"),   15823},
- *    {   "silverstone",  "LEC" ("leclerc"),    15827},
- *    {   "silverstone",  "BOT" ("bottas"),     15834},  // <-- Tied for 3rd place.
- *    {   "silverstone",  "NOR" ("norris"),     15834},  // <-- Tied for 3rd place.
- *    {   "silverstone",  "RIC" ("ricciardo"),  15905},
- *    {      "monza",     "RIC" ("ricciardo"),  12154},
- *    {      "monza",     "NOR" ("norris"),     12156},  // <-- Tied for 2nd place.
- *    {      "monza",     "BOT" ("bottas"),     12156},  // <-- Tied for 2nd place.
- *    {      "monza",     "LEC" ("leclerc"),    12201},
- *    {      "monza",     "PER" ("perez"),      12203}
- *  ]
- *
- * A grouped dense rank aggregation scan with:
- *   groupby column      : venue
- *   input orderby column: time
- * Produces the following dense rank column:
- * {   1,     2,     3,     3,     4,      1,     2,     2,     3,     4}
- * (This corresponds to the following grouping and `driver` rows:)
- * { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
- *   <----------silverstone----------->|<-------------monza-------------->
  * @endcode
- */
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_dense_rank_aggregation();
-
-/**
- * @brief Factory to create a PERCENT_RANK aggregation
  *
- * `PERCENT_RANK` returns a non-nullable column of double precision "fractional" ranks.
- * For row index `i`, the percent rank of row `i` is defined as:
- *   percent_rank = (rank - 1) / (group_row_count - 1)
- * where,
- *   1. rank is the `RANK` of the row within the group
- *   2. group_row_count is the number of rows in the group
- *
- * This aggregation only works with "scan" algorithms. The input to the grouped or
- * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks.
- * If rows are ordered by more than one column, the orderby input column should be a struct
- * column containing the ordering columns.
- *
- * Note:
- *  1. This method requires that the rows are presorted by the group keys and order_by columns.
- *  2. `PERCENT_RANK` aggregations will return a fully valid column regardless of null_handling
- *     policy specified in the scan.
- *  3. `PERCENT_RANK` aggregations are not compatible with exclusive scans.
- *
- * @code{.pseudo}
- * Example: Consider a motor-racing statistics dataset, containing the following columns:
- *   1. venue:  (STRING) Location of the race event
- *   2. driver: (STRING) Name of the car driver (abbreviated to 3 characters)
- *   3. time:   (INT32)  Time taken to complete the circuit
- *
- * For the following presorted data:
- *
- *  [ //      venue,           driver,           time
- *    {   "silverstone",  "HAM" ("hamilton"),   15823},
- *    {   "silverstone",  "LEC" ("leclerc"),    15827},
- *    {   "silverstone",  "BOT" ("bottas"),     15834},  // <-- Tied for 3rd place.
- *    {   "silverstone",  "NOR" ("norris"),     15834},  // <-- Tied for 3rd place.
- *    {   "silverstone",  "RIC" ("ricciardo"),  15905},
- *    {      "monza",     "RIC" ("ricciardo"),  12154},
- *    {      "monza",     "NOR" ("norris"),     12156},  // <-- Tied for 2nd place.
- *    {      "monza",     "BOT" ("bottas"),     12156},  // <-- Tied for 2nd place.
- *    {      "monza",     "LEC" ("leclerc"),    12201},
- *    {      "monza",     "PER" ("perez"),      12203}
- *  ]
- *
- * A grouped percent rank aggregation scan with:
- *   groupby column      : venue
- *   input orderby column: time
- * Produces the following percent rank column:
- * { 0.00,  0.25,  0.50,  0.50,  1.00,   0.00,  0.25,  0.25,  0.75,  1.00 }
- *
- * (This corresponds to the following grouping and `driver` rows:)
- * { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
- *   <----------silverstone----------->|<-------------monza-------------->
- * @endcode
+ * @param method The ranking method used for tie breaking (same values).
+ * @param column_order The desired sort order for ranking
+ * @param null_handling  flag to include nulls during ranking. If nulls are not included,
+ * the corresponding rank will be null.
+ * @param null_precedence The desired order of null compared to other elements for column
+ * @param percentage enum to denote the type of conversion of ranks to percentage in range (0,1]
  */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_percent_rank_aggregation();
+std::unique_ptr<Base> make_rank_aggregation(rank_method method,
+                                            order column_order         = order::ASCENDING,
+                                            null_policy null_handling  = null_policy::EXCLUDE,
+                                            null_order null_precedence = null_order::AFTER,
+                                            rank_percentage percentage = rank_percentage::NONE);
 
 /**
  * @brief Factory to create a COLLECT_LIST aggregation
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 886151fb9d6..8ca49dd7d5f 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -75,10 +75,6 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class row_number_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class rank_aggregation const& agg);
-  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                          class dense_rank_aggregation const& agg);
-  virtual std::vector<std::unique_ptr<aggregation>> visit(
-    data_type col_type, class percent_rank_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(
     data_type col_type, class collect_list_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -127,8 +123,6 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class nth_element_aggregation const& agg);
   virtual void visit(class row_number_aggregation const& agg);
   virtual void visit(class rank_aggregation const& agg);
-  virtual void visit(class dense_rank_aggregation const& agg);
-  virtual void visit(class percent_rank_aggregation const& agg);
   virtual void visit(class collect_list_aggregation const& agg);
   virtual void visit(class collect_set_aggregation const& agg);
   virtual void visit(class lead_lag_aggregation const& agg);
@@ -642,32 +636,42 @@ class rank_aggregation final : public rolling_aggregation,
                                public groupby_scan_aggregation,
                                public scan_aggregation {
  public:
-  rank_aggregation() : aggregation{RANK} {}
-
-  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  rank_aggregation(rank_method method,
+                   order column_order,
+                   null_policy null_handling,
+                   null_order null_precedence,
+                   rank_percentage percentage)
+    : aggregation{RANK},
+      _method{method},
+      _column_order{column_order},
+      _null_handling{null_handling},
+      _null_precedence{null_precedence},
+      _percentage(percentage)
   {
-    return std::make_unique<rank_aggregation>(*this);
   }
-  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
-    data_type col_type, simple_aggregations_collector& collector) const override
+  rank_method const _method;          ///< rank method
+  order const _column_order;          ///< order of the column to rank
+  null_policy const _null_handling;   ///< include or exclude nulls in ranks
+  null_order const _null_precedence;  ///< order of nulls in ranks
+  rank_percentage const _percentage;  ///< whether to return percentage ranks
+
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
-    return collector.visit(col_type, *this);
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<rank_aggregation const&>(_other);
+    return _method == other._method and _null_handling == other._null_handling and
+           _column_order == other._column_order and _null_precedence == other._null_precedence and
+           _percentage == other._percentage;
   }
-  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
-};
 
-/**
- * @brief Derived class for specifying a dense rank aggregation
- */
-class dense_rank_aggregation final : public rolling_aggregation,
-                                     public groupby_scan_aggregation,
-                                     public scan_aggregation {
- public:
-  dense_rank_aggregation() : aggregation{DENSE_RANK} {}
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
   [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
-    return std::make_unique<dense_rank_aggregation>(*this);
+    return std::make_unique<rank_aggregation>(*this);
   }
   std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
     data_type col_type, simple_aggregations_collector& collector) const override
@@ -675,24 +679,16 @@ class dense_rank_aggregation final : public rolling_aggregation,
     return collector.visit(col_type, *this);
   }
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
-};
-
-class percent_rank_aggregation final : public rolling_aggregation,
-                                       public groupby_scan_aggregation,
-                                       public scan_aggregation {
- public:
-  percent_rank_aggregation() : aggregation{PERCENT_RANK} {}
 
-  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
-  {
-    return std::make_unique<percent_rank_aggregation>(*this);
-  }
-  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
-    data_type col_type, simple_aggregations_collector& collector) const override
+ private:
+  [[nodiscard]] size_t hash_impl() const
   {
-    return collector.visit(col_type, *this);
+    return std::hash<int>{}(static_cast<int>(_method)) ^
+           std::hash<int>{}(static_cast<int>(_column_order)) ^
+           std::hash<int>{}(static_cast<int>(_null_handling)) ^
+           std::hash<int>{}(static_cast<int>(_null_precedence)) ^
+           std::hash<int>{}(static_cast<int>(_percentage));
   }
-  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
 /**
@@ -1278,19 +1274,7 @@ struct target_type_impl<Source, aggregation::ROW_NUMBER> {
 // Always use size_type accumulator for RANK
 template <typename Source>
 struct target_type_impl<Source, aggregation::RANK> {
-  using type = size_type;
-};
-
-// Always use size_type accumulator for DENSE_RANK
-template <typename Source>
-struct target_type_impl<Source, aggregation::DENSE_RANK> {
-  using type = size_type;
-};
-
-// Always use double for PERCENT_RANK
-template <typename SourceType>
-struct target_type_impl<SourceType, aggregation::PERCENT_RANK> {
-  using type = double;
+  using type = size_type;  // double for percentage=true.
 };
 
 // Always use list for COLLECT_LIST
@@ -1453,10 +1437,6 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::ROW_NUMBER>(std::forward<Ts>(args)...);
     case aggregation::RANK:
       return f.template operator()<aggregation::RANK>(std::forward<Ts>(args)...);
-    case aggregation::DENSE_RANK:
-      return f.template operator()<aggregation::DENSE_RANK>(std::forward<Ts>(args)...);
-    case aggregation::PERCENT_RANK:
-      return f.template operator()<aggregation::PERCENT_RANK>(std::forward<Ts>(args)...);
     case aggregation::COLLECT_LIST:
       return f.template operator()<aggregation::COLLECT_LIST>(std::forward<Ts>(args)...);
     case aggregation::COLLECT_SET:
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index fc829617c2d..13dddd3b0c8 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -103,16 +103,17 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Generate row percent ranks for a column.
+ * @brief Generate row ONE_NORMALIZED percent ranks for a column.
+ * Also, knowns as ANSI SQL PERCENT RANK.
+ * Calculated by (rank - 1) / (count - 1).
  *
  * @param order_by Input column to generate ranks for.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return rank values.
  */
-std::unique_ptr<column> inclusive_percent_rank_scan(column_view const& order_by,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
+  column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index ff334b9ee85..b7e915650dc 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/aggregation.hpp>
 #include <cudf/types.hpp>
 
 #include <memory>
@@ -23,19 +24,6 @@
 
 namespace cudf {
 
-/**
- * @brief Tie-breaker method to use for ranking the column.
- *
- * @ingroup column_sort
- */
-enum class rank_method {
-  FIRST,    ///< stable sort order ranking (no ties)
-  AVERAGE,  ///< mean of first in the group
-  MIN,      ///< min of first in the group
-  MAX,      ///< max of first in the group
-  DENSE     ///< rank always increases by 1 between groups
-};
-
 /**
  * @addtogroup column_sort
  * @{
@@ -198,7 +186,7 @@ std::unique_ptr<table> stable_sort_by_key(
  * included, corresponding rank will be null.
  * @param null_precedence The desired order of null compared to other elements
  * for column
- * @param percentage flag to convert ranks to percentage in range (0,1}
+ * @param percentage flag to convert ranks to percentage in range (0,1]
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> A column of containing the rank of the each
  * element of the column of `input`. The output column type will be `size_type`
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 8fedf641c8f..27732b25401 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -154,18 +154,6 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
-std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, dense_rank_aggregation const& agg)
-{
-  return visit(col_type, static_cast<aggregation const&>(agg));
-}
-
-std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, percent_rank_aggregation const& agg)
-{
-  return visit(col_type, static_cast<aggregation const&>(agg));
-}
-
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, collect_list_aggregation const& agg)
 {
@@ -334,16 +322,6 @@ void aggregation_finalizer::visit(rank_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
-void aggregation_finalizer::visit(dense_rank_aggregation const& agg)
-{
-  visit(static_cast<aggregation const&>(agg));
-}
-
-void aggregation_finalizer::visit(percent_rank_aggregation const& agg)
-{
-  visit(static_cast<aggregation const&>(agg));
-}
-
 void aggregation_finalizer::visit(collect_list_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
@@ -644,36 +622,33 @@ template std::unique_ptr<rolling_aggregation> make_row_number_aggregation<rollin
 
 /// Factory to create a RANK aggregation
 template <typename Base>
-std::unique_ptr<Base> make_rank_aggregation()
-{
-  return std::make_unique<detail::rank_aggregation>();
-}
-template std::unique_ptr<aggregation> make_rank_aggregation<aggregation>();
-template std::unique_ptr<groupby_scan_aggregation>
-make_rank_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<scan_aggregation> make_rank_aggregation<scan_aggregation>();
-
-/// Factory to create a DENSE_RANK aggregation
-template <typename Base>
-std::unique_ptr<Base> make_dense_rank_aggregation()
-{
-  return std::make_unique<detail::dense_rank_aggregation>();
-}
-template std::unique_ptr<aggregation> make_dense_rank_aggregation<aggregation>();
-template std::unique_ptr<groupby_scan_aggregation>
-make_dense_rank_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<scan_aggregation> make_dense_rank_aggregation<scan_aggregation>();
-
-/// Factory to create a PERCENT_RANK aggregation
-template <typename Base>
-std::unique_ptr<Base> make_percent_rank_aggregation()
-{
-  return std::make_unique<detail::percent_rank_aggregation>();
-}
-template std::unique_ptr<aggregation> make_percent_rank_aggregation<aggregation>();
-template std::unique_ptr<groupby_scan_aggregation>
-make_percent_rank_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<scan_aggregation> make_percent_rank_aggregation<scan_aggregation>();
+std::unique_ptr<Base> make_rank_aggregation(rank_method method,
+                                            order column_order,
+                                            null_policy null_handling,
+                                            null_order null_precedence,
+                                            rank_percentage percentage)
+{
+  return std::make_unique<detail::rank_aggregation>(
+    method, column_order, null_handling, null_precedence, percentage);
+}
+template std::unique_ptr<aggregation> make_rank_aggregation<aggregation>(
+  rank_method method,
+  order column_order,
+  null_policy null_handling,
+  null_order null_precedence,
+  rank_percentage percentage);
+template std::unique_ptr<groupby_scan_aggregation> make_rank_aggregation<groupby_scan_aggregation>(
+  rank_method method,
+  order column_order,
+  null_policy null_handling,
+  null_order null_precedence,
+  rank_percentage percentage);
+template std::unique_ptr<scan_aggregation> make_rank_aggregation<scan_aggregation>(
+  rank_method method,
+  order column_order,
+  null_policy null_handling,
+  null_order null_precedence,
+  rank_percentage percentage);
 
 /// Factory to create a COLLECT_LIST aggregation
 template <typename Base>
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 79882239b38..a002b0bb744 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -101,9 +101,12 @@ namespace {
  * Adds special handling for COLLECT_LIST/COLLECT_SET, because:
  * 1. `make_empty_column()` does not support construction of nested columns.
  * 2. Empty lists need empty child columns, to persist type information.
+ * Adds special handling for RANK, because it needs to return double type column when rank_method is
+ * AVERAGE or percentage is true.
  */
 struct empty_column_constructor {
   column_view values;
+  aggregation const& agg;
 
   template <typename ValuesType, aggregation::Kind k>
   std::unique_ptr<cudf::column> operator()() const
@@ -116,6 +119,14 @@ struct empty_column_constructor {
         0, make_empty_column(type_to_id<offset_type>()), empty_like(values), 0, {});
     }
 
+    if constexpr (k == aggregation::Kind::RANK) {
+      auto const& rank_agg = dynamic_cast<cudf::detail::rank_aggregation const&>(agg);
+      if (rank_agg._method == cudf::rank_method::AVERAGE or
+          rank_agg._percentage != rank_percentage::NONE)
+        return make_empty_column(type_to_id<double>());
+      return make_empty_column(target_type(values.type(), k));
+    }
+
     // If `values` is LIST typed, and the aggregation results match the type,
     // construct empty results based on `values`.
     // Most generally, this applies if input type matches output type.
@@ -148,7 +159,7 @@ auto empty_results(host_span<RequestType const> requests)
         std::back_inserter(results),
         [&request](auto const& agg) {
           return cudf::detail::dispatch_type_and_aggregation(
-            request.values.type(), agg->kind, empty_column_constructor{request.values});
+            request.values.type(), agg->kind, empty_column_constructor{request.values, *agg});
         });
 
       return aggregation_result{std::move(results)};
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index fa3d19bdcfd..748e34a583d 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,7 +87,6 @@ struct store_result_functor {
    */
   column_view get_sorted_values()
   {
-    if (is_presorted()) { return values; }
     return sorted_values ? sorted_values->view()
                          : (sorted_values = helper.sorted_values(values, stream))->view();
   };
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 77d68edaa3a..0b25ab9a33d 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/row_operators.cuh>
@@ -27,6 +28,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/functional.h>
+#include <thrust/iterator/reverse_iterator.h>
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
 #include <thrust/transform.h>
@@ -35,23 +37,59 @@ namespace cudf {
 namespace groupby {
 namespace detail {
 namespace {
+
+/**
+ * @brief Functor to compare two rows of a table in given permutation order
+ * This is useful to identify unique elements in a sorted order table, when the permutation order is
+ * the sorted order of the table.
+ *
+ */
+template <typename Iterator>
+struct permuted_comparator {
+  /**
+   * @brief comparator object which compares two rows of the table in given permutation order
+   *
+   * @param device_table Device table to compare
+   * @param permutation The permutation order, integer type column.
+   * @param has_nulls whether the table has nulls
+   */
+  permuted_comparator(table_device_view device_table, Iterator const permutation, bool has_nulls)
+    : comparator(nullate::DYNAMIC{has_nulls}, device_table, device_table, null_equality::EQUAL),
+      permutation(permutation)
+  {
+  }
+  __device__ bool operator()(size_type index1, size_type index2) const
+  {
+    return comparator(permutation[index1], permutation[index2]);
+  };
+
+ private:
+  row_equality_comparator<nullate::DYNAMIC> comparator;
+  Iterator const permutation;
+};
+
 /**
  * @brief generate grouped row ranks or dense ranks using a row comparison then scan the results
  *
+ * @tparam forward true if the rank scan computation should use forward iterator traversal (default)
+ * else reverse iterator traversal
  * @tparam value_resolver flag value resolver function with boolean first and row number arguments
  * @tparam scan_operator scan function ran on the flag values
- * @param order_by input column to generate ranks for
+ * @param grouped_values input column to generate ranks for
+ * @param value_order column of type INT32 that contains the order of the values in the
+ * grouped_values column
  * @param group_labels ID of group that the corresponding value belongs to
  * @param group_offsets group index offsets with group ID indices
  * @param resolver flag value resolver
  * @param scan_op scan operation ran on the flag results
- * @param has_nulls true if nulls are included in the `order_by` column
+ * @param has_nulls true if nulls are included in the `grouped_values` column
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> rank values
  */
-template <typename value_resolver, typename scan_operator>
-std::unique_ptr<column> rank_generator(column_view const& order_by,
+template <bool forward, typename value_resolver, typename scan_operator>
+std::unique_ptr<column> rank_generator(column_view const& grouped_values,
+                                       column_view const& value_order,
                                        device_span<size_type const> group_labels,
                                        device_span<size_type const> group_offsets,
                                        value_resolver resolver,
@@ -61,10 +99,11 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        rmm::mr::device_memory_resource* mr)
 {
   auto const flattened = cudf::structs::detail::flatten_nested_columns(
-    table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
+    table_view{{grouped_values}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
   auto const d_flat_order = table_device_view::create(flattened, stream);
-  row_equality_comparator comparator(
-    nullate::DYNAMIC{has_nulls}, *d_flat_order, *d_flat_order, null_equality::EQUAL);
+  auto sorted_index_order = value_order.begin<size_type>();
+  auto comparator         = permuted_comparator(*d_flat_order, sorted_index_order, has_nulls);
+
   auto ranks         = make_fixed_width_column(data_type{type_to_id<size_type>()},
                                        flattened.flattened_columns().num_rows(),
                                        mask_state::UNALLOCATED,
@@ -72,100 +111,218 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        mr);
   auto mutable_ranks = ranks->mutable_view();
 
-  thrust::tabulate(
-    rmm::exec_policy(stream),
-    mutable_ranks.begin<size_type>(),
-    mutable_ranks.end<size_type>(),
-    [comparator, resolver, labels = group_labels.data(), offsets = group_offsets.data()] __device__(
-      size_type row_index) {
-      auto group_start = offsets[labels[row_index]];
+  auto unique_identifier = [labels  = group_labels.begin(),
+                            offsets = group_offsets.begin(),
+                            comparator,
+                            resolver] __device__(size_type row_index) {
+    auto const group_start = offsets[labels[row_index]];
+    if constexpr (forward) {
+      // First value of equal values is 1.
       return resolver(row_index == group_start || !comparator(row_index, row_index - 1),
                       row_index - group_start);
-    });
+    } else {
+      auto const group_end = offsets[labels[row_index] + 1];
+      // Last value of equal values is 1.
+      return resolver(row_index + 1 == group_end || !comparator(row_index, row_index + 1),
+                      row_index - group_start);
+    }
+  };
+  thrust::tabulate(rmm::exec_policy(stream),
+                   mutable_ranks.begin<size_type>(),
+                   mutable_ranks.end<size_type>(),
+                   unique_identifier);
 
+  auto [group_labels_begin, mutable_rank_begin] = [&]() {
+    if constexpr (forward) {
+      return thrust::pair{group_labels.begin(), mutable_ranks.begin<size_type>()};
+    } else {
+      return thrust::pair{thrust::reverse_iterator(group_labels.end()),
+                          thrust::reverse_iterator(mutable_ranks.end<size_type>())};
+    }
+  }();
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                group_labels.begin(),
-                                group_labels.end(),
-                                mutable_ranks.begin<size_type>(),
-                                mutable_ranks.begin<size_type>(),
+                                group_labels_begin,
+                                group_labels_begin + group_labels.size(),
+                                mutable_rank_begin,
+                                mutable_rank_begin,
                                 thrust::equal_to{},
                                 scan_op);
-
   return ranks;
 }
 }  // namespace
 
-std::unique_ptr<column> rank_scan(column_view const& order_by,
-                                  device_span<size_type const> group_labels,
-                                  device_span<size_type const> group_offsets,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
+                                      column_view const& value_order,
+                                      device_span<size_type const> group_labels,
+                                      device_span<size_type const> group_offsets,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
-  return rank_generator(
-    order_by,
+  return rank_generator<true>(
+    grouped_values,
+    value_order,
     group_labels,
     group_offsets,
     [] __device__(bool unequal, auto row_index_in_group) {
       return unequal ? row_index_in_group + 1 : 0;
     },
     DeviceMax{},
-    has_nested_nulls(table_view{{order_by}}),
+    has_nested_nulls(table_view{{grouped_values}}),
     stream,
     mr);
 }
 
-std::unique_ptr<column> dense_rank_scan(column_view const& order_by,
-                                        device_span<size_type const> group_labels,
-                                        device_span<size_type const> group_offsets,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
+                                      column_view const& value_order,
+                                      device_span<size_type const> group_labels,
+                                      device_span<size_type const> group_offsets,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
-  return rank_generator(
-    order_by,
+  return rank_generator<false>(
+    grouped_values,
+    value_order,
     group_labels,
     group_offsets,
-    [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; },
-    DeviceSum{},
-    has_nested_nulls(table_view{{order_by}}),
+    [] __device__(bool unequal, auto row_index_in_group) {
+      return unequal ? row_index_in_group + 1 : std::numeric_limits<size_type>::max();
+    },
+    DeviceMin{},
+    has_nested_nulls(table_view{{grouped_values}}),
     stream,
     mr);
 }
 
-std::unique_ptr<column> percent_rank_scan(column_view const& order_by,
+std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
+                                        column_view const&,
+                                        device_span<size_type const> group_labels,
+                                        device_span<size_type const> group_offsets,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  auto ranks = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
+  auto mutable_ranks = ranks->mutable_view();
+  thrust::tabulate(rmm::exec_policy(stream),
+                   mutable_ranks.begin<size_type>(),
+                   mutable_ranks.end<size_type>(),
+                   [labels  = group_labels.begin(),
+                    offsets = group_offsets.begin()] __device__(size_type row_index) {
+                     auto group_start = offsets[labels[row_index]];
+                     return row_index - group_start + 1;
+                   });
+  return ranks;
+}
+
+std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
+                                          column_view const& value_order,
                                           device_span<size_type const> group_labels,
                                           device_span<size_type const> group_offsets,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
-  auto const rank_column = rank_scan(
-    order_by, group_labels, group_offsets, stream, rmm::mr::get_current_device_resource());
-  auto const rank_view       = rank_column->view();
-  auto const group_size_iter = cudf::detail::make_counting_transform_iterator(
-    0,
-    [labels  = group_labels.begin(),
-     offsets = group_offsets.begin()] __device__(size_type row_index) {
-      auto const group_label = labels[row_index];
-      auto const group_start = offsets[group_label];
-      auto const group_end   = offsets[group_label + 1];
-      return group_end - group_start;
-    });
-
-  // Result type for PERCENT_RANK is independent of input type.
-  using result_type = cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
-
-  auto percent_rank_result = cudf::make_fixed_width_column(
-    data_type{type_to_id<result_type>()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr);
-
+  auto max_rank = max_rank_scan(grouped_values,
+                                value_order,
+                                group_labels,
+                                group_offsets,
+                                stream,
+                                rmm::mr::get_current_device_resource());
+  auto min_rank = min_rank_scan(grouped_values,
+                                value_order,
+                                group_labels,
+                                group_offsets,
+                                stream,
+                                rmm::mr::get_current_device_resource());
+  auto ranks    = make_fixed_width_column(
+    data_type{type_to_id<double>()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
+  auto mutable_ranks = ranks->mutable_view();
   thrust::transform(rmm::exec_policy(stream),
-                    rank_view.begin<size_type>(),
-                    rank_view.end<size_type>(),
-                    group_size_iter,
-                    percent_rank_result->mutable_view().begin<result_type>(),
-                    [] __device__(auto const rank, auto const group_size) {
-                      return group_size == 1 ? 0.0 : ((rank - 1.0) / (group_size - 1));
+                    max_rank->view().begin<size_type>(),
+                    max_rank->view().end<size_type>(),
+                    min_rank->view().begin<size_type>(),
+                    mutable_ranks.begin<double>(),
+                    [] __device__(auto max_rank, auto min_rank) -> double {
+                      return min_rank + (max_rank - min_rank) / 2.0;
                     });
+  return ranks;
+}
 
-  return percent_rank_result;
+std::unique_ptr<column> dense_rank_scan(column_view const& grouped_values,
+                                        column_view const& value_order,
+                                        device_span<size_type const> group_labels,
+                                        device_span<size_type const> group_offsets,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  return rank_generator<true>(
+    grouped_values,
+    value_order,
+    group_labels,
+    group_offsets,
+    [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; },
+    DeviceSum{},
+    has_nested_nulls(table_view{{grouped_values}}),
+    stream,
+    mr);
+}
+
+std::unique_ptr<column> group_rank_to_percentage(rank_method const method,
+                                                 rank_percentage const percentage,
+                                                 column_view const& rank,
+                                                 column_view const& count,
+                                                 device_span<size_type const> group_labels,
+                                                 device_span<size_type const> group_offsets,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(percentage != rank_percentage::NONE, "Percentage cannot be NONE");
+  auto ranks = make_fixed_width_column(
+    data_type{type_to_id<double>()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
+  ranks->set_null_mask(copy_bitmask(rank, stream, mr));
+  auto mutable_ranks = ranks->mutable_view();
+
+  auto one_normalized = [] __device__(auto const rank, auto const group_size) {
+    return group_size == 1 ? 0.0 : ((rank - 1.0) / (group_size - 1));
+  };
+  if (method == rank_method::DENSE) {
+    thrust::tabulate(rmm::exec_policy(stream),
+                     mutable_ranks.begin<double>(),
+                     mutable_ranks.end<double>(),
+                     [percentage,
+                      one_normalized,
+                      is_double = rank.type().id() == type_id::FLOAT64,
+                      dcount    = count.begin<size_type>(),
+                      labels    = group_labels.begin(),
+                      offsets   = group_offsets.begin(),
+                      d_rank    = rank.begin<double>(),
+                      s_rank = rank.begin<size_type>()] __device__(size_type row_index) -> double {
+                       double const r   = is_double ? d_rank[row_index] : s_rank[row_index];
+                       auto const count = dcount[labels[row_index]];
+                       size_type const last_rank_index = offsets[labels[row_index]] + count - 1;
+                       auto const last_rank            = s_rank[last_rank_index];
+                       return percentage == rank_percentage::ZERO_NORMALIZED
+                                ? r / last_rank
+                                : one_normalized(r, last_rank);
+                     });
+  } else {
+    thrust::tabulate(rmm::exec_policy(stream),
+                     mutable_ranks.begin<double>(),
+                     mutable_ranks.end<double>(),
+                     [percentage,
+                      one_normalized,
+                      is_double = rank.type().id() == type_id::FLOAT64,
+                      dcount    = count.begin<size_type>(),
+                      labels    = group_labels.begin(),
+                      d_rank    = rank.begin<double>(),
+                      s_rank = rank.begin<size_type>()] __device__(size_type row_index) -> double {
+                       double const r   = is_double ? d_rank[row_index] : s_rank[row_index];
+                       auto const count = dcount[labels[row_index]];
+                       return percentage == rank_percentage::ZERO_NORMALIZED
+                                ? r / count
+                                : one_normalized(r, count);
+                     });
+  }
+  return ranks;
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp
index 76a7f3f73c7..dc0eb691748 100644
--- a/cpp/src/groupby/sort/group_scan.hpp
+++ b/cpp/src/groupby/sort/group_scan.hpp
@@ -85,52 +85,115 @@ std::unique_ptr<column> count_scan(device_span<size_type const> group_labels,
                                    rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Internal API to calculate groupwise rank value
+ * @brief Internal API to calculate groupwise min rank value
  *
- * @param order_by column or struct column that rows within a group are sorted by
+ * @param grouped_values column or struct column that rows within a group are sorted by
+ * @param value_order column of type INT32 that contains the order of the values in the
+ * grouped_values column
  * @param group_labels ID of group that the corresponding value belongs to
  * @param group_offsets group index offsets with group ID indices
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of type size_type of rank values
  */
-std::unique_ptr<column> rank_scan(column_view const& order_by,
-                                  device_span<size_type const> group_labels,
-                                  device_span<size_type const> group_offsets,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
+                                      column_view const& value_order,
+                                      device_span<size_type const> group_labels,
+                                      device_span<size_type const> group_offsets,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to calculate groupwise max rank value
+ *
+ * @details  @copydetails min_rank_scan(column_view const& grouped_values,
+ *                                      column_view const& value_order,
+ *                                      device_span<size_type const> group_labels,
+ *                                      device_span<size_type const> group_offsets,
+ *                                      rmm::cuda_stream_view stream,
+ *                                      rmm::mr::device_memory_resource* mr)
+ */
+std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
+                                      column_view const& value_order,
+                                      device_span<size_type const> group_labels,
+                                      device_span<size_type const> group_offsets,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to calculate groupwise first rank value
+ *
+ * @details  @copydetails min_rank_scan(column_view const& grouped_values,
+ *                                      column_view const& value_order,
+ *                                      device_span<size_type const> group_labels,
+ *                                      device_span<size_type const> group_offsets,
+ *                                      rmm::cuda_stream_view stream,
+ *                                      rmm::mr::device_memory_resource* mr)
+ */
+std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
+                                        column_view const& value_order,
+                                        device_span<size_type const> group_labels,
+                                        device_span<size_type const> group_offsets,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to calculate groupwise average rank value
+ *
+ * @details  @copydetails min_rank_scan(column_view const& grouped_values,
+ *                                      column_view const& value_order,
+ *                                      device_span<size_type const> group_labels,
+ *                                      device_span<size_type const> group_offsets,
+ *                                      rmm::cuda_stream_view stream,
+ *                                      rmm::mr::device_memory_resource* mr)
+ */
+std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
+                                          column_view const& value_order,
+                                          device_span<size_type const> group_labels,
+                                          device_span<size_type const> group_offsets,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate groupwise dense rank value
  *
- * @param order_by column or struct column that rows within a group are sorted by
+ * @param grouped_values column or struct column that rows within a group are sorted by
  * @param group_labels ID of group that the corresponding value belongs to
  * @param group_offsets group index offsets with group ID indices
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of type size_type of dense rank values
  */
-std::unique_ptr<column> dense_rank_scan(column_view const& order_by,
+std::unique_ptr<column> dense_rank_scan(column_view const& grouped_values,
+                                        column_view const& value_order,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Internal API to calculate groupwise percent rank value
+ * @brief Convert groupwise rank to groupwise percentage rank
  *
- * @param order_by column or struct column by which the rows within a group are sorted
- * @param group_labels ID of group to which the row belongs
+ * @param method rank method
+ * @param percentage enum to denote the type of conversion ranks to percentage in range (0,1]
+ * @param rank Groupwise rank column
+ * @param count Groupwise count column
+ * @param group_labels ID of group that the corresponding value belongs to
  * @param group_offsets group index offsets with group ID indices
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Column of type `double` of percent rank values
+ * @return Column of type double of rank values
+
  */
-std::unique_ptr<column> percent_rank_scan(column_view const& order_by,
-                                          device_span<size_type const> group_labels,
-                                          device_span<size_type const> group_offsets,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> group_rank_to_percentage(rank_method const method,
+                                                 rank_percentage const percentage,
+                                                 column_view const& rank,
+                                                 column_view const& count,
+                                                 device_span<size_type const> group_labels,
+                                                 device_span<size_type const> group_offsets,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 20edc1b3f50..5d345273782 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -16,14 +16,20 @@
 
 #include <groupby/common/utils.hpp>
 #include <groupby/sort/functors.hpp>
+#include <groupby/sort/group_reductions.hpp>
 #include <groupby/sort/group_scan.hpp>
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/scatter.hpp>
+#include <cudf/detail/sequence.hpp>
+#include <cudf/detail/sorting.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -115,51 +121,70 @@ template <>
 void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
 {
   if (cache.has_result(values, agg)) return;
-  CUDF_EXPECTS(helper.is_presorted(),
-               "Rank aggregate in groupby scan requires the keys to be presorted");
-  auto const order_by = get_grouped_values();
-  CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
-               "Unsupported list type in grouped rank scan.");
-
-  cache.add_result(
-    values,
-    agg,
-    detail::rank_scan(
-      order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr));
-}
-
-template <>
-void scan_result_functor::operator()<aggregation::DENSE_RANK>(aggregation const& agg)
-{
-  if (cache.has_result(values, agg)) return;
-  CUDF_EXPECTS(helper.is_presorted(),
-               "Dense rank aggregate in groupby scan requires the keys to be presorted");
-  auto const order_by = get_grouped_values();
-  CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
-               "Unsupported list type in grouped dense_rank scan.");
 
-  cache.add_result(
-    values,
-    agg,
-    detail::dense_rank_scan(
-      order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr));
-}
-
-template <>
-void scan_result_functor::operator()<aggregation::PERCENT_RANK>(aggregation const& agg)
-{
-  if (cache.has_result(values, agg)) return;
-  CUDF_EXPECTS(helper.is_presorted(),
-               "Percent rank aggregate in groupby scan requires the keys to be presorted");
-  auto const order_by = get_grouped_values();
-  CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
-               "Unsupported list type in grouped percent_rank scan.");
-
-  cache.add_result(
-    values,
-    agg,
-    detail::percent_rank_scan(
-      order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr));
+  CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(values),
+               "Unsupported list type in grouped rank scan.");
+  auto const& rank_agg         = dynamic_cast<cudf::detail::rank_aggregation const&>(agg);
+  auto const& group_labels     = helper.group_labels(stream);
+  auto const group_labels_view = column_view(cudf::device_span<const size_type>(group_labels));
+  auto const gather_map        = [&]() {
+    if (is_presorted()) {  // assumes both keys and values are sorted, Spark does this.
+      return cudf::detail::sequence(
+        group_labels.size(), *cudf::make_fixed_width_scalar(size_type{0}, stream), stream);
+    } else {
+      auto sort_order = (rank_agg._method == rank_method::FIRST ? cudf::detail::stable_sorted_order
+                                                                       : cudf::detail::sorted_order);
+      return sort_order(table_view({group_labels_view, get_grouped_values()}),
+                        {order::ASCENDING, rank_agg._column_order},
+                        {null_order::AFTER, rank_agg._null_precedence},
+                        stream,
+                        rmm::mr::get_current_device_resource());
+    }
+  }();
+
+  auto rank_scan = [&]() {
+    switch (rank_agg._method) {
+      case rank_method::FIRST: return detail::first_rank_scan;
+      case rank_method::AVERAGE: return detail::average_rank_scan;
+      case rank_method::DENSE: return detail::dense_rank_scan;
+      case rank_method::MIN: return detail::min_rank_scan;
+      case rank_method::MAX: return detail::max_rank_scan;
+      default: CUDF_FAIL("Unsupported rank method in groupby scan");
+    }
+  }();
+  auto result = rank_scan(get_grouped_values(),
+                          *gather_map,
+                          helper.group_labels(stream),
+                          helper.group_offsets(stream),
+                          stream,
+                          rmm::mr::get_current_device_resource());
+  if (rank_agg._percentage != rank_percentage::NONE) {
+    auto count = get_grouped_values().nullable() and rank_agg._null_handling == null_policy::EXCLUDE
+                   ? detail::group_count_valid(get_grouped_values(),
+                                               helper.group_labels(stream),
+                                               helper.num_groups(stream),
+                                               stream,
+                                               rmm::mr::get_current_device_resource())
+                   : detail::group_count_all(helper.group_offsets(stream),
+                                             helper.num_groups(stream),
+                                             stream,
+                                             rmm::mr::get_current_device_resource());
+    result     = detail::group_rank_to_percentage(rank_agg._method,
+                                              rank_agg._percentage,
+                                              *result,
+                                              *count,
+                                              helper.group_labels(stream),
+                                              helper.group_offsets(stream),
+                                              stream,
+                                              mr);
+  }
+  result = std::move(cudf::detail::scatter(
+                       table_view{{*result}}, *gather_map, table_view{{*result}}, false, stream, mr)
+                       ->release()[0]);
+  if (rank_agg._null_handling == null_policy::EXCLUDE) {
+    result->set_null_mask(cudf::detail::copy_bitmask(get_grouped_values(), stream, mr));
+  }
+  cache.add_result(values, agg, std::move(result));
 }
 }  // namespace detail
 
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index 521f8e2d06f..0ababbf0a3d 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -102,16 +102,15 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
     mr);
 }
 
-std::unique_ptr<column> inclusive_percent_rank_scan(column_view const& order_by,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
+  column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   auto const rank_column =
     inclusive_rank_scan(order_by, stream, rmm::mr::get_current_device_resource());
   auto const rank_view = rank_column->view();
 
-  // Result type for PERCENT_RANK is independent of input type.
-  using result_type = cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
+  // Result type for min 0-index percent rank is independent of input type.
+  using result_type        = double;
   auto percent_rank_result = cudf::make_fixed_width_column(
     data_type{type_to_id<result_type>()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr);
 
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index 52aaad5ddcf..b678b9441a5 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -35,17 +35,17 @@ std::unique_ptr<column> scan(column_view const& input,
   if (agg->kind == aggregation::RANK) {
     CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
                  "Rank aggregation operator requires an inclusive scan");
-    return inclusive_rank_scan(input, rmm::cuda_stream_default, mr);
-  }
-  if (agg->kind == aggregation::DENSE_RANK) {
-    CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
-                 "Dense rank aggregation operator requires an inclusive scan");
-    return inclusive_dense_rank_scan(input, rmm::cuda_stream_default, mr);
-  }
-  if (agg->kind == aggregation::PERCENT_RANK) {
-    CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
-                 "Percent rank aggregation operator requires an inclusive scan");
-    return inclusive_percent_rank_scan(input, rmm::cuda_stream_default, mr);
+    auto const& rank_agg = dynamic_cast<cudf::detail::rank_aggregation const&>(*agg);
+    if (rank_agg._method == rank_method::MIN) {
+      if (rank_agg._percentage == rank_percentage::NONE) {
+        return inclusive_rank_scan(input, rmm::cuda_stream_default, mr);
+      } else if (rank_agg._percentage == rank_percentage::ONE_NORMALIZED) {
+        return inclusive_one_normalized_percent_rank_scan(input, rmm::cuda_stream_default, mr);
+      }
+    } else if (rank_agg._method == rank_method::DENSE) {
+      return inclusive_dense_rank_scan(input, rmm::cuda_stream_default, mr);
+    }
+    CUDF_FAIL("Unsupported rank aggregation method for inclusive scan");
   }
 
   return inclusive == scan_type::EXCLUSIVE
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index 81369beb2ec..d4e8b4cbf0f 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -29,11 +29,9 @@ namespace test {
 using namespace iterators;
 
 template <typename T>
-using input           = fixed_width_column_wrapper<T>;
-using rank_result_col = fixed_width_column_wrapper<size_type>;
-using percent_result_t =
-  cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
-using percent_result_col = fixed_width_column_wrapper<percent_result_t>;
+using input              = fixed_width_column_wrapper<T>;
+using rank_result_col    = fixed_width_column_wrapper<size_type>;
+using percent_result_col = fixed_width_column_wrapper<double>;
 using null_iter_t        = decltype(nulls_at({}));
 
 auto constexpr X     = int32_t{0};  // Placeholder for NULL rows.
@@ -45,27 +43,31 @@ inline void test_rank_scans(column_view const& keys,
                             column_view const& expected_rank,
                             column_view const& expected_percent_rank)
 {
-  test_single_scan(keys,
-                   order,
-                   keys,
-                   expected_dense,
-                   make_dense_rank_aggregation<groupby_scan_aggregation>(),
-                   null_policy::INCLUDE,
-                   sorted::YES);
-  test_single_scan(keys,
-                   order,
-                   keys,
-                   expected_rank,
-                   make_rank_aggregation<groupby_scan_aggregation>(),
-                   null_policy::INCLUDE,
-                   sorted::YES);
-  test_single_scan(keys,
-                   order,
-                   keys,
-                   expected_percent_rank,
-                   make_percent_rank_aggregation<groupby_scan_aggregation>(),
-                   null_policy::INCLUDE,
-                   sorted::YES);
+  test_single_scan(
+    keys,
+    order,
+    keys,
+    expected_dense,
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE, {}, null_policy::INCLUDE),
+    null_policy::INCLUDE,
+    sorted::YES);
+  test_single_scan(
+    keys,
+    order,
+    keys,
+    expected_rank,
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN, {}, null_policy::INCLUDE),
+    null_policy::INCLUDE,
+    sorted::YES);
+  test_single_scan(
+    keys,
+    order,
+    keys,
+    expected_percent_rank,
+    make_rank_aggregation<groupby_scan_aggregation>(
+      rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED),
+    null_policy::INCLUDE,
+    sorted::YES);
 }
 
 struct groupby_rank_scan_test : public BaseFixture {
@@ -148,7 +150,7 @@ TYPED_TEST(typed_groupby_rank_scan_test, basic)
 {
   using T = TypeParam;
 
-  auto const keys            = input<T>{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
+  auto const keys            = /*        */ input<T>{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
   auto const make_order_by   = [&] { return input<T>{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; };
   auto const order_by        = make_order_by();
   auto const order_by_struct = [&] {
@@ -244,9 +246,12 @@ TYPED_TEST(typed_groupby_rank_scan_test, mixedStructs)
   std::vector<groupby::scan_request> requests;
   requests.emplace_back(groupby::scan_request());
   requests[0].values = *struct_col;
-  requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE, {}, null_policy::INCLUDE));
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN, {}, null_policy::INCLUDE));
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(
+    rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED));
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
   auto [result_keys, agg_results] = gb_obj.scan(requests);
@@ -288,13 +293,19 @@ TYPED_TEST(typed_groupby_rank_scan_test, nestedStructs)
   requests.emplace_back(groupby::scan_request());
   requests.emplace_back(groupby::scan_request());
   requests[0].values = *nested_structs;
-  requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE));
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN));
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(
+    rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED));
   requests[1].values = *flat_struct;
-  requests[1].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[1].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE));
+  requests[1].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN));
+  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(
+    rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED));
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
   auto [result_keys, agg_results] = gb_obj.scan(requests);
@@ -339,13 +350,19 @@ TYPED_TEST(typed_groupby_rank_scan_test, structsWithNullPushdown)
   requests.emplace_back(groupby::scan_request());
   requests.emplace_back(groupby::scan_request());
   requests[0].values = *possibly_null_structs;
-  requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE, {}, null_policy::INCLUDE));
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN, {}, null_policy::INCLUDE));
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(
+    rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED));
   requests[1].values = *definitely_null_structs;
-  requests[1].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[1].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE, {}, null_policy::INCLUDE));
+  requests[1].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN, {}, null_policy::INCLUDE));
+  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(
+    rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED));
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
   auto [result_keys, agg_results] = gb_obj.scan(requests);
@@ -405,11 +422,11 @@ TYPED_TEST(list_groupby_rank_scan_test, lists)
   requests.emplace_back(groupby::aggregation_request());
   requests.emplace_back(groupby::aggregation_request());
   requests[0].values = list_col;
-  requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE));
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN));
   requests[1].values = struct_col;
-  requests[1].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
+  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE));
+  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN));
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
   auto result = gb_obj.scan(requests);
@@ -484,7 +501,7 @@ TEST(groupby_rank_scan_test, strings)
     keys, order_by_structs_with_nulls, expected_dense, expected_rank, expected_percent);
 }
 
-TEST_F(groupby_rank_scan_test_failures, test_exception_triggers)
+TEST_F(groupby_rank_scan_test_failures, DISABLED_test_exception_triggers)
 {
   using T = uint32_t;
 
@@ -496,57 +513,60 @@ TEST_F(groupby_rank_scan_test_failures, test_exception_triggers)
                      col,
                      keys,
                      col,
-                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
                      null_policy::INCLUDE,
                      sorted::NO),
-    "Dense rank aggregate in groupby scan requires the keys to be presorted");
+    "Rank aggregate in groupby scan requires the keys to be presorted");
 
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
-                                             col,
-                                             keys,
-                                             col,
-                                             make_rank_aggregation<groupby_scan_aggregation>(),
-                                             null_policy::INCLUDE,
-                                             sorted::NO),
-                            "Rank aggregate in groupby scan requires the keys to be presorted");
+  CUDF_EXPECT_THROW_MESSAGE(
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
+                     null_policy::INCLUDE,
+                     sorted::NO),
+    "Rank aggregate in groupby scan requires the keys to be presorted");
+
+  CUDF_EXPECT_THROW_MESSAGE(
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
+                     null_policy::EXCLUDE,
+                     sorted::YES),
+    "Rank aggregate in groupby scan requires the keys to be presorted");
 
   CUDF_EXPECT_THROW_MESSAGE(
     test_single_scan(keys,
                      col,
                      keys,
                      col,
-                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
                      null_policy::EXCLUDE,
                      sorted::YES),
-    "Dense rank aggregate in groupby scan requires the keys to be presorted");
+    "Rank aggregate in groupby scan requires the keys to be presorted");
 
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
-                                             col,
-                                             keys,
-                                             col,
-                                             make_rank_aggregation<groupby_scan_aggregation>(),
-                                             null_policy::EXCLUDE,
-                                             sorted::YES),
-                            "Rank aggregate in groupby scan requires the keys to be presorted");
+  CUDF_EXPECT_THROW_MESSAGE(
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
+                     null_policy::EXCLUDE,
+                     sorted::NO),
+    "Rank aggregate in groupby scan requires the keys to be presorted");
 
   CUDF_EXPECT_THROW_MESSAGE(
     test_single_scan(keys,
                      col,
                      keys,
                      col,
-                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
                      null_policy::EXCLUDE,
                      sorted::NO),
-    "Dense rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
-                                             col,
-                                             keys,
-                                             col,
-                                             make_rank_aggregation<groupby_scan_aggregation>(),
-                                             null_policy::EXCLUDE,
-                                             sorted::NO),
-                            "Rank aggregate in groupby scan requires the keys to be presorted");
+    "Rank aggregate in groupby scan requires the keys to be presorted");
 }
 
 }  // namespace test
diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp
index d263677f23b..b3a8e7e0c28 100644
--- a/cpp/tests/reductions/list_rank_test.cpp
+++ b/cpp/tests/reductions/list_rank_test.cpp
@@ -42,10 +42,11 @@ TEST_F(ListRankScanTest, BasicList)
 
   auto const expected_dense_vals =
     cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 9};
-  this->test_ungrouped_rank_scan(col,
-                                 expected_dense_vals,
-                                 cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                 cudf::null_policy::INCLUDE);
+  this->test_ungrouped_rank_scan(
+    col,
+    expected_dense_vals,
+    cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+    cudf::null_policy::INCLUDE);
 }
 
 TEST_F(ListRankScanTest, DeepList)
@@ -73,20 +74,22 @@ TEST_F(ListRankScanTest, DeepList)
   {  // Non-sliced
     auto const expected_dense_vals = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
       1, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 7, 8, 9, 10, 11};
-    this->test_ungrouped_rank_scan(col,
-                                   expected_dense_vals,
-                                   cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                   cudf::null_policy::INCLUDE);
+    this->test_ungrouped_rank_scan(
+      col,
+      expected_dense_vals,
+      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      cudf::null_policy::INCLUDE);
   }
 
   {  // sliced
     auto sliced_col = cudf::slice(col, {3, 12})[0];
     auto const expected_dense_vals =
       cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 2, 3, 3, 3, 4, 4, 5, 5};
-    this->test_ungrouped_rank_scan(sliced_col,
-                                   expected_dense_vals,
-                                   cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                   cudf::null_policy::INCLUDE);
+    this->test_ungrouped_rank_scan(
+      sliced_col,
+      expected_dense_vals,
+      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      cudf::null_policy::INCLUDE);
   }
 }
 
@@ -138,10 +141,11 @@ TEST_F(ListRankScanTest, ListOfStruct)
     auto expect = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
       1, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10};
 
-    this->test_ungrouped_rank_scan(list_column,
-                                   expect,
-                                   cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                   cudf::null_policy::INCLUDE);
+    this->test_ungrouped_rank_scan(
+      list_column,
+      expect,
+      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      cudf::null_policy::INCLUDE);
   }
 
   {  // Sliced
@@ -149,10 +153,11 @@ TEST_F(ListRankScanTest, ListOfStruct)
     auto expect =
       cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 2, 3, 3, 3, 4, 5, 6, 7, 7, 8, 8};
 
-    this->test_ungrouped_rank_scan(sliced_col,
-                                   expect,
-                                   cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                   cudf::null_policy::INCLUDE);
+    this->test_ungrouped_rank_scan(
+      sliced_col,
+      expect,
+      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      cudf::null_policy::INCLUDE);
   }
 }
 
@@ -192,10 +197,11 @@ TEST_F(ListRankScanTest, ListOfEmptyStruct)
   auto expect =
     cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6};
 
-  this->test_ungrouped_rank_scan(*list_column,
-                                 expect,
-                                 cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                 cudf::null_policy::INCLUDE);
+  this->test_ungrouped_rank_scan(
+    *list_column,
+    expect,
+    cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+    cudf::null_policy::INCLUDE);
 }
 
 TEST_F(ListRankScanTest, EmptyDeepList)
@@ -221,8 +227,9 @@ TEST_F(ListRankScanTest, EmptyDeepList)
 
   auto expect = cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 2, 2};
 
-  this->test_ungrouped_rank_scan(*list_column,
-                                 expect,
-                                 cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                 cudf::null_policy::INCLUDE);
+  this->test_ungrouped_rank_scan(
+    *list_column,
+    expect,
+    cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+    cudf::null_policy::INCLUDE);
 }
diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp
index fb2cd17fe30..3bf2899ce2f 100644
--- a/cpp/tests/reductions/rank_tests.cpp
+++ b/cpp/tests/reductions/rank_tests.cpp
@@ -36,15 +36,14 @@ namespace cudf::test {
 using namespace iterators;
 
 template <typename T>
-using input           = fixed_width_column_wrapper<T>;
-using rank_result_col = fixed_width_column_wrapper<size_type>;
-using percent_result_t =
-  cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
-using percent_result_col = fixed_width_column_wrapper<percent_result_t>;
+using input              = fixed_width_column_wrapper<T>;
+using rank_result_col    = fixed_width_column_wrapper<size_type>;
+using percent_result_col = fixed_width_column_wrapper<double>;
 
-auto const rank         = cudf::make_rank_aggregation<scan_aggregation>();
-auto const dense_rank   = cudf::make_dense_rank_aggregation<scan_aggregation>();
-auto const percent_rank = cudf::make_percent_rank_aggregation<scan_aggregation>();
+auto const rank         = cudf::make_rank_aggregation<scan_aggregation>(cudf::rank_method::MIN);
+auto const dense_rank   = cudf::make_rank_aggregation<scan_aggregation>(cudf::rank_method::DENSE);
+auto const percent_rank = cudf::make_rank_aggregation<scan_aggregation>(
+  cudf::rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED);
 
 auto constexpr INCLUSIVE_SCAN = cudf::scan_type::INCLUSIVE;
 auto constexpr INCLUDE_NULLS  = cudf::null_policy::INCLUDE;
@@ -56,6 +55,8 @@ struct TypedRankScanTest : BaseScanTest<T> {
                                        std::unique_ptr<scan_aggregation> const& agg)
   {
     auto col_out = cudf::scan(input, agg, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    std::cout << "expect type: " << static_cast<int>(expect_vals.type().id()) << std::endl;
+    std::cout << "out type: " << static_cast<int>(col_out->type().id()) << std::endl;
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, col_out->view());
   }
 };
@@ -318,11 +319,11 @@ TEST(RankScanTest, ExclusiveScan)
   auto const vals = input<uint32_t>{3, 4, 5};
 
   CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, dense_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
-                            "Dense rank aggregation operator requires an inclusive scan");
+                            "Rank aggregation operator requires an inclusive scan");
   CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
                             "Rank aggregation operator requires an inclusive scan");
   CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, percent_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
-                            "Percent rank aggregation operator requires an inclusive scan");
+                            "Rank aggregation operator requires an inclusive scan");
 }
 
 }  // namespace cudf::test
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index d533a91f4d0..68b4d85db2a 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -84,6 +84,7 @@ struct ScanTest : public BaseScanTest<T> {
         case aggregation::PRODUCT: return std::is_invocable_v<cudf::DeviceProduct, T, T>;
         case aggregation::MIN: return std::is_invocable_v<cudf::DeviceMin, T, T>;
         case aggregation::MAX: return std::is_invocable_v<cudf::DeviceMax, T, T>;
+        case aggregation::RANK: return std::is_invocable_v<cudf::DeviceMax, T, T>;  // comparable
         default: return false;
       }
       return false;
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index f8c448566c8..6ac73282615 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -82,11 +82,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv
         case 27: // MERGE_M2
           return cudf::make_merge_m2_aggregation();
         case 28: // RANK
-          return cudf::make_rank_aggregation();
+          return cudf::make_rank_aggregation(cudf::rank_method::MIN, {},
+                                             cudf::null_policy::INCLUDE);
         case 29: // DENSE_RANK
-          return cudf::make_dense_rank_aggregation();
-        case 30: // PERCENT_RANK
-          return cudf::make_percent_rank_aggregation();
+          return cudf::make_rank_aggregation(cudf::rank_method::DENSE, {},
+                                             cudf::null_policy::INCLUDE);
+        case 30: // ANSI SQL PERCENT_RANK
+          return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE,
+                                             {}, cudf::rank_percentage::ONE_NORMALIZED);
         default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
       }
     }();
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 4dc91268d57..84dd9c3a576 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -30,7 +30,10 @@ from cudf._lib.types import Interpolation
 
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.aggregation cimport underlying_type_t_correlation_type
+from cudf._lib.cpp.aggregation cimport (
+    underlying_type_t_correlation_type,
+    underlying_type_t_rank_method,
+)
 
 import cudf
 
@@ -54,6 +57,7 @@ class AggregationKind(Enum):
     ARGMIN = libcudf_aggregation.aggregation.Kind.ARGMIN
     NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE
     NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT
+    RANK = libcudf_aggregation.aggregation.Kind.RANK
     COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT
     UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET
     PTX = libcudf_aggregation.aggregation.Kind.PTX
@@ -77,6 +81,14 @@ class CorrelationType(IntEnum):
     )
 
 
+class RankMethod(IntEnum):
+    FIRST = libcudf_aggregation.rank_method.FIRST
+    AVERAGE = libcudf_aggregation.rank_method.AVERAGE
+    MIN = libcudf_aggregation.rank_method.MIN
+    MAX = libcudf_aggregation.rank_method.MAX
+    DENSE = libcudf_aggregation.rank_method.DENSE
+
+
 cdef class RollingAggregation:
     """A Cython wrapper for rolling window aggregations.
 
@@ -564,6 +576,33 @@ cdef class GroupbyScanAggregation:
     cummin = min
     cummax = max
 
+    @classmethod
+    def rank(cls, method, ascending, na_option, pct):
+        cdef GroupbyScanAggregation agg = cls()
+        cdef libcudf_aggregation.rank_method c_method = (
+            <libcudf_aggregation.rank_method> (
+                <underlying_type_t_rank_method> (
+                    RankMethod[method.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_rank_aggregation[groupby_scan_aggregation](
+                c_method,
+                (libcudf_types.order.ASCENDING if ascending else
+                    libcudf_types.order.DESCENDING),
+                (libcudf_types.null_policy.EXCLUDE if na_option == "keep" else
+                    libcudf_types.null_policy.INCLUDE),
+                (libcudf_types.null_order.BEFORE
+                    if (na_option == "top") == ascending else
+                    libcudf_types.null_order.AFTER),
+                (libcudf_aggregation.rank_percentage.ZERO_NORMALIZED
+                    if pct else
+                    libcudf_aggregation.rank_percentage.NONE)
+            ))
+        return agg
+
 
 cdef class ReduceAggregation:
     """A Cython wrapper for reduce aggregations.
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd
index 399deb74c9c..a1d1485e1e8 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -7,11 +8,14 @@ from libcpp.vector cimport vector
 from cudf._lib.cpp.types cimport (
     data_type,
     interpolation,
+    null_order,
     null_policy,
+    order,
     size_type,
 )
 
 ctypedef int32_t underlying_type_t_correlation_type
+ctypedef int32_t underlying_type_t_rank_method
 
 cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
 
@@ -35,6 +39,7 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
             ARGMIN 'cudf::aggregation::ARGMIN'
             NUNIQUE 'cudf::aggregation::NUNIQUE'
             NTH_ELEMENT 'cudf::aggregation::NTH_ELEMENT'
+            RANK 'cudf::aggregation::RANK'
             COLLECT 'cudf::aggregation::COLLECT_LIST'
             COLLECT_SET 'cudf::aggregation::COLLECT_SET'
             PTX 'cudf::aggregation::PTX'
@@ -68,6 +73,18 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         KENDALL 'cudf::correlation_type::KENDALL'
         SPEARMAN 'cudf::correlation_type::SPEARMAN'
 
+    ctypedef enum rank_method:
+        FIRST "cudf::rank_method::FIRST"
+        AVERAGE "cudf::rank_method::AVERAGE"
+        MIN "cudf::rank_method::MIN"
+        MAX "cudf::rank_method::MAX"
+        DENSE "cudf::rank_method::DENSE"
+
+    ctypedef enum rank_percentage:
+        NONE "cudf::rank_percentage::NONE"
+        ZERO_NORMALIZED "cudf::rank_percentage::ZERO_NORMALIZED"
+        ONE_NORMALIZED "cudf::rank_percentage::ONE_NORMALIZED"
+
     cdef unique_ptr[T] make_sum_aggregation[T]() except +
 
     cdef unique_ptr[T] make_product_aggregation[T]() except +
@@ -127,3 +144,10 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[T] make_covariance_aggregation[T](
         size_type min_periods, size_type ddof) except +
+
+    cdef unique_ptr[T] make_rank_aggregation[T](
+        rank_method method,
+        order column_order,
+        null_policy null_handling,
+        null_order null_precedence,
+        rank_percentage percentage) except +
diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd
index 243b841ce4b..c6c42c327ac 100644
--- a/python/cudf/cudf/_lib/cpp/sorting.pxd
+++ b/python/cudf/cudf/_lib/cpp/sorting.pxd
@@ -7,20 +7,13 @@ from libcpp.vector cimport vector
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
 cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.cpp.aggregation cimport rank_method
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 
 
-cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
-    ctypedef enum rank_method:
-        FIRST "cudf::rank_method::FIRST"
-        AVERAGE "cudf::rank_method::AVERAGE"
-        MIN "cudf::rank_method::MIN"
-        MAX "cudf::rank_method::MAX"
-        DENSE "cudf::rank_method::DENSE"
-
 cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
     cdef unique_ptr[column] sorted_order(
         table_view source_table,
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 48f566b846d..be5bb2741b4 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -341,7 +341,7 @@ cdef class GroupBy:
         return columns_from_unique_ptr(move(c_result.second))
 
 
-_GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax"}
+_GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax", "rank"}
 
 
 def _is_all_scan_aggregate(all_aggs):
diff --git a/python/cudf/cudf/_lib/sort.pxd b/python/cudf/cudf/_lib/sort.pxd
deleted file mode 100644
index d7488889555..00000000000
--- a/python/cudf/cudf/_lib/sort.pxd
+++ /dev/null
@@ -1,3 +0,0 @@
-from libc.stdint cimport int32_t
-
-ctypedef int32_t underlying_type_t_rank_method
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index faa4279c1ca..1d7204a0a39 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -8,19 +8,21 @@ from libcpp.vector cimport vector
 from enum import IntEnum
 
 from cudf._lib.column cimport Column
+from cudf._lib.cpp.aggregation cimport (
+    rank_method,
+    underlying_type_t_rank_method,
+)
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.search cimport lower_bound, upper_bound
 from cudf._lib.cpp.sorting cimport (
     is_sorted as cpp_is_sorted,
     rank,
-    rank_method,
     sorted_order,
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport null_order, null_policy, order
-from cudf._lib.sort cimport underlying_type_t_rank_method
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
@@ -190,14 +192,6 @@ def digitize(list source_columns, list bins, bool right=False):
     return Column.from_unique_ptr(move(c_result))
 
 
-class RankMethod(IntEnum):
-    FIRST = < underlying_type_t_rank_method > rank_method.FIRST
-    AVERAGE = < underlying_type_t_rank_method > rank_method.AVERAGE
-    MIN = < underlying_type_t_rank_method > rank_method.MIN
-    MAX = < underlying_type_t_rank_method > rank_method.MAX
-    DENSE = < underlying_type_t_rank_method > rank_method.DENSE
-
-
 def rank_columns(list source_columns, object method, str na_option,
                  bool ascending, bool pct
                  ):
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 1af84920057..013ae7ad033 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -207,6 +207,30 @@ def cumcount(self):
             .reset_index(drop=True)
         )
 
+    def rank(
+        self,
+        method="average",
+        ascending=True,
+        na_option="keep",
+        pct=False,
+        axis=0,
+    ):
+        """
+        Return the rank of values within each group.
+        """
+        if not axis == 0:
+            raise NotImplementedError("Only axis=0 is supported.")
+
+        def rank(x):
+            return getattr(x, "rank")(
+                method=method,
+                ascending=ascending,
+                na_option=na_option,
+                pct=pct,
+            )
+
+        return self.agg(rank)
+
     @cached_property
     def _groupby(self):
         return libgroupby.GroupBy(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index a77fca098bc..1361fc56fa0 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3632,7 +3632,7 @@ def rank(
         if method not in {"average", "min", "max", "first", "dense"}:
             raise KeyError(method)
 
-        method_enum = libcudf.sort.RankMethod[method.upper()]
+        method_enum = libcudf.aggregation.RankMethod[method.upper()]
         if na_option not in {"keep", "top", "bottom"}:
             raise ValueError(
                 "na_option must be one of 'keep', 'top', or 'bottom'"
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 1411d7ba64c..9e87fdbd3be 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1811,6 +1811,50 @@ def test_groupby_2keys_scan(nelem, func):
     assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
 
 
+@pytest.mark.parametrize("nelem", [100, 1000])
+@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
+@pytest.mark.parametrize("pct", [False, True])
+def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct):
+    t = rand_dataframe(
+        dtypes_meta=[
+            {"dtype": "int64", "null_frequency": 0, "cardinality": 10},
+            {"dtype": "int64", "null_frequency": 0, "cardinality": 10},
+            {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
+        ],
+        rows=nelem,
+        use_threads=False,
+    )
+    pdf = t.to_pandas()
+    pdf.columns = ["x", "y", "z"]
+    gdf = cudf.from_pandas(pdf)
+    expect_df = pdf.groupby(["x", "y"], sort=True).rank(
+        method=method, ascending=ascending, na_option=na_option, pct=pct
+    )
+    got_df = gdf.groupby(["x", "y"], sort=True).rank(
+        method=method, ascending=ascending, na_option=na_option, pct=pct
+    )
+
+    assert_groupby_results_equal(got_df, expect_df, check_dtype=False)
+
+
+def test_groupby_rank_fails():
+    gdf = cudf.DataFrame(
+        {"x": [1, 2, 3, 4], "y": [1, 2, 3, 4], "z": [1, 2, 3, 4]}
+    )
+    with pytest.raises(NotImplementedError):
+        gdf.groupby(["x", "y"]).rank(method="min", axis=1)
+    gdf = cudf.DataFrame(
+        {
+            "a": [1, 1, 1, 2, 2, 2],
+            "b": [[1, 2], [3, None, 5], None, [], [7, 8], [9]],
+        }
+    )
+    with pytest.raises(NotImplementedError):
+        gdf.groupby(["a"]).rank(method="min", axis=1)
+
+
 def test_groupby_mix_agg_scan():
     err_msg = "Cannot perform both aggregation and scan in one operation"
     func = ["cumsum", "sum"]

From 280acdfd65b12b4ac953c193c7d7fd35809e41be Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 28 Apr 2022 14:25:04 -0700
Subject: [PATCH 09/28] Partial cuIO GPU decompression refactor (#10699)

Required to expand future nvcomp integration.

- [x] Moving nvcomp integration in ORC and Parquet readers to common code. Enables nvcomp use for multiple compression type without code duplication.
- [x] `gpu_inflate_input_s` refactor to facilitate unified host/device decompressor interface. Enables further changes to unify CPU and GPU decompression API, which in turn enables ZSTD use in ORC.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Elias Stehle (https://github.com/elstehle)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10699
---
 cpp/CMakeLists.txt                         |   2 +
 cpp/src/io/avro/reader_impl.cu             |  82 +++----
 cpp/src/io/comp/debrotli.cu                |  75 +++----
 cpp/src/io/comp/gpuinflate.cu              | 113 +++++-----
 cpp/src/io/comp/gpuinflate.h               |  97 ++++-----
 cpp/src/io/comp/nvcomp_adapter.cpp         |  86 ++++++++
 cpp/src/io/comp/nvcomp_adapter.cu          |  73 +++++++
 cpp/src/io/comp/nvcomp_adapter.cuh         |  55 +++++
 cpp/src/io/comp/nvcomp_adapter.hpp         |  45 ++++
 cpp/src/io/comp/snap.cu                    |  45 ++--
 cpp/src/io/comp/unsnap.cu                  |  68 +++---
 cpp/src/io/orc/orc_gpu.h                   |  32 +--
 cpp/src/io/orc/reader_impl.cu              | 118 +++-------
 cpp/src/io/orc/stripe_enc.cu               | 101 +++++----
 cpp/src/io/orc/stripe_init.cu              |  83 ++++---
 cpp/src/io/orc/writer_impl.cu              |  12 +-
 cpp/src/io/orc/writer_impl.hpp             |   2 +-
 cpp/src/io/parquet/page_enc.cu             |  42 ++--
 cpp/src/io/parquet/parquet_gpu.hpp         |  16 +-
 cpp/src/io/parquet/reader_impl.cu          | 239 ++++++---------------
 cpp/src/io/parquet/writer_impl.cu          |  53 ++---
 cpp/src/io/utilities/hostdevice_vector.hpp |   6 +-
 cpp/tests/io/comp/decomp_test.cpp          | 100 ++++-----
 23 files changed, 834 insertions(+), 711 deletions(-)
 create mode 100644 cpp/src/io/comp/nvcomp_adapter.cpp
 create mode 100644 cpp/src/io/comp/nvcomp_adapter.cu
 create mode 100644 cpp/src/io/comp/nvcomp_adapter.cuh
 create mode 100644 cpp/src/io/comp/nvcomp_adapter.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 91f67fd0420..15caaec9bec 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -302,6 +302,8 @@ add_library(
   src/io/comp/cpu_unbz2.cpp
   src/io/comp/debrotli.cu
   src/io/comp/gpuinflate.cu
+  src/io/comp/nvcomp_adapter.cpp
+  src/io/comp/nvcomp_adapter.cu
   src/io/comp/snap.cu
   src/io/comp/uncomp.cpp
   src/io/comp/unsnap.cu
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 5885b61b35b..556ca6b9d80 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -162,62 +162,66 @@ rmm::device_buffer decompress_data(datasource& source,
                                    rmm::cuda_stream_view stream)
 {
   if (meta.codec == "deflate") {
-    size_t uncompressed_data_size = 0;
+    auto inflate_in = hostdevice_vector<device_span<uint8_t const>>(meta.block_list.size(), stream);
+    auto inflate_out   = hostdevice_vector<device_span<uint8_t>>(meta.block_list.size(), stream);
+    auto inflate_stats = hostdevice_vector<decompress_status>(meta.block_list.size(), stream);
 
-    auto inflate_in  = hostdevice_vector<gpu_inflate_input_s>(meta.block_list.size(), stream);
-    auto inflate_out = hostdevice_vector<gpu_inflate_status_s>(meta.block_list.size(), stream);
+    // Guess an initial maximum uncompressed block size. We estimate the compression factor is two
+    // and round up to the next multiple of 4096 bytes.
+    uint32_t const initial_blk_len = meta.max_block_size * 2 + (meta.max_block_size * 2) % 4096;
+    size_t const uncomp_size       = initial_blk_len * meta.block_list.size();
 
-    // Guess an initial maximum uncompressed block size
-    uint32_t initial_blk_len = (meta.max_block_size * 2 + 0xfff) & ~0xfff;
-    uncompressed_data_size   = initial_blk_len * meta.block_list.size();
-    for (size_t i = 0; i < inflate_in.size(); ++i) {
-      inflate_in[i].dstSize = initial_blk_len;
-    }
-
-    rmm::device_buffer decomp_block_data(uncompressed_data_size, stream);
+    rmm::device_buffer decomp_block_data(uncomp_size, stream);
 
     auto const base_offset = meta.block_list[0].offset;
     for (size_t i = 0, dst_pos = 0; i < meta.block_list.size(); i++) {
       auto const src_pos = meta.block_list[i].offset - base_offset;
 
-      inflate_in[i].srcDevice = static_cast<uint8_t const*>(comp_block_data.data()) + src_pos;
-      inflate_in[i].srcSize   = meta.block_list[i].size;
-      inflate_in[i].dstDevice = static_cast<uint8_t*>(decomp_block_data.data()) + dst_pos;
+      inflate_in[i]  = {static_cast<uint8_t const*>(comp_block_data.data()) + src_pos,
+                       meta.block_list[i].size};
+      inflate_out[i] = {static_cast<uint8_t*>(decomp_block_data.data()) + dst_pos, initial_blk_len};
 
       // Update blocks offsets & sizes to refer to uncompressed data
       meta.block_list[i].offset = dst_pos;
-      meta.block_list[i].size   = static_cast<uint32_t>(inflate_in[i].dstSize);
+      meta.block_list[i].size   = static_cast<uint32_t>(inflate_out[i].size());
       dst_pos += meta.block_list[i].size;
     }
+    inflate_in.host_to_device(stream);
 
     for (int loop_cnt = 0; loop_cnt < 2; loop_cnt++) {
-      inflate_in.host_to_device(stream);
-      CUDF_CUDA_TRY(
-        cudaMemsetAsync(inflate_out.device_ptr(), 0, inflate_out.memory_size(), stream.value()));
-      CUDF_CUDA_TRY(gpuinflate(
-        inflate_in.device_ptr(), inflate_out.device_ptr(), inflate_in.size(), 0, stream));
-      inflate_out.device_to_host(stream, true);
+      inflate_out.host_to_device(stream);
+      CUDF_CUDA_TRY(cudaMemsetAsync(
+        inflate_stats.device_ptr(), 0, inflate_stats.memory_size(), stream.value()));
+      gpuinflate(inflate_in, inflate_out, inflate_stats, gzip_header_included::NO, stream);
+      inflate_stats.device_to_host(stream, true);
 
       // Check if larger output is required, as it's not known ahead of time
       if (loop_cnt == 0) {
-        size_t actual_uncompressed_size = 0;
-        for (size_t i = 0; i < meta.block_list.size(); i++) {
-          // If error status is 1 (buffer too small), the `bytes_written` field
-          // is actually contains the uncompressed data size
-          if (inflate_out[i].status == 1 && inflate_out[i].bytes_written > inflate_in[i].dstSize) {
-            inflate_in[i].dstSize = inflate_out[i].bytes_written;
-          }
-          actual_uncompressed_size += inflate_in[i].dstSize;
-        }
-        if (actual_uncompressed_size > uncompressed_data_size) {
-          decomp_block_data.resize(actual_uncompressed_size, stream);
-          for (size_t i = 0, dst_pos = 0; i < meta.block_list.size(); i++) {
-            auto dst_base           = static_cast<uint8_t*>(decomp_block_data.data());
-            inflate_in[i].dstDevice = dst_base + dst_pos;
-
-            meta.block_list[i].offset = dst_pos;
-            meta.block_list[i].size   = static_cast<uint32_t>(inflate_in[i].dstSize);
-            dst_pos += meta.block_list[i].size;
+        std::vector<size_t> actual_uncomp_sizes;
+        actual_uncomp_sizes.reserve(inflate_out.size());
+        std::transform(inflate_out.begin(),
+                       inflate_out.end(),
+                       inflate_stats.begin(),
+                       std::back_inserter(actual_uncomp_sizes),
+                       [](auto const& inf_out, auto const& inf_stats) {
+                         // If error status is 1 (buffer too small), the `bytes_written` field
+                         // actually contains the uncompressed data size
+                         return inf_stats.status == 1
+                                  ? std::max(inf_out.size(), inf_stats.bytes_written)
+                                  : inf_out.size();
+                       });
+        auto const total_actual_uncomp_size =
+          std::accumulate(actual_uncomp_sizes.cbegin(), actual_uncomp_sizes.cend(), 0ul);
+        if (total_actual_uncomp_size > uncomp_size) {
+          decomp_block_data.resize(total_actual_uncomp_size, stream);
+          for (size_t i = 0; i < meta.block_list.size(); ++i) {
+            meta.block_list[i].offset =
+              i > 0 ? (meta.block_list[i - 1].size + meta.block_list[i - 1].offset) : 0;
+            meta.block_list[i].size = static_cast<uint32_t>(actual_uncomp_sizes[i]);
+
+            inflate_out[i] = {
+              static_cast<uint8_t*>(decomp_block_data.data()) + meta.block_list[i].offset,
+              meta.block_list[i].size};
           }
         } else {
           break;
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 631cf19b2aa..cf4d1b0e0f4 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -1904,41 +1904,42 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
  *
  * blockDim = {block_size,1,1}
  *
- * @param[in] inputs Source/Destination buffer information per block
- * @param[out] outputs Decompressor status per block
+ * @param[in] inputs Source buffer per block
+ * @param[out] outputs Destination buffer per block
+ * @param[out] statuses Decompressor status per block
  * @param scratch Intermediate device memory heap space (will be dynamically shared between blocks)
  * @param scratch_size Size of scratch heap space (smaller sizes may result in serialization between
- *blocks)
- * @param count Number of blocks to decompress
+ * blocks)
  */
-extern "C" __global__ void __launch_bounds__(block_size, 2)
-  gpu_debrotli_kernel(gpu_inflate_input_s* inputs,
-                      gpu_inflate_status_s* outputs,
+__global__ void __launch_bounds__(block_size, 2)
+  gpu_debrotli_kernel(device_span<device_span<uint8_t const> const> inputs,
+                      device_span<device_span<uint8_t> const> outputs,
+                      device_span<decompress_status> statuses,
                       uint8_t* scratch,
-                      uint32_t scratch_size,
-                      uint32_t count)
+                      uint32_t scratch_size)
 {
   __shared__ __align__(16) debrotli_state_s state_g;
 
   int t                     = threadIdx.x;
-  int z                     = blockIdx.x;
+  auto const block_id       = blockIdx.x;
   debrotli_state_s* const s = &state_g;
 
-  if (z >= count) { return; }
+  if (block_id >= inputs.size()) { return; }
   // Thread0: initializes shared state and decode stream header
   if (!t) {
-    auto const* src = static_cast<uint8_t const*>(inputs[z].srcDevice);
-    size_t src_size = inputs[z].srcSize;
+    auto const src      = inputs[block_id].data();
+    auto const src_size = inputs[block_id].size();
     if (src && src_size >= 8) {
-      s->error = 0;
-      s->out = s->outbase = static_cast<uint8_t*>(inputs[z].dstDevice);
-      s->bytes_left       = inputs[z].dstSize;
-      s->mtf_upper_bound  = 63;
-      s->dist_rb[0]       = 16;
-      s->dist_rb[1]       = 15;
-      s->dist_rb[2]       = 11;
-      s->dist_rb[3]       = 4;
-      s->dist_rb_idx      = 0;
+      s->error           = 0;
+      s->out             = outputs[block_id].data();
+      s->outbase         = s->out;
+      s->bytes_left      = outputs[block_id].size();
+      s->mtf_upper_bound = 63;
+      s->dist_rb[0]      = 16;
+      s->dist_rb[1]      = 15;
+      s->dist_rb[2]      = 11;
+      s->dist_rb[3]      = 4;
+      s->dist_rb_idx     = 0;
       s->p1 = s->p2 = 0;
       initbits(s, src, src_size);
       DecodeStreamHeader(s);
@@ -2015,9 +2016,10 @@ extern "C" __global__ void __launch_bounds__(block_size, 2)
   __syncthreads();
   // Output decompression status
   if (!t) {
-    outputs[z].bytes_written = s->out - s->outbase;
-    outputs[z].status        = s->error;
-    outputs[z].reserved      = s->fb_size;  // Return ext heap used by last block (statistics)
+    statuses[block_id].bytes_written = s->out - s->outbase;
+    statuses[block_id].status        = s->error;
+    // Return ext heap used by last block (statistics)
+    statuses[block_id].reserved = s->fb_size;
   }
 }
 
@@ -2075,20 +2077,21 @@ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs)
 #include <stdio.h>
 #endif
 
-cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
-                                  gpu_inflate_status_s* outputs,
-                                  void* scratch,
-                                  size_t scratch_size,
-                                  int count,
-                                  rmm::cuda_stream_view stream)
+void gpu_debrotli(device_span<device_span<uint8_t const> const> inputs,
+                  device_span<device_span<uint8_t> const> outputs,
+                  device_span<decompress_status> statuses,
+                  void* scratch,
+                  size_t scratch_size,
+                  rmm::cuda_stream_view stream)
 {
-  uint32_t count32 = (count > 0) ? count : 0;
+  auto const count = inputs.size();
   uint32_t fb_heap_size;
   auto* scratch_u8 = static_cast<uint8_t*>(scratch);
   dim3 dim_block(block_size, 1);
-  dim3 dim_grid(count32, 1);  // TODO: Check max grid dimensions vs max expected count
+  dim3 dim_grid(count, 1);  // TODO: Check max grid dimensions vs max expected count
 
-  if (scratch_size < sizeof(brotli_dictionary_s)) { return cudaErrorLaunchOutOfResources; }
+  CUDF_EXPECTS(scratch_size >= sizeof(brotli_dictionary_s),
+               "Insufficient scratch space for debrotli");
   scratch_size = min(scratch_size, (size_t)0xffffffffu);
   fb_heap_size = (uint32_t)((scratch_size - sizeof(brotli_dictionary_s)) & ~0xf);
 
@@ -2101,7 +2104,7 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
                                 cudaMemcpyHostToDevice,
                                 stream.value()));
   gpu_debrotli_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(
-    inputs, outputs, scratch_u8, fb_heap_size, count32);
+    inputs, outputs, statuses, scratch_u8, fb_heap_size);
 #if DUMP_FB_HEAP
   uint32_t dump[2];
   uint32_t cur = 0;
@@ -2114,8 +2117,6 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
     cur = (dump[0] > cur) ? dump[0] : 0xffffffffu;
   }
 #endif
-
-  return cudaSuccess;
 }
 
 }  // namespace io
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index 508e960430d..0d33158da2b 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,8 +124,8 @@ struct inflate_state_s {
   uint8_t* outbase;  ///< start of output buffer
   uint8_t* outend;   ///< end of output buffer
   // Input state
-  uint8_t* cur;  ///< input buffer
-  uint8_t* end;  ///< end of input buffer
+  uint8_t const* cur;  ///< input buffer
+  uint8_t const* end;  ///< end of input buffer
 
   uint2 bitbuf;     ///< bit buffer (64-bit)
   uint32_t bitpos;  ///< position in bit buffer
@@ -180,10 +180,10 @@ inline __device__ void skipbits(inflate_state_s* s, uint32_t n)
 {
   uint32_t bitpos = s->bitpos + n;
   if (bitpos >= 32) {
-    uint8_t* cur = s->cur + 8;
-    s->bitbuf.x  = s->bitbuf.y;
-    s->bitbuf.y  = (cur < s->end) ? *reinterpret_cast<uint32_t*>(cur) : 0;
-    s->cur       = cur - 4;
+    auto cur    = s->cur + 8;
+    s->bitbuf.x = s->bitbuf.y;
+    s->bitbuf.y = (cur < s->end) ? *reinterpret_cast<uint32_t const*>(cur) : 0;
+    s->cur      = cur - 4;
     bitpos &= 0x1f;
   }
   s->bitpos = bitpos;
@@ -510,8 +510,8 @@ __device__ void decode_symbols(inflate_state_s* s)
 {
   uint32_t bitpos = s->bitpos;
   uint2 bitbuf    = s->bitbuf;
-  uint8_t* cur    = s->cur;
-  uint8_t* end    = s->end;
+  auto cur        = s->cur;
+  auto end        = s->end;
   int32_t batch   = 0;
   int32_t sym, batch_len;
 
@@ -871,13 +871,11 @@ __device__ int init_stored(inflate_state_s* s)
 /// Copy bytes from stored block to destination
 __device__ void copy_stored(inflate_state_s* s, int t)
 {
-  int len         = s->stored_blk_len;
-  uint8_t* cur    = s->cur + (s->bitpos >> 3);
-  uint8_t* out    = s->out;
-  uint8_t* outend = s->outend;
-  uint8_t* cur4;
-  int slow_bytes = min(len, (int)((16 - (size_t)out) & 0xf));
-  int fast_bytes, bitpos;
+  auto len              = s->stored_blk_len;
+  auto cur              = s->cur + s->bitpos / 8;
+  auto out              = s->out;
+  auto outend           = s->outend;
+  auto const slow_bytes = min(len, (int)((16 - reinterpret_cast<size_t>(out)) % 16));
 
   // Slow copy until output is 16B aligned
   if (slow_bytes) {
@@ -890,11 +888,11 @@ __device__ void copy_stored(inflate_state_s* s, int t)
     out += slow_bytes;
     len -= slow_bytes;
   }
-  fast_bytes = len;
+  auto fast_bytes = len;
   if (out < outend) { fast_bytes = (int)min((size_t)fast_bytes, (outend - out)); }
   fast_bytes &= ~0xf;
-  bitpos = ((int)(3 & (size_t)cur)) << 3;
-  cur4   = cur - (bitpos >> 3);
+  auto bitpos = ((int)((size_t)cur % 4)) * 8;
+  auto cur4   = cur - (bitpos / 8);
   if (out < outend) {
     // Fast copy 16 bytes at a time
     for (int i = t * 16; i < fast_bytes; i += blockDim.x * 16) {
@@ -926,13 +924,13 @@ __device__ void copy_stored(inflate_state_s* s, int t)
   __syncthreads();
   if (t == 0) {
     // Reset bitstream to end of block
-    uint8_t* p        = cur + len;
+    auto p            = cur + len;
     auto prefix_bytes = (uint32_t)(((size_t)p) & 3);
     p -= prefix_bytes;
     s->cur      = p;
-    s->bitbuf.x = (p < s->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
+    s->bitbuf.x = (p < s->end) ? *reinterpret_cast<uint32_t const*>(p) : 0;
     p += 4;
-    s->bitbuf.y = (p < s->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
+    s->bitbuf.y = (p < s->end) ? *reinterpret_cast<uint32_t const*>(p) : 0;
     s->bitpos   = prefix_bytes * 8;
     s->out      = out;
   }
@@ -1021,12 +1019,16 @@ __device__ int parse_gzip_header(const uint8_t* src, size_t src_size)
  *
  * @tparam block_size Thread block dimension for this call
  * @param inputs Source and destination buffer information per block
- * @param outputs Decompression status buffer per block
+ * @param outputs Destination buffer information per block
+ * @param statuses Decompression status buffer per block
  * @param parse_hdr If nonzero, indicates that the compressed bitstream includes a GZIP header
  */
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  inflate_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, int parse_hdr)
+  inflate_kernel(device_span<device_span<uint8_t const> const> inputs,
+                 device_span<device_span<uint8_t> const> outputs,
+                 device_span<decompress_status> statuses,
+                 gzip_header_included parse_hdr)
 {
   __shared__ __align__(16) inflate_state_s state_g;
 
@@ -1035,12 +1037,11 @@ __global__ void __launch_bounds__(block_size)
   inflate_state_s* state = &state_g;
 
   if (!t) {
-    auto* p         = const_cast<uint8_t*>(static_cast<uint8_t const*>(inputs[z].srcDevice));
-    size_t src_size = inputs[z].srcSize;
-    uint32_t prefix_bytes;
+    auto p        = inputs[z].data();
+    auto src_size = inputs[z].size();
     // Parse header if needed
     state->err = 0;
-    if (parse_hdr) {
+    if (parse_hdr == gzip_header_included::YES) {
       int hdr_len = parse_gzip_header(p, src_size);
       src_size    = (src_size >= 8) ? src_size - 8 : 0;  // ignore footer
       if (hdr_len >= 0) {
@@ -1051,16 +1052,16 @@ __global__ void __launch_bounds__(block_size)
       }
     }
     // Initialize shared state
-    state->out     = const_cast<uint8_t*>(static_cast<uint8_t const*>(inputs[z].dstDevice));
-    state->outbase = state->out;
-    state->outend  = state->out + inputs[z].dstSize;
-    state->end     = p + src_size;
-    prefix_bytes   = (uint32_t)(((size_t)p) & 3);
+    state->out              = outputs[z].data();
+    state->outbase          = state->out;
+    state->outend           = state->out + outputs[z].size();
+    state->end              = p + src_size;
+    auto const prefix_bytes = (uint32_t)(((size_t)p) & 3);
     p -= prefix_bytes;
     state->cur      = p;
-    state->bitbuf.x = (p < state->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
+    state->bitbuf.x = (p < state->end) ? *reinterpret_cast<uint32_t const*>(p) : 0;
     p += 4;
-    state->bitbuf.y = (p < state->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
+    state->bitbuf.y = (p < state->end) ? *reinterpret_cast<uint32_t const*>(p) : 0;
     state->bitpos   = prefix_bytes * 8;
   }
   __syncthreads();
@@ -1132,9 +1133,9 @@ __global__ void __launch_bounds__(block_size)
       // Output buffer too small
       state->err = 1;
     }
-    outputs[z].bytes_written = state->out - state->outbase;
-    outputs[z].status        = state->err;
-    outputs[z].reserved      = (int)(state->end - state->cur);  // Here mainly for debug purposes
+    statuses[z].bytes_written = state->out - state->outbase;
+    statuses[z].status        = state->err;
+    statuses[z].reserved      = (int)(state->end - state->cur);  // Here mainly for debug purposes
   }
 }
 
@@ -1145,7 +1146,9 @@ __global__ void __launch_bounds__(block_size)
  *
  * @param inputs Source and destination information per block
  */
-__global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_input_s* inputs)
+__global__ void __launch_bounds__(1024)
+  copy_uncompressed_kernel(device_span<device_span<uint8_t const> const> inputs,
+                           device_span<device_span<uint8_t> const> outputs)
 {
   __shared__ const uint8_t* volatile src_g;
   __shared__ uint8_t* volatile dst_g;
@@ -1158,9 +1161,9 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp
   uint32_t len, src_align_bytes, src_align_bits, dst_align_bytes;
 
   if (!t) {
-    src        = static_cast<const uint8_t*>(inputs[z].srcDevice);
-    dst        = static_cast<uint8_t*>(inputs[z].dstDevice);
-    len        = min((uint32_t)inputs[z].srcSize, (uint32_t)inputs[z].dstSize);
+    src        = inputs[z].data();
+    dst        = outputs[z].data();
+    len        = static_cast<uint32_t>(min(inputs[z].size(), outputs[z].size()));
     src_g      = src;
     dst_g      = dst;
     copy_len_g = len;
@@ -1195,26 +1198,26 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp
   if (t < len) { dst[t] = src[t]; }
 }
 
-cudaError_t __host__ gpuinflate(gpu_inflate_input_s* inputs,
-                                gpu_inflate_status_s* outputs,
-                                int count,
-                                int parse_hdr,
-                                rmm::cuda_stream_view stream)
+void gpuinflate(device_span<device_span<uint8_t const> const> inputs,
+                device_span<device_span<uint8_t> const> outputs,
+                device_span<decompress_status> statuses,
+                gzip_header_included parse_hdr,
+                rmm::cuda_stream_view stream)
 {
   constexpr int block_size = 128;  // Threads per block
-  if (count > 0) {
+  if (inputs.size() > 0) {
     inflate_kernel<block_size>
-      <<<count, block_size, 0, stream.value()>>>(inputs, outputs, parse_hdr);
+      <<<inputs.size(), block_size, 0, stream.value()>>>(inputs, outputs, statuses, parse_hdr);
   }
-  return cudaSuccess;
 }
 
-cudaError_t __host__ gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs,
-                                                  int count,
-                                                  rmm::cuda_stream_view stream)
+void gpu_copy_uncompressed_blocks(device_span<device_span<uint8_t const> const> inputs,
+                                  device_span<device_span<uint8_t> const> outputs,
+                                  rmm::cuda_stream_view stream)
 {
-  if (count > 0) { copy_uncompressed_kernel<<<count, 1024, 0, stream.value()>>>(inputs); }
-  return cudaSuccess;
+  if (inputs.size() > 0) {
+    copy_uncompressed_kernel<<<inputs.size(), 1024, 0, stream.value()>>>(inputs, outputs);
+  }
 }
 
 }  // namespace io
diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h
index 29856bcd3f3..3870b2ac3b3 100644
--- a/cpp/src/io/comp/gpuinflate.h
+++ b/cpp/src/io/comp/gpuinflate.h
@@ -16,75 +16,70 @@
 
 #pragma once
 
-#include <cstdint>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cstdint>
+
 namespace cudf {
 namespace io {
-/**
- * @brief Input parameters for the decompression interface
- */
-struct gpu_inflate_input_s {
-  const void* srcDevice;
-  uint64_t srcSize;
-  void* dstDevice;
-  uint64_t dstSize;
-};
 
 /**
  * @brief Output parameters for the decompression interface
  */
-struct gpu_inflate_status_s {
+struct decompress_status {
   uint64_t bytes_written;
   uint32_t status;
   uint32_t reserved;
 };
 
+enum class gzip_header_included { NO, YES };
+
 /**
  * @brief Interface for decompressing GZIP-compressed data
  *
  * Multiple, independent chunks of compressed data can be decompressed by using
- * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk.
+ * separate input/output/status for each chunk.
  *
- * @param[in] inputs List of input argument structures
- * @param[out] outputs List of output status structures
- * @param[in] count Number of input/output structures
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
+ * @param[out] statuses List of output status structures
  * @param[in] parse_hdr Whether or not to parse GZIP header
  * @param[in] stream CUDA stream to use
  */
-cudaError_t gpuinflate(gpu_inflate_input_s* inputs,
-                       gpu_inflate_status_s* outputs,
-                       int count,
-                       int parse_hdr,
-                       rmm::cuda_stream_view stream);
+void gpuinflate(device_span<device_span<uint8_t const> const> inputs,
+                device_span<device_span<uint8_t> const> outputs,
+                device_span<decompress_status> statuses,
+                gzip_header_included parse_hdr,
+                rmm::cuda_stream_view stream);
 
 /**
  * @brief Interface for copying uncompressed byte blocks
  *
- * @param[in] inputs List of input argument structures
- * @param[in] count Number of input structures
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
  * @param[in] stream CUDA stream to use
  */
-cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs,
-                                         int count,
-                                         rmm::cuda_stream_view stream);
+void gpu_copy_uncompressed_blocks(device_span<device_span<uint8_t const> const> inputs,
+                                  device_span<device_span<uint8_t> const> outputs,
+                                  rmm::cuda_stream_view stream);
 
 /**
  * @brief Interface for decompressing Snappy-compressed data
  *
  * Multiple, independent chunks of compressed data can be decompressed by using
- * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk.
+ * separate input/output/status for each chunk.
  *
- * @param[in] inputs List of input argument structures
- * @param[out] outputs List of output status structures
- * @param[in] count Number of input/output structures
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
+ * @param[out] statuses List of output status structures
  * @param[in] stream CUDA stream to use
  */
-cudaError_t gpu_unsnap(gpu_inflate_input_s* inputs,
-                       gpu_inflate_status_s* outputs,
-                       int count,
-                       rmm::cuda_stream_view stream);
+void gpu_unsnap(device_span<device_span<uint8_t const> const> inputs,
+                device_span<device_span<uint8_t> const> outputs,
+                device_span<decompress_status> statuses,
+                rmm::cuda_stream_view stream);
 
 /**
  * @brief Computes the size of temporary memory for Brotli decompression
@@ -99,37 +94,37 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0);
  * @brief Interface for decompressing Brotli-compressed data
  *
  * Multiple, independent chunks of compressed data can be decompressed by using
- * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk.
+ * separate input/output/status pairs for each chunk.
  *
- * @param[in] inputs List of input argument structures
- * @param[out] outputs List of output status structures
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
+ * @param[out] statuses List of output status structures
  * @param[in] scratch Temporary memory for intermediate work
  * @param[in] scratch_size Size in bytes of the temporary memory
- * @param[in] count Number of input/output structures
  * @param[in] stream CUDA stream to use
  */
-cudaError_t gpu_debrotli(gpu_inflate_input_s* inputs,
-                         gpu_inflate_status_s* outputs,
-                         void* scratch,
-                         size_t scratch_size,
-                         int count,
-                         rmm::cuda_stream_view stream);
+void gpu_debrotli(device_span<device_span<uint8_t const> const> inputs,
+                  device_span<device_span<uint8_t> const> outputs,
+                  device_span<decompress_status> statuses,
+                  void* scratch,
+                  size_t scratch_size,
+                  rmm::cuda_stream_view stream);
 
 /**
  * @brief Interface for compressing data with Snappy
  *
  * Multiple, independent chunks of compressed data can be compressed by using
- * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk.
+ * separate input/output/status for each chunk.
  *
- * @param[in] inputs List of input argument structures
- * @param[out] outputs List of output status structures
- * @param[in] count Number of input/output structures
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
+ * @param[out] statuses List of output status structures
  * @param[in] stream CUDA stream to use
  */
-cudaError_t gpu_snap(gpu_inflate_input_s* inputs,
-                     gpu_inflate_status_s* outputs,
-                     int count,
-                     rmm::cuda_stream_view stream);
+void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<decompress_status> statuses,
+              rmm::cuda_stream_view stream);
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
new file mode 100644
index 00000000000..b2e6f07b80b
--- /dev/null
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nvcomp_adapter.hpp"
+#include "nvcomp_adapter.cuh"
+
+#include <cudf/utilities/error.hpp>
+
+#include <nvcomp/snappy.h>
+
+namespace cudf::io::nvcomp {
+
+template <typename... Args>
+auto batched_decompress_get_temp_size(compression_type type, Args&&... args)
+{
+  switch (type) {
+    case compression_type::SNAPPY:
+      return nvcompBatchedSnappyDecompressGetTempSize(std::forward<Args>(args)...);
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+};
+
+template <typename... Args>
+auto batched_decompress_async(compression_type type, Args&&... args)
+{
+  switch (type) {
+    case compression_type::SNAPPY:
+      return nvcompBatchedSnappyDecompressAsync(std::forward<Args>(args)...);
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+};
+
+size_t get_temp_size(compression_type type, size_t num_chunks, size_t max_uncomp_chunk_size)
+{
+  size_t temp_size = 0;
+  nvcompStatus_t nvcomp_status =
+    batched_decompress_get_temp_size(type, num_chunks, max_uncomp_chunk_size, &temp_size);
+  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
+               "Unable to get scratch size for decompression");
+
+  return temp_size;
+}
+
+void batched_decompress(compression_type type,
+                        device_span<device_span<uint8_t const> const> inputs,
+                        device_span<device_span<uint8_t> const> outputs,
+                        device_span<decompress_status> statuses,
+                        size_t max_uncomp_chunk_size,
+                        rmm::cuda_stream_view stream)
+{
+  auto const num_chunks = inputs.size();
+
+  // cuDF inflate inputs converted to nvcomp inputs
+  auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream);
+  rmm::device_uvector<size_t> actual_uncompressed_data_sizes(num_chunks, stream);
+  rmm::device_uvector<nvcompStatus_t> nvcomp_statuses(num_chunks, stream);
+  // Temporary space required for decompression
+  rmm::device_buffer scratch(get_temp_size(type, num_chunks, max_uncomp_chunk_size), stream);
+  auto const nvcomp_status = batched_decompress_async(type,
+                                                      nvcomp_args.compressed_data_ptrs.data(),
+                                                      nvcomp_args.compressed_data_sizes.data(),
+                                                      nvcomp_args.uncompressed_data_sizes.data(),
+                                                      actual_uncompressed_data_sizes.data(),
+                                                      num_chunks,
+                                                      scratch.data(),
+                                                      scratch.size(),
+                                                      nvcomp_args.uncompressed_data_ptrs.data(),
+                                                      nvcomp_statuses.data(),
+                                                      stream.value());
+  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression");
+
+  convert_status(nvcomp_statuses, actual_uncompressed_data_sizes, statuses, stream);
+}
+}  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu
new file mode 100644
index 00000000000..ce294cc9b00
--- /dev/null
+++ b/cpp/src/io/comp/nvcomp_adapter.cu
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nvcomp_adapter.cuh"
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+namespace cudf::io::nvcomp {
+
+batched_args create_batched_nvcomp_args(device_span<device_span<uint8_t const> const> inputs,
+                                        device_span<device_span<uint8_t> const> outputs,
+                                        rmm::cuda_stream_view stream)
+{
+  size_t num_comp_pages = inputs.size();
+  rmm::device_uvector<void const*> compressed_data_ptrs(num_comp_pages, stream);
+  rmm::device_uvector<size_t> compressed_data_sizes(num_comp_pages, stream);
+  rmm::device_uvector<void*> uncompressed_data_ptrs(num_comp_pages, stream);
+  rmm::device_uvector<size_t> uncompressed_data_sizes(num_comp_pages, stream);
+
+  // Prepare the input vectors
+  auto ins_it =
+    thrust::make_zip_iterator(compressed_data_ptrs.begin(), compressed_data_sizes.begin());
+  thrust::transform(
+    rmm::exec_policy(stream), inputs.begin(), inputs.end(), ins_it, [] __device__(auto const& in) {
+      return thrust::make_tuple(in.data(), in.size());
+    });
+
+  // Prepare the output vectors
+  auto outs_it =
+    thrust::make_zip_iterator(uncompressed_data_ptrs.begin(), uncompressed_data_sizes.begin());
+  thrust::transform(
+    rmm::exec_policy(stream),
+    outputs.begin(),
+    outputs.end(),
+    outs_it,
+    [] __device__(auto const& out) { return thrust::make_tuple(out.data(), out.size()); });
+
+  return {std::move(compressed_data_ptrs),
+          std::move(compressed_data_sizes),
+          std::move(uncompressed_data_ptrs),
+          std::move(uncompressed_data_sizes)};
+}
+
+void convert_status(device_span<nvcompStatus_t const> nvcomp_stats,
+                    device_span<size_t const> actual_uncompressed_sizes,
+                    device_span<decompress_status> cudf_stats,
+                    rmm::cuda_stream_view stream)
+{
+  thrust::transform(
+    rmm::exec_policy(stream),
+    nvcomp_stats.begin(),
+    nvcomp_stats.end(),
+    actual_uncompressed_sizes.begin(),
+    cudf_stats.begin(),
+    [] __device__(auto const& status, auto const& size) {
+      return decompress_status{size, status == nvcompStatus_t::nvcompSuccess ? 0u : 1u};
+    });
+}
+}  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.cuh b/cpp/src/io/comp/nvcomp_adapter.cuh
new file mode 100644
index 00000000000..a76ddcf6813
--- /dev/null
+++ b/cpp/src/io/comp/nvcomp_adapter.cuh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gpuinflate.h"
+
+#include <cudf/utilities/span.hpp>
+
+#include <nvcomp.h>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cudf::io::nvcomp {
+
+struct batched_args {
+  rmm::device_uvector<void const*> compressed_data_ptrs;
+  rmm::device_uvector<size_t> compressed_data_sizes;
+  rmm::device_uvector<void*> uncompressed_data_ptrs;
+  rmm::device_uvector<size_t> uncompressed_data_sizes;
+};
+
+/**
+ * @brief Split lists of src/dst device spans into lists of pointers/sizes.
+ *
+ * @param[in] inputs List of input buffers
+ * @param[in] outputs List of output buffers
+ * @param[in] stream CUDA stream to use
+ */
+batched_args create_batched_nvcomp_args(device_span<device_span<uint8_t const> const> inputs,
+                                        device_span<device_span<uint8_t> const> outputs,
+                                        rmm::cuda_stream_view stream);
+
+/**
+ * @brief Convert nvcomp statuses into cuIO compression statuses.
+ */
+void convert_status(device_span<nvcompStatus_t const> nvcomp_stats,
+                    device_span<size_t const> actual_uncompressed_sizes,
+                    device_span<decompress_status> cudf_stats,
+                    rmm::cuda_stream_view stream);
+}  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
new file mode 100644
index 00000000000..a0eb6bc4fbf
--- /dev/null
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gpuinflate.h"
+
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::io::nvcomp {
+
+enum class compression_type { SNAPPY };
+
+/**
+ * @brief Device batch decompression of given type.
+ *
+ * @param[in] type Compression type
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
+ * @param[out] statuses List of output status structures
+ * @param[in] max_uncomp_page_size maximum size of uncompressed block
+ * @param[in] stream CUDA stream to use
+ */
+void batched_decompress(compression_type type,
+                        device_span<device_span<uint8_t const> const> inputs,
+                        device_span<device_span<uint8_t> const> outputs,
+                        device_span<decompress_status> statuses,
+                        size_t max_uncomp_page_size,
+                        rmm::cuda_stream_view stream);
+}  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index 9f0a610f8f7..d64eea06631 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -258,7 +258,9 @@ static __device__ uint32_t Match60(const uint8_t* src1,
  * @param[in] count Number of blocks to compress
  */
 __global__ void __launch_bounds__(128)
-  snap_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, int count)
+  snap_kernel(device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<decompress_status> statuses)
 {
   __shared__ __align__(16) snap_state_s state_g;
 
@@ -268,15 +270,15 @@ __global__ void __launch_bounds__(128)
   const uint8_t* src;
 
   if (!t) {
-    const auto* src = static_cast<const uint8_t*>(inputs[blockIdx.x].srcDevice);
-    auto src_len    = static_cast<uint32_t>(inputs[blockIdx.x].srcSize);
-    auto* dst       = static_cast<uint8_t*>(inputs[blockIdx.x].dstDevice);
-    auto dst_len    = static_cast<uint32_t>(inputs[blockIdx.x].dstSize);
-    uint8_t* end    = dst + dst_len;
-    s->src          = src;
-    s->src_len      = src_len;
-    s->dst_base     = dst;
-    s->end          = end;
+    auto const src     = inputs[blockIdx.x].data();
+    auto src_len       = static_cast<uint32_t>(inputs[blockIdx.x].size());
+    auto dst           = outputs[blockIdx.x].data();
+    auto const dst_len = static_cast<uint32_t>(outputs[blockIdx.x].size());
+    auto const end     = dst + dst_len;
+    s->src             = src;
+    s->src_len         = src_len;
+    s->dst_base        = dst;
+    s->end             = end;
     while (src_len > 0x7f) {
       if (dst < end) { dst[0] = src_len | 0x80; }
       dst++;
@@ -335,23 +337,22 @@ __global__ void __launch_bounds__(128)
   }
   __syncthreads();
   if (!t) {
-    outputs[blockIdx.x].bytes_written = s->dst - s->dst_base;
-    outputs[blockIdx.x].status        = (s->dst > s->end) ? 1 : 0;
-    outputs[blockIdx.x].reserved      = 0;
+    statuses[blockIdx.x].bytes_written = s->dst - s->dst_base;
+    statuses[blockIdx.x].status        = (s->dst > s->end) ? 1 : 0;
+    statuses[blockIdx.x].reserved      = 0;
   }
 }
 
-cudaError_t __host__ gpu_snap(gpu_inflate_input_s* inputs,
-                              gpu_inflate_status_s* outputs,
-                              int count,
-                              rmm::cuda_stream_view stream)
+void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<decompress_status> statuses,
+              rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);  // 4 warps per stream, 1 stream per block
-  dim3 dim_grid(count, 1);
-  if (count > 0) {
-    snap_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, count);
+  dim3 dim_grid(inputs.size(), 1);
+  if (inputs.size() > 0) {
+    snap_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, statuses);
   }
-  return cudaSuccess;
 }
 
 }  // namespace io
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index 791a16bc912..dc44b9fcd59 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,14 +64,15 @@ struct unsnap_queue_s {
  * @brief snappy decompression state
  */
 struct unsnap_state_s {
-  const uint8_t* base;         ///< base ptr of compressed stream
-  const uint8_t* end;          ///< end of compressed stream
-  uint32_t uncompressed_size;  ///< uncompressed stream size
-  uint32_t bytes_left;         ///< bytes to uncompressed remaining
-  int32_t error;               ///< current error status
-  uint32_t tstart;             ///< start time for perf logging
-  volatile unsnap_queue_s q;   ///< queue for cross-warp communication
-  gpu_inflate_input_s in;      ///< input parameters for current block
+  const uint8_t* base;             ///< base ptr of compressed stream
+  const uint8_t* end;              ///< end of compressed stream
+  uint32_t uncompressed_size;      ///< uncompressed stream size
+  uint32_t bytes_left;             ///< remaining bytes to decompress
+  int32_t error;                   ///< current error status
+  uint32_t tstart;                 ///< start time for perf logging
+  volatile unsnap_queue_s q;       ///< queue for cross-warp communication
+  device_span<uint8_t const> src;  ///< input for current block
+  device_span<uint8_t> dst;        ///< output for current block
 };
 
 inline __device__ volatile uint8_t& byte_access(unsnap_state_s* s, uint32_t pos)
@@ -497,9 +498,9 @@ __device__ void snappy_decode_symbols(unsnap_state_s* s, uint32_t t)
 template <typename Storage>
 __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_storage)
 {
-  const uint8_t* literal_base = s->base;
-  auto* out                   = static_cast<uint8_t*>(s->in.dstDevice);
-  int batch                   = 0;
+  auto const literal_base = s->base;
+  auto out                = s->dst.data();
+  int batch               = 0;
 
   do {
     volatile unsnap_batch_s* b = &s->q.batch[batch * batch_size];
@@ -624,7 +625,9 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s
  */
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  unsnap_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs)
+  unsnap_kernel(device_span<device_span<uint8_t const> const> inputs,
+                device_span<device_span<uint8_t> const> outputs,
+                device_span<decompress_status> statuses)
 {
   __shared__ __align__(16) unsnap_state_s state_g;
   __shared__ cub::WarpReduce<uint32_t>::TempStorage temp_storage;
@@ -632,16 +635,14 @@ __global__ void __launch_bounds__(block_size)
   unsnap_state_s* s = &state_g;
   int strm_id       = blockIdx.x;
 
-  if (t < sizeof(gpu_inflate_input_s) / sizeof(uint32_t)) {
-    reinterpret_cast<uint32_t*>(&s->in)[t] = reinterpret_cast<const uint32_t*>(&inputs[strm_id])[t];
-    __threadfence_block();
-  }
   if (t < batch_count) { s->q.batch_len[t] = 0; }
   __syncthreads();
   if (!t) {
-    const auto* cur    = static_cast<const uint8_t*>(s->in.srcDevice);
-    const uint8_t* end = cur + s->in.srcSize;
-    s->error           = 0;
+    s->src         = inputs[strm_id];
+    s->dst         = outputs[strm_id];
+    auto cur       = s->src.begin();
+    auto const end = s->src.end();
+    s->error       = 0;
     if (log_cyclecount) { s->tstart = clock(); }
     if (cur < end) {
       // Read uncompressed size (varint), limited to 32-bit
@@ -672,7 +673,7 @@ __global__ void __launch_bounds__(block_size)
       s->bytes_left        = uncompressed_size;
       s->base              = cur;
       s->end               = end;
-      if ((cur >= end && uncompressed_size != 0) || (uncompressed_size > s->in.dstSize)) {
+      if ((cur >= end && uncompressed_size != 0) || (uncompressed_size > s->dst.size())) {
         s->error = -1;
       }
     } else {
@@ -697,28 +698,25 @@ __global__ void __launch_bounds__(block_size)
     __syncthreads();
   }
   if (!t) {
-    outputs[strm_id].bytes_written = s->uncompressed_size - s->bytes_left;
-    outputs[strm_id].status        = s->error;
+    statuses[strm_id].bytes_written = s->uncompressed_size - s->bytes_left;
+    statuses[strm_id].status        = s->error;
     if (log_cyclecount) {
-      outputs[strm_id].reserved = clock() - s->tstart;
+      statuses[strm_id].reserved = clock() - s->tstart;
     } else {
-      outputs[strm_id].reserved = 0;
+      statuses[strm_id].reserved = 0;
     }
   }
 }
 
-cudaError_t __host__ gpu_unsnap(gpu_inflate_input_s* inputs,
-                                gpu_inflate_status_s* outputs,
-                                int count,
-                                rmm::cuda_stream_view stream)
+void gpu_unsnap(device_span<device_span<uint8_t const> const> inputs,
+                device_span<device_span<uint8_t> const> outputs,
+                device_span<decompress_status> statuses,
+                rmm::cuda_stream_view stream)
 {
-  uint32_t count32 = (count > 0) ? count : 0;
-  dim3 dim_block(128, 1);     // 4 warps per stream, 1 stream per block
-  dim3 dim_grid(count32, 1);  // TODO: Check max grid dimensions vs max expected count
-
-  unsnap_kernel<128><<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs);
+  dim3 dim_block(128, 1);           // 4 warps per stream, 1 stream per block
+  dim3 dim_grid(inputs.size(), 1);  // TODO: Check max grid dimensions vs max expected count
 
-  return cudaSuccess;
+  unsnap_kernel<128><<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, statuses);
 }
 
 }  // namespace io
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index d94aa00c7b9..837fd03a112 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,9 +43,10 @@ struct CompressedStreamInfo {
     : compressed_data(compressed_data_),
       uncompressed_data(nullptr),
       compressed_data_size(compressed_size_),
-      decctl(nullptr),
-      decstatus(nullptr),
-      copyctl(nullptr),
+      dec_in_ctl(nullptr),
+      dec_out_ctl(nullptr),
+      copy_in_ctl(nullptr),
+      copy_out_ctl(nullptr),
       num_compressed_blocks(0),
       num_uncompressed_blocks(0),
       max_uncompressed_size(0),
@@ -54,14 +55,15 @@ struct CompressedStreamInfo {
   }
   const uint8_t* compressed_data;  // [in] base ptr to compressed stream data
   uint8_t* uncompressed_data;  // [in] base ptr to uncompressed stream data or NULL if not known yet
-  size_t compressed_data_size;      // [in] compressed data size for this stream
-  gpu_inflate_input_s* decctl;      // [in] base ptr to decompression structure to be filled
-  gpu_inflate_status_s* decstatus;  // [in] results of decompression
-  gpu_inflate_input_s*
-    copyctl;  // [in] base ptr to copy structure to be filled for uncompressed blocks
+  size_t compressed_data_size;               // [in] compressed data size for this stream
+  device_span<uint8_t const>* dec_in_ctl;    // [in] input buffer to decompress
+  device_span<uint8_t>* dec_out_ctl;         // [in] output buffer to decompress into
+  device_span<decompress_status> decstatus;  // [in] results of decompression
+  device_span<uint8_t const>* copy_in_ctl;   // [out] input buffer to copy
+  device_span<uint8_t>* copy_out_ctl;        // [out] output buffer to copy to
   uint32_t num_compressed_blocks;  // [in,out] number of entries in decctl(in), number of compressed
                                    // blocks(out)
-  uint32_t num_uncompressed_blocks;      // [in,out] number of entries in copyctl(in), number of
+  uint32_t num_uncompressed_blocks;      // [in,out] number of entries in dec_in_ctl(in), number of
                                          // uncompressed blocks(out)
   uint64_t max_uncompressed_size;        // [out] maximum uncompressed data size of stream
   uint32_t max_uncompressed_block_size;  // [out] maximum uncompressed size of any block in stream
@@ -345,8 +347,9 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
  * @param[in] max_comp_blk_size Max size of any block after compression
  * @param[in,out] strm_desc StripeStream device array [stripe][stream]
  * @param[in,out] enc_streams chunk streams device array [column][rowgroup]
- * @param[out] comp_in Per-block compression input parameters
- * @param[out] comp_out Per-block compression status
+ * @param[out] comp_in Per-block compression input buffers
+ * @param[out] comp_out Per-block compression output buffers
+ * @param[out] comp_stat Per-block compression status
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  */
 void CompressOrcDataStreams(uint8_t* compressed_data,
@@ -356,8 +359,9 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t max_comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            device_span<gpu_inflate_input_s> comp_in,
-                            device_span<gpu_inflate_status_s> comp_out,
+                            device_span<device_span<uint8_t const>> comp_in,
+                            device_span<device_span<uint8_t>> comp_out,
+                            device_span<decompress_status> comp_stat,
                             rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index a768d568178..139eb28d1a1 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -25,6 +25,7 @@
 #include "timezone.cuh"
 
 #include <io/comp/gpuinflate.h>
+#include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
 
@@ -40,8 +41,6 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <nvcomp/snappy.h>
-
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -262,7 +261,7 @@ auto decimal_column_type(std::vector<std::string> const& decimal128_columns,
 
 }  // namespace
 
-__global__ void decompress_check_kernel(device_span<gpu_inflate_status_s const> stats,
+__global__ void decompress_check_kernel(device_span<decompress_status const> stats,
                                         bool* any_block_failure)
 {
   auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -273,7 +272,7 @@ __global__ void decompress_check_kernel(device_span<gpu_inflate_status_s const>
   }
 }
 
-void decompress_check(device_span<gpu_inflate_status_s> stats,
+void decompress_check(device_span<decompress_status> stats,
                       bool* any_block_failure,
                       rmm::cuda_stream_view stream)
 {
@@ -284,74 +283,6 @@ void decompress_check(device_span<gpu_inflate_status_s> stats,
   decompress_check_kernel<<<grid, block, 0, stream.value()>>>(stats, any_block_failure);
 }
 
-__global__ void convert_nvcomp_status(device_span<nvcompStatus_t const> nvcomp_stats,
-                                      device_span<size_t const> actual_uncompressed_sizes,
-                                      device_span<gpu_inflate_status_s> stats)
-{
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < stats.size()) {
-    stats[tid].status        = nvcomp_stats[tid] == nvcompStatus_t::nvcompSuccess ? 0 : 1;
-    stats[tid].bytes_written = actual_uncompressed_sizes[tid];
-  }
-}
-
-void snappy_decompress(device_span<gpu_inflate_input_s> comp_in,
-                       device_span<gpu_inflate_status_s> comp_stat,
-                       size_t max_uncomp_page_size,
-                       rmm::cuda_stream_view stream)
-{
-  size_t num_blocks = comp_in.size();
-  size_t temp_size;
-
-  auto status =
-    nvcompBatchedSnappyDecompressGetTempSize(num_blocks, max_uncomp_page_size, &temp_size);
-  CUDF_EXPECTS(nvcompStatus_t::nvcompSuccess == status,
-               "Unable to get scratch size for snappy decompression");
-
-  rmm::device_buffer scratch(temp_size, stream);
-  rmm::device_uvector<void const*> compressed_data_ptrs(num_blocks, stream);
-  rmm::device_uvector<size_t> compressed_data_sizes(num_blocks, stream);
-  rmm::device_uvector<void*> uncompressed_data_ptrs(num_blocks, stream);
-  rmm::device_uvector<size_t> uncompressed_data_sizes(num_blocks, stream);
-
-  rmm::device_uvector<size_t> actual_uncompressed_data_sizes(num_blocks, stream);
-  rmm::device_uvector<nvcompStatus_t> statuses(num_blocks, stream);
-
-  device_span<size_t const> actual_uncompressed_sizes_span(actual_uncompressed_data_sizes.data(),
-                                                           actual_uncompressed_data_sizes.size());
-  device_span<nvcompStatus_t const> statuses_span(statuses.data(), statuses.size());
-
-  // Prepare the vectors
-  auto comp_it = thrust::make_zip_iterator(compressed_data_ptrs.begin(),
-                                           compressed_data_sizes.begin(),
-                                           uncompressed_data_ptrs.begin(),
-                                           uncompressed_data_sizes.data());
-  thrust::transform(rmm::exec_policy(stream),
-                    comp_in.begin(),
-                    comp_in.end(),
-                    comp_it,
-                    [] __device__(gpu_inflate_input_s in) {
-                      return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice, in.dstSize);
-                    });
-
-  status = nvcompBatchedSnappyDecompressAsync(compressed_data_ptrs.data(),
-                                              compressed_data_sizes.data(),
-                                              uncompressed_data_sizes.data(),
-                                              actual_uncompressed_data_sizes.data(),
-                                              num_blocks,
-                                              scratch.data(),
-                                              scratch.size(),
-                                              uncompressed_data_ptrs.data(),
-                                              statuses.data(),
-                                              stream.value());
-  CUDF_EXPECTS(nvcompStatus_t::nvcompSuccess == status, "unable to perform snappy decompression");
-
-  dim3 block(128);
-  dim3 grid(cudf::util::div_rounding_up_safe(num_blocks, static_cast<size_t>(block.x)));
-  convert_nvcomp_status<<<grid, block, 0, stream.value()>>>(
-    statuses_span, actual_uncompressed_sizes_span, comp_stat);
-}
-
 rmm::device_buffer reader::impl::decompress_stripe_data(
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
   const std::vector<rmm::device_buffer>& stripe_data,
@@ -396,9 +327,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   CUDF_EXPECTS(total_decomp_size > 0, "No decompressible data found");
 
   rmm::device_buffer decomp_data(total_decomp_size, stream);
-  rmm::device_uvector<gpu_inflate_input_s> inflate_in(
+  rmm::device_uvector<device_span<uint8_t const>> inflate_in(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<device_span<uint8_t>> inflate_out(
     num_compressed_blocks + num_uncompressed_blocks, stream);
-  rmm::device_uvector<gpu_inflate_status_s> inflate_out(num_compressed_blocks, stream);
+  rmm::device_uvector<decompress_status> inflate_stats(num_compressed_blocks, stream);
 
   // Parse again to populate the decompression input/output buffers
   size_t decomp_offset           = 0;
@@ -408,9 +341,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   for (size_t i = 0; i < compinfo.size(); ++i) {
     auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
     compinfo[i].uncompressed_data = dst_base + decomp_offset;
-    compinfo[i].decctl            = inflate_in.data() + start_pos;
-    compinfo[i].decstatus         = inflate_out.data() + start_pos;
-    compinfo[i].copyctl           = inflate_in.data() + start_pos_uncomp;
+    compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
+    compinfo[i].dec_out_ctl       = inflate_out.data() + start_pos;
+    compinfo[i].decstatus   = {inflate_stats.data() + start_pos, compinfo[i].num_compressed_blocks};
+    compinfo[i].copy_in_ctl = inflate_in.data() + start_pos_uncomp;
+    compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
 
     stream_info[i].dst_pos = decomp_offset;
     decomp_offset += compinfo[i].max_uncompressed_size;
@@ -428,29 +363,36 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
 
   // Dispatch batches of blocks to decompress
   if (num_compressed_blocks > 0) {
-    device_span<gpu_inflate_status_s> inflate_out_view(inflate_out.data(), num_compressed_blocks);
+    device_span<device_span<uint8_t const>> inflate_in_view{inflate_in.data(),
+                                                            num_compressed_blocks};
+    device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
     switch (decompressor->GetKind()) {
       case orc::ZLIB:
-        CUDF_CUDA_TRY(
-          gpuinflate(inflate_in.data(), inflate_out.data(), num_compressed_blocks, 0, stream));
+        gpuinflate(
+          inflate_in_view, inflate_out_view, inflate_stats, gzip_header_included::NO, stream);
         break;
       case orc::SNAPPY:
         if (nvcomp_integration::is_stable_enabled()) {
-          device_span<gpu_inflate_input_s> inflate_in_view{inflate_in.data(),
-                                                           num_compressed_blocks};
-          snappy_decompress(inflate_in_view, inflate_out_view, max_uncomp_block_size, stream);
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     inflate_in_view,
+                                     inflate_out_view,
+                                     inflate_stats,
+                                     max_uncomp_block_size,
+                                     stream);
         } else {
-          CUDF_CUDA_TRY(
-            gpu_unsnap(inflate_in.data(), inflate_out.data(), num_compressed_blocks, stream));
+          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_stats, stream);
         }
         break;
       default: CUDF_FAIL("Unexpected decompression dispatch"); break;
     }
-    decompress_check(inflate_out_view, any_block_failure.device_ptr(), stream);
+    decompress_check(inflate_stats, any_block_failure.device_ptr(), stream);
   }
   if (num_uncompressed_blocks > 0) {
-    CUDF_CUDA_TRY(gpu_copy_uncompressed_blocks(
-      inflate_in.data() + num_compressed_blocks, num_uncompressed_blocks, stream));
+    device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
+                                                         num_uncompressed_blocks};
+    device_span<device_span<uint8_t>> copy_out_view{inflate_out.data() + num_compressed_blocks,
+                                                    num_uncompressed_blocks};
+    gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
   }
   gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
 
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index f1d524058d2..3fe623be5b1 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1141,8 +1141,9 @@ __global__ void __launch_bounds__(1024)
  *
  * @param[in] strm_desc StripeStream device array [stripe][stream]
  * @param[in] chunks EncChunk device array [rowgroup][column]
- * @param[out] comp_in Per-block compression input parameters
- * @param[out] comp_out Per-block compression status
+ * @param[out] inputs Per-block compression input buffers
+ * @param[out] outputs Per-block compression output buffers
+ * @param[out] statuses Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
  * @param[in] max_comp_blk_size Max size of any block after compression
@@ -1151,8 +1152,9 @@ __global__ void __launch_bounds__(1024)
 __global__ void __launch_bounds__(256)
   gpuInitCompressionBlocks(device_2dspan<StripeStream const> strm_desc,
                            device_2dspan<encoder_chunk_streams> streams,  // const?
-                           device_span<gpu_inflate_input_s> comp_in,
-                           device_span<gpu_inflate_status_s> comp_out,
+                           device_span<device_span<uint8_t const>> inputs,
+                           device_span<device_span<uint8_t>> outputs,
+                           device_span<decompress_status> statuses,
                            uint8_t* compressed_bfr,
                            uint32_t comp_blk_size,
                            uint32_t max_comp_blk_size)
@@ -1175,16 +1177,11 @@ __global__ void __launch_bounds__(256)
   dst        = compressed_bfr + ss.bfr_offset;
   num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 1;
   for (uint32_t b = t; b < num_blocks; b += 256) {
-    gpu_inflate_input_s* blk_in   = &comp_in[ss.first_block + b];
-    gpu_inflate_status_s* blk_out = &comp_out[ss.first_block + b];
     uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
-    blk_in->srcDevice = src + b * comp_blk_size;
-    blk_in->srcSize   = blk_size;
-    blk_in->dstDevice = dst + b * (BLOCK_HEADER_SIZE + max_comp_blk_size) + BLOCK_HEADER_SIZE;
-    blk_in->dstSize   = max_comp_blk_size;
-    blk_out->bytes_written = blk_size;
-    blk_out->status        = 1;
-    blk_out->reserved      = 0;
+    inputs[ss.first_block + b]  = {src + b * comp_blk_size, blk_size};
+    outputs[ss.first_block + b] = {
+      dst + b * (BLOCK_HEADER_SIZE + max_comp_blk_size) + BLOCK_HEADER_SIZE, max_comp_blk_size};
+    statuses[ss.first_block + b] = {blk_size, 1, 0};
   }
 }
 
@@ -1194,8 +1191,9 @@ __global__ void __launch_bounds__(256)
  *
  * @param[in,out] strm_desc StripeStream device array [stripe][stream]
  * @param[in] chunks EncChunk device array [rowgroup][column]
- * @param[in] comp_in Per-block compression input parameters
- * @param[in] comp_out Per-block compression status
+ * @param[out] inputs Per-block compression input buffers
+ * @param[out] outputs Per-block compression output buffers
+ * @param[out] statuses Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
  * @param[in] max_comp_blk_size Max size of any block after compression
@@ -1203,8 +1201,9 @@ __global__ void __launch_bounds__(256)
 // blockDim {1024,1,1}
 __global__ void __launch_bounds__(1024)
   gpuCompactCompressedBlocks(device_2dspan<StripeStream> strm_desc,
-                             device_span<gpu_inflate_input_s> comp_in,
-                             device_span<gpu_inflate_status_s> comp_out,
+                             device_span<device_span<uint8_t const> const> inputs,
+                             device_span<device_span<uint8_t> const> outputs,
+                             device_span<decompress_status> statuses,
                              uint8_t* compressed_bfr,
                              uint32_t comp_blk_size,
                              uint32_t max_comp_blk_size)
@@ -1228,21 +1227,21 @@ __global__ void __launch_bounds__(1024)
   b          = 0;
   do {
     if (t == 0) {
-      gpu_inflate_input_s* blk_in   = &comp_in[ss.first_block + b];
-      gpu_inflate_status_s* blk_out = &comp_out[ss.first_block + b];
-      uint32_t src_len =
+      auto const src_len =
         min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
-      uint32_t dst_len = (blk_out->status == 0) ? blk_out->bytes_written : src_len;
-      uint32_t blk_size24;
+      auto dst_len = (statuses[ss.first_block + b].status == 0)
+                       ? statuses[ss.first_block + b].bytes_written
+                       : src_len;
+      uint32_t blk_size24{};
       if (dst_len >= src_len) {
         // Copy from uncompressed source
-        src                    = static_cast<const uint8_t*>(blk_in->srcDevice);
-        blk_out->bytes_written = src_len;
-        dst_len                = src_len;
-        blk_size24             = dst_len * 2 + 1;
+        src                                        = inputs[ss.first_block + b].data();
+        statuses[ss.first_block + b].bytes_written = src_len;
+        dst_len                                    = src_len;
+        blk_size24                                 = dst_len * 2 + 1;
       } else {
         // Compressed block
-        src        = static_cast<const uint8_t*>(blk_in->dstDevice);
+        src        = outputs[ss.first_block + b].data();
         blk_size24 = dst_len * 2 + 0;
       }
       dst[0]     = static_cast<uint8_t>(blk_size24 >> 0);
@@ -1311,14 +1310,21 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t max_comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            device_span<gpu_inflate_input_s> comp_in,
-                            device_span<gpu_inflate_status_s> comp_out,
+                            device_span<device_span<uint8_t const>> comp_in,
+                            device_span<device_span<uint8_t>> comp_out,
+                            device_span<decompress_status> comp_stat,
                             rmm::cuda_stream_view stream)
 {
   dim3 dim_block_init(256, 1);
   dim3 dim_grid(strm_desc.size().first, strm_desc.size().second);
-  gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream.value()>>>(
-    strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size);
+  gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream.value()>>>(strm_desc,
+                                                                            enc_streams,
+                                                                            comp_in,
+                                                                            comp_out,
+                                                                            comp_stat,
+                                                                            compressed_data,
+                                                                            comp_blk_size,
+                                                                            max_comp_blk_size);
   if (compression == SNAPPY) {
     if (detail::nvcomp_integration::is_stable_enabled()) {
       try {
@@ -1336,15 +1342,18 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
         rmm::device_uvector<size_t> compressed_bytes_written(num_compressed_blocks, stream);
 
         auto comp_it = thrust::make_zip_iterator(uncompressed_data_ptrs.begin(),
-                                                 uncompressed_data_sizes.begin(),
-                                                 compressed_data_ptrs.begin());
+                                                 uncompressed_data_sizes.begin());
+        thrust::transform(
+          rmm::exec_policy(stream),
+          comp_in.begin(),
+          comp_in.end(),
+          comp_it,
+          [] __device__(auto const& in) { return thrust::make_tuple(in.data(), in.size()); });
         thrust::transform(rmm::exec_policy(stream),
-                          comp_in.begin(),
-                          comp_in.end(),
-                          comp_it,
-                          [] __device__(gpu_inflate_input_s in) {
-                            return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice);
-                          });
+                          comp_out.begin(),
+                          comp_out.end(),
+                          compressed_data_ptrs.begin(),
+                          [] __device__(auto const& out) { return out.data(); });
         nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(),
                                                          uncompressed_data_sizes.data(),
                                                          max_comp_blk_size,
@@ -1361,9 +1370,9 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
         thrust::transform(rmm::exec_policy(stream),
                           compressed_bytes_written.begin(),
                           compressed_bytes_written.end(),
-                          comp_out.begin(),
+                          comp_stat.begin(),
                           [] __device__(size_t size) {
-                            gpu_inflate_status_s status{};
+                            decompress_status status{};
                             status.bytes_written = size;
                             return status;
                           });
@@ -1371,18 +1380,18 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
         // If we reach this then there was an error in compressing so set an error status for each
         // block
         thrust::for_each(rmm::exec_policy(stream),
-                         comp_out.begin(),
-                         comp_out.end(),
-                         [] __device__(gpu_inflate_status_s & stat) { stat.status = 1; });
+                         comp_stat.begin(),
+                         comp_stat.end(),
+                         [] __device__(decompress_status & stat) { stat.status = 1; });
       };
 
     } else {
-      gpu_snap(comp_in.data(), comp_out.data(), num_compressed_blocks, stream);
+      gpu_snap(comp_in, comp_out, comp_stat, stream);
     }
   }
   dim3 dim_block_compact(1024, 1);
   gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
-    strm_desc, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size);
+    strm_desc, comp_in, comp_out, comp_stat, compressed_data, comp_blk_size, max_comp_blk_size);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 276a1f49abf..e44ca10922f 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -26,9 +26,16 @@ namespace cudf {
 namespace io {
 namespace orc {
 namespace gpu {
+
+struct comp_in_out {
+  uint8_t const* in_ptr;
+  size_t in_size;
+  uint8_t* out_ptr;
+  size_t out_size;
+};
 struct compressed_stream_s {
   CompressedStreamInfo info;
-  gpu_inflate_input_s ctl;
+  comp_in_out ctl;
 };
 
 // blockDim {128,1,1}
@@ -57,7 +64,8 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
       uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0);
       uint32_t is_uncompressed = block_len & 1;
       uint32_t uncompressed_size;
-      gpu_inflate_input_s* init_ctl = nullptr;
+      device_span<uint8_t const>* init_in_ctl = nullptr;
+      device_span<uint8_t>* init_out_ctl      = nullptr;
       block_len >>= 1;
       cur += BLOCK_HEADER_SIZE;
       if (block_len > block_size || cur + block_len > end) {
@@ -82,27 +90,34 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
             uncompressed[max_uncompressed_size + lane_id] = cur[lane_id];
           }
         } else {
-          init_ctl = s->info.copyctl;
-          init_ctl = (init_ctl && num_uncompressed_blocks < s->info.num_uncompressed_blocks)
-                       ? &init_ctl[num_uncompressed_blocks]
-                       : nullptr;
+          init_in_ctl =
+            (s->info.copy_in_ctl && num_uncompressed_blocks < s->info.num_uncompressed_blocks)
+              ? &s->info.copy_in_ctl[num_uncompressed_blocks]
+              : nullptr;
+          init_out_ctl =
+            (s->info.copy_out_ctl && num_uncompressed_blocks < s->info.num_uncompressed_blocks)
+              ? &s->info.copy_out_ctl[num_uncompressed_blocks]
+              : nullptr;
           num_uncompressed_blocks++;
         }
       } else {
-        init_ctl = s->info.decctl;
-        init_ctl = (init_ctl && num_compressed_blocks < s->info.num_compressed_blocks)
-                     ? &init_ctl[num_compressed_blocks]
-                     : nullptr;
+        init_in_ctl = (s->info.dec_in_ctl && num_compressed_blocks < s->info.num_compressed_blocks)
+                        ? &s->info.dec_in_ctl[num_compressed_blocks]
+                        : nullptr;
+        init_out_ctl =
+          (s->info.dec_out_ctl && num_compressed_blocks < s->info.num_compressed_blocks)
+            ? &s->info.dec_out_ctl[num_compressed_blocks]
+            : nullptr;
         num_compressed_blocks++;
       }
-      if (!lane_id && init_ctl) {
-        s->ctl.srcDevice = const_cast<uint8_t*>(cur);
-        s->ctl.srcSize   = block_len;
-        s->ctl.dstDevice = uncompressed + max_uncompressed_size;
-        s->ctl.dstSize   = uncompressed_size;
+      if (!lane_id && init_in_ctl) {
+        s->ctl = {cur, block_len, uncompressed + max_uncompressed_size, uncompressed_size};
       }
       __syncwarp();
-      if (init_ctl && lane_id == 0) *init_ctl = s->ctl;
+      if (init_in_ctl && lane_id == 0) {
+        *init_in_ctl  = {s->ctl.in_ptr, s->ctl.in_size};
+        *init_out_ctl = {s->ctl.out_ptr, s->ctl.out_size};
+      }
       cur += block_len;
       max_uncompressed_size += uncompressed_size;
       max_uncompressed_block_size = max(max_uncompressed_block_size, uncompressed_size);
@@ -137,14 +152,14 @@ extern "C" __global__ void __launch_bounds__(128, 8)
       s->info.num_compressed_blocks + s->info.num_uncompressed_blocks > 0 &&
       s->info.max_uncompressed_size > 0) {
     // Walk through the compressed blocks
-    const uint8_t* cur                  = s->info.compressed_data;
-    const uint8_t* end                  = cur + s->info.compressed_data_size;
-    const gpu_inflate_input_s* dec_in   = s->info.decctl;
-    const gpu_inflate_status_s* dec_out = s->info.decstatus;
-    uint8_t* uncompressed_actual        = s->info.uncompressed_data;
-    uint8_t* uncompressed_estimated     = uncompressed_actual;
-    uint32_t num_compressed_blocks      = 0;
-    uint32_t max_compressed_blocks      = s->info.num_compressed_blocks;
+    const uint8_t* cur              = s->info.compressed_data;
+    const uint8_t* end              = cur + s->info.compressed_data_size;
+    auto dec_out                    = s->info.dec_out_ctl;
+    auto dec_status                 = s->info.decstatus;
+    uint8_t* uncompressed_actual    = s->info.uncompressed_data;
+    uint8_t* uncompressed_estimated = uncompressed_actual;
+    uint32_t num_compressed_blocks  = 0;
+    uint32_t max_compressed_blocks  = s->info.num_compressed_blocks;
 
     while (cur + BLOCK_HEADER_SIZE < end) {
       uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0);
@@ -158,14 +173,14 @@ extern "C" __global__ void __launch_bounds__(128, 8)
         uncompressed_size_actual = block_len;
       } else {
         if (num_compressed_blocks > max_compressed_blocks) { break; }
-        if (shuffle((lane_id == 0) ? dec_out[num_compressed_blocks].status : 0) != 0) {
+        if (shuffle((lane_id == 0) ? dec_status[num_compressed_blocks].status : 0) != 0) {
           // Decompression failed, not much point in doing anything else
           break;
         }
-        uncompressed_size_est =
-          shuffle((lane_id == 0) ? *(const uint32_t*)&dec_in[num_compressed_blocks].dstSize : 0);
-        uncompressed_size_actual = shuffle(
-          (lane_id == 0) ? *(const uint32_t*)&dec_out[num_compressed_blocks].bytes_written : 0);
+        uint32_t const dst_size      = dec_out[num_compressed_blocks].size();
+        uncompressed_size_est        = shuffle((lane_id == 0) ? dst_size : 0);
+        uint32_t const bytes_written = dec_status[num_compressed_blocks].bytes_written;
+        uncompressed_size_actual     = shuffle((lane_id == 0) ? bytes_written : 0);
       }
       // In practice, this should never happen with a well-behaved writer, as we would expect the
       // uncompressed size to always be equal to the compression block size except for the last
@@ -360,11 +375,11 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s,
   if (strm_len > 0) {
     int32_t compressed_offset = (t < num_rowgroups) ? s->compressed_offset[t][ci_id] : 0;
     if (compressed_offset > 0) {
-      const uint8_t* start            = s->strm_info[ci_id].compressed_data;
-      const uint8_t* cur              = start;
-      const uint8_t* end              = cur + s->strm_info[ci_id].compressed_data_size;
-      gpu_inflate_status_s* decstatus = s->strm_info[ci_id].decstatus;
-      uint32_t uncomp_offset          = 0;
+      const uint8_t* start   = s->strm_info[ci_id].compressed_data;
+      const uint8_t* cur     = start;
+      const uint8_t* end     = cur + s->strm_info[ci_id].compressed_data_size;
+      auto decstatus         = s->strm_info[ci_id].decstatus.data();
+      uint32_t uncomp_offset = 0;
       for (;;) {
         uint32_t block_len, is_uncompressed;
 
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 779d0390751..ecd2d6f6ec0 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1314,7 +1314,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
                                       file_segmentation const& segmentation,
                                       host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
                                       host_2dspan<gpu::StripeStream const> strm_desc,
-                                      host_span<gpu_inflate_status_s const> comp_out,
+                                      host_span<decompress_status const> comp_out,
                                       std::vector<ColStatsBlob> const& rg_stats,
                                       StripeInformation* stripe,
                                       orc_streams* streams,
@@ -2050,8 +2050,9 @@ void writer::impl::write(table_view const& table)
 
     // Compress the data streams
     rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-    hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
-    hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
+    hostdevice_vector<device_span<uint8_t const>> comp_in(num_compressed_blocks, stream);
+    hostdevice_vector<device_span<uint8_t>> comp_out(num_compressed_blocks, stream);
+    hostdevice_vector<decompress_status> comp_stats(num_compressed_blocks, stream);
     if (compression_kind_ != NONE) {
       strm_descs.host_to_device(stream);
       gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
@@ -2063,9 +2064,10 @@ void writer::impl::write(table_view const& table)
                                   enc_data.streams,
                                   comp_in,
                                   comp_out,
+                                  comp_stats,
                                   stream);
       strm_descs.device_to_host(stream);
-      comp_out.device_to_host(stream, true);
+      comp_stats.device_to_host(stream, true);
     }
 
     ProtobufWriter pbw_(&buffer_);
@@ -2097,7 +2099,7 @@ void writer::impl::write(table_view const& table)
                            segmentation,
                            enc_data.streams,
                            strm_descs,
-                           comp_out,
+                           comp_stats,
                            intermediate_stats.rowgroup_blobs,
                            &stripe,
                            &streams,
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 5f981793762..d823c73007f 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -386,7 +386,7 @@ class writer::impl {
                           file_segmentation const& segmentation,
                           host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
                           host_2dspan<gpu::StripeStream const> strm_desc,
-                          host_span<gpu_inflate_status_s const> comp_out,
+                          host_span<decompress_status const> comp_out,
                           std::vector<ColStatsBlob> const& rg_stats,
                           StripeInformation* stripe,
                           orc_streams* streams,
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 61bd29399cd..f05f0af2a79 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -81,8 +81,6 @@ struct page_enc_state_s {
   EncPage page;
   EncColumnChunk ck;
   parquet_column_device_view col;
-  gpu_inflate_input_s comp_in;
-  gpu_inflate_status_s comp_stat;
   uint16_t vals[rle_buffer_size];
 };
 
@@ -750,8 +748,9 @@ static __device__ std::pair<duration_ns, duration_D> convert_nanoseconds(timesta
 template <int block_size>
 __global__ void __launch_bounds__(128, 8)
   gpuEncodePages(device_span<gpu::EncPage> pages,
-                 device_span<gpu_inflate_input_s> comp_in,
-                 device_span<gpu_inflate_status_s> comp_stat)
+                 device_span<device_span<uint8_t const>> comp_in,
+                 device_span<device_span<uint8_t>> comp_out,
+                 device_span<decompress_status> comp_stats)
 {
   __shared__ __align__(8) page_enc_state_s state_g;
   using block_scan = cub::BlockScan<uint32_t, block_size>;
@@ -761,6 +760,7 @@ __global__ void __launch_bounds__(128, 8)
   uint32_t t                = threadIdx.x;
 
   if (t == 0) {
+    state_g = page_enc_state_s{};
     s->page = pages[blockIdx.x];
     s->ck   = *s->page.chunk;
     s->col  = *s->ck.col_desc;
@@ -1085,21 +1085,14 @@ __global__ void __launch_bounds__(128, 8)
     auto actual_data_size        = static_cast<uint32_t>(s->cur - base);
     uint32_t compressed_bfr_size = GetMaxCompressedBfrSize(actual_data_size);
     s->page.max_data_size        = actual_data_size;
-    s->comp_in.srcDevice         = base;
-    s->comp_in.srcSize           = actual_data_size;
-    s->comp_in.dstDevice         = s->page.compressed_data + s->page.max_hdr_size;
-    s->comp_in.dstSize           = compressed_bfr_size;
-    s->comp_stat.bytes_written   = 0;
-    s->comp_stat.status          = ~0;
-    s->comp_stat.reserved        = 0;
-  }
-  __syncthreads();
-  if (t == 0) {
+    if (not comp_in.empty()) {
+      comp_in[blockIdx.x]  = {base, actual_data_size};
+      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, compressed_bfr_size};
+    }
     pages[blockIdx.x] = s->page;
-    if (not comp_in.empty()) comp_in[blockIdx.x] = s->comp_in;
-    if (not comp_stat.empty()) {
-      comp_stat[blockIdx.x]       = s->comp_stat;
-      pages[blockIdx.x].comp_stat = &comp_stat[blockIdx.x];
+    if (not comp_stats.empty()) {
+      comp_stats[blockIdx.x]      = {0, ~0u};
+      pages[blockIdx.x].comp_stat = &comp_stats[blockIdx.x];
     }
   }
 }
@@ -1317,7 +1310,7 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start,
 // blockDim(128, 1, 1)
 __global__ void __launch_bounds__(128)
   gpuEncodePageHeaders(device_span<EncPage> pages,
-                       device_span<gpu_inflate_status_s const> comp_stat,
+                       device_span<decompress_status const> comp_stat,
                        device_span<statistics_chunk const> page_stats,
                        const statistics_chunk* chunk_stats)
 {
@@ -1946,14 +1939,15 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
 }
 
 void EncodePages(device_span<gpu::EncPage> pages,
-                 device_span<gpu_inflate_input_s> comp_in,
-                 device_span<gpu_inflate_status_s> comp_stat,
+                 device_span<device_span<uint8_t const>> comp_in,
+                 device_span<device_span<uint8_t>> comp_out,
+                 device_span<decompress_status> comp_stats,
                  rmm::cuda_stream_view stream)
 {
   auto num_pages = pages.size();
   // A page is part of one column. This is launching 1 block per page. 1 block will exclusively
   // deal with one datatype.
-  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(pages, comp_in, comp_stat);
+  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(pages, comp_in, comp_out, comp_stats);
 }
 
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -1962,7 +1956,7 @@ void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view
 }
 
 void EncodePageHeaders(device_span<EncPage> pages,
-                       device_span<gpu_inflate_status_s const> comp_stat,
+                       device_span<decompress_status const> comp_stats,
                        device_span<statistics_chunk const> page_stats,
                        const statistics_chunk* chunk_stats,
                        rmm::cuda_stream_view stream)
@@ -1970,7 +1964,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
   // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the
   // threads to coop load structs
   gpuEncodePageHeaders<<<pages.size(), 128, 0, stream.value()>>>(
-    pages, comp_stat, page_stats, chunk_stats);
+    pages, comp_stats, page_stats, chunk_stats);
 }
 
 void GatherPages(device_span<EncColumnChunk> chunks,
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 53b82c73a35..057b9a87214 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -378,7 +378,7 @@ struct EncPage {
   uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
   uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
                         //!< non-leaf levels
-  gpu_inflate_status_s* comp_stat;  //!< Ptr to compression status
+  decompress_status* comp_stat;  //!< Ptr to compression status
 };
 
 /**
@@ -584,13 +584,15 @@ void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
  * @brief Launches kernel for packing column data into parquet pages
  *
  * @param[in,out] pages Device array of EncPages (unordered)
- * @param[out] comp_in Optionally initializes compressor input params
- * @param[out] comp_out Optionally initializes compressor output params
+ * @param[out] comp_in Compressor input buffers
+ * @param[out] comp_in Compressor output buffers
+ * @param[out] comp_stats Compressor statuses
  * @param[in] stream CUDA stream to use, default 0
  */
 void EncodePages(device_span<EncPage> pages,
-                 device_span<gpu_inflate_input_s> comp_in,
-                 device_span<gpu_inflate_status_s> comp_out,
+                 device_span<device_span<uint8_t const>> comp_in,
+                 device_span<device_span<uint8_t>> comp_out,
+                 device_span<decompress_status> comp_stats,
                  rmm::cuda_stream_view stream);
 
 /**
@@ -605,13 +607,13 @@ void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view
  * @brief Launches kernel to encode page headers
  *
  * @param[in,out] pages Device array of EncPages
- * @param[in] comp_out Compressor status or nullptr if no compression
+ * @param[in] comp_stats Compressor status
  * @param[in] page_stats Optional page-level statistics to be included in page header
  * @param[in] chunk_stats Optional chunk-level statistics to be encoded
  * @param[in] stream CUDA stream to use, default 0
  */
 void EncodePageHeaders(device_span<EncPage> pages,
-                       device_span<gpu_inflate_status_s const> comp_out,
+                       device_span<decompress_status const> comp_stats,
                        device_span<statistics_chunk const> page_stats,
                        const statistics_chunk* chunk_stats,
                        rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index cfca0bad518..a40993ee2dd 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -24,6 +24,7 @@
 #include "compact_protocol_reader.hpp"
 
 #include <io/comp/gpuinflate.h>
+#include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
 
@@ -38,10 +39,9 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <nvcomp/snappy.h>
-
 #include <thrust/for_each.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/logical.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -1050,96 +1050,13 @@ void reader::impl::decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>&
   pages.device_to_host(stream, true);
 }
 
-__global__ void decompress_check_kernel(device_span<gpu_inflate_status_s const> stats,
-                                        bool* any_block_failure)
-{
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < stats.size()) {
-    if (stats[tid].status != 0) {
-      *any_block_failure = true;  // Doesn't need to be atomic
-    }
-  }
-}
-
-void decompress_check(device_span<gpu_inflate_status_s> stats,
-                      bool* any_block_failure,
-                      rmm::cuda_stream_view stream)
-{
-  if (stats.empty()) { return; }  // early exit for empty stats
-
-  dim3 block(128);
-  dim3 grid(cudf::util::div_rounding_up_safe(stats.size(), static_cast<size_t>(block.x)));
-  decompress_check_kernel<<<grid, block, 0, stream.value()>>>(stats, any_block_failure);
-}
-
-__global__ void convert_nvcomp_status(device_span<nvcompStatus_t const> nvcomp_stats,
-                                      device_span<gpu_inflate_status_s> stats)
-{
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < stats.size()) {
-    stats[tid].status = nvcomp_stats[tid] == nvcompStatus_t::nvcompSuccess ? 0 : 1;
-  }
-}
-
-void snappy_decompress(device_span<gpu_inflate_input_s> comp_in,
-                       device_span<gpu_inflate_status_s> comp_stat,
-                       size_t max_uncomp_page_size,
-                       rmm::cuda_stream_view stream)
+void decompress_check(device_span<decompress_status const> stats, rmm::cuda_stream_view stream)
 {
-  size_t num_comp_pages = comp_in.size();
-  size_t temp_size;
-
-  nvcompStatus_t nvcomp_status =
-    nvcompBatchedSnappyDecompressGetTempSize(num_comp_pages, max_uncomp_page_size, &temp_size);
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for snappy decompression");
-
-  // Not needed now but nvcomp API makes no promises about future
-  rmm::device_buffer scratch(temp_size, stream);
-  // Analogous to comp_in.srcDevice
-  rmm::device_uvector<void const*> compressed_data_ptrs(num_comp_pages, stream);
-  // Analogous to comp_in.srcSize
-  rmm::device_uvector<size_t> compressed_data_sizes(num_comp_pages, stream);
-  // Analogous to comp_in.dstDevice
-  rmm::device_uvector<void*> uncompressed_data_ptrs(num_comp_pages, stream);
-  // Analogous to comp_in.dstSize
-  rmm::device_uvector<size_t> uncompressed_data_sizes(num_comp_pages, stream);
-
-  // Analogous to comp_stat.bytes_written
-  rmm::device_uvector<size_t> actual_uncompressed_data_sizes(num_comp_pages, stream);
-  // Convertible to comp_stat.status
-  rmm::device_uvector<nvcompStatus_t> statuses(num_comp_pages, stream);
-  device_span<nvcompStatus_t const> statuses_span(statuses.data(), statuses.size());
-
-  // Prepare the vectors
-  auto comp_it = thrust::make_zip_iterator(compressed_data_ptrs.begin(),
-                                           compressed_data_sizes.begin(),
-                                           uncompressed_data_ptrs.begin(),
-                                           uncompressed_data_sizes.data());
-  thrust::transform(rmm::exec_policy(stream),
-                    comp_in.begin(),
-                    comp_in.end(),
-                    comp_it,
-                    [] __device__(gpu_inflate_input_s in) {
-                      return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice, in.dstSize);
-                    });
-
-  nvcomp_status = nvcompBatchedSnappyDecompressAsync(compressed_data_ptrs.data(),
-                                                     compressed_data_sizes.data(),
-                                                     uncompressed_data_sizes.data(),
-                                                     actual_uncompressed_data_sizes.data(),
-                                                     num_comp_pages,
-                                                     scratch.data(),
-                                                     scratch.size(),
-                                                     uncompressed_data_ptrs.data(),
-                                                     statuses.data(),
-                                                     stream.value());
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "unable to perform snappy decompression");
-
-  dim3 block(128);
-  dim3 grid(cudf::util::div_rounding_up_safe(num_comp_pages, static_cast<size_t>(block.x)));
-  convert_nvcomp_status<<<grid, block, 0, stream.value()>>>(statuses_span, comp_stat);
+  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
+                              stats.begin(),
+                              stats.end(),
+                              [] __device__(auto const& stat) { return stat.status == 0; }),
+               "Error during decompression");
 }
 
 /**
@@ -1175,9 +1092,9 @@ rmm::device_buffer reader::impl::decompress_page_data(
     int32_t max_decompressed_size;
   };
 
-  std::array<codec_stats, 3> codecs{codec_stats{parquet::GZIP, 0, 0},
-                                    codec_stats{parquet::SNAPPY, 0, 0},
-                                    codec_stats{parquet::BROTLI, 0, 0}};
+  std::array codecs{codec_stats{parquet::GZIP, 0, 0},
+                    codec_stats{parquet::SNAPPY, 0, 0},
+                    codec_stats{parquet::BROTLI, 0, 0}};
 
   auto is_codec_supported = [&codecs](int8_t codec) {
     if (codec == parquet::UNCOMPRESSED) return true;
@@ -1207,91 +1124,73 @@ rmm::device_buffer reader::impl::decompress_page_data(
 
   // Dispatch batches of pages to decompress for each codec
   rmm::device_buffer decomp_pages(total_decomp_size, stream);
-  hostdevice_vector<gpu_inflate_input_s> inflate_in(0, num_comp_pages, stream);
-  hostdevice_vector<gpu_inflate_status_s> inflate_out(0, num_comp_pages, stream);
 
-  hostdevice_vector<bool> any_block_failure(1, stream);
-  any_block_failure[0] = false;
-  any_block_failure.host_to_device(stream);
+  std::vector<device_span<uint8_t const>> comp_in;
+  comp_in.reserve(num_comp_pages);
+  std::vector<device_span<uint8_t>> comp_out;
+  comp_out.reserve(num_comp_pages);
 
-  device_span<gpu_inflate_input_s> inflate_in_view(inflate_in.device_ptr(), inflate_in.size());
-  device_span<gpu_inflate_status_s> inflate_out_view(inflate_out.device_ptr(), inflate_out.size());
+  rmm::device_uvector<decompress_status> comp_stats(num_comp_pages, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               comp_stats.begin(),
+               comp_stats.end(),
+               decompress_status{0, static_cast<uint32_t>(-1000), 0});
 
   size_t decomp_offset = 0;
-  int32_t argc         = 0;
+  int32_t start_pos    = 0;
   for (const auto& codec : codecs) {
-    if (codec.num_pages > 0) {
-      int32_t start_pos = argc;
-
-      for_each_codec_page(codec.compression_type, [&](size_t page) {
-        auto dst_base              = static_cast<uint8_t*>(decomp_pages.data());
-        inflate_in[argc].srcDevice = pages[page].page_data;
-        inflate_in[argc].srcSize   = pages[page].compressed_page_size;
-        inflate_in[argc].dstDevice = dst_base + decomp_offset;
-        inflate_in[argc].dstSize   = pages[page].uncompressed_page_size;
-
-        inflate_out[argc].bytes_written = 0;
-        inflate_out[argc].status        = static_cast<uint32_t>(-1000);
-        inflate_out[argc].reserved      = 0;
-
-        pages[page].page_data = static_cast<uint8_t*>(inflate_in[argc].dstDevice);
-        decomp_offset += inflate_in[argc].dstSize;
-        argc++;
-      });
+    if (codec.num_pages == 0) { continue; }
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_in.device_ptr(start_pos),
-                                    inflate_in.host_ptr(start_pos),
-                                    sizeof(decltype(inflate_in)::value_type) * (argc - start_pos),
-                                    cudaMemcpyHostToDevice,
-                                    stream.value()));
-      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.device_ptr(start_pos),
-                                    inflate_out.host_ptr(start_pos),
-                                    sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
-                                    cudaMemcpyHostToDevice,
-                                    stream.value()));
-
-      switch (codec.compression_type) {
-        case parquet::GZIP:
-          CUDF_CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos),
-                                   inflate_out.device_ptr(start_pos),
-                                   argc - start_pos,
-                                   1,
-                                   stream))
-          break;
-        case parquet::SNAPPY:
-          if (nvcomp_integration::is_stable_enabled()) {
-            snappy_decompress(inflate_in_view.subspan(start_pos, argc - start_pos),
-                              inflate_out_view.subspan(start_pos, argc - start_pos),
-                              codec.max_decompressed_size,
-                              stream);
-          } else {
-            CUDF_CUDA_TRY(gpu_unsnap(inflate_in.device_ptr(start_pos),
-                                     inflate_out.device_ptr(start_pos),
-                                     argc - start_pos,
-                                     stream));
-          }
-          break;
-        case parquet::BROTLI:
-          CUDF_CUDA_TRY(gpu_debrotli(inflate_in.device_ptr(start_pos),
-                                     inflate_out.device_ptr(start_pos),
-                                     debrotli_scratch.data(),
-                                     debrotli_scratch.size(),
-                                     argc - start_pos,
-                                     stream));
-          break;
-        default: CUDF_FAIL("Unexpected decompression dispatch"); break;
-      }
-      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.host_ptr(start_pos),
-                                    inflate_out.device_ptr(start_pos),
-                                    sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
-                                    cudaMemcpyDeviceToHost,
-                                    stream.value()));
+    for_each_codec_page(codec.compression_type, [&](size_t page) {
+      auto dst_base = static_cast<uint8_t*>(decomp_pages.data());
+      comp_in.emplace_back(pages[page].page_data,
+                           static_cast<size_t>(pages[page].compressed_page_size));
+      comp_out.emplace_back(dst_base + decomp_offset,
+                            static_cast<size_t>(pages[page].uncompressed_page_size));
+
+      pages[page].page_data = static_cast<uint8_t*>(comp_out.back().data());
+      decomp_offset += comp_out.back().size();
+    });
+
+    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
+                                                             codec.num_pages};
+    auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, stream);
+    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
+                                                        codec.num_pages);
+    auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, stream);
+    device_span<decompress_status> d_comp_stats_view(comp_stats.data() + start_pos,
+                                                     codec.num_pages);
+
+    switch (codec.compression_type) {
+      case parquet::GZIP:
+        gpuinflate(d_comp_in, d_comp_out, d_comp_stats_view, gzip_header_included::YES, stream);
+        break;
+      case parquet::SNAPPY:
+        if (nvcomp_integration::is_stable_enabled()) {
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     d_comp_in,
+                                     d_comp_out,
+                                     d_comp_stats_view,
+                                     codec.max_decompressed_size,
+                                     stream);
+        } else {
+          gpu_unsnap(d_comp_in, d_comp_out, d_comp_stats_view, stream);
+        }
+        break;
+      case parquet::BROTLI:
+        gpu_debrotli(d_comp_in,
+                     d_comp_out,
+                     d_comp_stats_view,
+                     debrotli_scratch.data(),
+                     debrotli_scratch.size(),
+                     stream);
+        break;
+      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
     }
+    start_pos += codec.num_pages;
   }
 
-  decompress_check(inflate_out_view, any_block_failure.device_ptr(), stream);
-  any_block_failure.device_to_host(stream, true);  // synchronizes stream
-  CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
+  decompress_check(comp_stats, stream);
 
   // Update the page information in device memory with the updated value of
   // page_data; it now points to the uncompressed data buffer
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 75a50714407..dbbd39fb508 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -984,8 +984,9 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>&
   stream.synchronize();
 }
 
-void snappy_compress(device_span<gpu_inflate_input_s const> comp_in,
-                     device_span<gpu_inflate_status_s> comp_stat,
+void snappy_compress(device_span<device_span<uint8_t const> const> comp_in,
+                     device_span<device_span<uint8_t> const> comp_out,
+                     device_span<decompress_status> comp_stats,
                      size_t max_page_uncomp_data_size,
                      rmm::cuda_stream_view stream)
 {
@@ -1012,16 +1013,20 @@ void snappy_compress(device_span<gpu_inflate_input_s const> comp_in,
     // the space allocated unless one uses the API nvcompBatchedSnappyCompressGetOutputSize()
 
     // Prepare the vectors
-    auto comp_it = thrust::make_zip_iterator(uncompressed_data_ptrs.begin(),
-                                             uncompressed_data_sizes.begin(),
-                                             compressed_data_ptrs.begin());
+    auto comp_it =
+      thrust::make_zip_iterator(uncompressed_data_ptrs.begin(), uncompressed_data_sizes.begin());
+    thrust::transform(
+      rmm::exec_policy(stream),
+      comp_in.begin(),
+      comp_in.end(),
+      comp_it,
+      [] __device__(auto const& in) { return thrust::make_tuple(in.data(), in.size()); });
+
     thrust::transform(rmm::exec_policy(stream),
-                      comp_in.begin(),
-                      comp_in.end(),
-                      comp_it,
-                      [] __device__(gpu_inflate_input_s in) {
-                        return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice);
-                      });
+                      comp_out.begin(),
+                      comp_out.end(),
+                      compressed_data_ptrs.begin(),
+                      [] __device__(auto const& out) { return out.data(); });
     nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(),
                                                      uncompressed_data_sizes.data(),
                                                      max_page_uncomp_data_size,
@@ -1041,9 +1046,9 @@ void snappy_compress(device_span<gpu_inflate_input_s const> comp_in,
     thrust::transform(rmm::exec_policy(stream),
                       compressed_bytes_written.begin(),
                       compressed_bytes_written.end(),
-                      comp_stat.begin(),
+                      comp_stats.begin(),
                       [] __device__(size_t size) {
-                        gpu_inflate_status_s status{};
+                        decompress_status status{};
                         status.bytes_written = size;
                         return status;
                       });
@@ -1051,9 +1056,9 @@ void snappy_compress(device_span<gpu_inflate_input_s const> comp_in,
   } catch (...) {
     // If we reach this then there was an error in compressing so set an error status for each page
     thrust::for_each(rmm::exec_policy(stream),
-                     comp_stat.begin(),
-                     comp_stat.end(),
-                     [] __device__(gpu_inflate_status_s & stat) { stat.status = 1; });
+                     comp_stats.begin(),
+                     comp_stats.end(),
+                     [] __device__(decompress_status & stat) { stat.status = 1; });
   };
 }
 
@@ -1077,19 +1082,17 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
   uint32_t max_comp_pages =
     (compression_ != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0;
 
-  rmm::device_uvector<gpu_inflate_input_s> compression_input(max_comp_pages, stream);
-  rmm::device_uvector<gpu_inflate_status_s> compression_status(max_comp_pages, stream);
-
-  device_span<gpu_inflate_input_s> comp_in{compression_input.data(), compression_input.size()};
-  device_span<gpu_inflate_status_s> comp_stat{compression_status.data(), compression_status.size()};
+  rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
+  rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
+  rmm::device_uvector<decompress_status> comp_stats(max_comp_pages, stream);
 
-  gpu::EncodePages(batch_pages, comp_in, comp_stat, stream);
+  gpu::EncodePages(batch_pages, comp_in, comp_out, comp_stats, stream);
   switch (compression_) {
     case parquet::Compression::SNAPPY:
       if (nvcomp_integration::is_stable_enabled()) {
-        snappy_compress(comp_in, comp_stat, max_page_uncomp_data_size, stream);
+        snappy_compress(comp_in, comp_out, comp_stats, max_page_uncomp_data_size, stream);
       } else {
-        CUDF_CUDA_TRY(gpu_snap(comp_in.data(), comp_stat.data(), pages_in_batch, stream));
+        gpu_snap(comp_in, comp_out, comp_stats, stream);
       }
       break;
     default: break;
@@ -1098,7 +1101,7 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
   // chunk-level
   auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch);
   DecideCompression(d_chunks_in_batch.flat_view(), stream);
-  EncodePageHeaders(batch_pages, comp_stat, batch_pages_stats, chunk_stats, stream);
+  EncodePageHeaders(batch_pages, comp_stats, batch_pages_stats, chunk_stats, stream);
   GatherPages(d_chunks_in_batch.flat_view(), pages, stream);
 
   auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch);
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index a754f7cf7d3..30c7b6ec326 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -51,10 +51,10 @@ class hostdevice_vector {
   }
 
   explicit hostdevice_vector(size_t initial_size, size_t max_size, rmm::cuda_stream_view stream)
-    : num_elements(initial_size), max_elements(max_size)
+    : max_elements(max_size), num_elements(initial_size)
   {
     if (max_elements != 0) {
-      CUDF_CUDA_TRY(cudaMallocHost(&h_data, sizeof(T) * max_elements));
+      CUDF_CUDA_TRY(cudaMallocHost(reinterpret_cast<void**>(&h_data), sizeof(T) * max_elements));
       d_data.resize(sizeof(T) * max_elements, stream);
     }
   }
@@ -62,7 +62,7 @@ class hostdevice_vector {
   ~hostdevice_vector()
   {
     if (max_elements != 0) {
-      auto const free_result = cudaFreeHost(h_data);
+      [[maybe_unused]] auto const free_result = cudaFreeHost(h_data);
       assert(free_result == cudaSuccess);
     }
   }
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index dd00b201df9..a325cadf6a5 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <io/comp/gpuinflate.h>
+#include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 
@@ -24,6 +25,8 @@
 
 #include <vector>
 
+using cudf::device_span;
+
 /**
  * @brief Base test fixture for decompression
  *
@@ -32,19 +35,6 @@
  */
 template <typename Decompressor>
 struct DecompressTest : public cudf::test::BaseFixture {
-  void SetUp() override
-  {
-    ASSERT_CUDA_SUCCEEDED(cudaMallocHost((void**)&inf_args, sizeof(cudf::io::gpu_inflate_input_s)));
-    ASSERT_CUDA_SUCCEEDED(
-      cudaMallocHost((void**)&inf_stat, sizeof(cudf::io::gpu_inflate_status_s)));
-  }
-
-  void TearDown() override
-  {
-    ASSERT_CUDA_SUCCEEDED(cudaFreeHost(inf_stat));
-    ASSERT_CUDA_SUCCEEDED(cudaFreeHost(inf_args));
-  }
-
   std::vector<uint8_t> vector_from_string(const char* str) const
   {
     return std::vector<uint8_t>(reinterpret_cast<const uint8_t*>(str),
@@ -55,49 +45,43 @@ struct DecompressTest : public cudf::test::BaseFixture {
                   const uint8_t* compressed,
                   size_t compressed_size)
   {
-    rmm::device_buffer src{compressed, compressed_size, rmm::cuda_stream_default};
-    rmm::device_buffer dst{decompressed->size(), rmm::cuda_stream_default};
-
-    inf_args->srcDevice = static_cast<const uint8_t*>(src.data());
-    inf_args->dstDevice = static_cast<uint8_t*>(dst.data());
-    inf_args->srcSize   = src.size();
-    inf_args->dstSize   = dst.size();
-    rmm::device_uvector<cudf::io::gpu_inflate_input_s> d_inf_args(1, rmm::cuda_stream_default);
-    rmm::device_uvector<cudf::io::gpu_inflate_status_s> d_inf_stat(1, rmm::cuda_stream_default);
-    ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_inf_args.data(),
-                                          inf_args,
-                                          sizeof(cudf::io::gpu_inflate_input_s),
-                                          cudaMemcpyHostToDevice,
-                                          0));
-    ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_inf_stat.data(),
-                                          inf_stat,
-                                          sizeof(cudf::io::gpu_inflate_status_s),
-                                          cudaMemcpyHostToDevice,
-                                          0));
-    ASSERT_CUDA_SUCCEEDED(
-      static_cast<Decompressor*>(this)->dispatch(d_inf_args.data(), d_inf_stat.data()));
-    ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(inf_stat,
-                                          d_inf_stat.data(),
-                                          sizeof(cudf::io::gpu_inflate_status_s),
-                                          cudaMemcpyDeviceToHost,
-                                          0));
-    ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(
-      decompressed->data(), inf_args->dstDevice, inf_args->dstSize, cudaMemcpyDeviceToHost, 0));
-    ASSERT_CUDA_SUCCEEDED(cudaStreamSynchronize(0));
+    auto stream = rmm::cuda_stream_default;
+    rmm::device_buffer src{compressed, compressed_size, stream};
+    rmm::device_uvector<uint8_t> dst{decompressed->size(), stream};
+
+    hostdevice_vector<device_span<uint8_t const>> inf_in(1, stream);
+    inf_in[0] = {static_cast<uint8_t const*>(src.data()), src.size()};
+    inf_in.host_to_device(stream);
+
+    hostdevice_vector<device_span<uint8_t>> inf_out(1, stream);
+    inf_out[0] = dst;
+    inf_out.host_to_device(stream);
+
+    hostdevice_vector<cudf::io::decompress_status> inf_stat(1, stream);
+    inf_stat[0] = {};
+    inf_stat.host_to_device(stream);
+
+    static_cast<Decompressor*>(this)->dispatch(inf_in, inf_out, inf_stat);
+    cudaMemcpyAsync(
+      decompressed->data(), dst.data(), dst.size(), cudaMemcpyDeviceToHost, stream.value());
+    inf_stat.device_to_host(stream, true);
+    ASSERT_EQ(inf_stat[0].status, 0);
   }
-
-  cudf::io::gpu_inflate_input_s* inf_args  = nullptr;
-  cudf::io::gpu_inflate_status_s* inf_stat = nullptr;
 };
 
 /**
  * @brief Derived fixture for GZIP decompression
  */
 struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
-  cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args,
-                       cudf::io::gpu_inflate_status_s* d_inf_stat)
+  void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
+                device_span<device_span<uint8_t>> d_inf_out,
+                device_span<cudf::io::decompress_status> d_inf_stat)
   {
-    return cudf::io::gpuinflate(d_inf_args, d_inf_stat, 1, 1, rmm::cuda_stream_default);
+    cudf::io::gpuinflate(d_inf_in,
+                         d_inf_out,
+                         d_inf_stat,
+                         cudf::io::gzip_header_included::YES,
+                         rmm::cuda_stream_default);
   }
 };
 
@@ -105,10 +89,11 @@ struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
  * @brief Derived fixture for Snappy decompression
  */
 struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
-  cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args,
-                       cudf::io::gpu_inflate_status_s* d_inf_stat)
+  void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
+                device_span<device_span<uint8_t>> d_inf_out,
+                device_span<cudf::io::decompress_status> d_inf_stat)
   {
-    return cudf::io::gpu_unsnap(d_inf_args, d_inf_stat, 1, rmm::cuda_stream_default);
+    cudf::io::gpu_unsnap(d_inf_in, d_inf_out, d_inf_stat, rmm::cuda_stream_default);
   }
 };
 
@@ -116,14 +101,19 @@ struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
  * @brief Derived fixture for Brotli decompression
  */
 struct BrotliDecompressTest : public DecompressTest<BrotliDecompressTest> {
-  cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args,
-                       cudf::io::gpu_inflate_status_s* d_inf_stat)
+  void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
+                device_span<device_span<uint8_t>> d_inf_out,
+                device_span<cudf::io::decompress_status> d_inf_stat)
   {
     rmm::device_buffer d_scratch{cudf::io::get_gpu_debrotli_scratch_size(1),
                                  rmm::cuda_stream_default};
 
-    return cudf::io::gpu_debrotli(
-      d_inf_args, d_inf_stat, d_scratch.data(), d_scratch.size(), 1, rmm::cuda_stream_default);
+    cudf::io::gpu_debrotli(d_inf_in,
+                           d_inf_out,
+                           d_inf_stat,
+                           d_scratch.data(),
+                           d_scratch.size(),
+                           rmm::cuda_stream_default);
   }
 };
 

From 84f88ceb18225850835a9912a18e4c82245d5620 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Thu, 28 Apr 2022 23:45:40 -0700
Subject: [PATCH 10/28] Support purging non-empty null elements from
 LIST/STRING columns (#10701)

Fixes #10291.

With certain operations in `libcudf`, it is possible to produce `LIST` columns with `NULL` rows that are not also empty.
For instance, consider a `STRUCT` column is constructed with an explicit validity buffer and a `LIST` child column:
```c++
auto const lists   = lists_column_wrapper<int32_t>{ {0,1}, {2,3}, {4,5} };
auto const structs = structs_column_wrapper{ {lists}, null_at(1) };
```
Since `structs[1] == NULL`, its `LIST` member is also deemed null. However, for efficiency, the null-ness is recorded in the `LIST`'s validity buffer, without purging the unnecessary values from its child. The `LIST` columns appears as follows:
```
Validity: 101
Offsets:  [0, 2, 4, 6]
Child:    [0, 1, 2, 3, 4, 5]
```
Even though Row#1 is null, its size is `4-2 = 2`, and not `0`. (Row#1 is thus a non-empty null row.)

This commit adds a `cudf::purge_nonempty_nulls()` function that purges such rows, and reduces such columns to a more space-efficient representation, i.e.:
```
Validity: 101
Offsets:  [0, 2, 2, 4]
Child:    [0, 1, 4, 5]
```

This commit also modifies `cudf::gather()` not to produce `STRING`/`LIST` columns with "dirty" rows. Further, it adds two new functions to determine if a specified column needs such purging:
1. `cudf::may_have_nonempty_nulls()`: A fast check to check a column for the *possibility* of having non-empty nulls. This only checks whether the column or its descendants have null rows at all. If there are no nulls anywhere in the hierarchy, it does not need purging.
2. `cudf::has_nonempty_nulls()`: A deeper, more expensive check that categorically confirms whether non-empty null rows exist in any column in the hierarchy.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - https://github.com/nvdbaranec
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/cudf/pull/10701
---
 conda/recipes/libcudf/meta.yaml               |   1 +
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/copying.hpp                  | 153 ++++++
 cpp/include/cudf/detail/copy.cuh              |  47 ++
 cpp/include/cudf/detail/copy.hpp              |  19 +-
 cpp/include/cudf/lists/detail/gather.cuh      |  45 +-
 cpp/include/cudf/strings/detail/gather.cuh    |  20 +-
 .../cudf/structs/structs_column_view.hpp      |   7 +-
 cpp/src/copying/purge_nonempty_nulls.cu       | 134 ++++++
 cpp/src/structs/structs_column_view.cpp       |   2 +
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/column/factories_test.cpp           |   2 +-
 .../copying/purge_nonempty_nulls_tests.cpp    | 437 ++++++++++++++++++
 13 files changed, 847 insertions(+), 22 deletions(-)
 create mode 100644 cpp/include/cudf/detail/copy.cuh
 create mode 100644 cpp/src/copying/purge_nonempty_nulls.cu
 create mode 100644 cpp/tests/copying/purge_nonempty_nulls_tests.cpp

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 0806bb964cf..68008e13897 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -79,6 +79,7 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
         - test -f $PREFIX/include/cudf/detail/concatenate.hpp
         - test -f $PREFIX/include/cudf/detail/copy.hpp
+        - test -f $PREFIX/include/cudf/detail/copy.cuh
         - test -f $PREFIX/include/cudf/detail/datetime.hpp
         - test -f $PREFIX/include/cudf/detail/fill.hpp
         - test -f $PREFIX/include/cudf/detail/gather.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 15caaec9bec..cbe2811afe4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -238,6 +238,7 @@ add_library(
   src/copying/gather.cu
   src/copying/get_element.cu
   src/copying/pack.cpp
+  src/copying/purge_nonempty_nulls.cu
   src/copying/reverse.cu
   src/copying/sample.cu
   src/copying/scatter.cu
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 2e559afef4f..8f1ad7da9b6 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -17,7 +17,10 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
@@ -939,5 +942,155 @@ std::unique_ptr<table> sample(
   int64_t const seed                  = 0,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Checks if a column or its descendants have non-empty null rows
+ *
+ * @note This function is exact. If it returns `true`, there exists one or more
+ * non-empty null elements.
+ *
+ * A LIST or STRING column might have non-empty rows that are marked as null.
+ * A STRUCT OR LIST column might have child columns that have non-empty null rows.
+ * Other types of columns are deemed incapable of having non-empty null rows.
+ * E.g. Fixed width columns have no concept of an "empty" row.
+ *
+ * @param input The column which is (and whose descendants are) to be checked for
+ * non-empty null rows.
+ * @return true If either the column or its descendants have non-empty null rows.
+ * @return false If neither the column or its descendants have non-empty null rows.
+ */
+bool has_nonempty_nulls(column_view const& input);
+
+/**
+ * @brief Approximates if a column or its descendants *may* have non-empty null elements
+ *
+ * @note This function is approximate.
+ * - `true`: Non-empty null elements could exist
+ * - `false`: Non-empty null elements definitely do not exist
+ *
+ * False positives are possible, but false negatives are not.
+ *
+ * Compared to the exact `has_nonempty_nulls()` function, this function is typically
+ * more efficient.
+ *
+ * Complexity:
+ * - Best case: `O(count_descendants(input))`
+ * - Worst case: `O(count_descendants(input)) * m`, where `m` is the number of rows in the largest
+ * descendant
+ *
+ * @param input The column which is (and whose descendants are) to be checked for
+ * non-empty null rows
+ * @return true If either the column or its decendants have null rows
+ * @return false If neither the column nor its descendants have null rows
+ */
+bool may_have_nonempty_nulls(column_view const& input);
+
+/**
+ * @brief Copies `input`, purging any non-empty null rows in the column or its descendants
+ *
+ * LIST columns may have non-empty null rows.
+ * For example:
+ * @code{.pseudo}
+ *
+ * auto const lists   = lists_column_wrapper<int32_t>{ {0,1}, {2,3}, {4,5} }.release();
+ * cudf::detail::set_null_mask(lists->null_mask(), 1, 2, false);
+ *
+ * lists[1] is now null, but the lists child column still stores `{2,3}`.
+ * The lists column contents will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 4, 6]
+ *   Child:    [0, 1, 2, 3, 4, 5]
+ *
+ * After purging the contents of the list's null rows, the column's contents
+ * will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 2, 4]
+ *   Child:    [0, 1, 4, 5]
+ * @endcode
+ *
+ * The purge operation only applies directly to LIST and STRING columns, but it
+ * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns
+ * may have child/decendant columns that are LIST or STRING.
+ *
+ * @param input The column whose null rows are to be checked and purged
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return std::unique_ptr<column> Column with equivalent contents to `input`, but with
+ * the contents of null rows purged
+ */
+std::unique_ptr<column> purge_nonempty_nulls(
+  lists_column_view const& input,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Copies `input`, purging any non-empty null rows in the column or its descendants
+ *
+ * STRING columns may have non-empty null rows.
+ * For example:
+ * @code{.pseudo}
+ *
+ * auto const strings = strings_column_wrapper{ "AB", "CD", "EF" }.release();
+ * cudf::detail::set_null_mask(strings->null_mask(), 1, 2, false);
+ *
+ * strings[1] is now null, but the strings column still stores `"CD"`.
+ * The lists column contents will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 4, 6]
+ *   Child:    [A, B, C, D, E, F]
+ *
+ * After purging the contents of the list's null rows, the column's contents
+ * will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 2, 4]
+ *   Child:    [A, B, E, F]
+ * @endcode
+ *
+ * The purge operation only applies directly to LIST and STRING columns, but it
+ * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns
+ * may have child/decendant columns that are LIST or STRING.
+ *
+ * @param input The column whose null rows are to be checked and purged
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return std::unique_ptr<column> Column with equivalent contents to `input`, but with
+ * the contents of null rows purged
+ */
+std::unique_ptr<column> purge_nonempty_nulls(
+  strings_column_view const& input,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Copies `input`, purging any non-empty null rows in the column or its descendants
+ *
+ * STRUCTS columns may have null rows, with non-empty child rows.
+ * For example:
+ * @code{.pseudo}
+ *
+ * auto const lists   = lists_column_wrapper<int32_t>{ {0,1}, {2,3}, {4,5} };
+ * auto const structs = structs_column_wrapper{ {lists}, null_at(1) };
+ *
+ * structs[1].child is now null, but the lists column still stores `{2,3}`.
+ * The lists column contents will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 4, 6]
+ *   Child:    [0, 1, 2, 3, 4, 5]
+ *
+ * After purging the contents of the list's null rows, the column's contents
+ * will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 2, 4]
+ *   Child:    [0, 1, 4, 5]
+ * @endcode
+ *
+ * The purge operation only applies directly to LIST and STRING columns, but it
+ * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns
+ * may have child/decendant columns that are LIST or STRING.
+ *
+ * @param input The column whose null rows are to be checked and purged
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return std::unique_ptr<column> Column with equivalent contents to `input`, but with
+ * the contents of null rows purged
+ */
+std::unique_ptr<column> purge_nonempty_nulls(
+  structs_column_view const& input,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy.cuh b/cpp/include/cudf/detail/copy.cuh
new file mode 100644
index 00000000000..773bce7131f
--- /dev/null
+++ b/cpp/include/cudf/detail/copy.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/gather.cuh>
+
+namespace cudf::detail {
+
+/**
+ * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*)
+ *
+ * @tparam ColumnViewT View type (lists_column_view, strings_column_view, or strings_column_view)
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+template <typename ColumnViewT>
+std::unique_ptr<cudf::column> purge_nonempty_nulls(ColumnViewT const& input,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  // Implement via identity gather.
+  auto const input_column = input.parent();
+  auto const gather_begin = thrust::counting_iterator<cudf::size_type>(0);
+  auto const gather_end   = gather_begin + input_column.size();
+
+  auto gathered_table = cudf::detail::gather(table_view{{input_column}},
+                                             gather_begin,
+                                             gather_end,
+                                             out_of_bounds_policy::DONT_CHECK,
+                                             stream,
+                                             mr);
+  return std::move(gathered_table->release()[0]);
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 50157d16876..abd14fbda89 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -299,5 +299,22 @@ std::unique_ptr<scalar> get_element(
   size_type index,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::has_nonempty_nulls
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+bool has_nonempty_nulls(column_view const& input,
+                        rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+/**
+ * @copydoc cudf::may_have_nonempty_nulls
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+bool may_have_nonempty_nulls(column_view const& input,
+                             rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index c637ad041ba..7df36be2385 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/bit.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -82,6 +83,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
   auto dst_offsets_c = cudf::make_fixed_width_column(
     data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr);
   mutable_column_view dst_offsets_v = dst_offsets_c->mutable_view();
+  auto const source_column_nullmask = source_column.null_mask();
 
   // generate the compacted outgoing offsets.
   auto count_iter = thrust::make_counting_iterator<int32_t>(0);
@@ -90,12 +92,23 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
     count_iter,
     count_iter + offset_count,
     dst_offsets_v.begin<int32_t>(),
-    [gather_map, output_count, src_offsets, src_size] __device__(int32_t index) -> int32_t {
+    [source_column_nullmask,
+     source_column_offset = source_column.offset(),
+     gather_map,
+     output_count,
+     src_offsets,
+     src_size] __device__(int32_t index) -> int32_t {
       int32_t offset_index = index < output_count ? gather_map[index] : 0;
 
       // if this is an invalid index, this will be a NULL list
       if (NullifyOutOfBounds && ((offset_index < 0) || (offset_index >= src_size))) { return 0; }
 
+      // If the source row is null, the output row size must be 0.
+      if (source_column_nullmask != nullptr &&
+          not cudf::bit_is_set(source_column_nullmask, source_column_offset + offset_index)) {
+        return 0;
+      }
+
       // the length of this list
       return src_offsets[offset_index + 1] - src_offsets[offset_index];
     },
@@ -110,15 +123,27 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
 
   // generate the base offsets
   rmm::device_uvector<int32_t> base_offsets = rmm::device_uvector<int32_t>(output_count, stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    gather_map,
-                    gather_map + output_count,
-                    base_offsets.data(),
-                    [src_offsets, src_size, shift] __device__(int32_t index) {
-                      // if this is an invalid index, this will be a NULL list
-                      if (NullifyOutOfBounds && ((index < 0) || (index >= src_size))) { return 0; }
-                      return src_offsets[index] - shift;
-                    });
+  thrust::transform(
+    rmm::exec_policy(stream),
+    gather_map,
+    gather_map + output_count,
+    base_offsets.data(),
+    [source_column_nullmask,
+     source_column_offset = source_column.offset(),
+     src_offsets,
+     src_size,
+     shift] __device__(int32_t index) {
+      // if this is an invalid index, this will be a NULL list
+      if (NullifyOutOfBounds && ((index < 0) || (index >= src_size))) { return 0; }
+
+      // If the source row is null, the output row size must be 0.
+      if (source_column_nullmask != nullptr &&
+          not cudf::bit_is_set(source_column_nullmask, source_column_offset + index)) {
+        return 0;
+      }
+
+      return src_offsets[index] - shift;
+    });
 
   // Retrieve size of the resulting gather map for level N+1 (the last offset)
   size_type child_gather_map_size =
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 1b10c70d6d6..d46ab3a91a1 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -303,14 +303,17 @@ std::unique_ptr<cudf::column> gather(
     data_type{type_id::INT32}, output_count + 1, mask_state::UNALLOCATED, stream, mr);
   auto const d_out_offsets = out_offsets_column->mutable_view().template data<int32_t>();
   auto const d_in_offsets  = (strings_count > 0) ? strings.offsets_begin() : nullptr;
-  thrust::transform(rmm::exec_policy(stream),
-                    begin,
-                    end,
-                    d_out_offsets,
-                    [d_in_offsets, strings_count] __device__(size_type in_idx) {
-                      if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0;
-                      return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx];
-                    });
+  auto const d_strings     = column_device_view::create(strings.parent(), stream);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    begin,
+    end,
+    d_out_offsets,
+    [d_strings = *d_strings, d_in_offsets, strings_count] __device__(size_type in_idx) {
+      if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0;
+      if (not d_strings.is_valid(in_idx)) return 0;
+      return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx];
+    });
 
   // check total size is not too large
   size_t const total_bytes = thrust::transform_reduce(
@@ -329,7 +332,6 @@ std::unique_ptr<cudf::column> gather(
 
   // build chars column
   cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
-  auto const d_strings  = column_device_view::create(strings.parent(), stream);
   auto out_chars_column = gather_chars(d_strings->begin<string_view>(),
                                        begin,
                                        end,
diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp
index 329c24cfe0a..ca866d8555e 100644
--- a/cpp/include/cudf/structs/structs_column_view.hpp
+++ b/cpp/include/cudf/structs/structs_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,6 +41,11 @@ class structs_column_view : public column_view {
 
   explicit structs_column_view(column_view const& rhs);
 
+  /**
+   * @brief Returns the parent column.
+   */
+  [[nodiscard]] column_view parent() const;
+
   using column_view::child_begin;
   using column_view::child_end;
   using column_view::has_nulls;
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
new file mode 100644
index 00000000000..778d6c4df55
--- /dev/null
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+
+namespace cudf {
+namespace detail {
+
+using cudf::type_id;
+
+namespace {
+
+/// Check if nonempty-null checks can be skipped for a given type.
+bool type_may_have_nonempty_nulls(cudf::type_id const& type)
+{
+  return type == type_id::STRING || type == type_id::LIST || type == type_id::STRUCT;
+}
+
+/// Check if the (STRING/LIST) column has any null rows with non-zero length.
+bool has_nonempty_null_rows(cudf::column_view const& input, rmm::cuda_stream_view stream)
+{
+  if (not input.has_nulls()) { return false; }  // No nulls => no dirty rows.
+
+  // Cross-reference nullmask and offsets.
+  auto const type         = input.type().id();
+  auto const offsets      = (type == type_id::STRING) ? (strings_column_view{input}).offsets()
+                                                      : (lists_column_view{input}).offsets();
+  auto const d_input      = cudf::column_device_view::create(input);
+  auto const is_dirty_row = [d_input = *d_input, offsets = offsets.begin<size_type>()] __device__(
+                              size_type const& row_idx) {
+    return d_input.is_null_nocheck(row_idx) && (offsets[row_idx] != offsets[row_idx + 1]);
+  };
+
+  auto const row_begin = thrust::counting_iterator<cudf::size_type>(0);
+  auto const row_end   = row_begin + input.size();
+  return thrust::count_if(rmm::exec_policy(stream), row_begin, row_end, is_dirty_row) > 0;
+}
+
+}  // namespace
+
+/**
+ * @copydoc cudf::detail::has_nonempty_nulls
+ */
+bool has_nonempty_nulls(cudf::column_view const& input, rmm::cuda_stream_view stream)
+{
+  auto const type = input.type().id();
+
+  if (not type_may_have_nonempty_nulls(type)) { return false; }
+
+  // For types with variable-length rows, check if any rows are "dirty".
+  // A dirty row is a null row with non-zero length.
+  if ((type == type_id::STRING || type == type_id::LIST) && has_nonempty_null_rows(input, stream)) {
+    return true;
+  }
+
+  // For complex types, check if child columns need purging.
+  if ((type == type_id::STRUCT || type == type_id::LIST) &&
+      std::any_of(input.child_begin(), input.child_end(), [stream](auto const& child) {
+        return cudf::detail::has_nonempty_nulls(child, stream);
+      })) {
+    return true;
+  }
+
+  return false;
+}
+}  // namespace detail
+
+/**
+ * @copydoc cudf::may_have_nonempty_nulls
+ */
+bool may_have_nonempty_nulls(column_view const& input)
+{
+  auto const type = input.type().id();
+
+  if (not detail::type_may_have_nonempty_nulls(type)) { return false; }
+
+  if ((type == type_id::STRING || type == type_id::LIST) && input.has_nulls()) { return true; }
+
+  if ((type == type_id::STRUCT || type == type_id::LIST) &&
+      std::any_of(input.child_begin(), input.child_end(), may_have_nonempty_nulls)) {
+    return true;
+  }
+
+  return false;
+}
+
+/**
+ * @copydoc cudf::has_nonempty_nulls
+ */
+bool has_nonempty_nulls(column_view const& input) { return detail::has_nonempty_nulls(input); }
+
+/**
+ * @copydoc cudf::purge_nonempty_nulls(lists_column_view const&, rmm::mr::device_memory_resource*)
+ */
+std::unique_ptr<cudf::column> purge_nonempty_nulls(lists_column_view const& input,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*)
+ */
+std::unique_ptr<cudf::column> purge_nonempty_nulls(structs_column_view const& input,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc cudf::purge_nonempty_nulls(strings_column_view const&, rmm::mr::device_memory_resource*)
+ */
+std::unique_ptr<cudf::column> purge_nonempty_nulls(strings_column_view const& input,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp
index 681f13386ff..7d8c8837d2d 100644
--- a/cpp/src/structs/structs_column_view.cpp
+++ b/cpp/src/structs/structs_column_view.cpp
@@ -25,6 +25,8 @@ structs_column_view::structs_column_view(column_view const& rhs) : column_view{r
   CUDF_EXPECTS(type().id() == type_id::STRUCT, "structs_column_view only supports struct columns");
 }
 
+column_view structs_column_view::parent() const { return *this; }
+
 column_view structs_column_view::get_sliced_child(int index) const
 {
   std::vector<column_view> children;
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e016f47616b..95c54d7596e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -250,6 +250,7 @@ ConfigureTest(
   copying/gather_tests.cpp
   copying/get_value_tests.cpp
   copying/pack_tests.cpp
+  copying/purge_nonempty_nulls_tests.cpp
   copying/sample_tests.cpp
   copying/scatter_tests.cpp
   copying/scatter_list_tests.cpp
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index 4e0e70bf15c..44a79e63cd8 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -645,7 +645,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNonNested)
                                           0,
                                           cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *expected);
 }
 
 TYPED_TEST(ListsStructsLeafTest, FromNested)
diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
new file mode 100644
index 00000000000..77fd3f66ee5
--- /dev/null
+++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+namespace cudf::test {
+
+using iterators::no_nulls;
+using iterators::null_at;
+using iterators::nulls_at;
+using T             = int32_t;  // The actual type of the leaf node isn't really important.
+using values_col_t  = fixed_width_column_wrapper<T>;
+using offsets_col_t = fixed_width_column_wrapper<size_type>;
+using gather_map_t  = fixed_width_column_wrapper<size_type>;
+
+template <typename T>
+using LCW = cudf::test::lists_column_wrapper<T, int32_t>;
+
+struct PurgeNonEmptyNullsTest : public cudf::test::BaseFixture {
+  /// Helper to run gather() on a single column, and extract the single column from the result.
+  std::unique_ptr<cudf::column> gather(column_view const& input, gather_map_t const& gather_map)
+  {
+    auto gathered =
+      cudf::gather(cudf::table_view{{input}}, gather_map, out_of_bounds_policy::NULLIFY);
+    return std::move(gathered->release()[0]);
+  }
+
+  /// Verify that the result of `sanitize()` is equivalent to the unsanitized input,
+  /// except that the null rows are also empty.
+  template <typename ColumnViewT>
+  void test_purge(ColumnViewT const& unpurged)
+  {
+    auto const purged = cudf::purge_nonempty_nulls(unpurged);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(unpurged.parent(), *purged);
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*purged));
+  }
+};
+
+// List<T>.
+TEST_F(PurgeNonEmptyNullsTest, SingleLevelList)
+{
+  auto const input = LCW<T>{{{{1, 2, 3, 4}, null_at(2)},
+                             {5},
+                             {6, 7},  // <--- Will be set to NULL. Unsanitized row.
+                             {8, 9, 10}},
+                            no_nulls()}
+                       .release();
+  EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
+
+  // Set nullmask, post construction.
+  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
+
+  test_purge(lists_column_view{*input});
+
+  {
+    // Selecting all rows from input, in different order.
+    auto const results           = gather(input->view(), {1, 2, 0, 3});
+    auto const results_list_view = lists_column_view(*results);
+
+    auto const expected = LCW<T>{{{5},
+                                  {},  // NULL.
+                                  {{1, 2, 3, 4}, null_at(2)},
+                                  {8, 9, 10}},
+                                 null_at(1)};
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 1, 1, 5, 8});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.child(),
+                                   values_col_t{{5, 1, 2, 3, 4, 8, 9, 10}, null_at(3)});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+  {
+    // Test when gather selects rows preceded by unsanitized rows.
+    auto const results  = gather(input->view(), {3, 100, 0});
+    auto const expected = LCW<T>{{
+                                   {8, 9, 10},
+                                   {},  // NULL.
+                                   {{1, 2, 3, 4}, null_at(2)},
+                                 },
+                                 null_at(1)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+  {
+    // Test when gather selects rows followed by unsanitized rows.
+    auto const results  = gather(input->view(), {1, 100, 0});
+    auto const expected = LCW<T>{{
+                                   {5},
+                                   {},  // NULL.
+                                   {{1, 2, 3, 4}, null_at(2)},
+                                 },
+                                 null_at(1)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+  {
+    // Test when gather selects unsanitized row specifically.
+    auto const results            = gather(input->view(), {2});
+    auto const results_lists_view = lists_column_view(*results);
+    auto const expected           = LCW<T>{{
+                                   LCW<T>{}  // NULL.
+                                 },
+                                 null_at(0)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.child(), values_col_t{});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+}
+
+// List<List<T>>.
+TEST_F(PurgeNonEmptyNullsTest, TwoLevelList)
+{
+  auto const input =
+    LCW<T>{
+      {{{1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}},
+       {{11, 12}, {13, 14, 15}, {16, 17, 18}, {19}},
+       {{21}, {22, 23}, {24, 25, 26}},
+       {{31, 32}, {33, 34, 35, 36}, {}, {37, 38}},  //<--- Will be set to NULL. Unsanitized row.
+       {{41}, {42, 43}}},
+      no_nulls()}
+      .release();
+  EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
+
+  // Set nullmask, post construction.
+  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
+
+  test_purge(lists_column_view{*input});
+
+  {
+    // Verify that gather() output is sanitized.
+    auto const results            = gather(input->view(), {100, 3, 0, 1});
+    auto const results_lists_view = lists_column_view(*results);
+
+    auto const expected = LCW<T>{{
+                                   LCW<T>{},  // NULL, because of out of bounds.
+                                   LCW<T>{},  // NULL, because input row was null.
+                                   {{1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}},  // i.e. input[0]
+                                   {{11, 12}, {13, 14, 15}, {16, 17, 18}, {19}}  // i.e. input[1]
+                                 },
+                                 nulls_at({0, 1})};
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0, 0, 5, 9});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+      results_lists_view.child(),
+      LCW<T>{
+        {1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}, {11, 12}, {13, 14, 15}, {16, 17, 18}, {19}});
+
+    auto const child_lists_view = lists_column_view(results_lists_view.child());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_lists_view.offsets(),
+                                   offsets_col_t{0, 3, 7, 8, 10, 11, 13, 16, 19, 20});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+      child_lists_view.child(),
+      values_col_t{1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 11, 12, 13, 14, 15, 16, 17, 18, 19});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+}
+
+// List<List<List<T>>>.
+TEST_F(PurgeNonEmptyNullsTest, ThreeLevelList)
+{
+  auto const input = LCW<T>{{{{{1, 2}, {3}}, {{4, 5}, {6, 7}}, {{8, 8}, {}}, {{9, 1}}, {{2, 3}}},
+                             {{{11, 12}}, {{13}, {14, 15}}, {{16, 17, 18}}, {{19, 19}, {}}},
+                             {{{21, 21}}, {{22, 23}, {}}, {{24, 25}, {26}}},
+                             {{{31, 32}, {}},
+                              {{33, 34, 35}, {36}},
+                              {},
+                              {{37, 38}}},  //<--- Will be set to NULL. Unsanitized row.
+                             {{{41, 41, 41}}, {{42, 43}}}},
+                            no_nulls()}
+                       .release();
+  EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
+
+  // Set nullmask, post construction.
+  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
+
+  test_purge(lists_column_view{*input});
+
+  {
+    auto const results            = gather(input->view(), {100, 3, 0, 1});
+    auto const results_lists_view = lists_column_view(*results);
+
+    auto const expected = LCW<T>{
+      {
+        LCW<T>{},  // NULL, because of out of bounds.
+        LCW<T>{},  // NULL, because input row was null.
+        {{{1, 2}, {3}}, {{4, 5}, {6, 7}}, {{8, 8}, {}}, {{9, 1}}, {{2, 3}}},  // i.e. input[0]
+        {{{11, 12}}, {{13}, {14, 15}}, {{16, 17, 18}}, {{19, 19}, {}}}        // i.e. input[1]
+      },
+      nulls_at({0, 1})};
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0, 0, 5, 9});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.child(),
+                                   LCW<T>{{{1, 2}, {3}},
+                                          {{4, 5}, {6, 7}},
+                                          {{8, 8}, {}},
+                                          {{9, 1}},
+                                          {{2, 3}},
+                                          {{11, 12}},
+                                          {{13}, {14, 15}},
+                                          {{16, 17, 18}},
+                                          {{19, 19}, {}}});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+}
+
+// List<string>.
+TEST_F(PurgeNonEmptyNullsTest, ListOfStrings)
+{
+  using T = string_view;
+
+  auto const input = LCW<T>{{{{"1", "22", "", "4444"}, null_at(2)},
+                             {"55555"},
+                             {"666666", "7777777"},  // <--- Will be set to NULL. Unsanitized row.
+                             {"88888888", "999999999", "1010101010"},
+                             {"11", "22", "33", "44"},
+                             {"55", "66", "77", "88"}},
+                            no_nulls()}
+                       .release();
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
+
+  // Set nullmask, post construction.
+  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
+
+  test_purge(lists_column_view{*input});
+
+  {
+    // Selecting all rows from input, in different order.
+    auto const results           = gather(input->view(), {1, 2, 0, 3});
+    auto const results_list_view = lists_column_view(*results);
+
+    auto const expected = LCW<T>{{{"55555"},
+                                  {},  // NULL.
+                                  {{"1", "22", "", "4444"}, null_at(2)},
+                                  {"88888888", "999999999", "1010101010"}},
+                                 null_at(1)};
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 1, 1, 5, 8});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+      results_list_view.child(),
+      strings_column_wrapper{
+        {"55555", "1", "22", "", "4444", "88888888", "999999999", "1010101010"}, null_at(3)});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+  {
+    // Gathering from a sliced column.
+    auto const sliced = cudf::slice({input->view()}, {1, 5})[0];  // Lop off 1 row at each end.
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(sliced));
+    EXPECT_TRUE(cudf::has_nonempty_nulls(sliced));
+
+    auto const results           = gather(sliced, {1, 2, 0, 3});
+    auto const results_list_view = lists_column_view(*results);
+    auto const expected          = LCW<T>{{
+                                   {},
+                                   {"88888888", "999999999", "1010101010"},
+                                   {"55555"},
+                                   {"11", "22", "33", "44"},
+                                 },
+                                 null_at(0)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 0, 3, 4, 8});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+      results_list_view.child(),
+      strings_column_wrapper{
+        "88888888", "999999999", "1010101010", "55555", "11", "22", "33", "44"});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+}
+
+// List<string>.
+TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings)
+{
+  auto strings =
+    strings_column_wrapper{
+      {"1", "22", "3", "44", "5", "66", "7", "8888", "9", "1010"},  //<--- "8888" will be
+                                                                    // unsanitized.
+      no_nulls()}
+      .release();
+  EXPECT_FALSE(cudf::may_have_nonempty_nulls(*strings));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*strings));
+
+  // Set strings nullmask, post construction.
+  set_null_mask(strings->mutable_view().null_mask(), 7, 8, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*strings));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*strings));
+
+  test_purge(strings_column_view{*strings});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    strings_column_view(*strings).offsets(), offsets_col_t{0, 1, 3, 4, 6, 7, 9, 10, 14, 15, 19}
+    // 10-14 indicates that "8888" is unsanitized.
+  );
+
+  // Construct a list column from the strings column.
+  auto const lists = make_lists_column(4,
+                                       offsets_col_t{0, 4, 5, 7, 10}.release(),
+                                       std::move(strings),
+                                       0,
+                                       detail::make_null_mask(no_nulls(), no_nulls() + 4));
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*lists));
+
+  // Set lists nullmask, post construction.
+  cudf::detail::set_null_mask(lists->mutable_view().null_mask(), 2, 3, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*lists));
+
+  test_purge(lists_column_view{*lists});
+
+  // At this point,
+  // 1. {"66", "7"} will be unsanitized.
+  // 2. {"8888", "9", "1010"} will be actually be {NULL, "9", "1010"}.
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    lists_column_view(*lists).offsets(),
+    offsets_col_t{0, 4, 5, 7, 10});  // 5-7 indicates that list row#2 is unsanitized.
+
+  auto const result   = gather(lists->view(), {1, 2, 0, 3});
+  auto const expected = LCW<string_view>{{{"5"},
+                                          {},  // NULL.
+                                          {"1", "22", "3", "44"},
+                                          {{"", "9", "1010"}, null_at(0)}},
+                                         null_at(1)};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
+
+  // Ensure row#2 has been sanitized.
+  auto const results_lists_view = lists_column_view(*result);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 1, 1, 5, 8}
+                                 // 1-1 indicates that row#2 is sanitized.
+  );
+
+  // Ensure that "8888" has been sanitized, and stored as "".
+  auto const child_strings_view = strings_column_view(results_lists_view.child());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_strings_view.offsets(),
+                                 offsets_col_t{0, 1, 2, 4, 5, 7, 7, 8, 12});
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*result));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*result));
+}
+
+// Struct<List<T>>.
+TEST_F(PurgeNonEmptyNullsTest, StructOfList)
+{
+  auto const structs_input =
+    [] {
+      auto child = LCW<T>{{{{1, 2, 3, 4}, null_at(2)},
+                           {5},
+                           {6, 7},  //<--- Unsanitized row.
+                           {8, 9, 10}},
+                          no_nulls()};
+      EXPECT_FALSE(cudf::has_nonempty_nulls(child));
+      return structs_column_wrapper{{child}, null_at(2)};
+    }()
+      .release();
+
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*structs_input));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*structs_input));
+
+  test_purge(structs_column_view{*structs_input});
+
+  // At this point, even though the structs column has a null at index 2,
+  // the child column has a non-empty list row at index 2: {6, 7}.
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(lists_column_view(structs_input->child(0)).child(),
+                                 values_col_t{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, null_at(2)});
+
+  {
+    // Test rearrange.
+    auto const gather_map      = gather_map_t{1, 2, 0, 3};
+    auto const result          = gather(structs_input->view(), gather_map);
+    auto const expected_result = [] {
+      auto child = LCW<T>{{{5},
+                           LCW<T>{},  //<--- Now, sanitized.
+                           {{1, 2, 3, 4}, null_at(2)},
+                           {8, 9, 10}},
+                          null_at(1)};
+      return structs_column_wrapper{{child}, null_at(1)};
+    }();
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_result);
+    auto const results_child = lists_column_view(result->child(0));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.offsets(), offsets_col_t{0, 1, 1, 5, 8});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.child(),
+                                   values_col_t{{5, 1, 2, 3, 4, 8, 9, 10}, null_at(3)});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*result));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*result));
+  }
+}
+
+}  // namespace cudf::test

From 3c208a618f7f3443d021c01ad27f560a7d71e7d7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 29 Apr 2022 09:36:29 -0400
Subject: [PATCH 11/28] Enable pydocstyle rules involving quotes (#10748)

This PR enables D30* errors for pydocstyle. It also sets up the `ignore-decorators` configuration so that future PRs involving D10* errors will treat docutils decorators appropriately. Contributes to #10711.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10748
---
 .pre-commit-config.yaml                | 15 ++++++
 python/.flake8                         | 24 +++++-----
 python/cudf/cudf/comm/gpuarrow.py      |  4 +-
 python/cudf/cudf/core/column/string.py | 66 +++++++++++++-------------
 python/cudf/cudf/core/frame.py         |  4 +-
 python/cudf/cudf/core/series.py        |  4 +-
 6 files changed, 66 insertions(+), 51 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5f690f5f827..cd7b8aea6d7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,5 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
 repos:
       - repo: https://github.com/PyCQA/isort
         rev: 5.6.4
@@ -56,6 +58,19 @@ repos:
         hooks:
               - id: pydocstyle
                 args: ["--config=python/.flake8"]
+                exclude: |
+                    (?x)^(
+                    ci|
+                    cpp|
+                    conda|
+                    docs|
+                    java|
+                    notebooks|
+                    python/dask_cudf|
+                    python/cudf_kafka|
+                    python/custreamz|
+                    python/cudf/cudf/tests
+                    )
       - repo: https://github.com/pre-commit/mirrors-clang-format
         rev: v11.1.0
         hooks:
diff --git a/python/.flake8 b/python/.flake8
index c645c46a216..667875030cc 100644
--- a/python/.flake8
+++ b/python/.flake8
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 [flake8]
 exclude = __init__.py
@@ -9,14 +9,14 @@ ignore =
     E203
 
 [pydocstyle]
-match = ^(.*abc\.py|.*api/types\.py|.*single_column_frame\.py|.*indexed_frame\.py)$
-# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather than include using match-dir.
-match-dir = ^(?!ci|cpp|python/dask_cudf|python/cudf_kafka|python/custreamz).*$
-# In addition to numpy style, we additionally ignore:
-add-ignore =
-    # magic methods
-    D105,
-    # no docstring in __init__
-    D107,
-    # newlines before docstrings
-    D204
+# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
+# than include using match-dir. Note that as discussed in
+# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
+# unlike the match option above this match-dir will have no effect when
+# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
+# also be maintained in the pre-commit config file.
+match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks|dask_cudf|cudf_kafka|custreamz|tests)).*$
+# Allow missing docstrings for docutils
+ignore-decorators = .*(docutils|doc_apply|copy_docstring).*
+select = 
+    D30
diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py
index 09b4cc5ffba..0c4d9d7f77e 100644
--- a/python/cudf/cudf/comm/gpuarrow.py
+++ b/python/cudf/cudf/comm/gpuarrow.py
@@ -119,12 +119,12 @@ def null(self):
 
     @property
     def data_raw(self):
-        "Accessor for the data buffer as a device array"
+        """Accessor for the data buffer as a device array"""
         return self._series._column.data_array_view
 
     @property
     def null_raw(self):
-        "Accessor for the null buffer as a device array"
+        """Accessor for the null buffer as a device array"""
         return self._series._column.mask_array_view
 
     def make_series(self):
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 1d836d9b759..0db7e7d9a27 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -201,7 +201,7 @@ def __getitem__(self, key):
             return self.get(key)
 
     def len(self) -> SeriesOrIndex:
-        """
+        r"""
         Computes the length of each element in the Series/Index.
 
         Returns
@@ -213,7 +213,7 @@ def len(self) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series(["dog", "", "\\n", None])
+        >>> s = cudf.Series(["dog", "", "\n", None])
         >>> s.str.len()
         0       3
         1       0
@@ -960,7 +960,7 @@ def replace(
         )
 
     def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex:
-        """
+        r"""
         Use the ``repl`` back-ref template to create a new string
         with the extracted elements found using the ``pat`` expression.
 
@@ -980,7 +980,7 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex:
         --------
         >>> import cudf
         >>> s = cudf.Series(["A543","Z756"])
-        >>> s.str.replace_with_backrefs('(\\\\d)(\\\\d)', 'V\\\\2\\\\1')
+        >>> s.str.replace_with_backrefs('(\\d)(\\d)', 'V\\2\\1')
         0    AV453
         1    ZV576
         dtype: object
@@ -1195,7 +1195,7 @@ def istimestamp(self, format: str) -> SeriesOrIndex:
         )
 
     def isfloat(self) -> SeriesOrIndex:
-        """
+        r"""
         Check whether all characters in each string form floating value.
 
         If a string has zero characters, False is returned for
@@ -1249,7 +1249,7 @@ def isfloat(self) -> SeriesOrIndex:
         4     True
         5    False
         dtype: bool
-        >>> s = cudf.Series(["this is plain text", "\\t\\n", "9.9", "9.9.9"])
+        >>> s = cudf.Series(["this is plain text", "\t\n", "9.9", "9.9.9"])
         >>> s.str.isfloat()
         0    False
         1    False
@@ -2239,7 +2239,7 @@ def get(self, i: int = 0) -> SeriesOrIndex:
         return self._return_or_inplace(libstrings.get(self._column, i))
 
     def get_json_object(self, json_path):
-        """
+        r"""
         Applies a JSONPath string to an input strings column
         where each row in the column is a valid json string
 
@@ -2258,7 +2258,7 @@ def get_json_object(self, json_path):
         >>> import cudf
         >>> s = cudf.Series(
             [
-                \\"\\"\\"
+                \"\"\"
                 {
                     "store":{
                         "book":[
@@ -2277,13 +2277,13 @@ def get_json_object(self, json_path):
                         ]
                     }
                 }
-                \\"\\"\\"
+                \"\"\"
             ])
         >>> s
-            0    {"store": {\\n        "book": [\\n        { "cat...
+            0    {"store": {\n        "book": [\n        { "cat...
             dtype: object
         >>> s.str.get_json_object("$.store.book")
-            0    [\\n        { "category": "reference",\\n       ...
+            0    [\n        { "category": "reference",\n       ...
             dtype: object
         """
 
@@ -3138,7 +3138,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex:
         )
 
     def strip(self, to_strip: str = None) -> SeriesOrIndex:
-        """
+        r"""
         Remove leading and trailing characters.
 
         Strip whitespaces (including newlines) or a set of
@@ -3169,11 +3169,11 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\\n', '3. Cat?\\t', None])
+        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\n', '3. Cat?\t', None])
         >>> s
         0    1. Ant.
-        1    2. Bee!\\n
-        2    3. Cat?\\t
+        1    2. Bee!\n
+        2    3. Cat?\t
         3         <NA>
         dtype: object
         >>> s.str.strip()
@@ -3182,7 +3182,7 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex:
         2    3. Cat?
         3       <NA>
         dtype: object
-        >>> s.str.strip('123.!? \\n\\t')
+        >>> s.str.strip('123.!? \n\t')
         0     Ant
         1     Bee
         2     Cat
@@ -3197,7 +3197,7 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex:
         )
 
     def lstrip(self, to_strip: str = None) -> SeriesOrIndex:
-        """
+        r"""
         Remove leading and trailing characters.
 
         Strip whitespaces (including newlines)
@@ -3228,11 +3228,11 @@ def lstrip(self, to_strip: str = None) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\\n', '3. Cat?\\t', None])
+        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\n', '3. Cat?\t', None])
         >>> s.str.lstrip('123.')
         0     Ant.
-        1     Bee!\\n
-        2     Cat?\\t
+        1     Bee!\n
+        2     Cat?\t
         3       <NA>
         dtype: object
         """
@@ -3244,7 +3244,7 @@ def lstrip(self, to_strip: str = None) -> SeriesOrIndex:
         )
 
     def rstrip(self, to_strip: str = None) -> SeriesOrIndex:
-        """
+        r"""
         Remove leading and trailing characters.
 
         Strip whitespaces (including newlines)
@@ -3277,14 +3277,14 @@ def rstrip(self, to_strip: str = None) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\\n', '3. Cat?\\t', None])
+        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\n', '3. Cat?\t', None])
         >>> s
         0    1. Ant.
-        1    2. Bee!\\n
-        2    3. Cat?\\t
+        1    2. Bee!\n
+        2    3. Cat?\t
         3         <NA>
         dtype: object
-        >>> s.str.rstrip('.!? \\n\\t')
+        >>> s.str.rstrip('.!? \n\t')
         0    1. Ant
         1    2. Bee
         2    3. Cat
@@ -3299,7 +3299,7 @@ def rstrip(self, to_strip: str = None) -> SeriesOrIndex:
         )
 
     def wrap(self, width: int, **kwargs) -> SeriesOrIndex:
-        """
+        r"""
         Wrap long strings in the Series/Index to be formatted in
         paragraphs with length less than a given width.
 
@@ -3340,8 +3340,8 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex:
         >>> data = ['line to be wrapped', 'another line to be wrapped']
         >>> s = cudf.Series(data)
         >>> s.str.wrap(12)
-        0             line to be\\nwrapped
-        1    another line\\nto be\\nwrapped
+        0             line to be\nwrapped
+        1    another line\nto be\nwrapped
         dtype: object
         """
         if not is_integer(width):
@@ -3575,7 +3575,7 @@ def isempty(self) -> SeriesOrIndex:
         return self._return_or_inplace((self._column == "").fillna(False))
 
     def isspace(self) -> SeriesOrIndex:
-        """
+        r"""
         Check whether all characters in each string are whitespace.
 
         This is equivalent to running the Python string method
@@ -3623,7 +3623,7 @@ def isspace(self) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series([' ', '\\t\\r\\n ', ''])
+        >>> s = cudf.Series([' ', '\t\r\n ', ''])
         >>> s.str.isspace()
         0     True
         1     True
@@ -4271,7 +4271,7 @@ def normalize_spaces(self) -> SeriesOrIndex:
         )
 
     def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
-        """
+        r"""
         Normalizes strings characters for tokenizing.
 
         This uses the normalizer that is built into the
@@ -4280,7 +4280,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
             - adding padding around punctuation (unicode category starts with
               "P") as well as certain ASCII symbols like "^" and "$"
             - adding padding around the CJK Unicode block characters
-            - changing whitespace (e.g. ``\\t``, ``\\n``, ``\\r``) to space
+            - changing whitespace (e.g. ``\t``, ``\n``, ``\r``) to space
             - removing control characters (unicode categories "Cc" and "Cf")
 
         If `do_lower_case = true`, lower-casing also removes the accents.
@@ -4303,7 +4303,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> ser = cudf.Series(["héllo, \\tworld","ĂĆCĖÑTED","$99"])
+        >>> ser = cudf.Series(["héllo, \tworld","ĂĆCĖÑTED","$99"])
         >>> ser.str.normalize_characters()
         0    hello ,  world
         1          accented
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 104ed3eeb67..d0e9e6d94c1 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -3356,7 +3356,7 @@ def to_dlpack(self):
 
     @_cudf_nvtx_annotate
     def to_string(self):
-        """
+        r"""
         Convert to string
 
         cuDF uses Pandas internals for efficient string formatting.
@@ -3373,7 +3373,7 @@ def to_string(self):
         >>> df['key'] = [0, 1, 2]
         >>> df['val'] = [float(i + 10) for i in range(3)]
         >>> df.to_string()
-        '   key   val\\n0    0  10.0\\n1    1  11.0\\n2    2  12.0'
+        '   key   val\n0    0  10.0\n1    1  11.0\n2    2  12.0'
         """
         return repr(self)
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 4ff671509a0..d813db58d1e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4614,13 +4614,13 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
 
 @_cudf_nvtx_annotate
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
-    """Returns a boolean array where two arrays are equal within a tolerance.
+    r"""Returns a boolean array where two arrays are equal within a tolerance.
 
     Two values in ``a`` and ``b`` are  considered equal when the following
     equation is satisfied.
 
     .. math::
-       |a - b| \\le \\mathrm{atol} + \\mathrm{rtol} |b|
+       |a - b| \le \mathrm{atol} + \mathrm{rtol} |b|
 
     Parameters
     ----------

From 15e49824a8cb2a5a7ec6a6e5f273589a66f1c120 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 29 Apr 2022 10:22:10 -0500
Subject: [PATCH 12/28] Enable pydocstyle for all packages. (#10759)

Follow-up to #10748 to enable the base pydocstyle rules on all Python packages (`dask_cudf`, `cudf_kafka`, `custreamz`) and test files. Contributes to #10711, #10758.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10759
---
 .pre-commit-config.yaml             | 6 +-----
 python/.flake8                      | 2 +-
 python/custreamz/custreamz/kafka.py | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cd7b8aea6d7..46d5223f7d3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -65,11 +65,7 @@ repos:
                     conda|
                     docs|
                     java|
-                    notebooks|
-                    python/dask_cudf|
-                    python/cudf_kafka|
-                    python/custreamz|
-                    python/cudf/cudf/tests
+                    notebooks
                     )
       - repo: https://github.com/pre-commit/mirrors-clang-format
         rev: v11.1.0
diff --git a/python/.flake8 b/python/.flake8
index 667875030cc..b763c209fc1 100644
--- a/python/.flake8
+++ b/python/.flake8
@@ -15,7 +15,7 @@ ignore =
 # unlike the match option above this match-dir will have no effect when
 # pydocstyle is invoked from pre-commit. Therefore this exclusion list must
 # also be maintained in the pre-commit config file.
-match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks|dask_cudf|cudf_kafka|custreamz|tests)).*$
+match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks)).*$
 # Allow missing docstrings for docutils
 ignore-decorators = .*(docutils|doc_apply|copy_docstring).*
 select = 
diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py
index f5d5031602f..0198757c68d 100644
--- a/python/custreamz/custreamz/kafka.py
+++ b/python/custreamz/custreamz/kafka.py
@@ -95,7 +95,7 @@ def read_gdf(
         message_format="json",
     ):
 
-        """
+        r"""
         Read messages from the underlying KafkaDatasource connection and create
         a cudf Dataframe
 

From 3c4e72e68d9406d65939b7d2fdf28b0b921840dd Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Fri, 29 Apr 2022 21:24:12 +0530
Subject: [PATCH 13/28] Add row hasher with nested column support (#10641)

Contributes to #10186

Authors:
  - Devavret Makkar (https://github.com/devavret)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/10641
---
 cpp/benchmarks/stream_compaction/distinct.cpp |  41 +++
 cpp/include/cudf/detail/hashing.hpp           |   5 +-
 cpp/include/cudf/detail/iterator.cuh          |   8 +-
 .../cudf/detail/utilities/algorithm.cuh       |  28 ++
 cpp/include/cudf/detail/utilities/column.hpp  |  10 +-
 .../cudf/table/experimental/row_operators.cuh | 273 +++++++++++++++---
 cpp/src/hash/hashing.cu                       |  29 +-
 cpp/src/hash/murmur_hash.cu                   |  28 +-
 cpp/src/stream_compaction/distinct.cu         |  18 +-
 .../stream_compaction_common.cuh              |  22 ++
 cpp/src/table/row_operators.cu                |  60 ++--
 cpp/tests/hashing/hash_test.cpp               | 224 +++++++++++++-
 cpp/tests/reductions/list_rank_test.cpp       |   4 +-
 .../stream_compaction/distinct_tests.cpp      | 242 ++++++++++++++++
 python/cudf/cudf/tests/test_dataframe.py      |   2 +-
 15 files changed, 880 insertions(+), 114 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/algorithm.cuh

diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp
index 749badc715d..149c6ad7219 100644
--- a/cpp/benchmarks/stream_compaction/distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct.cpp
@@ -19,6 +19,7 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/lists/list_view.cuh>
 #include <cudf/types.hpp>
 
 #include <nvbench/nvbench.cuh>
@@ -55,3 +56,43 @@ NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("distinct")
   .set_type_axes_names({"Type"})
   .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+
+template <typename Type>
+void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
+{
+  cudf::rmm_pool_raii pool_raii;
+
+  auto const size             = state.get_int64("ColumnSize");
+  auto const dtype            = cudf::type_to_id<Type>();
+  double const null_frequency = state.get_float64("null_frequency");
+
+  data_profile table_data_profile;
+  if (dtype == cudf::type_id::LIST) {
+    table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 4);
+    table_data_profile.set_distribution_params(
+      cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4);
+    table_data_profile.set_list_depth(1);
+  } else {
+    // We're comparing distinct() on a non-nested column to that on a list column with the same
+    // number of distinct rows. The max list size is 4 and the number of distinct values in the
+    // list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 + 5^4 = 781
+    // We want this column to also have 781 distinct values.
+    table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 781);
+  }
+  table_data_profile.set_null_frequency(null_frequency);
+
+  auto const table = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, table_data_profile, 0);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    auto result = cudf::detail::distinct(*table, {0}, cudf::null_equality::EQUAL, stream_view);
+  });
+}
+
+NVBENCH_BENCH_TYPES(nvbench_distinct_list,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
+  .set_name("distinct_list")
+  .set_type_axes_names({"Type"})
+  .add_float64_axis("null_frequency", {0.0, 0.1})
+  .add_int64_axis("ColumnSize", {100'000'000});
diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
index e8e100aaec5..9958fa8f3a4 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/detail/hashing.hpp
@@ -33,19 +33,20 @@ namespace detail {
 std::unique_ptr<column> hash(
   table_view const& input,
   hash_id hash_function               = hash_id::HASH_MURMUR3,
-  uint32_t seed                       = 0,
+  uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> murmur_hash3_32(
   table_view const& input,
+  uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 template <template <typename> class hash_function>
 std::unique_ptr<column> serial_murmur_hash3_32(
   table_view const& input,
-  uint32_t seed                       = 0,
+  uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 7a83298c72a..01ab435bca7 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -120,7 +120,7 @@ struct null_replaced_value_accessor {
  * @brief validity accessor of column with null bitmask
  * A unary functor that returns validity at index `i`.
  *
- * @tparam safe If false, the accessor with throw logic_error if the column is not nullable. If
+ * @tparam safe If false, the accessor will throw a logic_error if the column is not nullable. If
  * true, the accessor checks for nullability and if col is not nullable, returns true.
  */
 template <bool safe = false>
@@ -306,12 +306,12 @@ auto make_pair_rep_iterator(column_device_view const& column)
  *
  * Dereferencing the returned iterator for element `i` will return the validity
  * of `column[i]`
- * This iterator is only allowed for nullable columns if `safe` = false
+ * If `safe` = false, the column must be nullable.
  * When safe = true, if the column is not nullable then the validity is always true.
  *
- * @throws cudf::logic_error if the column is not nullable when safe = false
+ * @throws cudf::logic_error if the column is not nullable and safe = false
  *
- * @tparam safe If false, the accessor with throw logic_error if the column is not nullable. If
+ * @tparam safe If false, the accessor will throw a logic_error if the column is not nullable. If
  * true, the accessor checks for nullability and if col is not nullable, returns true.
  * @param column The column to iterate
  * @return auto Iterator that returns validities of column elements.
diff --git a/cpp/include/cudf/detail/utilities/algorithm.cuh b/cpp/include/cudf/detail/utilities/algorithm.cuh
new file mode 100644
index 00000000000..f05a09a8df1
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/algorithm.cuh
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace cudf::detail {
+
+template <typename Iterator, typename T, typename BinaryOp>
+__device__ __forceinline__ T accumulate(Iterator first, Iterator last, T init, BinaryOp op)
+{
+  for (; first != last; ++first) {
+    init = op(std::move(init), *first);
+  }
+  return init;
+}
+}  // namespace cudf::detail
\ No newline at end of file
diff --git a/cpp/include/cudf/detail/utilities/column.hpp b/cpp/include/cudf/detail/utilities/column.hpp
index 7d22bbd60af..05b46cc8e13 100644
--- a/cpp/include/cudf/detail/utilities/column.hpp
+++ b/cpp/include/cudf/detail/utilities/column.hpp
@@ -72,13 +72,9 @@ struct linked_column_view : public column_view_base {
  */
 inline LinkedColVector table_to_linked_columns(table_view const& table)
 {
-  LinkedColVector result;
-  result.reserve(table.num_columns());
-  std::transform(table.begin(), table.end(), std::back_inserter(result), [&](column_view const& c) {
-    return std::make_shared<linked_column_view>(c);
-  });
-
-  return result;
+  auto linked_it = thrust::make_transform_iterator(
+    table.begin(), [](auto const& c) { return std::make_shared<linked_column_view>(c); });
+  return LinkedColVector(linked_it, linked_it + table.num_columns());
 }
 
 }  // namespace cudf::detail
\ No newline at end of file
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index 32b71e660ac..2ed45c71633 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -17,7 +17,9 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/hashing.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/lists/list_device_view.cuh>
@@ -82,7 +84,7 @@ namespace lexicographic {
  * second letter in both words is the first non-equal letter, and `a < b`, thus
  * `aac < abb`.
  *
- * @tparam Nullate A cudf::nullate type describing how to check for nulls.
+ * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
  */
 template <typename Nullate>
 class device_row_comparator {
@@ -92,7 +94,7 @@ class device_row_comparator {
    * @brief Construct a function object for performing a lexicographic
    * comparison between the rows of two tables.
    *
-   * @param has_nulls Indicates if either input table contains columns with nulls.
+   * @param check_nulls Indicates if either input table contains columns with nulls.
    * @param lhs The first table
    * @param rhs The second table (may be the same table as `lhs`)
    * @param depth Optional, device array the same length as a row that contains starting depths of
@@ -105,7 +107,7 @@ class device_row_comparator {
    * `null_order::BEFORE` for all columns.
    */
   device_row_comparator(
-    Nullate has_nulls,
+    Nullate check_nulls,
     table_device_view lhs,
     table_device_view rhs,
     std::optional<device_span<int const>> depth                  = std::nullopt,
@@ -113,7 +115,7 @@ class device_row_comparator {
     std::optional<device_span<null_order const>> null_precedence = std::nullopt) noexcept
     : _lhs{lhs},
       _rhs{rhs},
-      _nulls{has_nulls},
+      _check_nulls{check_nulls},
       _depth{depth},
       _column_order{column_order},
       _null_precedence{null_precedence}
@@ -131,19 +133,19 @@ class device_row_comparator {
      *
      * @note `lhs` and `rhs` may be the same.
      *
-     * @param has_nulls Indicates if either input column contains nulls.
+     * @param check_nulls Indicates if either input column contains nulls.
      * @param lhs The column containing the first element
      * @param rhs The column containing the second element (may be the same as lhs)
      * @param null_precedence Indicates how null values are ordered with other values
      * @param depth The depth of the column if part of a nested column @see
      * preprocessed_table::depths
      */
-    __device__ element_comparator(Nullate has_nulls,
+    __device__ element_comparator(Nullate check_nulls,
                                   column_device_view lhs,
                                   column_device_view rhs,
                                   null_order null_precedence = null_order::BEFORE,
                                   int depth                  = 0)
-      : _lhs{lhs}, _rhs{rhs}, _nulls{has_nulls}, _null_precedence{null_precedence}, _depth{depth}
+      : _lhs{lhs}, _rhs{rhs}, _nulls{check_nulls}, _null_precedence{null_precedence}, _depth{depth}
     {
     }
 
@@ -204,8 +206,8 @@ class device_row_comparator {
         }
 
         // Non-empty structs have been modified to only have 1 child when using this.
-        lcol = lcol.children()[0];
-        rcol = rcol.children()[0];
+        lcol = detail::structs_column_device_view(lcol).sliced_child(0);
+        rcol = detail::structs_column_device_view(rcol).sliced_child(0);
         ++depth;
       }
 
@@ -245,7 +247,7 @@ class device_row_comparator {
         _null_precedence.has_value() ? (*_null_precedence)[i] : null_order::BEFORE;
 
       auto const comparator =
-        element_comparator{_nulls, _lhs.column(i), _rhs.column(i), null_precedence, depth};
+        element_comparator{_check_nulls, _lhs.column(i), _rhs.column(i), null_precedence, depth};
 
       weak_ordering state;
       cuda::std::tie(state, last_null_depth) =
@@ -261,7 +263,7 @@ class device_row_comparator {
  private:
   table_device_view const _lhs;
   table_device_view const _rhs;
-  Nullate const _nulls{};
+  Nullate const _check_nulls{};
   std::optional<device_span<int const>> const _depth;
   std::optional<device_span<order const>> const _column_order;
   std::optional<device_span<null_order const>> const _null_precedence;
@@ -408,11 +410,11 @@ class self_comparator {
   /**
    * @brief Return the binary operator for comparing rows in the table.
    *
-   * Returns a binary callable, `F`, with signature `bool F(size_t, size_t)`.
+   * Returns a binary callable, `F`, with signature `bool F(size_type, size_type)`.
    *
    * `F(i,j)` returns true if and only if row `i` compares lexicographically less than row `j`.
    *
-   * @tparam Nullate Optional, A cudf::nullate type describing how to check for nulls.
+   * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
    */
   template <typename Nullate>
   device_row_comparator<Nullate> device_comparator(Nullate nullate = {}) const
@@ -427,6 +429,10 @@ class self_comparator {
 
 }  // namespace lexicographic
 
+namespace hash {
+class row_hasher;
+}
+
 namespace equality {
 
 template <typename Nullate>
@@ -438,7 +444,7 @@ class device_row_comparator {
    * @brief Checks whether the row at `lhs_index` in the `lhs` table is equal to the row at
    * `rhs_index` in the `rhs` table.
    *
-   * @param lhs_index The index of row in the `lhs` table to examine
+   * @param lhs_index The index of the row in the `lhs` table to examine
    * @param rhs_index The index of the row in the `rhs` table to examine
    * @return `true` if row from the `lhs` table is equal to the row in the `rhs` table
    */
@@ -446,7 +452,7 @@ class device_row_comparator {
   {
     auto equal_elements = [=](column_device_view l, column_device_view r) {
       return cudf::type_dispatcher(
-        l.type(), element_comparator{nulls, l, r, nulls_are_equal}, lhs_index, rhs_index);
+        l.type(), element_comparator{check_nulls, l, r, nulls_are_equal}, lhs_index, rhs_index);
     };
 
     return thrust::equal(thrust::seq, lhs.begin(), lhs.end(), rhs.begin(), equal_elements);
@@ -457,23 +463,21 @@ class device_row_comparator {
    * @brief Construct a function object for performing equality comparison between the rows of two
    * tables.
    *
-   * @param has_nulls Indicates if either input table contains columns with nulls.
+   * @param check_nulls Indicates if either input table contains columns with nulls.
    * @param lhs The first table
    * @param rhs The second table (may be the same table as `lhs`)
    * @param nulls_are_equal Indicates if two null elements are treated as equivalent
    */
-  device_row_comparator(Nullate has_nulls,
+  device_row_comparator(Nullate check_nulls,
                         table_device_view lhs,
                         table_device_view rhs,
                         null_equality nulls_are_equal = null_equality::EQUAL) noexcept
-    : lhs{lhs}, rhs{rhs}, nulls{has_nulls}, nulls_are_equal{nulls_are_equal}
+    : lhs{lhs}, rhs{rhs}, check_nulls{check_nulls}, nulls_are_equal{nulls_are_equal}
   {
   }
 
   /**
    * @brief Performs an equality comparison between two elements in two columns.
-   *
-   * @tparam Nullate A cudf::nullate type describing how to check for nulls.
    */
   class element_comparator {
    public:
@@ -483,16 +487,16 @@ class device_row_comparator {
      *
      * @note `lhs` and `rhs` may be the same.
      *
-     * @param has_nulls Indicates if either input column contains nulls.
+     * @param check_nulls Indicates if either input column contains nulls.
      * @param lhs The column containing the first element
      * @param rhs The column containing the second element (may be the same as lhs)
      * @param nulls_are_equal Indicates if two null elements are treated as equivalent
      */
-    __device__ element_comparator(Nullate has_nulls,
+    __device__ element_comparator(Nullate check_nulls,
                                   column_device_view lhs,
                                   column_device_view rhs,
                                   null_equality nulls_are_equal = null_equality::EQUAL) noexcept
-      : lhs{lhs}, rhs{rhs}, nulls{has_nulls}, nulls_are_equal{nulls_are_equal}
+      : lhs{lhs}, rhs{rhs}, check_nulls{check_nulls}, nulls_are_equal{nulls_are_equal}
     {
     }
 
@@ -502,13 +506,13 @@ class device_row_comparator {
      * @param lhs_element_index The index of the first element
      * @param rhs_element_index The index of the second element
      * @return True if lhs and rhs are equal or if both lhs and rhs are null and nulls are
-     * configured to be considered equal (`nulls_are_equal` == `null_equality::EQUAL`)
+     * considered equal (`nulls_are_equal` == `null_equality::EQUAL`)
      */
     template <typename Element, CUDF_ENABLE_IF(cudf::is_equality_comparable<Element, Element>())>
     __device__ bool operator()(size_type const lhs_element_index,
                                size_type const rhs_element_index) const noexcept
     {
-      if (nulls) {
+      if (check_nulls) {
         bool const lhs_is_null{lhs.is_null(lhs_element_index)};
         bool const rhs_is_null{rhs.is_null(rhs_element_index)};
         if (lhs_is_null and rhs_is_null) {
@@ -538,7 +542,7 @@ class device_row_comparator {
       column_device_view lcol = lhs.slice(lhs_element_index, 1);
       column_device_view rcol = rhs.slice(rhs_element_index, 1);
       while (is_nested(lcol.type())) {
-        if (nulls) {
+        if (check_nulls) {
           auto lvalid = detail::make_validity_iterator<true>(lcol);
           auto rvalid = detail::make_validity_iterator<true>(rcol);
           if (nulls_are_equal == null_equality::UNEQUAL) {
@@ -556,6 +560,7 @@ class device_row_comparator {
         }
         if (lcol.type().id() == type_id::STRUCT) {
           if (lcol.num_child_columns() == 0) { return true; }
+          // Non-empty structs are assumed to be decomposed and contain only one child
           lcol = detail::structs_column_device_view(lcol).sliced_child(0);
           rcol = detail::structs_column_device_view(rcol).sliced_child(0);
         } else if (lcol.type().id() == type_id::LIST) {
@@ -574,8 +579,8 @@ class device_row_comparator {
         }
       }
 
-      auto comp =
-        column_comparator{element_comparator{nulls, lcol, rcol, nulls_are_equal}, lcol.size()};
+      auto comp = column_comparator{element_comparator{check_nulls, lcol, rcol, nulls_are_equal},
+                                    lcol.size()};
       return type_dispatcher<dispatch_void_if_nested>(lcol.type(), comp);
     }
 
@@ -583,7 +588,7 @@ class device_row_comparator {
     /**
      * @brief Serially compare two columns for equality.
      *
-     * When we want to get the equivalence of two columns by serially comparing all elements in a
+     * When we want to get the equivalence of two columns by serially comparing all elements in
      * one column with the corresponding elements in the other column, this saves us from type
      * dispatching for each individual element in the range
      */
@@ -616,13 +621,13 @@ class device_row_comparator {
 
     column_device_view const lhs;
     column_device_view const rhs;
-    Nullate const nulls;
+    Nullate const check_nulls;
     null_equality const nulls_are_equal;
   };
 
   table_device_view const lhs;
   table_device_view const rhs;
-  Nullate const nulls;
+  Nullate const check_nulls;
   null_equality const nulls_are_equal;
 };
 
@@ -642,6 +647,7 @@ struct preprocessed_table {
 
  private:
   friend class self_comparator;
+  friend class hash::row_hasher;
 
   using table_device_view_owner =
     std::invoke_result_t<decltype(table_device_view::create), table_view, rmm::cuda_stream_view>;
@@ -692,16 +698,17 @@ class self_comparator {
   /**
    * @brief Get the comparison operator to use on the device
    *
-   * Returns a binary callable, `F`, with signature `bool F(size_t, size_t)`.
+   * Returns a binary callable, `F`, with signature `bool F(size_type, size_type)`.
    *
    * `F(i,j)` returns true if and only if row `i` compares equal to row `j`.
    *
-   * @tparam Nullate Optional, A cudf::nullate type describing how to check for nulls.
+   * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
    */
   template <typename Nullate>
-  device_row_comparator<Nullate> device_comparator(Nullate nullate = {}) const
+  device_row_comparator<Nullate> device_comparator(
+    Nullate nullate = {}, null_equality nulls_are_equal = null_equality::EQUAL) const
   {
-    return device_row_comparator(nullate, *d_t, *d_t);
+    return device_row_comparator(nullate, *d_t, *d_t, nulls_are_equal);
   }
 
  private:
@@ -710,6 +717,202 @@ class self_comparator {
 
 }  // namespace equality
 
+namespace hash {
+
+/**
+ * @brief Computes the hash value of an element in the given column.
+ *
+ * @tparam hash_function Hash functor to use for hashing elements.
+ * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
+ */
+template <template <typename> class hash_function, typename Nullate>
+class element_hasher {
+ public:
+  __device__ element_hasher(
+    Nullate nulls,
+    uint32_t seed             = DEFAULT_HASH_SEED,
+    hash_value_type null_hash = std::numeric_limits<hash_value_type>::max()) noexcept
+    : _check_nulls(nulls), _seed(seed), _null_hash(null_hash)
+  {
+  }
+
+  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  __device__ hash_value_type operator()(column_device_view const& col,
+                                        size_type row_index) const noexcept
+  {
+    if (_check_nulls && col.is_null(row_index)) { return _null_hash; }
+    return hash_function<T>{_seed}(col.element<T>(row_index));
+  }
+
+  template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
+  __device__ hash_value_type operator()(column_device_view const& col,
+                                        size_type row_index) const noexcept
+  {
+    CUDF_UNREACHABLE("Unsupported type in hash.");
+  }
+
+  uint32_t _seed;
+  hash_value_type _null_hash;
+  Nullate _check_nulls;
+};
+
+/**
+ * @brief Computes the hash value of a row in the given table.
+ *
+ * @tparam hash_function Hash functor to use for hashing elements.
+ * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
+ */
+template <template <typename> class hash_function, typename Nullate>
+class device_row_hasher {
+  friend class row_hasher;
+
+ public:
+  device_row_hasher() = delete;
+
+  __device__ auto operator()(size_type row_index) const noexcept
+  {
+    auto it = thrust::make_transform_iterator(_table.begin(), [=](auto const& column) {
+      return cudf::type_dispatcher<dispatch_storage_type>(
+        column.type(), element_hasher_adapter<hash_function>{_check_nulls}, column, row_index);
+    });
+
+    // Hash each element and combine all the hash values together
+    return detail::accumulate(it, it + _table.num_columns(), _seed, [](auto hash, auto h) {
+      return cudf::detail::hash_combine(hash, h);
+    });
+  }
+
+ private:
+  /**
+   * @brief Computes the hash value of an element in the given column.
+   *
+   * When the column is non-nested, this is a simple wrapper around the element_hasher.
+   * When the column is nested, this uses the element_hasher to hash the shape and values of the
+   * column.
+   */
+  template <template <typename> class hash_fn>
+  class element_hasher_adapter {
+    static constexpr hash_value_type NULL_HASH     = std::numeric_limits<hash_value_type>::max();
+    static constexpr hash_value_type NON_NULL_HASH = 0;
+
+   public:
+    __device__ element_hasher_adapter(Nullate check_nulls) noexcept
+      : _element_hasher(check_nulls), _check_nulls(check_nulls)
+    {
+    }
+
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_nested<T>())>
+    __device__ hash_value_type operator()(column_device_view const& col,
+                                          size_type row_index) const noexcept
+    {
+      return _element_hasher.template operator()<T>(col, row_index);
+    }
+
+    template <typename T, CUDF_ENABLE_IF(cudf::is_nested<T>())>
+    __device__ hash_value_type operator()(column_device_view const& col,
+                                          size_type row_index) const noexcept
+    {
+      auto hash                   = hash_value_type{0};
+      column_device_view curr_col = col.slice(row_index, 1);
+      while (is_nested(curr_col.type())) {
+        if (_check_nulls) {
+          auto validity_it = detail::make_validity_iterator<true>(curr_col);
+          hash             = detail::accumulate(
+            validity_it, validity_it + curr_col.size(), hash, [](auto hash, auto is_valid) {
+              return cudf::detail::hash_combine(hash, is_valid ? NON_NULL_HASH : NULL_HASH);
+            });
+        }
+        if (curr_col.type().id() == type_id::STRUCT) {
+          if (curr_col.num_child_columns() == 0) { return hash; }
+          // Non-empty structs are assumed to be decomposed and contain only one child
+          curr_col = detail::structs_column_device_view(curr_col).sliced_child(0);
+        } else if (curr_col.type().id() == type_id::LIST) {
+          auto list_col   = detail::lists_column_device_view(curr_col);
+          auto list_sizes = make_list_size_iterator(list_col);
+          hash            = detail::accumulate(
+            list_sizes, list_sizes + list_col.size(), hash, [](auto hash, auto size) {
+              return cudf::detail::hash_combine(hash, hash_fn<size_type>{}(size));
+            });
+          curr_col = list_col.sliced_child();
+        }
+      }
+      for (int i = 0; i < curr_col.size(); ++i) {
+        hash = cudf::detail::hash_combine(
+          hash,
+          type_dispatcher<dispatch_void_if_nested>(curr_col.type(), _element_hasher, curr_col, i));
+      }
+      return hash;
+    }
+
+    element_hasher<hash_fn, Nullate> const _element_hasher;
+    Nullate const _check_nulls;
+  };
+
+  CUDF_HOST_DEVICE device_row_hasher(Nullate check_nulls,
+                                     table_device_view t,
+                                     uint32_t seed = DEFAULT_HASH_SEED) noexcept
+    : _table{t}, _seed(seed), _check_nulls{check_nulls}
+  {
+  }
+
+  table_device_view const _table;
+  Nullate const _check_nulls;
+  uint32_t const _seed;
+};
+
+// Inject row::equality::preprocessed_table into the row::hash namespace
+// As a result, row::equality::preprocessed_table and row::hash::preprocessed table are the same
+// type and are interchangeable.
+using preprocessed_table = row::equality::preprocessed_table;
+
+class row_hasher {
+ public:
+  /**
+   * @brief Construct an owning object for hashing the rows of a table
+   *
+   * @param t The table containing rows to hash
+   * @param stream The stream to construct this object on. Not the stream that will be used for
+   * comparisons using this object.
+   */
+  row_hasher(table_view const& t, rmm::cuda_stream_view stream)
+    : d_t(preprocessed_table::create(t, stream))
+  {
+  }
+
+  /**
+   * @brief Construct an owning object for hashing the rows of a table from an existing
+   * preprocessed_table
+   *
+   * This constructor allows independently constructing a `preprocessed_table` and sharing it among
+   * multiple `row_hasher` and `equality::self_comparator` objects.
+   *
+   * @param t A table preprocessed for hashing or equality.
+   */
+  row_hasher(std::shared_ptr<preprocessed_table> t) : d_t{std::move(t)} {}
+
+  /**
+   * @brief Get the hash operator to use on the device
+   *
+   * Returns a unary callable, `F`, with signature `hash_function::hash_value_type F(size_type)`.
+   *
+   * `F(i)` returns the hash of row i.
+   *
+   * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
+   */
+  template <template <typename> class hash_function = detail::default_hash, typename Nullate>
+  device_row_hasher<hash_function, Nullate> device_hasher(Nullate nullate = {},
+                                                          uint32_t seed   = DEFAULT_HASH_SEED) const
+  {
+    return device_row_hasher<hash_function, Nullate>(nullate, *d_t, seed);
+  }
+
+ private:
+  std::shared_ptr<preprocessed_table> d_t;
+};
+
+}  // namespace hash
+
 }  // namespace row
+
 }  // namespace experimental
 }  // namespace cudf
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index 33984ad5ce3..dc47dc39cfe 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -16,8 +16,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/hashing.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
-#include <cudf/table/row_operators.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -70,18 +71,18 @@ std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,
     output_view.begin<int32_t>(),
     output_view.end<int32_t>(),
     [device_input = *device_input, nulls = has_nulls(leaf_table), seed] __device__(auto row_index) {
-      return thrust::reduce(thrust::seq,
-                            device_input.begin(),
-                            device_input.end(),
-                            seed,
-                            [rindex = row_index, nulls] __device__(auto hash, auto column) {
-                              return cudf::type_dispatcher(
-                                column.type(),
-                                element_hasher_with_seed<hash_function, nullate::DYNAMIC>{
-                                  nullate::DYNAMIC{nulls}, hash, hash},
-                                column,
-                                rindex);
-                            });
+      return detail::accumulate(
+        device_input.begin(),
+        device_input.end(),
+        seed,
+        [row_index, nulls] __device__(auto hash, auto column) {
+          return cudf::type_dispatcher(
+            column.type(),
+            experimental::row::hash::element_hasher<hash_function, nullate::DYNAMIC>{
+              nullate::DYNAMIC{nulls}, hash, hash},
+            column,
+            row_index);
+        });
     });
 
   return output;
@@ -94,7 +95,7 @@ std::unique_ptr<column> hash(table_view const& input,
                              rmm::mr::device_memory_resource* mr)
 {
   switch (hash_function) {
-    case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, stream, mr);
+    case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, seed, stream, mr);
     case (hash_id::HASH_SERIAL_MURMUR3):
       return serial_murmur_hash3_32<MurmurHash3_32>(input, seed, stream, mr);
     case (hash_id::HASH_SPARK_MURMUR3):
diff --git a/cpp/src/hash/murmur_hash.cu b/cpp/src/hash/murmur_hash.cu
index bc8d3577513..1b75c818f36 100644
--- a/cpp/src/hash/murmur_hash.cu
+++ b/cpp/src/hash/murmur_hash.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <cudf/detail/hashing.hpp>
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/table/row_operators.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -29,26 +29,28 @@ namespace cudf {
 namespace detail {
 
 std::unique_ptr<column> murmur_hash3_32(table_view const& input,
+                                        uint32_t seed,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  // TODO this should be UINT32
-  auto output = make_numeric_column(
-    data_type(type_id::INT32), input.num_rows(), mask_state::UNALLOCATED, stream, mr);
+  auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
+                                    input.num_rows(),
+                                    mask_state::UNALLOCATED,
+                                    stream,
+                                    mr);
 
   // Return early if there's nothing to hash
   if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
 
-  bool const nullable     = has_nulls(input);
-  auto const device_input = table_device_view::create(input, stream);
-  auto output_view        = output->mutable_view();
+  bool const nullable   = has_nulls(input);
+  auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream);
+  auto output_view      = output->mutable_view();
 
   // Compute the hash value for each row
-  thrust::tabulate(
-    rmm::exec_policy(stream),
-    output_view.begin<int32_t>(),
-    output_view.end<int32_t>(),
-    row_hasher<MurmurHash3_32, nullate::DYNAMIC>(nullate::DYNAMIC{nullable}, *device_input));
+  thrust::tabulate(rmm::exec_policy(stream),
+                   output_view.begin<hash_value_type>(),
+                   output_view.end<hash_value_type>(),
+                   row_hasher.device_hasher<MurmurHash3_32>(nullable, seed));
 
   return output;
 }
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index d74946406d8..35c74178620 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -27,7 +27,7 @@
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/stream_compaction.hpp>
-#include <cudf/table/row_operators.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
@@ -57,9 +57,10 @@ std::unique_ptr<table> distinct(table_view const& input,
   }
 
   auto keys_view = input.select(keys);
-  auto table_ptr = cudf::table_device_view::create(keys_view, stream);
-  auto has_null  = nullate::DYNAMIC{cudf::has_nulls(keys_view)};
-  auto const num_rows{table_ptr->num_rows()};
+  auto preprocessed_keys =
+    cudf::experimental::row::hash::preprocessed_table::create(keys_view, stream);
+  auto has_null = nullate::DYNAMIC{cudf::has_nulls(keys_view)};
+  auto const num_rows{keys_view.num_rows()};
 
   hash_map_type key_map{compute_hash_table_size(num_rows),
                         COMPACTION_EMPTY_KEY_SENTINEL,
@@ -67,13 +68,16 @@ std::unique_ptr<table> distinct(table_view const& input,
                         detail::hash_table_allocator_type{default_allocator<char>{}, stream},
                         stream.value()};
 
-  compaction_hash hash_key{has_null, *table_ptr};
-  row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal);
+  auto row_hash = cudf::experimental::row::hash::row_hasher(preprocessed_keys);
+  experimental::compaction_hash hash_key(row_hash.device_hasher(has_null));
+
+  cudf::experimental::row::equality::self_comparator row_equal(preprocessed_keys);
+  auto key_equal = row_equal.device_comparator(has_null, nulls_equal);
 
   auto iter = cudf::detail::make_counting_transform_iterator(
     0, [] __device__(size_type i) { return cuco::make_pair(i, i); });
   // insert distinct indices into the map.
-  key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value());
+  key_map.insert(iter, iter + num_rows, hash_key, key_equal, stream.value());
 
   auto counting_iter = thrust::make_counting_iterator<size_type>(0);
   rmm::device_uvector<bool> index_exists_in_map(num_rows, stream, mr);
diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh
index f49e17112c1..0970a99edad 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.cuh
+++ b/cpp/src/stream_compaction/stream_compaction_common.cuh
@@ -47,6 +47,28 @@ class compaction_hash {
   row_hash _hash;
 };
 
+namespace experimental {
+
+/**
+ * @brief Device callable to hash a given row.
+ */
+template <typename RowHash>
+class compaction_hash {
+ public:
+  compaction_hash(RowHash row_hasher) : _hash{row_hasher} {}
+
+  __device__ inline auto operator()(size_type i) const noexcept
+  {
+    auto hash = _hash(i);
+    return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash;
+  }
+
+ private:
+  RowHash _hash;
+};
+
+}  // namespace experimental
+
 /**
 ￼ * @brief Device functor to determine if a row is valid.
 ￼ */
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 408d4e51425..3c51ae22418 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -31,43 +31,38 @@ namespace experimental {
 namespace {
 
 /**
- * @brief Applies the offsets of struct column onto its children
+ * @brief Removes the offsets of struct column's children
  *
- * @param c The column whose children are to be sliced
- * @return Children of `c` with offsets applied
+ * @param c The column whose children are to be un-sliced
+ * @return Children of `c` with offsets removed
  */
-std::vector<column_view> slice_children(column_view const& c)
+std::vector<column_view> unslice_children(column_view const& c)
 {
   if (c.type().id() == type_id::STRUCT) {
-    std::vector<column_view> sliced_children;
-    sliced_children.reserve(c.num_children());
-    auto struct_col = structs_column_view(c);
-    for (size_type i = 0; i < struct_col.num_children(); ++i) {
-      auto sliced = struct_col.get_sliced_child(i);
-      // We cannot directly use the output of `structs_column_view::get_sliced_child` because we
-      // must first traverse its children recursively to push offsets all the way down to the leaf
-      // children.
-      sliced_children.emplace_back(sliced.type(),
-                                   sliced.size(),
-                                   sliced.head<uint8_t>(),
-                                   sliced.null_mask(),
-                                   sliced.null_count(),
-                                   sliced.offset(),
-                                   slice_children(sliced));
-    }
-    return sliced_children;
+    auto child_it = thrust::make_transform_iterator(c.child_begin(), [](auto const& child) {
+      return column_view(
+        child.type(),
+        child.offset() + child.size(),  // This is hacky, we don't know the actual unsliced size but
+                                        // it is at least offset + size
+        child.head(),
+        child.null_mask(),
+        child.null_count(),
+        0,
+        unslice_children(child));
+    });
+    return {child_it, child_it + c.num_children()};
   }
   return {c.child_begin(), c.child_end()};
 };
 
 /**
- * @brief Applies the offsets of struct columns in a table onto their children.
+ * @brief Removes the child column offsets of struct columns in a table.
  *
  * Given a table, this replaces any struct columns with similar struct columns that have their
- * offsets applied to their children. Structs that are children of list columns are not affected.
+ * offsets removed from their children. Structs that are children of list columns are not affected.
  *
  */
-table_view pushdown_struct_offsets(table_view table)
+table_view remove_struct_child_offsets(table_view table)
 {
   std::vector<column_view> cols;
   cols.reserve(table.num_columns());
@@ -78,7 +73,7 @@ table_view pushdown_struct_offsets(table_view table)
                        c.null_mask(),
                        c.null_count(),
                        c.offset(),
-                       slice_children(c));
+                       unslice_children(c));
   });
   return table_view(cols);
 }
@@ -159,8 +154,7 @@ auto decompose_structs(table_view table,
                        host_span<order const> column_order         = {},
                        host_span<null_order const> null_precedence = {})
 {
-  auto sliced         = pushdown_struct_offsets(table);
-  auto linked_columns = detail::table_to_linked_columns(sliced);
+  auto linked_columns = detail::table_to_linked_columns(table);
 
   std::vector<column_view> verticalized_columns;
   std::vector<order> new_column_order;
@@ -225,6 +219,15 @@ auto decompose_structs(table_view table,
               UNKNOWN_NULL_COUNT,
               parent->offset(),
               {*parent->children[lists_column_view::offsets_column_index], temp_col});
+          } else if (parent->type().id() == type_id::STRUCT) {
+            // Replace offset with parent's offset
+            temp_col = column_view(temp_col.type(),
+                                   parent->size(),
+                                   temp_col.head(),
+                                   temp_col.null_mask(),
+                                   UNKNOWN_NULL_COUNT,
+                                   parent->offset(),
+                                   {temp_col.child_begin(), temp_col.child_end()});
           }
         }
         verticalized_columns.push_back(temp_col);
@@ -334,7 +337,8 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(table_view const&
   check_eq_compatibility(t);
 
   auto null_pushed_table              = structs::detail::superimpose_parent_nulls(t, stream);
-  auto [verticalized_lhs, _, __, ___] = decompose_structs(std::get<0>(null_pushed_table));
+  auto struct_offset_removed_table    = remove_struct_child_offsets(std::get<0>(null_pushed_table));
+  auto [verticalized_lhs, _, __, ___] = decompose_structs(struct_offset_removed_table);
 
   auto d_t = table_device_view_owner(table_device_view::create(verticalized_lhs, stream));
   return std::shared_ptr<preprocessed_table>(
diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp
index da933b44b8d..5ba010255ca 100644
--- a/cpp/tests/hashing/hash_test.cpp
+++ b/cpp/tests/hashing/hash_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,6 +129,228 @@ TEST_F(HashTest, MultiValueNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
 }
 
+TEST_F(HashTest, BasicList)
+{
+  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
+  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
+
+  auto const col = LCW{{}, {}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}};
+  auto const input  = cudf::table_view({col});
+  auto const expect = ICW{1607593296,
+                          1607593296,
+                          -636010097,
+                          -132459357,
+                          -636010097,
+                          -2008850957,
+                          -1023787369,
+                          761197503,
+                          761197503,
+                          1340177511,
+                          -1023787369,
+                          -1023787369};
+
+  auto const output = cudf::hash(input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+
+  auto const expect_seeded = ICW{1607594268u,
+                                 1607594268u,
+                                 3658958173u,
+                                 4162508905u,
+                                 3658958173u,
+                                 2286117305u,
+                                 3271180885u,
+                                 761198477u,
+                                 761198477u,
+                                 1340178469u,
+                                 3271180885u,
+                                 3271180885u};
+
+  auto const seeded_output = cudf::hash(input, cudf::hash_id::HASH_MURMUR3, 15);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
+}
+
+TEST_F(HashTest, NullableList)
+{
+  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
+  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
+
+  auto const valids = std::vector<bool>{1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0};
+  auto const col =
+    LCW{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {}, {2, 2}, {2, 2}, {}}, valids.begin()};
+  auto expect = ICW{-2023148619,
+                    -2023148619,
+                    -31671896,
+                    -31671896,
+                    -1205248335,
+                    1865773848,
+                    1865773848,
+                    -2023148682,
+                    -1205248335,
+                    -1205248335,
+                    -2023148682};
+
+  auto const output = cudf::hash(cudf::table_view({col}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+
+  auto const expect_seeded = ICW{2271820643u,
+                                 2271820643u,
+                                 4263297392u,
+                                 4263297392u,
+                                 3089720935u,
+                                 1865775808u,
+                                 1865775808u,
+                                 2271820578u,
+                                 3089720935u,
+                                 3089720935u,
+                                 2271820578u};
+
+  auto const seeded_output = cudf::hash(cudf::table_view({col}), cudf::hash_id::HASH_MURMUR3, 31);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
+}
+
+TEST_F(HashTest, ListOfStruct)
+{
+  auto col1 = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {-1, -1, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, 1, 2},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+  auto col2 = cudf::test::strings_column_wrapper{
+    {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}};
+  auto struct_col = cudf::test::structs_column_wrapper{
+    {col1, col2}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
+
+  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto nullmask_buf =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(
+    17, offsets.release(), struct_col.release(), cudf::UNKNOWN_NULL_COUNT, std::move(nullmask_buf));
+
+  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{83451479,
+                                                                 83451479,
+                                                                 83455332,
+                                                                 83455332,
+                                                                 -759684425,
+                                                                 -959632766,
+                                                                 -959632766,
+                                                                 -959632766,
+                                                                 -959636527,
+                                                                 -656998704,
+                                                                 613652814,
+                                                                 1902080426,
+                                                                 1902080426,
+                                                                 2061025592,
+                                                                 2061025592,
+                                                                 -319840811,
+                                                                 -319840811};
+
+  auto const output = cudf::hash(cudf::table_view({*list_column}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+
+  auto expect_seeded = cudf::test::fixed_width_column_wrapper<uint32_t>{81710442u,
+                                                                        81710442u,
+                                                                        81729816u,
+                                                                        81729816u,
+                                                                        3532787573u,
+                                                                        3642097855u,
+                                                                        3642097855u,
+                                                                        3642097855u,
+                                                                        3642110391u,
+                                                                        3624905718u,
+                                                                        608933631u,
+                                                                        1899376347u,
+                                                                        1899376347u,
+                                                                        2058877614u,
+                                                                        2058877614u,
+                                                                        4013395891u,
+                                                                        4013395891u};
+
+  auto const seeded_output =
+    cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_MURMUR3, 619);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
+}
+
+TEST_F(HashTest, ListOfEmptyStruct)
+{
+  // []
+  // []
+  // Null
+  // Null
+  // [Null, Null]
+  // [Null, Null]
+  // [Null, Null]
+  // [Null]
+  // [Null]
+  // [{}]
+  // [{}]
+  // [{}, {}]
+  // [{}, {}]
+
+  auto struct_validity = std::vector<bool>{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+  auto struct_validity_buffer =
+    cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
+  auto struct_col =
+    cudf::make_structs_column(14, {}, cudf::UNKNOWN_NULL_COUNT, std::move(struct_validity_buffer));
+
+  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14};
+  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto list_validity_buffer =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(13,
+                                             offsets.release(),
+                                             std::move(struct_col),
+                                             cudf::UNKNOWN_NULL_COUNT,
+                                             std::move(list_validity_buffer));
+
+  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{-2023148619,
+                                                                 -2023148619,
+                                                                 -2023148682,
+                                                                 -2023148682,
+                                                                 -340558283,
+                                                                 -340558283,
+                                                                 -340558283,
+                                                                 -1999301021,
+                                                                 -1999301021,
+                                                                 -1999301020,
+                                                                 -1999301020,
+                                                                 -340558244,
+                                                                 -340558244};
+
+  auto output = cudf::hash(cudf::table_view({*list_column}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+}
+
+TEST_F(HashTest, EmptyDeepList)
+{
+  // List<List<int>>, where all lists are empty
+  // []
+  // []
+  // Null
+  // Null
+
+  // Internal empty list
+  auto list1 = cudf::test::lists_column_wrapper<int>{};
+
+  auto offsets       = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
+  auto list_nullmask = std::vector<bool>{1, 1, 0, 0};
+  auto list_validity_buffer =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(4,
+                                             offsets.release(),
+                                             list1.release(),
+                                             cudf::UNKNOWN_NULL_COUNT,
+                                             std::move(list_validity_buffer));
+
+  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{
+    -2023148619, -2023148619, -2023148682, -2023148682};
+
+  auto output = cudf::hash(cudf::table_view({*list_column}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+}
+
 template <typename T>
 class HashTestTyped : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp
index b3a8e7e0c28..9be68e8458b 100644
--- a/cpp/tests/reductions/list_rank_test.cpp
+++ b/cpp/tests/reductions/list_rank_test.cpp
@@ -120,7 +120,7 @@ TEST_F(ListRankScanTest, ListOfStruct)
   auto col2 = cudf::test::strings_column_wrapper{
     {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
     {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}};
-  auto struc = cudf::test::structs_column_wrapper{
+  auto struct_col = cudf::test::structs_column_wrapper{
     {col1, col2}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
 
   auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
@@ -135,7 +135,7 @@ TEST_F(ListRankScanTest, ListOfStruct)
                                        static_cast<cudf::bitmask_type*>(nullmask_buf.data()),
                                        cudf::UNKNOWN_NULL_COUNT,
                                        0,
-                                       {offsets, struc});
+                                       {offsets, struct_col});
 
   {  // Non-sliced
     auto expect = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp
index 866239efc9d..2c822b93444 100644
--- a/cpp/tests/stream_compaction/distinct_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_tests.cpp
@@ -138,3 +138,245 @@ TEST_F(Distinct, WithNull)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view());
 }
+
+TEST_F(Distinct, BasicList)
+{
+  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
+  using ICW = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
+
+  // clang-format off
+  auto const idx = ICW{ 0,  0,   1,      2,   1,      3,      4,   5,   5,      6,      4,     4 };
+  auto const col = LCW{{}, {}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}};
+  // clang-format on
+  auto const input = cudf::table_view({idx, col});
+
+  auto const exp_idx = ICW{0, 1, 2, 3, 4, 5, 6};
+  auto const exp_val = LCW{{}, {1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}};
+  auto const expect  = cudf::table_view({exp_idx, exp_val});
+
+  auto result        = cudf::distinct(input, {1});
+  auto sorted_result = cudf::sort_by_key(*result, result->select({0}));
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expect, *sorted_result);
+}
+
+TEST_F(Distinct, NullableList)
+{
+  using LCW  = cudf::test::lists_column_wrapper<uint64_t>;
+  using ICW  = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
+  using mask = std::vector<bool>;
+
+  // clang-format off
+  auto const idx    = ICW {  0,  0,   1,   1,      4,   5,   5,  6,       4,     4,  6};
+  auto const valids = mask{  1,  1,   1,   1,      1,   1,   1,  0,       1,     1,  0};
+  auto const col    = LCW {{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {}, {2, 2}, {2, 2}, {}}, valids.begin()};
+
+  auto const exp_idx    = ICW {  0,   1,      4,   5,  6};
+  auto const exp_valids = mask{  1,   1,      1,   1,  0};
+  auto const exp_val    = LCW {{{}, {1}, {2, 2}, {2}, {}}, exp_valids.begin()};
+
+  // clang-format on
+  auto const input  = cudf::table_view({idx, col});
+  auto const expect = cudf::table_view({exp_idx, exp_val});
+
+  auto result        = cudf::distinct(input, {1});
+  auto sorted_result = cudf::sort_by_key(*result, result->select({0}));
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expect, *sorted_result);
+}
+
+TEST_F(Distinct, ListOfStruct)
+{
+  // Constructing a list of struct of two elements
+  // 0.   []                  ==
+  // 1.   []                  !=
+  // 2.   Null                ==
+  // 3.   Null                !=
+  // 4.   [Null, Null]        !=
+  // 5.   [Null]              ==
+  // 6.   [Null]              ==
+  // 7.   [Null]              !=
+  // 8.   [{Null, Null}]      !=
+  // 9.   [{1,'a'}, {2,'b'}]  !=
+  // 10.  [{0,'a'}, {2,'b'}]  !=
+  // 11.  [{0,'a'}, {2,'c'}]  ==
+  // 12.  [{0,'a'}, {2,'c'}]  !=
+  // 13.  [{0,Null}]          ==
+  // 14.  [{0,Null}]          !=
+  // 15.  [{Null, 'b'}]       ==
+  // 16.  [{Null, 'b'}]
+
+  auto col1 = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {-1, -1, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, 1, 2},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+  auto col2 = cudf::test::strings_column_wrapper{
+    {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}};
+  auto struct_col = cudf::test::structs_column_wrapper{
+    {col1, col2}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
+
+  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto nullmask_buf =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::column_view(cudf::data_type(cudf::type_id::LIST),
+                                       17,
+                                       nullptr,
+                                       static_cast<cudf::bitmask_type*>(nullmask_buf.data()),
+                                       cudf::UNKNOWN_NULL_COUNT,
+                                       0,
+                                       {offsets, struct_col});
+
+  auto idx = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    1, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10};
+
+  auto input = cudf::table_view({idx, list_column});
+
+  auto expect_map =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 2, 4, 5, 8, 9, 10, 11, 13, 15};
+
+  auto expect_table = cudf::gather(input, expect_map);
+
+  auto result        = cudf::distinct(input, {1});
+  auto sorted_result = cudf::sort_by_key(*result, result->select({0}));
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *sorted_result);
+}
+
+TEST_F(Distinct, StructOfStruct)
+{
+  using FWCW = cudf::test::fixed_width_column_wrapper<int>;
+  using MASK = std::vector<bool>;
+
+  /*
+    `@` indicates null
+
+       /+-------------+
+       |s1{s2{a,b}, c}|
+       +--------------+
+     0 |  { {1, 1}, 5}|
+     1 |  { {1, 2}, 4}|
+     2 |  {@{2, 1}, 6}|
+     3 |  {@{2, 2}, 4}|
+     4 | @{ {2, 2}, 3}|
+     5 | @{ {1, 1}, 3}|  // Same as 4
+     6 |  { {1, 1}, 5}|  // Same as 0
+     7 |  {@{1, 1}, 4}|  // Same as 3
+     8 |  { {2, 1}, 5}|
+       +--------------+
+  */
+
+  auto col_a   = FWCW{1, 1, 2, 2, 2, 1, 1, 1, 2};
+  auto col_b   = FWCW{1, 2, 1, 2, 2, 1, 1, 1, 1};
+  auto s2_mask = MASK{1, 1, 0, 0, 1, 1, 1, 0, 1};
+  auto col_c   = FWCW{5, 4, 6, 4, 3, 3, 5, 4, 5};
+  auto s1_mask = MASK{1, 1, 1, 1, 0, 0, 1, 1, 1};
+  auto idx     = FWCW{0, 1, 2, 3, 4, 5, 6, 7, 8};
+
+  std::vector<std::unique_ptr<cudf::column>> s2_children;
+  s2_children.push_back(col_a.release());
+  s2_children.push_back(col_b.release());
+  auto s2 = cudf::test::structs_column_wrapper(std::move(s2_children), s2_mask);
+
+  std::vector<std::unique_ptr<cudf::column>> s1_children;
+  s1_children.push_back(s2.release());
+  s1_children.push_back(col_c.release());
+  auto s1 = cudf::test::structs_column_wrapper(std::move(s1_children), s1_mask);
+
+  auto input = cudf::table_view({idx, s1});
+
+  auto expect_map = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3, 4, 8};
+  auto expect     = cudf::gather(input, expect_map);
+
+  auto result        = cudf::distinct(input, {1});
+  auto sorted_result = cudf::sort_by_key(*result, result->select({0}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect->get_column(1), sorted_result->get_column(1));
+
+  auto sliced_input      = cudf::slice(input, {1, 7});
+  auto sliced_expect_map = cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 2, 3, 4, 6};
+  auto sliced_expect     = cudf::gather(input, sliced_expect_map);
+
+  auto sliced_result        = cudf::distinct(sliced_input, {1});
+  auto sorted_sliced_result = cudf::sort_by_key(*sliced_result, sliced_result->select({0}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_expect->get_column(1), sorted_sliced_result->get_column(1));
+}
+
+TEST_F(Distinct, ListOfEmptyStruct)
+{
+  // 0.  []             ==
+  // 1.  []             !=
+  // 2.  Null           ==
+  // 3.  Null           !=
+  // 4.  [Null, Null]   ==
+  // 5.  [Null, Null]   ==
+  // 6.  [Null, Null]   !=
+  // 7.  [Null]         ==
+  // 8.  [Null]         !=
+  // 9.  [{}]           ==
+  // 10. [{}]           !=
+  // 11. [{}, {}]       ==
+  // 12. [{}, {}]
+
+  using mask = std::vector<bool>;
+
+  auto struct_validity = mask{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+  auto struct_validity_buffer =
+    cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
+  auto struct_col =
+    cudf::make_structs_column(14, {}, cudf::UNKNOWN_NULL_COUNT, std::move(struct_validity_buffer));
+
+  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14};
+  auto list_nullmask = mask{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto list_validity_buffer =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(13,
+                                             offsets.release(),
+                                             std::move(struct_col),
+                                             cudf::UNKNOWN_NULL_COUNT,
+                                             std::move(list_validity_buffer));
+  auto idx =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6};
+  auto input = cudf::table_view({idx, *list_column});
+
+  auto expect_map = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 2, 4, 7, 9, 11};
+  auto expect     = cudf::gather(input, expect_map);
+
+  auto result        = cudf::distinct(input, {1});
+  auto sorted_result = cudf::sort_by_key(*result, result->select({0}));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expect, *sorted_result);
+}
+
+TEST_F(Distinct, EmptyDeepList)
+{
+  // List<List<int>>, where all lists are empty
+  // []
+  // []
+  // Null
+  // Null
+
+  // Internal empty list
+  auto list1 = cudf::test::lists_column_wrapper<int>{};
+
+  auto offsets       = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
+  auto list_nullmask = std::vector<bool>{1, 1, 0, 0};
+  auto list_validity_buffer =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(4,
+                                             offsets.release(),
+                                             list1.release(),
+                                             cudf::UNKNOWN_NULL_COUNT,
+                                             std::move(list_validity_buffer));
+
+  auto idx   = cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 2, 2};
+  auto input = cudf::table_view({idx, *list_column});
+
+  auto expect_map = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 2};
+  auto expect     = cudf::gather(input, expect_map);
+
+  auto result        = cudf::distinct(input, {1});
+  auto sorted_result = cudf::sort_by_key(*result, result->select({0}));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expect, *sorted_result);
+}
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d95fe278469..9f2a3d45778 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1148,7 +1148,7 @@ def test_dataframe_hash_values(nrows, method):
     out = gdf.hash_values()
     assert isinstance(out, cudf.Series)
     assert len(out) == nrows
-    assert out.dtype == np.int32
+    assert out.dtype == np.uint32
 
     # Check single column
     out_one = gdf[["a"]].hash_values(method=method)

From 9b8d26f8bf98424bf740627a1b226233861f961e Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Fri, 29 Apr 2022 13:04:18 -0500
Subject: [PATCH 14/28] Fix an issue with one_level_list schemas in parquet
 reader. (#10750)

Partially addresses: https://github.com/rapidsai/cudf/issues/10733

For a particular way of encoding list schemas (an old way that Spark seems to use sometimes), the parquet reader was accidentally propagating incorrect nesting information between columns.  Just a simple bug of not popping an extra value off a stack.

Note:  this is simply a fix so that the files read correctly, however the internal data in the file is actually of binary type and cudf converts these to string columns.  This PR does not add support for binary as a real type in cudf.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - MithunR (https://github.com/mythrocks)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10750
---
 cpp/src/io/parquet/reader_impl.cu             |   4 +++
 .../data/parquet/one_level_list2.parquet      | Bin 0 -> 656 bytes
 python/cudf/cudf/tests/test_parquet.py        |  26 ++++++++++++++++++
 3 files changed, 30 insertions(+)
 create mode 100644 python/cudf/cudf/tests/data/parquet/one_level_list2.parquet

diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index a40993ee2dd..f165bd5ec3b 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -679,6 +679,10 @@ class aggregate_reader_metadata {
           }
 
           std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
+
+          // pop off the extra nesting element.
+          if (schema_elem.is_one_level_list()) { nesting.pop_back(); }
+
           path_is_valid = true;  // If we're able to reach leaf then path is valid
         }
 
diff --git a/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet b/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..cd5acd045949218a7a419c66a11305eb7ddeb0d7
GIT binary patch
literal 656
zcmZuvL2JS=6i%B^iwHX<q|!q#76-1Zt)haL-L{+T<Y5S{X+X&8R_iwXJv;eZJnjeW
zrONEwN}89x_kG`c{mAt0F{FrgX}3oS94u<P9KV+($Egqu!DGUUF@_z|IzR!CE_G`Z
zqEVa}u-8cXQy2ciwGOXHHF(wO2p3fJs9g<Yxx6t5240#7j-wzt6Pgv_N^3#hh7bp_
zCpStp;C$i`T@8Z>wA!>?8yPF5xZaqSsiaIT1sQ5O>YkL@x};4+TOE@dABdVdZclNi
z^MvC(D~||Etrm&O5F?K3Jo&auTh72jwgX%WStgnE?a76?e(plE6K-4K=4|fglcPB`
zNBx6QgYJHy2LrGdcKUl$x9?cUsU`%l&GrT?s7(H`)BnQfc~5^*b(pak)L`mgX0cDg
z2<kq(B*p6{D{r0)?=36V%T?}$!6*p5Zn{}2;q{X=>5C*Ars=KdXCht5;W&zuER5r^
Uj7Cz#awr$_8WnI=rf?U(0hF3wRR910

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 727200293f7..3a07ce6234c 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2386,6 +2386,32 @@ def test_parquet_reader_one_level_list(datadir):
     assert_eq(expect, got)
 
 
+# testing a specific bug-fix/edge case.
+# specifically:  int a parquet file containing a particular way of representing
+#                a list column in a schema, the cudf reader was confusing
+#                nesting information between a list column and a subsequent
+#                string column, ultimately causing a crash.
+def test_parquet_reader_one_level_list2(datadir):
+    # we are reading in a file containing binary types, but cudf returns
+    # those as strings. so we have to massage the pandas data to get
+    # them to compare correctly.
+    def postprocess(val):
+        if isinstance(val, bytes):
+            return val.decode()
+        elif isinstance(val, np.ndarray):
+            return np.array([v.decode() for v in val])
+        else:
+            return val
+
+    fname = datadir / "one_level_list2.parquet"
+
+    expect = pd.read_parquet(fname)
+    expect = expect.applymap(postprocess)
+    got = cudf.read_parquet(fname)
+
+    assert_eq(expect, got, check_dtype=False)
+
+
 @pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000])
 @pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000])
 def test_parquet_writer_row_group_size(

From 91129078e5146ea551e3cdf5d4a701b62addc1c3 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 29 Apr 2022 15:05:01 -0700
Subject: [PATCH 15/28] Support Segmented Min/Max Reduction on String Type
 (#10447)

This PR adds `min/max` segmented reduction to string type.

Part of https://github.com/rapidsai/cudf/issues/10417

Authors:
  - Michael Wang (https://github.com/isVoid)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10447
---
 cpp/include/cudf/detail/reduction.cuh         |  27 ++--
 .../detail/utilities/element_argminmax.cuh    |  61 ++++++++
 .../sort/group_single_pass_reduction_util.cuh |  36 +----
 cpp/src/reductions/simple_segmented.cuh       | 136 +++++++++++++++++-
 .../reductions/segmented_reduction_tests.cpp  | 121 ++++++++++++++++
 5 files changed, 331 insertions(+), 50 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/element_argminmax.cuh

diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/detail/reduction.cuh
index 76afbf7e4b8..023d83f3c24 100644
--- a/cpp/include/cudf/detail/reduction.cuh
+++ b/cpp/include/cudf/detail/reduction.cuh
@@ -227,36 +227,36 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
  * @brief Compute the specified simple reduction over each of the segments in the
  * input range of elements.
  *
- * @tparam Op               the reduction operator with device binary operator
  * @tparam InputIterator    the input column iterator
  * @tparam OffsetIterator   the offset column iterator
+ * @tparam BinaryOp         the device binary operator used to reduce
  * @tparam OutputType       the output type of reduction
  *
  * @param[in] d_in          the begin iterator to input
  * @param[in] d_offset      the begin iterator to offset
  * @param[in] num_segments  the number of segments
- * @param[in] sop           the reduction operator
+ * @param[in] binary_op     the reduction operator
+ * @param[in] identity      the identity element of the reduction operator
  * @param[in] stream        CUDA stream used for device memory operations and kernel launches.
  * @param[in] mr            Device memory resource used to allocate the returned column's device
  * memory
  * @returns   Output column in device memory
  *
  */
-template <typename Op,
-          typename InputIterator,
+template <typename InputIterator,
           typename OffsetIterator,
+          typename BinaryOp,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type,
           typename std::enable_if_t<is_fixed_width<OutputType>() &&
-                                    not cudf::is_fixed_point<OutputType>()>* = nullptr>
+                                    !cudf::is_fixed_point<OutputType>()>* = nullptr>
 std::unique_ptr<column> segmented_reduce(InputIterator d_in,
                                          OffsetIterator d_offset,
                                          cudf::size_type num_segments,
-                                         op::simple_op<Op> sop,
+                                         BinaryOp binary_op,
+                                         OutputType identity,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  auto binary_op  = sop.get_binary_op();
-  auto identity   = sop.template get_identity<OutputType>();
   auto dev_result = make_fixed_width_column(
     data_type{type_to_id<OutputType>()}, num_segments, mask_state::UNALLOCATED, stream, mr);
   auto dev_result_mview = dev_result->mutable_view();
@@ -291,16 +291,17 @@ std::unique_ptr<column> segmented_reduce(InputIterator d_in,
   return dev_result;
 }
 
-template <typename Op,
-          typename InputIterator,
+template <typename InputIterator,
           typename OffsetIterator,
+          typename BinaryOp,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type,
-          typename std::enable_if_t<not is_fixed_width<OutputType>() ||
-                                    is_fixed_point<OutputType>()>* = nullptr>
+          typename std::enable_if_t<!(is_fixed_width<OutputType>() &&
+                                      !cudf::is_fixed_point<OutputType>())>* = nullptr>
 std::unique_ptr<column> segmented_reduce(InputIterator,
                                          OffsetIterator,
                                          cudf::size_type,
-                                         op::simple_op<Op>,
+                                         BinaryOp,
+                                         OutputType,
                                          rmm::cuda_stream_view,
                                          rmm::mr::device_memory_resource*)
 {
diff --git a/cpp/include/cudf/detail/utilities/element_argminmax.cuh b/cpp/include/cudf/detail/utilities/element_argminmax.cuh
new file mode 100644
index 00000000000..45b56278dba
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/element_argminmax.cuh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Binary `argmin`/`argmax` operator
+ *
+ * @tparam T Type of the underlying column. Must support '<' operator.
+ */
+template <typename T>
+struct element_argminmax_fn {
+  column_device_view const d_col;
+  bool const has_nulls;
+  bool const arg_min;
+
+  __device__ inline auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
+  {
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    auto out_of_bound_or_null = [this] __device__(size_type const& idx) {
+      return idx < 0 || idx >= this->d_col.size() ||
+             (this->has_nulls && this->d_col.is_null_nocheck(idx));
+    };
+    if (out_of_bound_or_null(lhs_idx)) { return rhs_idx; }
+    if (out_of_bound_or_null(rhs_idx)) { return lhs_idx; }
+
+    // Return `lhs_idx` iff:
+    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
+    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
+    auto const less = d_col.element<T>(lhs_idx) < d_col.element<T>(rhs_idx);
+    return less == arg_min ? lhs_idx : rhs_idx;
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 8e1463f7964..93d5e6c032c 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -23,6 +23,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/types.hpp>
@@ -40,37 +41,6 @@ namespace cudf {
 namespace groupby {
 namespace detail {
 
-/**
- * @brief Binary operator with index values into the input column.
- *
- * @tparam T Type of the underlying column. Must support '<' operator.
- */
-template <typename T>
-struct element_arg_minmax_fn {
-  column_device_view const d_col;
-  bool const has_nulls;
-  bool const arg_min;
-
-  __device__ inline auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
-  {
-    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
-    // github.com/NVIDIA/thrust/issues/1525
-    // where invalid random values may be passed here by thrust::reduce_by_key
-    if (lhs_idx < 0 || lhs_idx >= d_col.size() || (has_nulls && d_col.is_null_nocheck(lhs_idx))) {
-      return rhs_idx;
-    }
-    if (rhs_idx < 0 || rhs_idx >= d_col.size() || (has_nulls && d_col.is_null_nocheck(rhs_idx))) {
-      return lhs_idx;
-    }
-
-    // Return `lhs_idx` iff:
-    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
-    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
-    auto const less = d_col.element<T>(lhs_idx) < d_col.element<T>(rhs_idx);
-    return less == arg_min ? lhs_idx : rhs_idx;
-  }
-};
-
 /**
  * @brief Value accessor for column which supports dictionary column too.
  *
@@ -211,8 +181,8 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
 
     if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
       auto const count_iter = thrust::make_counting_iterator<ResultType>(0);
-      auto const binop =
-        element_arg_minmax_fn<T>{*d_values_ptr, values.has_nulls(), K == aggregation::ARGMIN};
+      auto const binop      = cudf::detail::element_argminmax_fn<T>{
+        *d_values_ptr, values.has_nulls(), K == aggregation::ARGMIN};
       do_reduction(count_iter, result_begin, binop);
     } else {
       using OpType    = cudf::detail::corresponding_operator_t<K>;
diff --git a/cpp/src/reductions/simple_segmented.cuh b/cpp/src/reductions/simple_segmented.cuh
index 99837e67398..7796794502d 100644
--- a/cpp/src/reductions/simple_segmented.cuh
+++ b/cpp/src/reductions/simple_segmented.cuh
@@ -16,12 +16,15 @@
 
 #pragma once
 
+#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/reduction.cuh>
 #include <cudf/detail/unary.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
@@ -31,9 +34,12 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
+#include <type_traits>
+
 namespace cudf {
 namespace reduction {
 namespace simple {
@@ -70,18 +76,21 @@ std::unique_ptr<column> simple_segmented_reduction(column_view const& col,
   auto simple_op         = Op{};
   size_type num_segments = offsets.size() - 1;
 
+  auto binary_op = simple_op.get_binary_op();
+  auto identity  = simple_op.template get_identity<ResultType>();
+
   // TODO: Explore rewriting null_replacing_element_transformer/element_transformer with nullate
   auto result = [&] {
     if (col.has_nulls()) {
       auto f  = simple_op.template get_null_replacing_element_transformer<ResultType>();
       auto it = thrust::make_transform_iterator(dcol->pair_begin<InputType, true>(), f);
       return cudf::reduction::detail::segmented_reduce(
-        it, offsets.begin(), num_segments, simple_op, stream, mr);
+        it, offsets.begin(), num_segments, binary_op, identity, stream, mr);
     } else {
       auto f  = simple_op.template get_element_transformer<ResultType>();
       auto it = thrust::make_transform_iterator(dcol->begin<InputType>(), f);
       return cudf::reduction::detail::segmented_reduce(
-        it, offsets.begin(), num_segments, simple_op, stream, mr);
+        it, offsets.begin(), num_segments, binary_op, identity, stream, mr);
     }
   }();
 
@@ -103,6 +112,112 @@ std::unique_ptr<column> simple_segmented_reduction(column_view const& col,
   return result;
 }
 
+/**
+ * @brief String segmented reduction for 'min', 'max'.
+ *
+ * This algorithm uses argmin/argmax as a custom comparator to build a gather
+ * map, then builds the output.
+ *
+ * @tparam InputType    the input column data-type
+ * @tparam Op           the operator of cudf::reduction::op::
+
+ * @param col Input column of data to reduce.
+ * @param offsets Indices to segment boundaries.
+ * @param null_handling If `null_policy::INCLUDE`, all elements in a segment
+ * must be valid for the reduced value to be valid. If `null_policy::EXCLUDE`,
+ * the reduced value is valid if any element in the segment is valid.
+ * @param stream Used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Output column in device memory
+ */
+
+template <typename InputType,
+          typename Op,
+          CUDF_ENABLE_IF(std::is_same_v<Op, cudf::reduction::op::min> ||
+                         std::is_same_v<Op, cudf::reduction::op::max>)>
+std::unique_ptr<column> string_segmented_reduction(column_view const& col,
+                                                   device_span<size_type const> offsets,
+                                                   null_policy null_handling,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  // Pass to simple_segmented_reduction, get indices to gather, perform gather here.
+  auto device_col = cudf::column_device_view::create(col, stream);
+
+  auto it                 = thrust::make_counting_iterator(0);
+  auto const num_segments = static_cast<size_type>(offsets.size()) - 1;
+
+  bool constexpr is_argmin = std::is_same_v<Op, cudf::reduction::op::min>;
+  auto string_comparator =
+    cudf::detail::element_argminmax_fn<InputType>{*device_col, col.has_nulls(), is_argmin};
+  auto constexpr identity =
+    is_argmin ? cudf::detail::ARGMIN_SENTINEL : cudf::detail::ARGMAX_SENTINEL;
+
+  auto gather_map =
+    cudf::reduction::detail::segmented_reduce(it,
+                                              offsets.begin(),
+                                              num_segments,
+                                              string_comparator,
+                                              identity,
+                                              stream,
+                                              rmm::mr::get_current_device_resource());
+  auto result = std::move(cudf::detail::gather(table_view{{col}},
+                                               *gather_map,
+                                               cudf::out_of_bounds_policy::NULLIFY,
+                                               cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                               stream,
+                                               mr)
+                            ->release()[0]);
+  auto const [segmented_null_mask, segmented_null_count] =
+    cudf::detail::segmented_null_mask_reduction(col.null_mask(),
+                                                offsets.begin(),
+                                                offsets.end() - 1,
+                                                offsets.begin() + 1,
+                                                null_handling,
+                                                stream,
+                                                mr);
+
+  // If the segmented null mask contains any null values, the segmented null mask
+  // must be combined with the result null mask.
+  if (segmented_null_count > 0) {
+    if (result->null_count() == 0) {
+      // The result has no nulls. Use the segmented null mask.
+      result->set_null_mask(segmented_null_mask, segmented_null_count, stream);
+    } else {
+      // Compute the logical AND of the segmented output null mask and the
+      // result null mask to update the result null mask and null count.
+      auto result_mview = result->mutable_view();
+      std::vector masks{static_cast<bitmask_type const*>(result_mview.null_mask()),
+                        static_cast<bitmask_type const*>(segmented_null_mask.data())};
+      std::vector<size_type> begin_bits{0, 0};
+      auto const valid_count = cudf::detail::inplace_bitmask_and(
+        device_span<bitmask_type>(static_cast<bitmask_type*>(result_mview.null_mask()),
+                                  num_bitmask_words(result->size())),
+        masks,
+        begin_bits,
+        result->size(),
+        stream,
+        mr);
+      result->set_null_count(result->size() - valid_count);
+    }
+  }
+
+  return result;
+}
+
+template <typename InputType,
+          typename Op,
+          CUDF_ENABLE_IF(!std::is_same_v<Op, cudf::reduction::op::min>() &&
+                         !std::is_same_v<Op, cudf::reduction::op::max>())>
+std::unique_ptr<column> string_segmented_reduction(column_view const& col,
+                                                   device_span<size_type const> offsets,
+                                                   null_policy null_handling,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FAIL("Segmented reduction on string column only supports min and max reduction.");
+}
+
 /**
  * @brief Call reduce and return a column of type bool.
  *
@@ -153,7 +268,9 @@ struct same_column_type_dispatcher {
   }
 
  public:
-  template <typename ElementType, std::enable_if_t<is_supported<ElementType>()>* = nullptr>
+  template <typename ElementType,
+            CUDF_ENABLE_IF(is_supported<ElementType>() &&
+                           !std::is_same_v<ElementType, string_view>)>
   std::unique_ptr<column> operator()(column_view const& col,
                                      device_span<size_type const> offsets,
                                      null_policy null_handling,
@@ -164,7 +281,18 @@ struct same_column_type_dispatcher {
       col, offsets, null_handling, stream, mr);
   }
 
-  template <typename ElementType, std::enable_if_t<not is_supported<ElementType>()>* = nullptr>
+  template <typename ElementType,
+            CUDF_ENABLE_IF(is_supported<ElementType>() && std::is_same_v<ElementType, string_view>)>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     device_span<size_type const> offsets,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return string_segmented_reduction<ElementType, Op>(col, offsets, null_handling, stream, mr);
+  }
+
+  template <typename ElementType, CUDF_ENABLE_IF(!is_supported<ElementType>())>
   std::unique_ptr<column> operator()(column_view const&,
                                      device_span<size_type const>,
                                      null_policy,
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index f750c432efb..8a9a8fb549e 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -387,6 +387,127 @@ TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
 }
 
+// String min/max test grid
+// Segment: Length 0, length 1, length 2
+// Element nulls: No nulls, all nulls, some nulls
+// String: Empty string,
+// Position of the min/max: start of segment, end of segment
+// Include null, exclude null
+
+#undef XXX
+#define XXX ""  // null placeholder
+
+struct SegmentedReductionStringTest : public cudf::test::BaseFixture {
+  std::pair<strings_column_wrapper, fixed_width_column_wrapper<size_type>> input()
+  {
+    return std::pair(
+      strings_column_wrapper{
+        {"world", "cudf", XXX, "", "rapids", "i am", "ai", "apples", "zebras", XXX, XXX, XXX},
+        {true, true, false, true, true, true, true, true, true, false, false, false}},
+      fixed_width_column_wrapper<size_type>{0, 1, 4, 7, 9, 9, 10, 12});
+  }
+};
+
+TEST_F(SegmentedReductionStringTest, MaxIncludeNulls)
+{
+  // data: ['world'], ['cudf', NULL, ''], ['rapids', 'i am', 'ai'], ['apples', 'zebras'],
+  //       [], [NULL], [NULL, NULL]
+  // values:  {"world", "cudf", XXX, "", "rapids", "i am", "ai", "apples", "zebras", XXX, XXX, XXX}
+  // nullmask:{1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0}
+  // offsets: {0, 1, 4, 7, 9, 9, 10, 12}
+  // output_dtype: string dtype
+  // outputs: {"world", XXX, "rapids", "zebras", XXX, XXX, XXX}
+  // output nullmask: {1, 0, 1, 1, 0, 0, 0}
+
+  auto const [input, offsets] = this->input();
+  data_type output_dtype{type_id::STRING};
+
+  strings_column_wrapper expect{{"world", XXX, "rapids", "zebras", XXX, XXX, XXX},
+                                {true, false, true, true, false, false, false}};
+
+  auto res = segmented_reduce(input,
+                              column_view(offsets),
+                              *make_max_aggregation<segmented_reduce_aggregation>(),
+                              output_dtype,
+                              null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TEST_F(SegmentedReductionStringTest, MaxExcludeNulls)
+{
+  // data: ['world'], ['cudf', NULL, ''], ['rapids', 'i am', 'ai'], ['apples', 'zebras'],
+  //       [], [NULL], [NULL, NULL]
+  // values:  {"world", "cudf", XXX, "", "rapids", "i am", "ai", "apples", "zebras", XXX, XXX, XXX}
+  // nullmask:{1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0}
+  // offsets: {0, 1, 4, 7, 9, 9, 10, 12}
+  // output_dtype: string dtype
+  // outputs: {"world", "cudf", "rapids", "zebras", XXX, XXX, XXX}
+  // output nullmask: {1, 1, 1, 1, 0, 0, 0}
+
+  auto const [input, offsets] = this->input();
+  data_type output_dtype{type_id::STRING};
+
+  strings_column_wrapper expect{{"world", "cudf", "rapids", "zebras", XXX, XXX, XXX},
+                                {true, true, true, true, false, false, false}};
+
+  auto res = segmented_reduce(input,
+                              column_view(offsets),
+                              *make_max_aggregation<segmented_reduce_aggregation>(),
+                              output_dtype,
+                              null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TEST_F(SegmentedReductionStringTest, MinIncludeNulls)
+{
+  // data: ['world'], ['cudf', NULL, ''], ['rapids', 'i am', 'ai'], ['apples', 'zebras'],
+  //       [], [NULL], [NULL, NULL]
+  // values:  {"world", "cudf", XXX, "", "rapids", "i am", "ai", "apples", "zebras", XXX, XXX, XXX}
+  // nullmask:{1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0}
+  // offsets: {0, 1, 4, 7, 9, 9, 10, 12}
+  // output_dtype: string dtype
+  // outputs: {"world", XXX, "ai", "apples", XXX, XXX, XXX}
+  // output nullmask: {1, 0, 1, 1, 0, 0, 0}
+
+  auto const [input, offsets] = this->input();
+  data_type output_dtype{type_id::STRING};
+
+  strings_column_wrapper expect{{"world", XXX, "ai", "apples", XXX, XXX, XXX},
+                                {true, false, true, true, false, false, false}};
+
+  auto res = segmented_reduce(input,
+                              column_view(offsets),
+                              *make_min_aggregation<segmented_reduce_aggregation>(),
+                              output_dtype,
+                              null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TEST_F(SegmentedReductionStringTest, MinExcludeNulls)
+{
+  // data: ['world'], ['cudf', NULL, ''], ['rapids', 'i am', 'ai'], ['apples', 'zebras'],
+  //       [], [NULL], [NULL, NULL]
+  // values:  {"world", "cudf", XXX, "", "rapids", "i am", "ai", "apples", "zebras", XXX, XXX, XXX}
+  // nullmask:{1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0}
+  // offsets: {0, 1, 4, 7, 9, 9, 10, 12}
+  // output_dtype: string dtype
+  // outputs: {"world", "", "ai", "apples", XXX, XXX, XXX}
+  // output nullmask: {1, 1, 1, 1, 0, 0, 0}
+
+  auto const [input, offsets] = this->input();
+  data_type output_dtype{type_id::STRING};
+
+  strings_column_wrapper expect{{"world", "", "ai", "apples", XXX, XXX, XXX},
+                                {true, true, true, true, false, false, false}};
+
+  auto res = segmented_reduce(input,
+                              column_view(offsets),
+                              *make_min_aggregation<segmented_reduce_aggregation>(),
+                              output_dtype,
+                              null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
 #undef XXX
 
 }  // namespace test

From bf10a9471979e1eaae4d12aa20e4bea45cfb7506 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 29 Apr 2022 16:41:22 -0700
Subject: [PATCH 16/28] Flush output streams before creating a process to drop
 caches (#10762)

Small improvement for the `try_drop_l3_cache` feature in cuIO benchmarks.
Prevents unflushed output from the original process from intermingling with the output from the `popen` process.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - https://github.com/nvdbaranec
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10762
---
 cpp/benchmarks/io/cuio_common.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 7d356263220..da64c1bbf3c 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -16,6 +16,7 @@
 
 #include <benchmarks/io/cuio_common.hpp>
 
+#include <cstdio>
 #include <fstream>
 #include <numeric>
 #include <string>
@@ -145,6 +146,8 @@ std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks,
 // Executes the command and returns stderr output
 std::string exec_cmd(std::string_view cmd)
 {
+  // Prevent the output from the command from mixing with the original process' output
+  std::fflush(nullptr);
   // Switch stderr and stdout to only capture stderr
   auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
   std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);

From 027c34aefbf8a5abf5394da15a7b6f1dcc63b06c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 29 Apr 2022 19:30:33 -0500
Subject: [PATCH 17/28] Use generator expressions in any/all functions.
 (#10736)

This PR uses generator expressions in `any(...)` and `all(...)` to avoid allocating a list in memory while maximizing the potential benefit of early exit from the `any`/`all` function.

I also fixed a few miscellaneous things (~ 10 lines):
- Use `cls` in `classmethod`s
- Simplify a lambda expression
- Use `super()` with no arguments if the arguments are the parent class and `self`
- Parenthesize multi-line strings with implicit concatenation to clarify the behavior when written in a tuple of values

Note: Some of these were caught by https://codereview.doctor/rapidsai/cudf. In some places, the bot correctly identified a problem but its suggestions were invalid or incomplete. I identified steps for improvement beyond what the bot suggested for most of these cases.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/10736
---
 python/cudf/cudf/_fuzz_testing/utils.py        | 12 ++++--------
 python/cudf/cudf/core/column/interval.py       |  4 ++--
 python/cudf/cudf/core/column/struct.py         |  2 +-
 python/cudf/cudf/core/dataframe.py             | 14 ++++++--------
 python/cudf/cudf/core/tools/datetimes.py       |  2 +-
 python/cudf/cudf/io/parquet.py                 |  2 +-
 python/cudf/cudf/testing/_utils.py             |  2 +-
 python/cudf/cudf/tests/test_custom_accessor.py |  4 ++--
 python/cudf/cudf/tests/test_dtypes.py          |  6 ++----
 python/cudf/cudf/tests/test_multiindex.py      |  8 ++++----
 python/cudf/cudf/tests/test_text.py            | 12 ++++++++----
 11 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 87a8fc46374..9f3c0ab6d5f 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import random
 from collections import OrderedDict
@@ -312,13 +312,9 @@ def sanitize(value, struct_type):
         return tuple(values_list)
 
     has_nulls_or_nullable_dtype = any(
-        [
-            True
-            if df[col].dtype in pandas_dtypes_to_np_dtypes
-            or df[col].isnull().any()
-            else False
-            for col in df.columns
-        ]
+        (col := df[colname]).dtype in pandas_dtypes_to_np_dtypes
+        or col.isnull().any()
+        for colname in df.columns
     )
     pdf = df.copy(deep=True)
     for field in arrow_table_schema:
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index a873a0f98a5..bfaf65d45e2 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 import pandas as pd
 import pyarrow as pa
 
@@ -39,7 +39,7 @@ def closed(self):
         return self._closed
 
     @classmethod
-    def from_arrow(self, data):
+    def from_arrow(cls, data):
         new_col = super().from_arrow(data.storage)
         size = len(data)
         dtype = IntervalDtype.from_arrow(data.type)
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 53e6e9972b1..ed5e1c9450d 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -31,7 +31,7 @@ def base_size(self):
             return len(self.base_children[0])
 
     @classmethod
-    def from_arrow(self, data):
+    def from_arrow(cls, data):
         size = len(data)
         dtype = cudf.core.dtypes.StructDtype.from_arrow(data.type)
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0d3b3ee0300..8c459e855c1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -160,7 +160,7 @@ def _can_downcast_to_series(self, df, arg):
                 ):
                     return True
             dtypes = df.dtypes.values.tolist()
-            all_numeric = all([is_numeric_dtype(t) for t in dtypes])
+            all_numeric = all(is_numeric_dtype(t) for t in dtypes)
             if all_numeric:
                 return True
         if ncols == 1:
@@ -720,7 +720,7 @@ def _init_from_series_list(self, data, columns, index):
 
             final_index = as_index(index)
 
-        series_lengths = list(map(lambda x: len(x), data))
+        series_lengths = list(map(len, data))
         data = numeric_normalize_types(*data)
         if series_lengths.count(series_lengths[0]) == len(series_lengths):
             # Calculating the final dataframe columns by
@@ -2999,11 +2999,11 @@ def agg(self, aggs, axis=None):
 
         elif isinstance(aggs, dict):
             cols = aggs.keys()
-            if any([callable(val) for val in aggs.values()]):
+            if any(callable(val) for val in aggs.values()):
                 raise NotImplementedError(
                     "callable parameter is not implemented yet"
                 )
-            elif all([isinstance(val, str) for val in aggs.values()]):
+            elif all(isinstance(val, str) for val in aggs.values()):
                 result = cudf.Series(index=cols)
                 for key, value in aggs.items():
                     col = df_normalized[key]
@@ -3013,7 +3013,7 @@ def agg(self, aggs, axis=None):
                             f"'Series' object"
                         )
                     result[key] = getattr(col, value)()
-            elif all([isinstance(val, abc.Iterable) for val in aggs.values()]):
+            elif all(isinstance(val, abc.Iterable) for val in aggs.values()):
                 idxs = set()
                 for val in aggs.values():
                     if isinstance(val, str):
@@ -6032,9 +6032,7 @@ def append(
             if (cols.get_indexer(other._data.to_pandas_index()) >= 0).all():
                 other = other.reindex(columns=cols)
 
-        return super(DataFrame, self)._append(
-            other, ignore_index, verify_integrity, sort
-        )
+        return super()._append(other, ignore_index, verify_integrity, sort)
 
     @_cudf_nvtx_annotate
     @copy_docstring(reshape.pivot)
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 3ce89bc27e8..ccd23b82c88 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -629,7 +629,7 @@ def _generate_months_column(self, size, op):
     def _is_no_op(self) -> bool:
         # some logic could be implemented here for more complex cases
         # such as +1 year, -12 months
-        return all([i == 0 for i in self._kwds.values()])
+        return all(i == 0 for i in self._kwds.values())
 
     def __neg__(self):
         new_scalars = {k: -v for k, v in self._kwds.items()}
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index baedc3f174b..5746bf6fec9 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -56,7 +56,7 @@ def _write_parquet(
         "row_group_size_rows": row_group_size_rows,
         "partitions_info": partitions_info,
     }
-    if all([ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs]):
+    if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
             fsspec_objs = [stack.enter_context(file) for file in paths_or_bufs]
             file_objs = [
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 607d9121630..5232d1adb64 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -133,7 +133,7 @@ def assert_eq(left, right, **kwargs):
         # Use the overloaded __eq__ of the operands
         if left == right:
             return True
-        elif any([np.issubdtype(type(x), np.floating) for x in (left, right)]):
+        elif any(np.issubdtype(type(x), np.floating) for x in (left, right)):
             np.testing.assert_almost_equal(left, right)
         else:
             np.testing.assert_equal(left, right)
diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py
index bfd2ccbccef..35cc107b257 100644
--- a/python/cudf/cudf/tests/test_custom_accessor.py
+++ b/python/cudf/cudf/tests/test_custom_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import pandas as pd
 import pytest
@@ -17,7 +17,7 @@ def __init__(self, obj):
     @staticmethod
     def _validate(obj):
         cols = obj.columns
-        if not all([vertex in cols for vertex in ["x", "y"]]):
+        if not all(vertex in cols for vertex in ["x", "y"]):
             raise AttributeError("Must have vertices 'x', 'y'.")
 
     @property
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index 356685c976e..f6a0e41a0c7 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -189,10 +189,8 @@ def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array):
         )
     elif isinstance(column.dtype, StructDtype):
         return array.type.equals(column.dtype.to_arrow()) and all(
-            [
-                assert_column_array_dtype_equal(child, array.field(i))
-                for i, child in enumerate(column.base_children)
-            ]
+            assert_column_array_dtype_equal(child, array.field(i))
+            for i, child in enumerate(column.base_children)
         )
     elif isinstance(
         column.dtype, (Decimal128Dtype, Decimal64Dtype, Decimal32Dtype)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 38225b3efb9..f3830ed386a 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -762,7 +762,7 @@ def test_multiindex_copy_deep(data, deep):
         lptrs = [child.base_data.ptr for child in lchildren]
         rptrs = [child.base_data.ptr for child in rchildren]
 
-        assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
+        assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
     elif isinstance(data, cudf.MultiIndex):
         mi1 = data
@@ -772,19 +772,19 @@ def test_multiindex_copy_deep(data, deep):
         lptrs = [lv._data._data[None].base_data.ptr for lv in mi1._levels]
         rptrs = [lv._data._data[None].base_data.ptr for lv in mi2._levels]
 
-        assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
+        assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
         # Assert ._codes identity
         lptrs = [c.base_data.ptr for _, c in mi1._codes._data.items()]
         rptrs = [c.base_data.ptr for _, c in mi2._codes._data.items()]
 
-        assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
+        assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
         # Assert ._data identity
         lptrs = [d.base_data.ptr for _, d in mi1._data.items()]
         rptrs = [d.base_data.ptr for _, d in mi2._data.items()]
 
-        assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
+        assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 21c22110910..a4edaeff545 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -308,8 +308,10 @@ def test_character_tokenize_series():
             "hello world",
             "sdf",
             None,
-            "goodbye, one-two:three~four+five_six@sev"
-            "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé Ǆ",
+            (
+                "goodbye, one-two:three~four+five_six@sev"
+                "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé Ǆ"
+            ),
         ]
     )
     expected = cudf.Series(
@@ -423,8 +425,10 @@ def test_character_tokenize_index():
             "hello world",
             "sdf",
             None,
-            "goodbye, one-two:three~four+five_six@sev"
-            "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé Ǆ",
+            (
+                "goodbye, one-two:three~four+five_six@sev"
+                "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé Ǆ"
+            ),
         ]
     )
     expected = cudf.core.index.as_index(

From 6128e0dd79131f866240484e3610f94b6c24bb2f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 2 May 2022 08:42:29 -0400
Subject: [PATCH 18/28] Use warp per string for long strings in
 cudf::strings::contains() (#10739)

Improves the performance on `cudf::strings::contains()` for long strings. This executes a warp per string to match a target over sections of a single string in parallel. The benchmark showed this to be faster than the current implementation only for longer strings (greater than 64 bytes). It also proved somewhat faster and more consistent than a pure character-parallel approach.

This change may also help improve the performance of the regex `contains_re()` function in the future.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10739
---
 cpp/src/strings/search/find.cu   | 88 +++++++++++++++++++++++++++++++-
 cpp/tests/strings/find_tests.cpp | 20 ++++++++
 2 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 15d89069ba3..1390b304e43 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -18,6 +18,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/find.hpp>
@@ -28,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
@@ -162,6 +165,81 @@ std::unique_ptr<column> rfind(strings_column_view const& strings,
 
 namespace detail {
 namespace {
+
+/**
+ * @brief Threshold to decide on using string or warp parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the warp-parallel `contains_warp_fn` function is used.
+ * Otherwise, the string-parallel function in `contains_fn` is used.
+ *
+ * This is only used for the scalar version of `contains()` right now.
+ */
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
+
+/**
+ * @brief Check if `d_target` appears in a row in `d_strings`.
+ *
+ * This executes as a warp per string/row.
+ */
+struct contains_warp_fn {
+  column_device_view const d_strings;
+  string_view const d_target;
+  bool* d_results;
+
+  __device__ void operator()(std::size_t idx)
+  {
+    auto const str_idx = static_cast<size_type>(idx / cudf::detail::warp_size);
+    if (d_strings.is_null(str_idx)) { return; }
+    // get the string for this warp
+    auto const d_str = d_strings.element<string_view>(str_idx);
+    // each thread of the warp will check just part of the string
+    auto found = false;
+    for (auto i = static_cast<size_type>(idx % cudf::detail::warp_size);
+         !found && (i + d_target.size_bytes()) < d_str.size_bytes();
+         i += cudf::detail::warp_size) {
+      // check the target matches this part of the d_str data
+      if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; }
+    }
+    if (found) { atomicOr(d_results + str_idx, true); }
+  }
+};
+
+std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
+                                               string_scalar const& target,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
+  auto d_target = string_view(target.data(), target.size());
+
+  // create output column
+  auto results = make_numeric_column(data_type{type_id::BOOL8},
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
+                                     stream,
+                                     mr);
+
+  // fill the output with `false` unless the `d_target` is empty
+  auto results_view = results->mutable_view();
+  thrust::fill(rmm::exec_policy(stream),
+               results_view.begin<bool>(),
+               results_view.end<bool>(),
+               d_target.empty());
+
+  if (!d_target.empty()) {
+    // launch warp per string
+    auto d_strings = column_device_view::create(input.parent(), stream);
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<std::size_t>(0),
+                       static_cast<std::size_t>(input.size()) * cudf::detail::warp_size,
+                       contains_warp_fn{*d_strings, d_target, results_view.data<bool>()});
+  }
+  results->set_null_count(input.null_count());
+  return results;
+}
+
 /**
  * @brief Utility to return a bool column indicating the presence of
  * a given target string in a strings column.
@@ -286,15 +364,21 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
 }  // namespace
 
 std::unique_ptr<column> contains(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
+  // use warp parallel when the average string width is greater than the threshold
+  if (!input.is_empty() && ((input.chars_size() / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) {
+    return contains_warp_parallel(input, target, stream, mr);
+  }
+
+  // benchmark measurements showed this to be faster for smaller strings
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return d_string.find(d_target) >= 0;
   };
-  return contains_fn(strings, target, pfn, stream, mr);
+  return contains_fn(input, target, pfn, stream, mr);
 }
 
 std::unique_ptr<column> contains(
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index 177e6d97f7f..208063adcb0 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -82,6 +82,26 @@ TEST_F(StringsFindTest, Contains)
   }
 }
 
+TEST_F(StringsFindTest, ContainsLongStrings)
+{
+  cudf::test::strings_column_wrapper strings(
+    {"Héllo, there world and goodbye",
+     "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving",
+     "the following code snippet demonstrates how to use search for values in an ordered range",
+     "it returns the last position where value could be inserted without violating the ordering",
+     "algorithms execution is parallelized as determined by an execution policy. t",
+     "he this is a continuation of previous row to make sure string boundaries are honored",
+     ""});
+  auto strings_view = cudf::strings_column_view(strings);
+  auto results      = cudf::strings::contains(strings_view, cudf::string_scalar("e"));
+  cudf::test::fixed_width_column_wrapper<bool> expected({1, 1, 1, 1, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  results = cudf::strings::contains(strings_view, cudf::string_scalar(" the "));
+  cudf::test::fixed_width_column_wrapper<bool> expected2({0, 1, 0, 1, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected2);
+}
+
 TEST_F(StringsFindTest, StartsWith)
 {
   cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""},

From 0ddb3d9319426da49d8cb4b9cbb95819dc9b5263 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 2 May 2022 17:12:14 -0400
Subject: [PATCH 19/28] Add row bitmask as a `detail::hash_join` member
 (#10248)

When working on https://github.com/rapidsai/cudf/pull/8934, we observed a performance regression when nulls are unequal. One major reason is that the new hash map uses a CG-based double hashing algorithm. This algorithm is dedicated to improving hash collision handling. The existing implementation determines hash map size by the number of rows in the build table regardless of how many rows are valid. In the case of nulls being unequal, the actual map occupancy is, therefore, lower than the default 50% thus resulting in fewer hash collisions. The old scalar linear probing is more efficient in this case due to less CG-related overhead and the probe will mostly end at the first probe slot.

To improve this situation, the original idea of this PR was to construct the hash map based on the number of valid rows. There are supposed to be two benefits:

1. Increases map occupancy to benefit more from CG-based double hashing thus improving runtime efficiency
2. Reduces peak memory usage: for 1'000 elements with 75% nulls, the new capacity would be 500 (1000 * 0.25 * 2) as opposed to 2000 (1000 * 2)

During this work, however, we noticed the first assumption is improper since it didn't consider the performance degradation along with reduced capacity (see https://github.com/rapidsai/cudf/pull/10248#issuecomment-1045206917). Though this effort will reduce peak memory usage, it seems Python/Spark workflows would never benefit from it since they tend to drop nulls before any join operations.

Finally, all changes related to map size reduction are discarded. This PR only adds `_composite_bitmask` as a `detail::hash_join` member which is a preparation step for https://github.com/rapidsai/cudf/issues/9151

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10248
---
 cpp/include/cudf/detail/join.hpp   | 8 +++++---
 cpp/src/join/hash_join.cu          | 7 ++++++-
 cpp/src/join/join_common_utils.cuh | 5 +++--
 cpp/src/join/mixed_join.cu         | 8 ++++++--
 4 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index 12e4aaa03fd..2a94ee22a0d 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
@@ -68,9 +69,10 @@ struct hash_join {
   hash_join& operator=(hash_join&&) = delete;
 
  private:
-  bool const _is_empty;                    ///< true if `_hash_table` is empty
-  cudf::null_equality const _nulls_equal;  ///< whether to consider nulls as equal
-  cudf::table_view _build;                 ///< input table to build the hash map
+  bool const _is_empty;                         ///< true if `_hash_table` is empty
+  rmm::device_buffer const _composite_bitmask;  ///< Bitmask to denote whether a row is valid
+  cudf::null_equality const _nulls_equal;       ///< whether to consider nulls as equal
+  cudf::table_view _build;                      ///< input table to build the hash map
   cudf::structs::detail::flattened_table
     _flattened_build_table;  ///< flattened data structures for `_build`
   map_type _hash_table;      ///< hash table built on `_build`
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 3e0e76de708..07995ba2785 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -283,6 +283,7 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
                              cudf::null_equality compare_nulls,
                              rmm::cuda_stream_view stream)
   : _is_empty{build.num_rows() == 0},
+    _composite_bitmask{cudf::detail::bitmask_and(build, stream).first},
     _nulls_equal{compare_nulls},
     _hash_table{compute_hash_table_size(build.num_rows()),
                 std::numeric_limits<hash_value_type>::max(),
@@ -302,7 +303,11 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
 
   if (_is_empty) { return; }
 
-  cudf::detail::build_join_hash_table(_build, _hash_table, _nulls_equal, stream);
+  cudf::detail::build_join_hash_table(_build,
+                                      _hash_table,
+                                      _nulls_equal,
+                                      static_cast<bitmask_type const*>(_composite_bitmask.data()),
+                                      stream);
 }
 
 template <typename Hasher>
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index fdb63419c84..b3994685623 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -143,6 +143,7 @@ get_trivial_left_join_indices(
  * @param build Table of columns used to build join hash.
  * @param hash_table Build hash table.
  * @param nulls_equal Flag to denote nulls are equal or not.
+ * @param bitmask Bitmask to denote whether a row is valid.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  *
  */
@@ -150,6 +151,7 @@ template <typename MultimapType>
 void build_join_hash_table(cudf::table_view const& build,
                            MultimapType& hash_table,
                            null_equality const nulls_equal,
+                           [[maybe_unused]] bitmask_type const* bitmask,
                            rmm::cuda_stream_view stream)
 {
   auto build_table_ptr = cudf::table_device_view::create(build, stream);
@@ -168,8 +170,7 @@ void build_join_hash_table(cudf::table_view const& build,
     hash_table.insert(iter, iter + build_table_num_rows, stream.value());
   } else {
     thrust::counting_iterator<size_type> stencil(0);
-    auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
-    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+    row_is_valid pred{bitmask};
 
     // insert valid rows
     hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value());
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 27ee77e3edd..11553858e5f 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -135,7 +135,9 @@ mixed_join(
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
-  build_join_hash_table(build, hash_table, compare_nulls, stream);
+  auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
+  build_join_hash_table(
+    build, hash_table, compare_nulls, static_cast<bitmask_type const*>(row_bitmask.data()), stream);
   auto hash_table_view = hash_table.get_device_view();
 
   auto left_conditional_view  = table_device_view::create(left_conditional, stream);
@@ -381,7 +383,9 @@ compute_mixed_join_output_size(table_view const& left_equality,
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
-  build_join_hash_table(build, hash_table, compare_nulls, stream);
+  auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
+  build_join_hash_table(
+    build, hash_table, compare_nulls, static_cast<bitmask_type const*>(row_bitmask.data()), stream);
   auto hash_table_view = hash_table.get_device_view();
 
   auto left_conditional_view  = table_device_view::create(left_conditional, stream);

From a9eb47cab6976e515af597cc6f9a90b846cb6706 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 2 May 2022 16:11:26 -0700
Subject: [PATCH 20/28] Deprecate `merge_sorted`, change dask cudf usage to
 internal method (#10713)

This PR deprecates non-pandas conform method `cudf.merge_sorted` and change dask cudf usage to internal method `_merge_sorted`.

I also updated msg keyword in pytest.skip in multiple tests to reason, this removes 1000+ test warnings.

cc @vyasr @rjzamora

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10713
---
 python/cudf/cudf/core/reshape.py       | 19 +++++++++++++++++++
 python/cudf/cudf/tests/test_reshape.py | 17 +++++++++--------
 python/dask_cudf/dask_cudf/sorting.py  |  2 +-
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 5977b63777f..b405c018983 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import itertools
+import warnings
 from collections import abc
 from typing import Dict, Optional
 
@@ -791,6 +792,24 @@ def merge_sorted(
     A new, lexicographically sorted, DataFrame/Series.
     """
 
+    warnings.warn(
+        "merge_sorted is deprecated and will be removed in a "
+        "future release.",
+        FutureWarning,
+    )
+    return _merge_sorted(
+        objs, keys, by_index, ignore_index, ascending, na_position
+    )
+
+
+def _merge_sorted(
+    objs,
+    keys=None,
+    by_index=False,
+    ignore_index=False,
+    ascending=True,
+    na_position="last",
+):
     if not pd.api.types.is_list_like(objs):
         raise TypeError("objs must be a list-like of Frame-like objects")
 
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 14fa4be7fed..5f40de74a65 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -24,7 +24,7 @@
 @pytest.mark.parametrize("nulls", ["none", "some", "all"])
 def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
     if dtype not in ["float32", "float64"] and nulls in ["some", "all"]:
-        pytest.skip(msg="nulls not supported in dtype: " + dtype)
+        pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
     pdf = pd.DataFrame()
     id_vars = []
@@ -87,7 +87,7 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
 @pytest.mark.parametrize("nulls", ["none", "some"])
 def test_df_stack(nulls, num_cols, num_rows, dtype):
     if dtype not in ["float32", "float64"] and nulls in ["some"]:
-        pytest.skip(msg="nulls not supported in dtype: " + dtype)
+        pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
     pdf = pd.DataFrame()
     for i in range(num_cols):
@@ -139,7 +139,7 @@ def test_df_stack_reset_index():
 def test_interleave_columns(nulls, num_cols, num_rows, dtype):
 
     if dtype not in ["float32", "float64"] and nulls in ["some"]:
-        pytest.skip(msg="nulls not supported in dtype: " + dtype)
+        pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
     pdf = pd.DataFrame(dtype=dtype)
     for i in range(num_cols):
@@ -176,7 +176,7 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype):
 def test_tile(nulls, num_cols, num_rows, dtype, count):
 
     if dtype not in ["float32", "float64"] and nulls in ["some"]:
-        pytest.skip(msg="nulls not supported in dtype: " + dtype)
+        pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
     pdf = pd.DataFrame(dtype=dtype)
     for i in range(num_cols):
@@ -269,7 +269,7 @@ def test_df_merge_sorted(nparts, keys, na_position, ascending):
     expect = df.sort_values(
         keys_1, na_position=na_position, ascending=ascending
     )
-    result = cudf.merge_sorted(
+    result = cudf.core.reshape._merge_sorted(
         dfs, keys=keys, na_position=na_position, ascending=ascending
     )
     if keys:
@@ -290,7 +290,8 @@ def test_df_merge_sorted_index(nparts, index, ascending):
     )
 
     expect = df.sort_index(ascending=ascending)
-    result = cudf.merge_sorted(dfs, by_index=True, ascending=ascending)
+    with pytest.warns(FutureWarning, match="deprecated and will be removed"):
+        result = cudf.merge_sorted(dfs, by_index=True, ascending=ascending)
 
     assert_eq(expect.index, result.index)
 
@@ -317,7 +318,7 @@ def test_df_merge_sorted_ignore_index(keys, na_position, ascending):
     expect = df.sort_values(
         keys_1, na_position=na_position, ascending=ascending
     )
-    result = cudf.merge_sorted(
+    result = cudf.core.reshape._merge_sorted(
         dfs,
         keys=keys,
         na_position=na_position,
@@ -347,7 +348,7 @@ def test_series_merge_sorted(nparts, key, na_position, ascending):
     )
 
     expect = df.sort_values(na_position=na_position, ascending=ascending)
-    result = cudf.merge_sorted(
+    result = cudf.core.reshape._merge_sorted(
         dfs, na_position=na_position, ascending=ascending
     )
 
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index 880e2365fe6..1c89baba592 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -85,7 +85,7 @@ def _append_counts(val, count):
         return val
 
     # Sort by calculated quantile values, then number of observations.
-    combined_vals_counts = gd.merge_sorted(
+    combined_vals_counts = gd.core.reshape._merge_sorted(
         [*map(_append_counts, vals, counts)]
     )
     combined_counts = cupy.asnumpy(combined_vals_counts["_counts"].values)

From 0e326245fbbc1332e0a83c16f296f09fbf33a7d1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 3 May 2022 07:39:50 -0400
Subject: [PATCH 21/28] Add multiple rows to subword tokenizer benchmark
 (#10767)

When porting the subword tokenizer code from CLX, the benchmark was not updated to measure multiple rows. This updates the benchmark to include a row test range and add the missing `cuda_event_timer`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/10767
---
 cpp/benchmarks/text/subword.cpp | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
index d8357dcf92c..2c430868341 100644
--- a/cpp/benchmarks/text/subword.cpp
+++ b/cpp/benchmarks/text/subword.cpp
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -53,9 +54,9 @@ static std::string create_hash_vocab_file()
   return hash_file;
 }
 
-static void BM_cuda_tokenizer_cudf(benchmark::State& state)
+static void BM_subword_tokenizer(benchmark::State& state)
 {
-  uint32_t nrows = 1000;
+  auto const nrows = static_cast<cudf::size_type>(state.range(0));
   std::vector<const char*> h_strings(nrows, "This is a test ");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = create_hash_vocab_file();
@@ -67,6 +68,7 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state)
   //
   auto vocab = nvtext::load_vocabulary_file(hash_file);
   for (auto _ : state) {
+    cuda_event_timer raii(state, true);
     auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
                                            *vocab,
                                            max_sequence_length,
@@ -76,6 +78,18 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state)
                                            MAX_ROWS_TENSOR);
   }
 }
-BENCHMARK(BM_cuda_tokenizer_cudf);
 
-BENCHMARK_MAIN();
+class Subword : public cudf::benchmark {
+};
+
+#define SUBWORD_BM_BENCHMARK_DEFINE(name)                                                        \
+  BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \
+  BENCHMARK_REGISTER_F(Subword, name)                                                            \
+    ->RangeMultiplier(2)                                                                         \
+    ->Range(1 << 10, 1 << 17)                                                                    \
+    ->UseManualTime()                                                                            \
+    ->Unit(benchmark::kMillisecond);
+
+SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer);
+
+// BENCHMARK_MAIN();

From ad126065109aaa72b6eb324ba5abd555b70bb4ae Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 3 May 2022 07:44:19 -0400
Subject: [PATCH 22/28] Fix replace error when regex has only zero match
 quantifiers (#10760)

Closes #10753

Fixes `cudf::strings::replace_re` logic that was reading past the end of a string when given a regex that contained net zero match quantifier pattern (e.g. 'D*' or 'D?s?' both can match to nothing).

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/10760
---
 cpp/src/strings/replace/replace_re.cu     | 15 ++++++++-------
 cpp/tests/strings/replace_regex_tests.cpp | 12 ++++++++++++
 python/cudf/cudf/tests/test_string.py     | 10 ++++++++++
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index d42359deeac..af74d8bdb92 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -54,18 +54,19 @@ struct replace_regex_fn {
       return;
     }
 
-    auto const d_str = d_strings.element<string_view>(idx);
-    auto nbytes      = d_str.size_bytes();                  // number of bytes in input string
-    auto mxn = maxrepl < 0 ? d_str.length() + 1 : maxrepl;  // max possible replaces for this string
-    auto in_ptr        = d_str.data();                      // input pointer (i)
-    auto out_ptr       = d_chars ? d_chars + d_offsets[idx]  // output pointer (o)
-                                 : nullptr;
+    auto const d_str  = d_strings.element<string_view>(idx);
+    auto const nchars = d_str.length();
+    auto nbytes       = d_str.size_bytes();             // number of bytes in input string
+    auto mxn     = maxrepl < 0 ? nchars + 1 : maxrepl;  // max possible replaces for this string
+    auto in_ptr  = d_str.data();                        // input pointer (i)
+    auto out_ptr = d_chars ? d_chars + d_offsets[idx]   // output pointer (o)
+                           : nullptr;
     size_type last_pos = 0;
     int32_t begin      = 0;   // these are for calling prog.find
     int32_t end        = -1;  // matches final word-boundary if at the end of the string
 
     // copy input to output replacing strings as we go
-    while (mxn-- > 0) {  // maximum number of replaces
+    while (mxn-- > 0 && begin <= nchars) {  // maximum number of replaces
 
       if (prog.is_empty() || prog.find<stack_size>(idx, d_str, begin, end) <= 0) {
         break;  // no more matches
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 2b9e8b7aae7..1ccbc6fa676 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -157,6 +157,18 @@ TEST_F(StringsReplaceRegexTest, WordBoundary)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
+TEST_F(StringsReplaceRegexTest, ZeroLengthMatch)
+{
+  cudf::test::strings_column_wrapper input({"DD", "zéz", "DsDs", ""});
+  auto repl     = cudf::string_scalar("_");
+  auto results  = cudf::strings::replace_re(cudf::strings_column_view(input), "D*", repl);
+  auto expected = cudf::test::strings_column_wrapper({"__", "_z_é_z_", "__s__s_", "_"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  results  = cudf::strings::replace_re(cudf::strings_column_view(input), "D?s?", repl);
+  expected = cudf::test::strings_column_wrapper({"___", "_z_é_z_", "___", "_"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+}
+
 TEST_F(StringsReplaceRegexTest, Multiline)
 {
   auto const multiline = cudf::strings::regex_flags::MULTILINE;
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index d600fdeee27..d212c6b2072 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -923,6 +923,16 @@ def test_string_replace(
         assert_eq(expect, got)
 
 
+@pytest.mark.parametrize("pat", ["A*", "F?H?"])
+def test_string_replace_zero_length(ps_gs, pat):
+    ps, gs = ps_gs
+
+    expect = ps.str.replace(pat, "_", regex=True)
+    got = gs.str.replace(pat, "_", regex=True)
+
+    assert_eq(expect, got)
+
+
 def test_string_lower(ps_gs):
     ps, gs = ps_gs
 

From 8d861ce3dd254d77a7bfe1655c52a156263bd747 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Tue, 3 May 2022 16:57:06 -0400
Subject: [PATCH 23/28] Fixing deprecation warnings in test_orc.py (#10772)

This change fixes the deprecation warnings in `test_orc.py`. Fixed warnings:

- parsing timezone aware datetimes is deprecated; this will raise an error in the future
- DeprecationWarning: elementwise comparison failed; this will raise an error in the future.
- FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
- DeprecationWarning: In future, it will be an error for 'np.bool_' scalars to be interpreted as an index
- FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10772
---
 python/cudf/cudf/testing/_utils.py | 23 ++++++++++++++---------
 python/cudf/cudf/tests/test_orc.py |  8 +++++++-
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 5232d1adb64..e9f836d9702 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -108,20 +108,25 @@ def assert_eq(left, right, **kwargs):
     if isinstance(right, cupy.ndarray):
         right = cupy.asnumpy(right)
 
-    if isinstance(left, pd.DataFrame):
-        tm.assert_frame_equal(left, right, **kwargs)
-    elif isinstance(left, pd.Series):
+    if isinstance(left, (pd.DataFrame, pd.Series, pd.Index)):
         # TODO: A warning is emitted from the function
-        # pandas.testing.assert_series_equal for some inputs:
+        # pandas.testing.assert_[series, frame, index]_equal for some inputs:
         # "DeprecationWarning: elementwise comparison failed; this will raise
         # an error in the future."
+        # or "FutureWarning: elementwise ..."
         # This warning comes from a call from pandas to numpy. It is ignored
         # here because it cannot be fixed within cudf.
         with warnings.catch_warnings():
-            warnings.simplefilter("ignore", DeprecationWarning)
-            tm.assert_series_equal(left, right, **kwargs)
-    elif isinstance(left, pd.Index):
-        tm.assert_index_equal(left, right, **kwargs)
+            warnings.simplefilter(
+                "ignore", (DeprecationWarning, FutureWarning)
+            )
+            if isinstance(left, pd.DataFrame):
+                tm.assert_frame_equal(left, right, **kwargs)
+            elif isinstance(left, pd.Series):
+                tm.assert_series_equal(left, right, **kwargs)
+            else:
+                tm.assert_index_equal(left, right, **kwargs)
+
     elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
         if np.issubdtype(left.dtype, np.floating) and np.issubdtype(
             right.dtype, np.floating
@@ -306,7 +311,7 @@ def gen_rand(dtype, size, **kwargs):
             np.random.randint(low=low, high=high, size=size), unit=time_unit
         )
     elif dtype.kind in ("O", "U"):
-        return pd.util.testing.rands_array(10, size)
+        return pd._testing.rands_array(10, size)
     raise NotImplementedError(f"dtype.kind={dtype.kind}")
 
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index c3969bf6c14..c28358f5fa0 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -310,7 +310,7 @@ def test_orc_read_skiprows(tmpdir):
     writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
     tuples = list(
         map(
-            lambda x: (None,) if x[0] is pd.NA else x,
+            lambda x: (None,) if x[0] is pd.NA else (bool(x[0]),),
             list(df.itertuples(index=False, name=None)),
         )
     )
@@ -640,6 +640,12 @@ def test_int_overflow(tmpdir):
 
 
 def normalized_equals(value1, value2):
+    # need naive time object for numpy to convert to datetime64
+    if isinstance(value1, datetime.datetime):
+        value1 = value1.replace(tzinfo=None)
+    if isinstance(value2, datetime.datetime):
+        value2 = value2.replace(tzinfo=None)
+
     if isinstance(value1, (datetime.datetime, np.datetime64)):
         value1 = np.datetime64(value1, "ms")
     if isinstance(value2, (datetime.datetime, np.datetime64)):

From d3a39b32a284050048e9e586694e805cd63201d1 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 4 May 2022 13:40:10 -0500
Subject: [PATCH 24/28] Add struct utility functions. (#10776)

This PR adds some struct utility functions. This change is needed for the eventual support of structs in binary operations. See also: PR #9452.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/10776
---
 cpp/CMakeLists.txt                            |  2 +-
 cpp/include/cudf/detail/structs/utilities.hpp | 16 +++++++++-
 cpp/include/cudf/table/row_operators.cuh      |  2 +-
 cpp/include/cudf/utilities/traits.hpp         | 31 +++++++++++++++++++
 cpp/src/structs/utilities.cpp                 |  6 ++++
 5 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cbe2811afe4..7870366b714 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -189,7 +189,6 @@ add_library(
   src/ast/expression_parser.cpp
   src/ast/expressions.cpp
   src/binaryop/binaryop.cpp
-  src/binaryop/compiled/binary_ops.cu
   src/binaryop/compiled/Add.cu
   src/binaryop/compiled/ATan2.cu
   src/binaryop/compiled/BitwiseAnd.cu
@@ -220,6 +219,7 @@ add_library(
   src/binaryop/compiled/ShiftRightUnsigned.cu
   src/binaryop/compiled/Sub.cu
   src/binaryop/compiled/TrueDiv.cu
+  src/binaryop/compiled/binary_ops.cu
   src/binaryop/compiled/util.cpp
   src/labeling/label_bins.cu
   src/bitmask/null_mask.cu
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index 751b7c00e8a..45d4c3b5ae4 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -245,6 +245,20 @@ std::tuple<cudf::table_view, std::vector<rmm::device_buffer>> superimpose_parent
   table_view const& table,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Checks if a column or any of its children is a struct column with structs that are null.
+ *
+ * This function searches for structs that are null -- differentiating between structs that are null
+ * and structs containing null values. Null structs add a column to the result of the flatten column
+ * utility and necessitates column_nullability::FORCE when flattening the column for comparison
+ * operations.
+ *
+ * @param col Column to check for null structs
+ * @return A boolean indicating if the column is or contains a struct column that contains a null
+ * struct.
+ */
+bool contains_null_structs(column_view const& col);
 }  // namespace detail
 }  // namespace structs
 }  // namespace cudf
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 4eca03a800c..4d503cd53b8 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -74,7 +74,7 @@ __device__ weak_ordering compare_elements(Element lhs, Element rhs)
  * @brief A specialization for floating-point `Element` type relational comparison
  * to derive the order of the elements with respect to `lhs`.
  *
- * This Specialization handles `nan` in the following order:
+ * This specialization handles `nan` in the following order:
  * `[-Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN, null] (for null_order::AFTER)`
  * `[null, -Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN] (for null_order::BEFORE)`
  *
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index ed24517f55b..d8fa7bff0b8 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -699,6 +699,37 @@ constexpr inline bool is_nested(data_type type)
   return cudf::type_dispatcher(type, is_nested_impl{});
 }
 
+/**
+ * @brief Indicates whether `T` is a struct type.
+ *
+ * @param T The type to verify
+ * @return A boolean indicating if T is a struct type
+ */
+template <typename T>
+constexpr inline bool is_struct()
+{
+  return std::is_same_v<T, cudf::struct_view>;
+}
+
+struct is_struct_impl {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return is_struct<T>();
+  }
+};
+
+/**
+ * @brief Indicates whether `type` is a struct type.
+ *
+ * @param type The `data_type` to verify
+ * @return A boolean indicating if `type` is a struct type
+ */
+constexpr inline bool is_struct(data_type type)
+{
+  return cudf::type_dispatcher(type, is_struct_impl{});
+}
+
 template <typename FromType>
 struct is_bit_castable_to_impl {
   template <typename ToType, std::enable_if_t<is_compound<ToType>()>* = nullptr>
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index a2c173cae5f..5baab0f09a2 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -441,6 +441,12 @@ std::tuple<cudf::table_view, std::vector<rmm::device_buffer>> superimpose_parent
   return {table_view{superimposed_columns}, std::move(superimposed_nullmasks)};
 }
 
+bool contains_null_structs(column_view const& col)
+{
+  return (is_struct(col) && col.has_nulls()) ||
+         std::any_of(col.child_begin(), col.child_end(), contains_null_structs);
+}
+
 }  // namespace detail
 }  // namespace structs
 }  // namespace cudf

From 0d11591f23b566e99f30cd06593e78097262a6fe Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 4 May 2022 15:16:51 -0500
Subject: [PATCH 25/28] Use column_views instead of column_device_views in
 binary operations. (#10780)

This PR changes the internal APIs used for binary operations to use `column_view` objects instead of `column_device_view` objects. This change is needed for the eventual support of structs in binary operations. See also: PR #9452.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ryan Lee (https://github.com/rwlee)
  - Nghia Truong (https://github.com/ttnghia)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/10780
---
 cpp/src/binaryop/compiled/ATan2.cu            |  8 +--
 cpp/src/binaryop/compiled/Add.cu              |  8 +--
 cpp/src/binaryop/compiled/BitwiseAnd.cu       |  8 +--
 cpp/src/binaryop/compiled/BitwiseOr.cu        |  8 +--
 cpp/src/binaryop/compiled/BitwiseXor.cu       |  8 +--
 cpp/src/binaryop/compiled/Div.cu              |  8 +--
 cpp/src/binaryop/compiled/FloorDiv.cu         |  8 +--
 cpp/src/binaryop/compiled/Greater.cu          |  8 +--
 cpp/src/binaryop/compiled/GreaterEqual.cu     |  8 +--
 cpp/src/binaryop/compiled/Less.cu             |  8 +--
 cpp/src/binaryop/compiled/LessEqual.cu        |  8 +--
 cpp/src/binaryop/compiled/LogBase.cu          |  8 +--
 cpp/src/binaryop/compiled/LogicalAnd.cu       |  8 +--
 cpp/src/binaryop/compiled/LogicalOr.cu        |  8 +--
 cpp/src/binaryop/compiled/Mod.cu              |  8 +--
 cpp/src/binaryop/compiled/Mul.cu              |  8 +--
 cpp/src/binaryop/compiled/NullEquals.cu       |  8 +--
 cpp/src/binaryop/compiled/NullLogicalAnd.cu   |  6 +-
 cpp/src/binaryop/compiled/NullLogicalOr.cu    |  6 +-
 cpp/src/binaryop/compiled/NullMax.cu          |  8 +--
 cpp/src/binaryop/compiled/NullMin.cu          |  8 +--
 cpp/src/binaryop/compiled/PMod.cu             |  8 +--
 cpp/src/binaryop/compiled/Pow.cu              |  8 +--
 cpp/src/binaryop/compiled/PyMod.cu            |  8 +--
 cpp/src/binaryop/compiled/ShiftLeft.cu        |  8 +--
 cpp/src/binaryop/compiled/ShiftRight.cu       |  8 +--
 .../binaryop/compiled/ShiftRightUnsigned.cu   |  8 +--
 cpp/src/binaryop/compiled/Sub.cu              |  8 +--
 cpp/src/binaryop/compiled/TrueDiv.cu          |  8 +--
 cpp/src/binaryop/compiled/binary_ops.cu       | 57 +++++++------------
 cpp/src/binaryop/compiled/binary_ops.cuh      | 25 +++++---
 cpp/src/binaryop/compiled/binary_ops.hpp      | 40 ++++++-------
 cpp/src/binaryop/compiled/equality_ops.cu     | 34 +++++------
 33 files changed, 190 insertions(+), 194 deletions(-)

diff --git a/cpp/src/binaryop/compiled/ATan2.cu b/cpp/src/binaryop/compiled/ATan2.cu
index 8e5cbf57f55..f43a469a2c9 100644
--- a/cpp/src/binaryop/compiled/ATan2.cu
+++ b/cpp/src/binaryop/compiled/ATan2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::ATan2>(mutable_column_device_view&,
-                                          column_device_view const&,
-                                          column_device_view const&,
+template void apply_binary_op<ops::ATan2>(mutable_column_view&,
+                                          column_view const&,
+                                          column_view const&,
                                           bool is_lhs_scalar,
                                           bool is_rhs_scalar,
                                           rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Add.cu b/cpp/src/binaryop/compiled/Add.cu
index 4cd2ced66f4..1dbfa5b4718 100644
--- a/cpp/src/binaryop/compiled/Add.cu
+++ b/cpp/src/binaryop/compiled/Add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Add>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Add>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/BitwiseAnd.cu b/cpp/src/binaryop/compiled/BitwiseAnd.cu
index 6abac2bd197..cfabb1402ce 100644
--- a/cpp/src/binaryop/compiled/BitwiseAnd.cu
+++ b/cpp/src/binaryop/compiled/BitwiseAnd.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::BitwiseAnd>(mutable_column_device_view&,
-                                               column_device_view const&,
-                                               column_device_view const&,
+template void apply_binary_op<ops::BitwiseAnd>(mutable_column_view&,
+                                               column_view const&,
+                                               column_view const&,
                                                bool is_lhs_scalar,
                                                bool is_rhs_scalar,
                                                rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/BitwiseOr.cu b/cpp/src/binaryop/compiled/BitwiseOr.cu
index 6d523cbf1d1..01ef118665b 100644
--- a/cpp/src/binaryop/compiled/BitwiseOr.cu
+++ b/cpp/src/binaryop/compiled/BitwiseOr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::BitwiseOr>(mutable_column_device_view&,
-                                              column_device_view const&,
-                                              column_device_view const&,
+template void apply_binary_op<ops::BitwiseOr>(mutable_column_view&,
+                                              column_view const&,
+                                              column_view const&,
                                               bool is_lhs_scalar,
                                               bool is_rhs_scalar,
                                               rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/BitwiseXor.cu b/cpp/src/binaryop/compiled/BitwiseXor.cu
index 45175681574..44f74bab876 100644
--- a/cpp/src/binaryop/compiled/BitwiseXor.cu
+++ b/cpp/src/binaryop/compiled/BitwiseXor.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::BitwiseXor>(mutable_column_device_view&,
-                                               column_device_view const&,
-                                               column_device_view const&,
+template void apply_binary_op<ops::BitwiseXor>(mutable_column_view&,
+                                               column_view const&,
+                                               column_view const&,
                                                bool is_lhs_scalar,
                                                bool is_rhs_scalar,
                                                rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Div.cu b/cpp/src/binaryop/compiled/Div.cu
index 7cc895ecd06..f377778c427 100644
--- a/cpp/src/binaryop/compiled/Div.cu
+++ b/cpp/src/binaryop/compiled/Div.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Div>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Div>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/FloorDiv.cu b/cpp/src/binaryop/compiled/FloorDiv.cu
index 99ea2706b86..f9cd323caec 100644
--- a/cpp/src/binaryop/compiled/FloorDiv.cu
+++ b/cpp/src/binaryop/compiled/FloorDiv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::FloorDiv>(mutable_column_device_view&,
-                                             column_device_view const&,
-                                             column_device_view const&,
+template void apply_binary_op<ops::FloorDiv>(mutable_column_view&,
+                                             column_view const&,
+                                             column_view const&,
                                              bool is_lhs_scalar,
                                              bool is_rhs_scalar,
                                              rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Greater.cu b/cpp/src/binaryop/compiled/Greater.cu
index 679e029b5fc..db06cc409da 100644
--- a/cpp/src/binaryop/compiled/Greater.cu
+++ b/cpp/src/binaryop/compiled/Greater.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Greater>(mutable_column_device_view&,
-                                            column_device_view const&,
-                                            column_device_view const&,
+template void apply_binary_op<ops::Greater>(mutable_column_view&,
+                                            column_view const&,
+                                            column_view const&,
                                             bool is_lhs_scalar,
                                             bool is_rhs_scalar,
                                             rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/GreaterEqual.cu b/cpp/src/binaryop/compiled/GreaterEqual.cu
index 23b0c6aaa0d..c239e1e1345 100644
--- a/cpp/src/binaryop/compiled/GreaterEqual.cu
+++ b/cpp/src/binaryop/compiled/GreaterEqual.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::GreaterEqual>(mutable_column_device_view&,
-                                                 column_device_view const&,
-                                                 column_device_view const&,
+template void apply_binary_op<ops::GreaterEqual>(mutable_column_view&,
+                                                 column_view const&,
+                                                 column_view const&,
                                                  bool is_lhs_scalar,
                                                  bool is_rhs_scalar,
                                                  rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Less.cu b/cpp/src/binaryop/compiled/Less.cu
index 7ab5dfe3478..e8663715c87 100644
--- a/cpp/src/binaryop/compiled/Less.cu
+++ b/cpp/src/binaryop/compiled/Less.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Less>(mutable_column_device_view&,
-                                         column_device_view const&,
-                                         column_device_view const&,
+template void apply_binary_op<ops::Less>(mutable_column_view&,
+                                         column_view const&,
+                                         column_view const&,
                                          bool is_lhs_scalar,
                                          bool is_rhs_scalar,
                                          rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/LessEqual.cu b/cpp/src/binaryop/compiled/LessEqual.cu
index 983c50c9575..d2f88fab81b 100644
--- a/cpp/src/binaryop/compiled/LessEqual.cu
+++ b/cpp/src/binaryop/compiled/LessEqual.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::LessEqual>(mutable_column_device_view&,
-                                              column_device_view const&,
-                                              column_device_view const&,
+template void apply_binary_op<ops::LessEqual>(mutable_column_view&,
+                                              column_view const&,
+                                              column_view const&,
                                               bool is_lhs_scalar,
                                               bool is_rhs_scalar,
                                               rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/LogBase.cu b/cpp/src/binaryop/compiled/LogBase.cu
index bdc709b86bf..8a2162c4ca4 100644
--- a/cpp/src/binaryop/compiled/LogBase.cu
+++ b/cpp/src/binaryop/compiled/LogBase.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::LogBase>(mutable_column_device_view&,
-                                            column_device_view const&,
-                                            column_device_view const&,
+template void apply_binary_op<ops::LogBase>(mutable_column_view&,
+                                            column_view const&,
+                                            column_view const&,
                                             bool is_lhs_scalar,
                                             bool is_rhs_scalar,
                                             rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/LogicalAnd.cu b/cpp/src/binaryop/compiled/LogicalAnd.cu
index 08112fadfff..64e5c1a31c0 100644
--- a/cpp/src/binaryop/compiled/LogicalAnd.cu
+++ b/cpp/src/binaryop/compiled/LogicalAnd.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::LogicalAnd>(mutable_column_device_view&,
-                                               column_device_view const&,
-                                               column_device_view const&,
+template void apply_binary_op<ops::LogicalAnd>(mutable_column_view&,
+                                               column_view const&,
+                                               column_view const&,
                                                bool is_lhs_scalar,
                                                bool is_rhs_scalar,
                                                rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/LogicalOr.cu b/cpp/src/binaryop/compiled/LogicalOr.cu
index bc400afd4cd..a4b64cc6afc 100644
--- a/cpp/src/binaryop/compiled/LogicalOr.cu
+++ b/cpp/src/binaryop/compiled/LogicalOr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::LogicalOr>(mutable_column_device_view&,
-                                              column_device_view const&,
-                                              column_device_view const&,
+template void apply_binary_op<ops::LogicalOr>(mutable_column_view&,
+                                              column_view const&,
+                                              column_view const&,
                                               bool is_lhs_scalar,
                                               bool is_rhs_scalar,
                                               rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Mod.cu b/cpp/src/binaryop/compiled/Mod.cu
index 0b82c09c8a6..fcdd01b7be8 100644
--- a/cpp/src/binaryop/compiled/Mod.cu
+++ b/cpp/src/binaryop/compiled/Mod.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Mod>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Mod>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Mul.cu b/cpp/src/binaryop/compiled/Mul.cu
index 15394245259..de6506d43f1 100644
--- a/cpp/src/binaryop/compiled/Mul.cu
+++ b/cpp/src/binaryop/compiled/Mul.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Mul>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Mul>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/NullEquals.cu b/cpp/src/binaryop/compiled/NullEquals.cu
index 3fc76e804f7..f4780c13bef 100644
--- a/cpp/src/binaryop/compiled/NullEquals.cu
+++ b/cpp/src/binaryop/compiled/NullEquals.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::NullEquals>(mutable_column_device_view&,
-                                               column_device_view const&,
-                                               column_device_view const&,
+template void apply_binary_op<ops::NullEquals>(mutable_column_view&,
+                                               column_view const&,
+                                               column_view const&,
                                                bool is_lhs_scalar,
                                                bool is_rhs_scalar,
                                                rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/NullLogicalAnd.cu b/cpp/src/binaryop/compiled/NullLogicalAnd.cu
index 48ae125bc93..55e71a52dae 100644
--- a/cpp/src/binaryop/compiled/NullLogicalAnd.cu
+++ b/cpp/src/binaryop/compiled/NullLogicalAnd.cu
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::NullLogicalAnd>(mutable_column_device_view&,
-                                                   column_device_view const&,
-                                                   column_device_view const&,
+template void apply_binary_op<ops::NullLogicalAnd>(mutable_column_view&,
+                                                   column_view const&,
+                                                   column_view const&,
                                                    bool is_lhs_scalar,
                                                    bool is_rhs_scalar,
                                                    rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/NullLogicalOr.cu b/cpp/src/binaryop/compiled/NullLogicalOr.cu
index e0ea95ac3ee..ee3b27c0934 100644
--- a/cpp/src/binaryop/compiled/NullLogicalOr.cu
+++ b/cpp/src/binaryop/compiled/NullLogicalOr.cu
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::NullLogicalOr>(mutable_column_device_view&,
-                                                  column_device_view const&,
-                                                  column_device_view const&,
+template void apply_binary_op<ops::NullLogicalOr>(mutable_column_view&,
+                                                  column_view const&,
+                                                  column_view const&,
                                                   bool is_lhs_scalar,
                                                   bool is_rhs_scalar,
                                                   rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/NullMax.cu b/cpp/src/binaryop/compiled/NullMax.cu
index 78a44041cba..6fae253d41f 100644
--- a/cpp/src/binaryop/compiled/NullMax.cu
+++ b/cpp/src/binaryop/compiled/NullMax.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::NullMax>(mutable_column_device_view&,
-                                            column_device_view const&,
-                                            column_device_view const&,
+template void apply_binary_op<ops::NullMax>(mutable_column_view&,
+                                            column_view const&,
+                                            column_view const&,
                                             bool is_lhs_scalar,
                                             bool is_rhs_scalar,
                                             rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/NullMin.cu b/cpp/src/binaryop/compiled/NullMin.cu
index 629ab600fd7..cb7fdb4f76a 100644
--- a/cpp/src/binaryop/compiled/NullMin.cu
+++ b/cpp/src/binaryop/compiled/NullMin.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::NullMin>(mutable_column_device_view&,
-                                            column_device_view const&,
-                                            column_device_view const&,
+template void apply_binary_op<ops::NullMin>(mutable_column_view&,
+                                            column_view const&,
+                                            column_view const&,
                                             bool is_lhs_scalar,
                                             bool is_rhs_scalar,
                                             rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/PMod.cu b/cpp/src/binaryop/compiled/PMod.cu
index 36902c0ed10..63b1f1f8269 100644
--- a/cpp/src/binaryop/compiled/PMod.cu
+++ b/cpp/src/binaryop/compiled/PMod.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::PMod>(mutable_column_device_view&,
-                                         column_device_view const&,
-                                         column_device_view const&,
+template void apply_binary_op<ops::PMod>(mutable_column_view&,
+                                         column_view const&,
+                                         column_view const&,
                                          bool is_lhs_scalar,
                                          bool is_rhs_scalar,
                                          rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Pow.cu b/cpp/src/binaryop/compiled/Pow.cu
index c6f897ee18d..435e1ac044a 100644
--- a/cpp/src/binaryop/compiled/Pow.cu
+++ b/cpp/src/binaryop/compiled/Pow.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Pow>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Pow>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/PyMod.cu b/cpp/src/binaryop/compiled/PyMod.cu
index b05dcd8e7bc..1e213598681 100644
--- a/cpp/src/binaryop/compiled/PyMod.cu
+++ b/cpp/src/binaryop/compiled/PyMod.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::PyMod>(mutable_column_device_view&,
-                                          column_device_view const&,
-                                          column_device_view const&,
+template void apply_binary_op<ops::PyMod>(mutable_column_view&,
+                                          column_view const&,
+                                          column_view const&,
                                           bool is_lhs_scalar,
                                           bool is_rhs_scalar,
                                           rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/ShiftLeft.cu b/cpp/src/binaryop/compiled/ShiftLeft.cu
index 6cc950b2d50..797821a9057 100644
--- a/cpp/src/binaryop/compiled/ShiftLeft.cu
+++ b/cpp/src/binaryop/compiled/ShiftLeft.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::ShiftLeft>(mutable_column_device_view&,
-                                              column_device_view const&,
-                                              column_device_view const&,
+template void apply_binary_op<ops::ShiftLeft>(mutable_column_view&,
+                                              column_view const&,
+                                              column_view const&,
                                               bool is_lhs_scalar,
                                               bool is_rhs_scalar,
                                               rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/ShiftRight.cu b/cpp/src/binaryop/compiled/ShiftRight.cu
index 1ddd7100a73..8a2566ff775 100644
--- a/cpp/src/binaryop/compiled/ShiftRight.cu
+++ b/cpp/src/binaryop/compiled/ShiftRight.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::ShiftRight>(mutable_column_device_view&,
-                                               column_device_view const&,
-                                               column_device_view const&,
+template void apply_binary_op<ops::ShiftRight>(mutable_column_view&,
+                                               column_view const&,
+                                               column_view const&,
                                                bool is_lhs_scalar,
                                                bool is_rhs_scalar,
                                                rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu b/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu
index a87b4b9f9ac..827029bc75c 100644
--- a/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu
+++ b/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::ShiftRightUnsigned>(mutable_column_device_view&,
-                                                       column_device_view const&,
-                                                       column_device_view const&,
+template void apply_binary_op<ops::ShiftRightUnsigned>(mutable_column_view&,
+                                                       column_view const&,
+                                                       column_view const&,
                                                        bool is_lhs_scalar,
                                                        bool is_rhs_scalar,
                                                        rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Sub.cu b/cpp/src/binaryop/compiled/Sub.cu
index e0cf47c1310..3022294f86f 100644
--- a/cpp/src/binaryop/compiled/Sub.cu
+++ b/cpp/src/binaryop/compiled/Sub.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Sub>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Sub>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/TrueDiv.cu b/cpp/src/binaryop/compiled/TrueDiv.cu
index d8f1d956340..4d0fc2d456b 100644
--- a/cpp/src/binaryop/compiled/TrueDiv.cu
+++ b/cpp/src/binaryop/compiled/TrueDiv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::TrueDiv>(mutable_column_device_view&,
-                                            column_device_view const&,
-                                            column_device_view const&,
+template void apply_binary_op<ops::TrueDiv>(mutable_column_view&,
+                                            column_view const&,
+                                            column_view const&,
                                             bool is_lhs_scalar,
                                             bool is_rhs_scalar,
                                             rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index c01359b80d0..d260aa6d6a0 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -37,23 +37,20 @@ namespace compiled {
 
 namespace {
 /**
- * @brief Converts scalar to column_device_view with single element.
+ * @brief Converts scalar to column_view with single element.
  *
- * @return pair with column_device_view and column containing any auxilary data to create
- * column_view from scalar
+ * @return pair with column_view and column containing any auxilary data to create column_view from
+ * scalar
  */
-struct scalar_as_column_device_view {
-  using return_type = typename std::pair<decltype(column_device_view::create(column_view{})),
-                                         std::unique_ptr<column>>;
+struct scalar_as_column_view {
+  using return_type = typename std::pair<column_view, std::unique_ptr<column>>;
   template <typename T, std::enable_if_t<(is_fixed_width<T>())>* = nullptr>
-  return_type operator()(scalar const& s,
-                         rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource*)
+  return_type operator()(scalar const& s, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
     auto col_v =
       column_view(s.type(), 1, h_scalar_type_view.data(), (bitmask_type const*)s.validity_data());
-    return std::pair{column_device_view::create(col_v, stream), std::unique_ptr<column>(nullptr)};
+    return std::pair{col_v, std::unique_ptr<column>(nullptr)};
   }
   template <typename T, std::enable_if_t<(!is_fixed_width<T>())>* = nullptr>
   return_type operator()(scalar const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
@@ -63,10 +60,8 @@ struct scalar_as_column_device_view {
 };
 // specialization for cudf::string_view
 template <>
-scalar_as_column_device_view::return_type
-scalar_as_column_device_view::operator()<cudf::string_view>(scalar const& s,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr)
+scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::string_view>(
+  scalar const& s, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   using T                  = cudf::string_view;
   auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
@@ -87,24 +82,24 @@ scalar_as_column_device_view::operator()<cudf::string_view>(scalar const& s,
                            cudf::UNKNOWN_NULL_COUNT,
                            0,
                            {offsets_column->view(), chars_column_v});
-  return std::pair{column_device_view::create(col_v, stream), std::move(offsets_column)};
+  return std::pair{col_v, std::move(offsets_column)};
 }
 
 /**
- * @brief Converts scalar to column_device_view with single element.
+ * @brief Converts scalar to column_view with single element.
  *
  * @param scal    scalar to convert
  * @param stream  CUDA stream used for device memory operations and kernel launches.
  * @param mr      Device memory resource used to allocate the returned column's device memory
- * @return        pair with column_device_view and column containing any auxilary data to create
+ * @return        pair with column_view and column containing any auxilary data to create
  * column_view from scalar
  */
-auto scalar_to_column_device_view(
+auto scalar_to_column_view(
   scalar const& scal,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  return type_dispatcher(scal.type(), scalar_as_column_device_view{}, scal, stream, mr);
+  return type_dispatcher(scal.type(), scalar_as_column_view{}, scal, stream, mr);
 }
 
 // This functor does the actual comparison between string column value and a scalar string
@@ -300,9 +295,9 @@ std::unique_ptr<column> string_null_min_max(column_view const& lhs,
     *lhs_device_view, *rhs_device_view, op, output_type, lhs.size(), stream, mr);
 }
 
-void operator_dispatcher(mutable_column_device_view& out,
-                         column_device_view const& lhs,
-                         column_device_view const& rhs,
+void operator_dispatcher(mutable_column_view& out,
+                         column_view const& lhs,
+                         column_view const& rhs,
                          bool is_lhs_scalar,
                          bool is_rhs_scalar,
                          binary_operator op,
@@ -358,10 +353,7 @@ void binary_operation(mutable_column_view& out,
                       binary_operator op,
                       rmm::cuda_stream_view stream)
 {
-  auto lhsd = column_device_view::create(lhs, stream);
-  auto rhsd = column_device_view::create(rhs, stream);
-  auto outd = mutable_column_device_view::create(out, stream);
-  operator_dispatcher(*outd, *lhsd, *rhsd, false, false, op, stream);
+  operator_dispatcher(out, lhs, rhs, false, false, op, stream);
 }
 // scalar_vector
 void binary_operation(mutable_column_view& out,
@@ -370,10 +362,8 @@ void binary_operation(mutable_column_view& out,
                       binary_operator op,
                       rmm::cuda_stream_view stream)
 {
-  auto [lhsd, aux] = scalar_to_column_device_view(lhs, stream);
-  auto rhsd        = column_device_view::create(rhs, stream);
-  auto outd        = mutable_column_device_view::create(out, stream);
-  operator_dispatcher(*outd, *lhsd, *rhsd, true, false, op, stream);
+  auto [lhsv, aux] = scalar_to_column_view(lhs, stream);
+  operator_dispatcher(out, lhsv, rhs, true, false, op, stream);
 }
 // vector_scalar
 void binary_operation(mutable_column_view& out,
@@ -382,12 +372,9 @@ void binary_operation(mutable_column_view& out,
                       binary_operator op,
                       rmm::cuda_stream_view stream)
 {
-  auto lhsd        = column_device_view::create(lhs, stream);
-  auto [rhsd, aux] = scalar_to_column_device_view(rhs, stream);
-  auto outd        = mutable_column_device_view::create(out, stream);
-  operator_dispatcher(*outd, *lhsd, *rhsd, false, true, op, stream);
+  auto [rhsv, aux] = scalar_to_column_view(rhs, stream);
+  operator_dispatcher(out, lhs, rhsv, false, true, op, stream);
 }
-
 }  // namespace compiled
 }  // namespace binops
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index ec41fbb8883..d88d2be2499 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -20,6 +20,7 @@
 #include "operation.cuh"
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_view.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -271,30 +272,36 @@ void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
   const int grid_size = util::div_rounding_up_safe(size, 2 * block_size);
   for_each_kernel<<<grid_size, block_size, 0, stream.value()>>>(size, std::forward<Functor&&>(f));
 }
-
+namespace detail {
+template <class T, class... Ts>
+inline constexpr bool is_any_v = std::disjunction<std::is_same<T, Ts>...>::value;
+}
 template <class BinaryOperator>
-void apply_binary_op(mutable_column_device_view& outd,
-                     column_device_view const& lhsd,
-                     column_device_view const& rhsd,
+void apply_binary_op(mutable_column_view& out,
+                     column_view const& lhs,
+                     column_view const& rhs,
                      bool is_lhs_scalar,
                      bool is_rhs_scalar,
                      rmm::cuda_stream_view stream)
 {
-  auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
+  auto common_dtype = get_common_type(out.type(), lhs.type(), rhs.type());
 
+  auto lhsd = column_device_view::create(lhs, stream);
+  auto rhsd = column_device_view::create(rhs, stream);
+  auto outd = mutable_column_device_view::create(out, stream);
   // Create binop functor instance
   if (common_dtype) {
     // Execute it on every element
     for_each(stream,
-             outd.size(),
+             out.size(),
              binary_op_device_dispatcher<BinaryOperator>{
-               *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+               *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
   } else {
     // Execute it on every element
     for_each(stream,
-             outd.size(),
+             out.size(),
              binary_op_double_device_dispatcher<BinaryOperator>{
-               outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+               *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
   }
 }
 
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index 26a0f26b59c..d1a40e15326 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -175,45 +175,45 @@ bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_
 // Defined in individual .cu files.
 /**
  * @brief Deploys single type or double type dispatcher that runs binary operation on each element
- * of @p lhsd and @p rhsd columns.
+ * of @p lhs and @p rhs columns.
  *
  * This template is instantiated for each binary operator.
  *
  * @tparam BinaryOperator Binary operator functor
- * @param outd mutable device view of output column
- * @param lhsd device view of left operand column
- * @param rhsd device view of right operand column
- * @param is_lhs_scalar true if @p lhsd is a single element column representing a scalar
- * @param is_rhs_scalar true if @p rhsd is a single element column representing a scalar
+ * @param out mutable view of output column
+ * @param lhs view of left operand column
+ * @param rhs view of right operand column
+ * @param is_lhs_scalar true if @p lhs is a single element column representing a scalar
+ * @param is_rhs_scalar true if @p rhs is a single element column representing a scalar
  * @param stream CUDA stream used for device memory operations
  */
 template <class BinaryOperator>
-void apply_binary_op(mutable_column_device_view&,
-                     column_device_view const&,
-                     column_device_view const&,
+void apply_binary_op(mutable_column_view& out,
+                     column_view const& lhs,
+                     column_view const& rhs,
                      bool is_lhs_scalar,
                      bool is_rhs_scalar,
                      rmm::cuda_stream_view stream);
 /**
  * @brief Deploys single type or double type dispatcher that runs equality operation on each element
- * of @p lhsd and @p rhsd columns.
+ * of @p lhs and @p rhs columns.
  *
  * Comparison operators are EQUAL, NOT_EQUAL, NULL_EQUALS.
- * @p outd type is boolean.
+ * @p out type is boolean.
  *
  * This template is instantiated for each binary operator.
  *
- * @param outd mutable device view of output column
- * @param lhsd device view of left operand column
- * @param rhsd device view of right operand column
- * @param is_lhs_scalar true if @p lhsd is a single element column representing a scalar
- * @param is_rhs_scalar true if @p rhsd is a single element column representing a scalar
+ * @param out mutable view of output column
+ * @param lhs view of left operand column
+ * @param rhs view of right operand column
+ * @param is_lhs_scalar true if @p lhs is a single element column representing a scalar
+ * @param is_rhs_scalar true if @p rhs is a single element column representing a scalar
  * @param op comparison binary operator
  * @param stream CUDA stream used for device memory operations
  */
-void dispatch_equality_op(mutable_column_device_view& outd,
-                          column_device_view const& lhsd,
-                          column_device_view const& rhsd,
+void dispatch_equality_op(mutable_column_view& out,
+                          column_view const& lhs,
+                          column_view const& rhs,
                           bool is_lhs_scalar,
                           bool is_rhs_scalar,
                           binary_operator op,
diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu
index 03c3e373476..61f02252a26 100644
--- a/cpp/src/binaryop/compiled/equality_ops.cu
+++ b/cpp/src/binaryop/compiled/equality_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,41 +17,43 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-void dispatch_equality_op(mutable_column_device_view& outd,
-                          column_device_view const& lhsd,
-                          column_device_view const& rhsd,
+void dispatch_equality_op(mutable_column_view& out,
+                          column_view const& lhs,
+                          column_view const& rhs,
                           bool is_lhs_scalar,
                           bool is_rhs_scalar,
                           binary_operator op,
                           rmm::cuda_stream_view stream)
 {
-  auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
-
-  // Execute it on every element
-
+  CUDF_EXPECTS(op == binary_operator::EQUAL || op == binary_operator::NOT_EQUAL,
+               "Unsupported operator for these types");
+  auto common_dtype = get_common_type(out.type(), lhs.type(), rhs.type());
+  auto outd         = mutable_column_device_view::create(out, stream);
+  auto lhsd         = column_device_view::create(lhs, stream);
+  auto rhsd         = column_device_view::create(rhs, stream);
   if (common_dtype) {
     if (op == binary_operator::EQUAL) {
       for_each(stream,
-               outd.size(),
+               out.size(),
                binary_op_device_dispatcher<ops::Equal>{
-                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+                 *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     } else if (op == binary_operator::NOT_EQUAL) {
       for_each(stream,
-               outd.size(),
+               out.size(),
                binary_op_device_dispatcher<ops::NotEqual>{
-                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+                 *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     }
   } else {
     if (op == binary_operator::EQUAL) {
       for_each(stream,
-               outd.size(),
+               out.size(),
                binary_op_double_device_dispatcher<ops::Equal>{
-                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+                 *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     } else if (op == binary_operator::NOT_EQUAL) {
       for_each(stream,
-               outd.size(),
+               out.size(),
                binary_op_double_device_dispatcher<ops::NotEqual>{
-                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+                 *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     }
   }
 }

From dd68db3b644c4448f9c87a43dcb303e9fb055ad4 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Wed, 4 May 2022 17:39:26 -0400
Subject: [PATCH 26/28] Reorganize cuDF Python docs (#10691)

This PR is composed of two high-level changes:

* Replaces the use of ReStructuredText with [MyST Markdown](https://myst-parser.readthedocs.io/en/latest/). I used [rst2myst](https://github.com/executablebooks/rst2myst) for this and it worked pretty well. The rationale for this change is simple: we use `myst-nb` to render notebooks into documentation, and for consistency, it's nice to use `myst-parser` to parse the rest of our docs too. As a matter of opinion, I think Markdown is simpler and more familiar to most developers.

* Reorganizes the docs (see below):

Prior to this PR, the cuDF documentation was divided into 3 sections:

* A user guide
* A "Basics" section
* API reference

The distinction between the first two sections was never clear. I've gone ahead and merged those into a single section named "User Guide". This is also more consistent with Pandas.

This PR also makes a couple of other changes:

- Renamed the "Basics" page under the previous "Basics" section to "Data Types", as that reflects its contents more accurately. I also modified the content here a bit.
- Renamed the "10 minutes to CuPy and CuDF" notebook to "Interoperability between CuPy and CuDF" as that more accurately describes what that page is about.

----

Compare the TOC from this PR (below) with our [currently published docs](https://docs.rapids.ai/api/cudf/stable/).

<img width="710" alt="Screen Shot 2022-04-20 at 1 13 04 PM" src="https://user-images.githubusercontent.com/3190405/164286913-2e3bfd2a-caa7-4324-9cad-bd131058999f.png">

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - Mike McCarty (https://github.com/mmccarty)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Mike McCarty (https://github.com/mmccarty)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10691
---
 docs/cudf/source/_static/params.css           |   8 +-
 docs/cudf/source/basics/PandasCompat.rst      |   4 -
 docs/cudf/source/basics/basics.rst            |  62 --
 docs/cudf/source/basics/dask-cudf.rst         | 107 ----
 docs/cudf/source/basics/groupby.rst           | 274 --------
 docs/cudf/source/basics/index.rst             |  15 -
 docs/cudf/source/basics/internals.rst         | 216 -------
 .../cudf/source/basics/io-gds-integration.rst |  42 --
 .../source/basics/io-nvcomp-integration.rst   |  27 -
 docs/cudf/source/basics/io.rst                |  13 -
 docs/cudf/source/index.rst                    |   1 -
 docs/cudf/source/user_guide/10min.ipynb       | 371 +++++++----
 docs/cudf/source/user_guide/PandasCompat.md   |   5 +
 ...min-cudf-cupy.ipynb => cupy-interop.ipynb} | 246 ++++---
 docs/cudf/source/user_guide/dask-cudf.md      | 104 +++
 docs/cudf/source/user_guide/data-types.md     | 153 +++++
 docs/cudf/source/user_guide/groupby.md        | 273 ++++++++
 .../source/user_guide/guide-to-udfs.ipynb     | 149 ++++-
 docs/cudf/source/user_guide/index.md          |  16 +
 docs/cudf/source/user_guide/index.rst         |  12 -
 docs/cudf/source/user_guide/internals.md      | 212 +++++++
 .../io.md}                                    | 113 +++-
 ...-missing-data.ipynb => missing-data.ipynb} | 598 ++++++++++--------
 23 files changed, 1738 insertions(+), 1283 deletions(-)
 delete mode 100644 docs/cudf/source/basics/PandasCompat.rst
 delete mode 100644 docs/cudf/source/basics/basics.rst
 delete mode 100644 docs/cudf/source/basics/dask-cudf.rst
 delete mode 100644 docs/cudf/source/basics/groupby.rst
 delete mode 100644 docs/cudf/source/basics/index.rst
 delete mode 100644 docs/cudf/source/basics/internals.rst
 delete mode 100644 docs/cudf/source/basics/io-gds-integration.rst
 delete mode 100644 docs/cudf/source/basics/io-nvcomp-integration.rst
 delete mode 100644 docs/cudf/source/basics/io.rst
 create mode 100644 docs/cudf/source/user_guide/PandasCompat.md
 rename docs/cudf/source/user_guide/{10min-cudf-cupy.ipynb => cupy-interop.ipynb} (87%)
 create mode 100644 docs/cudf/source/user_guide/dask-cudf.md
 create mode 100644 docs/cudf/source/user_guide/data-types.md
 create mode 100644 docs/cudf/source/user_guide/groupby.md
 create mode 100644 docs/cudf/source/user_guide/index.md
 delete mode 100644 docs/cudf/source/user_guide/index.rst
 create mode 100644 docs/cudf/source/user_guide/internals.md
 rename docs/cudf/source/{basics/io-supported-types.rst => user_guide/io.md} (69%)
 rename docs/cudf/source/user_guide/{Working-with-missing-data.ipynb => missing-data.ipynb} (87%)

diff --git a/docs/cudf/source/_static/params.css b/docs/cudf/source/_static/params.css
index 9e6be7ca75f..17c9d5accbd 100644
--- a/docs/cudf/source/_static/params.css
+++ b/docs/cudf/source/_static/params.css
@@ -50,11 +50,17 @@ table.io-supported-types-table thead{
 
 }
 
+/* Used to make special-table scrollable when it overflows */
+.special-table-wrapper {
+    width: 100%;
+    overflow: auto !important;
+}
+
 .special-table td, .special-table th {
     border: 1px solid #dee2e6;
 }
 
-/* Needed to resolve https://github.com/executablebooks/jupyter-book/issues/1611 */ 
+/* Needed to resolve https://github.com/executablebooks/jupyter-book/issues/1611 */
 .output.text_html {
     overflow: auto;
 }
diff --git a/docs/cudf/source/basics/PandasCompat.rst b/docs/cudf/source/basics/PandasCompat.rst
deleted file mode 100644
index fe9161e49c3..00000000000
--- a/docs/cudf/source/basics/PandasCompat.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Pandas Compatibility Notes
-==========================
-
-.. pandas-compat-list::
diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst
deleted file mode 100644
index 9b8983fba49..00000000000
--- a/docs/cudf/source/basics/basics.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-Basics
-======
-
-
-Supported Dtypes
-----------------
-
-cuDF uses dtypes for Series or individual columns of a DataFrame. cuDF uses NumPy dtypes, NumPy provides support for ``float``, ``int``, ``bool``,
-``'timedelta64[s]'``, ``'timedelta64[ms]'``, ``'timedelta64[us]'``, ``'timedelta64[ns]'``, ``'datetime64[s]'``, ``'datetime64[ms]'``,
-``'datetime64[us]'``, ``'datetime64[ns]'`` (note that NumPy does not support timezone-aware datetimes).
-
-
-The following table lists all of cudf types. For methods requiring dtype arguments, strings can be specified as indicated. See the respective documentation sections for more on each type.
-
-.. rst-class:: special-table
-.. table::
-
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Kind of Data    | Data Type        | Scalar                                                       | String Aliases                               |
-    +=================+==================+==============================================================+==============================================+
-    | Integer         |                  | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_,        | ``'int8'``, ``'int16'``, ``'int32'``,        |
-    |                 |                  | np.uint16_, np.uint32_, np.uint64_                           | ``'int64'``, ``'uint8'``, ``'uint16'``,      |
-    |                 |                  |                                                              | ``'uint32'``, ``'uint64'``                   |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Float           |                  | np.float32_, np.float64_                                     | ``'float32'``, ``'float64'``                 |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Strings         |                  | `str <https://docs.python.org/3/library/stdtypes.html#str>`_ | ``'string'``, ``'object'``                   |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Datetime        |                  | np.datetime64_                                               | ``'datetime64[s]'``, ``'datetime64[ms]'``,   |
-    |                 |                  |                                                              | ``'datetime64[us]'``, ``'datetime64[ns]'``   |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Timedelta       |                  | np.timedelta64_                                              | ``'timedelta64[s]'``, ``'timedelta64[ms]'``, |
-    | (duration type) |                  |                                                              | ``'timedelta64[us]'``, ``'timedelta64[ns]'`` |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Categorical     | CategoricalDtype | (none)                                                       | ``'category'``                               |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Boolean         |                  | np.bool_                                                     | ``'bool'``                                   |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Decimal         | Decimal32Dtype,  | (none)                                                       | (none)                                       |
-    |                 | Decimal64Dtype,  |                                                              |                                              |
-    |                 | Decimal128Dtype  |                                                              |                                              |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Lists           | ListDtype        | list                                                         | ``'list'``                                   |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Structs         | StructDtype      | dict                                                         | ``'struct'``                                 |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-
-**Note: All dtypes above are Nullable**
-
-.. _np.int8:
-.. _np.int16:
-.. _np.int32:
-.. _np.int64:
-.. _np.uint8:
-.. _np.uint16:
-.. _np.uint32:
-.. _np.uint64:
-.. _np.float32:
-.. _np.float64:
-.. _np.bool: https://numpy.org/doc/stable/user/basics.types.html
-.. _np.datetime64: https://numpy.org/doc/stable/reference/arrays.datetime.html#basic-datetimes
-.. _np.timedelta64: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-and-timedelta-arithmetic
diff --git a/docs/cudf/source/basics/dask-cudf.rst b/docs/cudf/source/basics/dask-cudf.rst
deleted file mode 100644
index a9c65dfbfae..00000000000
--- a/docs/cudf/source/basics/dask-cudf.rst
+++ /dev/null
@@ -1,107 +0,0 @@
-Multi-GPU with Dask-cuDF
-========================
-
-cuDF is a single-GPU library. For Multi-GPU cuDF solutions we use
-`Dask <https://dask.org/>`__ and the `dask-cudf
-package <https://github.com/rapidsai/cudf/tree/main/python/dask_cudf>`__,
-which is able to scale cuDF across multiple GPUs on a single machine, or
-multiple GPUs across many machines in a cluster.
-
-`Dask DataFrame <http://docs.dask.org/en/latest/dataframe.html>`__ was
-originally designed to scale Pandas, orchestrating many Pandas
-DataFrames spread across many CPUs into a cohesive parallel DataFrame.
-Because cuDF currently implements only a subset of Pandas’s API, not all
-Dask DataFrame operations work with cuDF.
-
-The following is tested and expected to work:
-
-What works
-----------
-
--  Data ingestion
-
-   -  ``dask_cudf.read_csv``
-   -  Use standard Dask ingestion with Pandas, then convert to cuDF (For
-      Parquet and other formats this is often decently fast)
-
--  Linear operations
-
-   -  Element-wise operations: ``df.x + df.y``, ``df ** 2``
-   -  Assignment: ``df['z'] = df.x + df.y``
-   -  Row-wise selections: ``df[df.x > 0]``
-   -  Loc: ``df.loc['2001-01-01': '2005-02-02']``
-   -  Date time/string accessors: ``df.timestamp.dt.dayofweek``
-   -  ... and most similar operations in this category that are already
-      implemented in cuDF
-
--  Reductions
-
-   -  Like ``sum``, ``mean``, ``max``, ``count``, and so on on
-      ``Series`` objects
-   -  Support for reductions on full dataframes
-   -  \ ``std``\
-   -  Custom reductions with
-      `dask.dataframe.reduction <http://docs.dask.org/en/latest/generated/dask.dataframe.Series.reduction.html>`__
-
--  Groupby aggregations
-
-   -  On single columns: ``df.groupby('x').y.max()``
-   -  With custom aggregations:
-   -  groupby standard deviation
-   -  grouping on multiple columns
-   -  groupby agg for multiple outputs
-
--  Joins:
-
-   -  On full unsorted columns: ``left.merge(right, on='id')``
-      (expensive)
-   -  On sorted indexes:
-      ``left.merge(right, left_index=True, right_index=True)`` (fast)
-   -  On large and small dataframes: ``left.merge(cudf_df, on='id')``
-      (fast)
-
--  Rolling operations
--  Converting to and from other forms
-
-   -  Dask + Pandas to Dask + cuDF
-      ``df.map_partitions(cudf.from_pandas)``
-   -  Dask + cuDF to Dask + Pandas
-      ``df.map_partitions(lambda df: df.to_pandas())``
-   -  cuDF to Dask + cuDF:
-      ``dask.dataframe.from_pandas(df, npartitions=20)``
-   -  Dask + cuDF to cuDF: ``df.compute()``
-
-Additionally all generic Dask operations, like ``compute``, ``persist``,
-``visualize`` and so on work regardless.
-
-Developing the API
-------------------
-
-Above we mention the following:
-
-    and most similar operations in this category that are already
-    implemented in cuDF
-
-This is because it is difficult to create a comprehensive list of
-operations in the cuDF and Pandas libraries. The API is large enough to
-be difficult to track effectively. For any operation that operates
-row-wise like ``fillna`` or ``query`` things will likely, but not
-certainly work. If operations don't work it is often due to a slight
-inconsistency between Pandas and cuDF that is generally easy to fix. We
-encourage users to look at the `cuDF issue
-tracker <https://github.com/rapidsai/cudf/issues>`__ to see if their
-issue has already been reported and, if not, `raise a new
-issue <https://github.com/rapidsai/cudf/issues/new>`__.
-
-Navigating the API
-------------------
-
-This project reuses the `Dask
-DataFrame <https://docs.dask.org/en/latest/dataframe.html>`__ project,
-which was originally designed for Pandas, with the newer library cuDF.
-Because we use the same Dask classes for both projects there are often
-methods that are implemented for Pandas, but not yet for cuDF. As a
-result users looking at the full Dask DataFrame API can be misleading,
-and often lead to frustration when operations that are advertised in the
-Dask API do not work as expected with cuDF. We apologize for this in
-advance.
diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst
deleted file mode 100644
index f74853769f6..00000000000
--- a/docs/cudf/source/basics/groupby.rst
+++ /dev/null
@@ -1,274 +0,0 @@
-.. _basics.groupby:
-
-GroupBy
-=======
-
-cuDF supports a small (but important) subset of Pandas' `groupby
-API <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html>`__.
-
-Summary of supported operations
--------------------------------
-
-1. Grouping by one or more columns
-2. Basic aggregations such as "sum", "mean", etc.
-3. Quantile aggregation
-4. A "collect" or ``list`` aggregation for collecting values in a group
-   into lists
-5. Automatic exclusion of columns with unsupported dtypes ("nuisance"
-   columns) when aggregating
-6. Iterating over the groups of a GroupBy object
-7. ``GroupBy.groups`` API that returns a mapping of group keys to row
-   labels
-8. ``GroupBy.apply`` API for performing arbitrary operations on each
-   group. Note that this has very limited functionality compared to the
-   equivalent Pandas function. See the section on
-   `apply <#groupby-apply>`__ for more details.
-9. ``GroupBy.pipe`` similar to
-   `Pandas <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`__.
-
-Grouping
---------
-
-A GroupBy object is created by grouping the values of a ``Series`` or
-``DataFrame`` by one or more columns:
-
-.. code:: python
-
-    import cudf
-
-    >>> df = cudf.DataFrame({'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]})
-    >>> df
-    >>> gb1 = df.groupby('a')  # grouping by a single column
-    >>> gb2 = df.groupby(['a', 'b'])  # grouping by multiple columns
-    >>> gb3 = df.groupby(cudf.Series(['a', 'a', 'b', 'b', 'b']))  # grouping by an external column
-
-.. warning::
-
-       cuDF uses `sort=False` by default to achieve better performance, which provides no gaurentee to the group order in outputs. This deviates from Pandas default behavior.
-
-       For example:
-
-       .. code-block:: python
-       
-          >>> df = cudf.DataFrame({'a' : [2, 2, 1], 'b' : [42, 21, 11]})
-          >>> df.groupby('a').sum()
-             b
-          a    
-          2  63
-          1  11
-          >>> df.to_pandas().groupby('a').sum()
-             b
-          a    
-          1  11
-          2  63
-       
-       Setting `sort=True` will produce Pandas-like output, but with some performance penalty:
-
-       .. code-block:: python
-       
-          >>> df.groupby('a', sort=True).sum()
-             b
-          a    
-          1  11
-          2  63
-
-Grouping by index levels
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-You can also group by one or more levels of a MultiIndex:
-
-.. code:: python
-
-    >>> df = cudf.DataFrame(
-    ...     {'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]}
-    ... ).set_index(['a', 'b'])
-    ...
-    >>> df.groupby(level='a')
-
-The ``Grouper`` object
-~~~~~~~~~~~~~~~~~~~~~~
-
-A ``Grouper`` can be used to disambiguate between columns and levels
-when they have the same name:
-
-.. code:: python
-
-    >>> df
-       b  c
-    b
-    1  1  1
-    1  1  2
-    1  2  3
-    2  2  4
-    2  3  5
-    >>> df.groupby('b', level='b')  # ValueError: Cannot specify both by and level
-    >>> df.groupby([cudf.Grouper(key='b'), cudf.Grouper(level='b')])  # OK
-
-Aggregation
------------
-
-Aggregations on groups is supported via the ``agg`` method:
-
-.. code:: python
-
-    >>> df
-       a  b  c
-    0  1  1  1
-    1  1  1  2
-    2  1  2  3
-    3  2  2  4
-    4  2  3  5
-    >>> df.groupby('a').agg('sum')
-       b  c
-    a
-    1  4  6
-    2  5  9
-    >>> df.groupby('a').agg({'b': ['sum', 'min'], 'c': 'mean'})
-        b        c
-      sum min mean
-    a
-    1   4   1  2.0
-    2   5   2  4.5
-    >>> df.groupby("a").corr(method="pearson")
-              b          c
-    a                      
-    1 b  1.000000  0.866025
-      c  0.866025  1.000000
-    2 b  1.000000  1.000000
-      c  1.000000  1.000000
-
-The following table summarizes the available aggregations and the types
-that support them:
-
-.. rst-class:: special-table
-.. table::
-
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | Aggregations / dtypes              | Numeric   | Datetime   | String   | Categorical   | List   | Struct   | Interval   | Decimal   |
-   +====================================+===========+============+==========+===============+========+==========+============+===========+
-   | count                              | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | size                               | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | sum                                | ✅        | ✅         |          |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | idxmin                             | ✅        | ✅         |          |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | idxmax                             | ✅        | ✅         |          |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | min                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | max                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | mean                               | ✅        | ✅         |          |               |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | var                                | ✅        | ✅         |          |               |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | std                                | ✅        | ✅         |          |               |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | quantile                           | ✅        | ✅         |          |               |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | median                             | ✅        | ✅         |          |               |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | nunique                            | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | nth                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | collect                            | ✅        | ✅         | ✅       |               | ✅     |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | unique                             | ✅        | ✅         | ✅       | ✅            |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | corr                               | ✅        |            |          |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | cov                                | ✅        |            |          |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-
-GroupBy apply
--------------
-
-To apply function on each group, use the ``GroupBy.apply()`` method:
-
-.. code:: python
-
-    >>> df
-       a  b  c
-    0  1  1  1
-    1  1  1  2
-    2  1  2  3
-    3  2  2  4
-    4  2  3  5
-    >>> df.groupby('a').apply(lambda x: x.max() - x.min())
-       a  b  c
-    a
-    0  0  1  2
-    1  0  1  1
-
-Limitations
-~~~~~~~~~~~
-
--  ``apply`` works by applying the provided function to each group
-   sequentially, and concatenating the results together. **This can be
-   very slow**, especially for a large number of small groups. For a
-   small number of large groups, it can give acceptable performance
-
--  The results may not always match Pandas exactly. For example, cuDF
-   may return a ``DataFrame`` containing a single column where Pandas
-   returns a ``Series``. Some post-processing may be required to match
-   Pandas behavior.
-
--  cuDF does not support some of the exceptional cases that Pandas
-   supports with ``apply``, such as calling |describe|_ inside the
-   callable.
-
- .. |describe| replace:: ``describe``
- .. _describe: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#flexible-apply
-
-
-Transform
----------
-
-The ``.transform()`` method aggregates per group, and broadcasts the
-result to the group size, resulting in a Series/DataFrame that is of
-the same size as the input Series/DataFrame.
-
-.. code:: python
-
-     >>> import cudf
-     >>> df = cudf.DataFrame({'a': [2, 1, 1, 2, 2], 'b': [1, 2, 3, 4, 5]})
-     >>> df.groupby('a').transform('max')
-        b
-     0  5
-     1  3
-     2  3
-     3  5
-     4  5
-
-
-Rolling window calculations
----------------------------
-
-Use the ``GroupBy.rolling()`` method to perform rolling window
-calculations on each group:
-
-.. code:: python
-
-    >>> df
-       a  b  c
-    0  1  1  1
-    1  1  1  2
-    2  1  2  3
-    3  2  2  4
-    4  2  3  5
-
-Rolling window sum on each group with a window size of 2:
-
-.. code:: python
-
-    >>> df.groupby('a').rolling(2).sum()
-            a     b     c
-    a
-    1 0  <NA>  <NA>  <NA>
-      1     2     2     3
-      2     2     3     5
-    2 3  <NA>  <NA>  <NA>
-      4     4     5     9
diff --git a/docs/cudf/source/basics/index.rst b/docs/cudf/source/basics/index.rst
deleted file mode 100644
index a29866d7e32..00000000000
--- a/docs/cudf/source/basics/index.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-======
-Basics
-======
-
-
-.. toctree::
-   :maxdepth: 2
-
-   basics
-   io.rst
-   groupby.rst
-   PandasCompat.rst
-   dask-cudf.rst
-   internals.rst
-   
\ No newline at end of file
diff --git a/docs/cudf/source/basics/internals.rst b/docs/cudf/source/basics/internals.rst
deleted file mode 100644
index 96ef40d51e6..00000000000
--- a/docs/cudf/source/basics/internals.rst
+++ /dev/null
@@ -1,216 +0,0 @@
-cuDF internals
-==============
-
-The cuDF API closely matches that of the
-`Pandas <https://pandas.pydata.org/>`__ library. Thus, we have the types
-``cudf.Series``, ``cudf.DataFrame`` and ``cudf.Index`` which look and
-feel very much like their Pandas counterparts.
-
-Under the hood, however, cuDF uses data structures very different from
-Pandas. In this document, we describe these internal data structures.
-
-Column
-------
-
-Columns are cuDF's core data structure and they are modeled after the
-`Apache Arrow Columnar
-Format <https://arrow.apache.org/docs/format/Columnar.html>`__.
-
-A column represents a sequence of values, any number of which may be
-"null". Columns are specialized based on the type of data they contain.
-Thus we have ``NumericalColumn``, ``StringColumn``, ``DatetimeColumn``,
-etc.,
-
-A column is composed of the following:
-
--  A **data type**, specifying the type of each element.
--  A **data buffer** that may store the data for the column elements.
-   Some column types do not have a data buffer, instead storing data in
-   the children columns.
--  A **mask buffer** whose bits represent the validity (null or not
-   null) of each element. Columns whose elements are all "valid" may not
-   have a mask buffer. Mask buffers are padded to 64 bytes.
--  A tuple of **children** columns, which enable the representation
-   complex types such as columns with non-fixed width elements such as
-   strings or lists.
--  A **size** indicating the number of elements in the column.
--  An integer **offset**: a column may represent a "slice" of another
-   column, in which case this offset represents the first element of the
-   slice. The size of the column then gives the extent of the slice. A
-   column that is not a slice has an offset of 0.
-
-For example, the ``NumericalColumn`` backing a Series with 1000 elements
-of type 'int32' and containing nulls is composed of:
-
-1. A data buffer of size 4000 bytes (sizeof(int32) \* 1000)
-2. A mask buffer of size 128 bytes (1000/8 padded to a multiple of 64
-   bytes)
-3. No children columns
-
-As another example, the ``StringColumn`` backing the Series
-``['do', 'you', 'have', 'any', 'cheese?']`` is composed of:
-
-1. No data buffer
-2. No mask buffer as there are no nulls in the Series
-3. Two children columns:
-
-    -  A column of UTF-8 characters
-       ``['d', 'o', 'y', 'o', 'u', h' ... '?']``
-    -  A column of "offsets" to the characters column (in this case,
-       ``[0, 2, 5, 9, 12, 19]``)
-
-Buffer
-------
-
-The data and mask buffers of a column represent data in GPU memory
-(a.k.a *device memory*), and are object of type
-``cudf.core.buffer.Buffer``.
-
-Buffers can be constructed from array-like objects that live either on
-the host (e.g., numpy arrays) or the device (e.g., cupy arrays). Arrays
-must be of ``uint8`` dtype or viewed as such.
-
-When constructing a Buffer from a host object such as a numpy array, new
-device memory is allocated:
-
-.. code:: python
-
-    >>> from cudf.core.buffer import Buffer
-    >>> buf = Buffer(np.array([1, 2, 3], dtype='int64').view("uint8"))
-    >>> print(buf.ptr)  # address of new device memory allocation
-    140050901762560
-    >>> print(buf.size)
-    24
-    >>> print(buf._owner)
-    <rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baab50>
-
-cuDF uses the `RMM <https://github.com/rapidsai/rmm>`__ library for
-allocating device memory. You can read more about device memory
-allocation with RMM
-`here <https://github.com/rapidsai/rmm#devicebuffers>`__.
-
-When constructing a Buffer from a device object such as a CuPy array, no
-new device memory is allocated. Instead, the Buffer points to the
-existing allocation, keeping a reference to the device array:
-
-.. code:: python
-
-    >>> import cupy as cp
-    >>> c_ary = cp.asarray([1, 2, 3], dtype='int64')
-    >>> buf = Buffer(c_ary.view("uint8"))
-    >>> print(c_ary.data.mem.ptr)
-    140050901762560
-    >>> print(buf.ptr)
-    140050901762560
-    >>> print(buf.size)
-    24
-    >>> print(buf._owner is c_ary)
-    True
-
-An uninitialized block of device memory can be allocated with
-``Buffer.empty``:
-
-.. code:: python
-
-    >>> buf = Buffer.empty(10)
-    >>> print(buf.size)
-    10
-    >>> print(buf._owner)
-    <rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baa890>
-
-ColumnAccessor
---------------
-
-cuDF ``Series``, ``DataFrame`` and ``Index`` are all subclasses of an
-internal ``Frame`` class. The underlying data structure of ``Frame`` is
-an ordered, dictionary-like object known as ``ColumnAccessor``, which
-can be accessed via the ``._data`` attribute:
-
-.. code:: python
-
-    >>> a = cudf.DataFrame({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
-    >>> a._data
-    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d12e050>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e320>)]), multiindex=False, level_names=(None,))
-
-ColumnAccessor is an ordered mapping of column labels to columns. In
-addition to behaving like an OrderedDict, it supports things like
-selecting multiple columns (both by index and label), as well as
-hierarchical indexing.
-
-.. code:: python
-
-    >>> from cudf.core.column_accessor import ColumnAccessor
-
-The values of a ColumnAccessor are coerced to Columns during
-construction:
-
-.. code:: python
-
-    >>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
-    >>> ca['x']
-    <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
-    >>> ca['y']
-    <cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>
-    >>> ca.pop('x')
-    <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
-    >>> ca
-    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>)]), multiindex=False, level_names=(None,))
-
-Columns can be inserted at a specified location:
-
-.. code:: python
-
-    >>> ca.insert('z', [3, 4, 5], loc=1)
-    >>> ca
-    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e3b0>)]), multiindex=False, level_names=(None,))
-
-Selecting columns by index:
-
-.. code:: python
-
-    >>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c'], 'z': [4, 5, 6]})
-    >>> ca.select_by_index(1)
-    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
-    >>> ca.select_by_index([0, 1])
-    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
-    >>> ca.select_by_index(slice(1, 3))
-    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
-
-Selecting columns by label:
-
-.. code:: python
-
-    >>> ca.select_by_label(['y', 'z'])
-    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
-    >>> ca.select_by_label(slice('x', 'y'))
-    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
-
-A ColumnAccessor with tuple keys (and constructed with
-``multiindex=True``) can be hierarchically indexed:
-
-.. code:: python
-
-    >>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], 'b': [4, 5, 6]}, multiindex=True)
-    >>> ca.select_by_label('a')
-    ColumnAccessor(OrderedColumnDict([('b', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('c', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>)]), multiindex=False, level_names=(None,))
-    >>> ca.select_by_label(('a', 'b'))
-    ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>)]), multiindex=False, level_names=(None,))
-
-"Wildcard" indexing is also allowed:
-
-.. code:: python
-
-    >>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], ('d', 'b'): [4, 5, 6]}, multiindex=True)
-    >>> ca.select_by_label((slice(None), 'b'))
-    ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578830>), (('d', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>)]), multiindex=True, level_names=(None, None))
-
-Finally, ColumnAccessors can convert to Pandas ``Index`` or
-``MultiIndex`` objects:
-
-.. code:: python
-
-    >>> ca.to_pandas_index()
-    MultiIndex([('a', 'b'),
-                ('a', 'c'),
-                ('d', 'b')],
-               )
diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst
deleted file mode 100644
index ce774453386..00000000000
--- a/docs/cudf/source/basics/io-gds-integration.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-GPUDirect Storage Integration
-=============================
-
-Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations.
-GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU.
-GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer.
-The SDK is available for download `here <https://developer.nvidia.com/gpudirect-storage>`_.
-GDS is also included in CUDA Toolkit 11.4 and higher.
-
-Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``.
-This variable also controls the GDS compatibility mode.
-
-There are four valid values for the environment variable:
-
-- "GDS": Enable GDS use; GDS compatibility mode is *off*.
-- "ALWAYS": Enable GDS use; GDS compatibility mode is *on*.
-- "KVIKIO": Enable GDS through `KvikIO <https://github.com/rapidsai/kvikio>`_.
-- "OFF": Completely disable GDS use.
-
-If no value is set, behavior will be the same as the "GDS" option.
-
-This environment variable also affects how cuDF treats GDS errors.
-When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers.
-When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on),
-cuDF throws an exception to propagate the error to the user.
-When ``LIBCUDF_CUFILE_POLICY`` is set to "KVIKIO" and a KvikIO API call fails for any reason (unlikely, given that KvikIO implements its own compatibility mode) cuDF throws an exception to propagate the error to the user.
-For more information about error handling, compatibility mode, and tuning parameters in KvikIO see: https://github.com/rapidsai/kvikio
-
-Operations that support the use of GPUDirect Storage:
-
-- :py:func:`cudf.read_avro`
-- :py:func:`cudf.read_parquet`
-- :py:func:`cudf.read_orc`
-- :py:meth:`cudf.DataFrame.to_csv`
-- :py:meth:`cudf.DataFrame.to_parquet`
-- :py:meth:`cudf.DataFrame.to_orc`
-
-Several parameters that can be used to tune the performance of GDS-enabled I/O are exposed through environment variables:
-
-- ``LIBCUDF_CUFILE_THREAD_COUNT``: Integral value, maximum number of parallel reads/writes per file (default 16);
-- ``LIBCUDF_CUFILE_SLICE_SIZE``: Integral value, maximum size of each GDS read/write, in bytes (default 4MB).
-  Larger I/O operations are split into multiple calls.
diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/basics/io-nvcomp-integration.rst
deleted file mode 100644
index fc24e0c15f4..00000000000
--- a/docs/cudf/source/basics/io-nvcomp-integration.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-nvCOMP Integration
-=============================
-
-Some types of compression/decompression can be performed using either the `nvCOMP library <https://github.com/NVIDIA/nvcomp>`_ or the internal implementation.
-
-Which implementation is used by default depends on the data format and the compression type.
-Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``.
-
-There are three valid values for the environment variable:
-
-- "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use.
-- "ALWAYS": Enable all available uses of nvCOMP, including new, experimental combinations.
-- "OFF": Disable nvCOMP use whenever possible and use the internal implementations instead.
-
-If no value is set, behavior will be the same as the "STABLE" option.
-
-
-.. table:: Current policy for nvCOMP use for different types
-    :widths: 20 15 15 15 15 15 15 15 15 15
-
-    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
-    |                       |       CSV       |      Parquet    |       JSON       |       ORC       |  AVRO  |
-    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
-    | Compression Type      | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader |
-    +=======================+========+========+========+========+=========+========+========+========+========+
-    | snappy                | ❌     | ❌     | Stable | Stable | ❌      | ❌     | Stable | Stable | ❌     |
-    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
diff --git a/docs/cudf/source/basics/io.rst b/docs/cudf/source/basics/io.rst
deleted file mode 100644
index ee3d997d664..00000000000
--- a/docs/cudf/source/basics/io.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-~~~~~~~~~~~~~~
-Input / Output
-~~~~~~~~~~~~~~
-
-This page contains Input / Output related APIs in cuDF.
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-   io-supported-types.rst
-   io-gds-integration.rst
-   io-nvcomp-integration.rst
\ No newline at end of file
diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst
index 90b287bd1b6..2c1df4a0c12 100644
--- a/docs/cudf/source/index.rst
+++ b/docs/cudf/source/index.rst
@@ -14,7 +14,6 @@ the details of CUDA programming.
    :caption: Contents:
 
    user_guide/index
-   basics/index
    api_docs/index
 
 
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 9bb95406e8a..080fce3c55c 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -2,6 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "e9357872",
    "metadata": {},
    "source": [
     "10 Minutes to cuDF and Dask-cuDF\n",
@@ -26,6 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "92eed4cb",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,6 +47,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ed6c6047",
    "metadata": {},
    "source": [
     "Object Creation\n",
@@ -53,6 +56,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "aeedd961",
    "metadata": {},
    "source": [
     "Creating a `cudf.Series` and `dask_cudf.Series`."
@@ -61,6 +65,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "cf8b08e5",
    "metadata": {},
    "outputs": [
     {
@@ -87,6 +92,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "083a5898",
    "metadata": {},
    "outputs": [
     {
@@ -112,6 +118,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6346e1b1",
    "metadata": {},
    "source": [
     "Creating a `cudf.DataFrame` and a `dask_cudf.DataFrame` by specifying values for each column."
@@ -120,6 +127,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "83d1e7f5",
    "metadata": {},
    "outputs": [
     {
@@ -313,6 +321,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "71b61d62",
    "metadata": {},
    "outputs": [
     {
@@ -502,6 +511,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c7cb5abc",
    "metadata": {},
    "source": [
     "Creating a `cudf.DataFrame` from a pandas `Dataframe` and a `dask_cudf.Dataframe` from a `cudf.Dataframe`.\n",
@@ -512,6 +522,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
+   "id": "07a62244",
    "metadata": {},
    "outputs": [
     {
@@ -586,6 +597,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
+   "id": "f5cb0c65",
    "metadata": {},
    "outputs": [
     {
@@ -658,6 +670,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "025eac40",
    "metadata": {},
    "source": [
     "Viewing Data\n",
@@ -666,6 +679,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "47a567e8",
    "metadata": {},
    "source": [
     "Viewing the top rows of a GPU dataframe."
@@ -674,6 +688,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "ab8cbdb8",
    "metadata": {},
    "outputs": [
     {
@@ -737,6 +752,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "2e923d8a",
    "metadata": {},
    "outputs": [
     {
@@ -799,6 +815,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "61257b4b",
    "metadata": {},
    "source": [
     "Sorting by values."
@@ -807,6 +824,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "512770f9",
    "metadata": {},
    "outputs": [
     {
@@ -996,6 +1014,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
+   "id": "1a13993f",
    "metadata": {},
    "outputs": [
     {
@@ -1184,6 +1203,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "19bce4c4",
    "metadata": {},
    "source": [
     "Selection\n",
@@ -1194,6 +1214,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ba55980e",
    "metadata": {},
    "source": [
     "Selecting a single column, which initially yields a `cudf.Series` or `dask_cudf.Series`. Calling `compute` results in a `cudf.Series` (equivalent to `df.a`)."
@@ -1202,6 +1223,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
+   "id": "885989a6",
    "metadata": {},
    "outputs": [
     {
@@ -1242,6 +1264,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
+   "id": "14a74255",
    "metadata": {},
    "outputs": [
     {
@@ -1281,6 +1304,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "498d79f2",
    "metadata": {},
    "source": [
     "## Selection by Label"
@@ -1288,6 +1312,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4b8b8e13",
    "metadata": {},
    "source": [
     "Selecting rows from index 2 to index 5 from columns 'a' and 'b'."
@@ -1296,6 +1321,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
+   "id": "d40bc19c",
    "metadata": {},
    "outputs": [
     {
@@ -1368,6 +1394,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
+   "id": "7688535b",
    "metadata": {},
    "outputs": [
     {
@@ -1439,6 +1466,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "8a64ce7a",
    "metadata": {},
    "source": [
     "## Selection by Position"
@@ -1446,6 +1474,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "dfba2bb2",
    "metadata": {},
    "source": [
     "Selecting via integers and integer slices, like numpy/pandas. Note that this functionality is not available for Dask-cuDF DataFrames."
@@ -1454,6 +1483,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
+   "id": "fb8d6d43",
    "metadata": {},
    "outputs": [
     {
@@ -1477,6 +1507,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
+   "id": "263231da",
    "metadata": {},
    "outputs": [
     {
@@ -1542,6 +1573,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2223b089",
    "metadata": {},
    "source": [
     "You can also select elements of a `DataFrame` or `Series` with direct index access."
@@ -1550,6 +1582,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "13f6158b",
    "metadata": {},
    "outputs": [
     {
@@ -1613,6 +1646,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
+   "id": "3cf4aa26",
    "metadata": {},
    "outputs": [
     {
@@ -1634,6 +1668,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ff633b2d",
    "metadata": {},
    "source": [
     "## Boolean Indexing"
@@ -1641,6 +1676,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "bbdef48f",
    "metadata": {},
    "source": [
     "Selecting rows in a `DataFrame` or `Series` by direct Boolean indexing."
@@ -1649,6 +1685,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
+   "id": "becb916f",
    "metadata": {},
    "outputs": [
     {
@@ -1726,6 +1763,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
+   "id": "b9475c43",
    "metadata": {},
    "outputs": [
     {
@@ -1802,6 +1840,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ecf982f5",
    "metadata": {},
    "source": [
     "Selecting values from a `DataFrame` where a Boolean condition is met, via the `query` API."
@@ -1810,6 +1849,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
+   "id": "fc2fc9f9",
    "metadata": {},
    "outputs": [
     {
@@ -1866,6 +1906,7 @@
   {
    "cell_type": "code",
    "execution_count": 23,
+   "id": "1a05a07f",
    "metadata": {},
    "outputs": [
     {
@@ -1921,6 +1962,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7f8955a0",
    "metadata": {},
    "source": [
     "You can also pass local variables to Dask-cuDF queries, via the `local_dict` keyword. With standard cuDF, you may either use the `local_dict` keyword or directly pass the variable via the `@` keyword. Supported logical operators include `>`, `<`, `>=`, `<=`, `==`, and `!=`."
@@ -1929,6 +1971,7 @@
   {
    "cell_type": "code",
    "execution_count": 24,
+   "id": "49485a4b",
    "metadata": {},
    "outputs": [
     {
@@ -1986,6 +2029,7 @@
   {
    "cell_type": "code",
    "execution_count": 25,
+   "id": "0f3a9116",
    "metadata": {},
    "outputs": [
     {
@@ -2042,6 +2086,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c355af07",
    "metadata": {},
    "source": [
     "Using the `isin` method for filtering."
@@ -2050,6 +2095,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
+   "id": "f44a5a57",
    "metadata": {},
    "outputs": [
     {
@@ -2112,6 +2158,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "79a50beb",
    "metadata": {},
    "source": [
     "## MultiIndex"
@@ -2119,6 +2166,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "14e70234",
    "metadata": {},
    "source": [
     "cuDF supports hierarchical indexing of DataFrames using MultiIndex. Grouping hierarchically (see `Grouping` below) automatically produces a DataFrame with a MultiIndex."
@@ -2127,6 +2175,7 @@
   {
    "cell_type": "code",
    "execution_count": 27,
+   "id": "882973ed",
    "metadata": {},
    "outputs": [
     {
@@ -2153,6 +2202,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c10971cc",
    "metadata": {},
    "source": [
     "This index can back either axis of a DataFrame."
@@ -2161,6 +2211,7 @@
   {
    "cell_type": "code",
    "execution_count": 28,
+   "id": "5417aeb9",
    "metadata": {},
    "outputs": [
     {
@@ -2238,6 +2289,7 @@
   {
    "cell_type": "code",
    "execution_count": 29,
+   "id": "4d6fb4ff",
    "metadata": {},
    "outputs": [
     {
@@ -2311,6 +2363,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "63dc11d8",
    "metadata": {},
    "source": [
     "Accessing values of a DataFrame with a MultiIndex. Note that slicing is not yet supported."
@@ -2319,6 +2372,7 @@
   {
    "cell_type": "code",
    "execution_count": 30,
+   "id": "3644920c",
    "metadata": {},
    "outputs": [
     {
@@ -2340,6 +2394,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "697a9a36",
    "metadata": {},
    "source": [
     "Missing Data\n",
@@ -2348,6 +2403,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "86655274",
    "metadata": {},
    "source": [
     "Missing data can be replaced by using the `fillna` method."
@@ -2356,6 +2412,7 @@
   {
    "cell_type": "code",
    "execution_count": 31,
+   "id": "28b06c52",
    "metadata": {},
    "outputs": [
     {
@@ -2381,6 +2438,7 @@
   {
    "cell_type": "code",
    "execution_count": 32,
+   "id": "7fb6a126",
    "metadata": {},
    "outputs": [
     {
@@ -2405,6 +2463,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7a0b732f",
    "metadata": {},
    "source": [
     "Operations\n",
@@ -2413,6 +2472,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "1e8b0464",
    "metadata": {},
    "source": [
     "## Stats"
@@ -2420,6 +2480,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7523512b",
    "metadata": {},
    "source": [
     "Calculating descriptive statistics for a `Series`."
@@ -2428,6 +2489,7 @@
   {
    "cell_type": "code",
    "execution_count": 33,
+   "id": "f7cb604e",
    "metadata": {},
    "outputs": [
     {
@@ -2448,6 +2510,7 @@
   {
    "cell_type": "code",
    "execution_count": 34,
+   "id": "b8957a5f",
    "metadata": {},
    "outputs": [
     {
@@ -2467,6 +2530,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "71fa928a",
    "metadata": {},
    "source": [
     "## Applymap"
@@ -2474,6 +2538,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "d98d6f7b",
    "metadata": {},
    "source": [
     "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
@@ -2482,6 +2547,7 @@
   {
    "cell_type": "code",
    "execution_count": 35,
+   "id": "5e627811",
    "metadata": {},
    "outputs": [
     {
@@ -2533,6 +2599,7 @@
   {
    "cell_type": "code",
    "execution_count": 36,
+   "id": "96cf628e",
    "metadata": {},
    "outputs": [
     {
@@ -2572,6 +2639,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "cd69c00a",
    "metadata": {},
    "source": [
     "## Histogramming"
@@ -2579,6 +2647,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "39982866",
    "metadata": {},
    "source": [
     "Counting the number of occurrences of each unique value of variable."
@@ -2587,6 +2656,7 @@
   {
    "cell_type": "code",
    "execution_count": 37,
+   "id": "62808675",
    "metadata": {},
    "outputs": [
     {
@@ -2627,6 +2697,7 @@
   {
    "cell_type": "code",
    "execution_count": 38,
+   "id": "5b2a42ce",
    "metadata": {},
    "outputs": [
     {
@@ -2666,6 +2737,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2d7e62e4",
    "metadata": {},
    "source": [
     "## String Methods"
@@ -2673,6 +2745,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4e704eca",
    "metadata": {},
    "source": [
     "Like pandas, cuDF provides string processing methods in the `str` attribute of `Series`. Full documentation of string methods is a work in progress. Please see the cuDF API documentation for more information."
@@ -2681,6 +2754,7 @@
   {
    "cell_type": "code",
    "execution_count": 39,
+   "id": "c73e70bb",
    "metadata": {},
    "outputs": [
     {
@@ -2711,6 +2785,7 @@
   {
    "cell_type": "code",
    "execution_count": 40,
+   "id": "697c1c94",
    "metadata": {},
    "outputs": [
     {
@@ -2740,6 +2815,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "dfc1371e",
    "metadata": {},
    "source": [
     "## Concat"
@@ -2747,6 +2823,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "f6fb9b53",
    "metadata": {},
    "source": [
     "Concatenating `Series` and `DataFrames` row-wise."
@@ -2755,6 +2832,7 @@
   {
    "cell_type": "code",
    "execution_count": 41,
+   "id": "60538bbd",
    "metadata": {},
    "outputs": [
     {
@@ -2786,6 +2864,7 @@
   {
    "cell_type": "code",
    "execution_count": 42,
+   "id": "17953847",
    "metadata": {},
    "outputs": [
     {
@@ -2816,6 +2895,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "27f0d621",
    "metadata": {},
    "source": [
     "## Join"
@@ -2823,6 +2903,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "fd35f1a7",
    "metadata": {},
    "source": [
     "Performing SQL style merges. Note that the dataframe order is not maintained, but may be restored post-merge by sorting by the index."
@@ -2831,6 +2912,7 @@
   {
    "cell_type": "code",
    "execution_count": 43,
+   "id": "52ada00a",
    "metadata": {},
    "outputs": [
     {
@@ -2924,6 +3006,7 @@
   {
    "cell_type": "code",
    "execution_count": 44,
+   "id": "409fcf92",
    "metadata": {},
    "outputs": [
     {
@@ -3011,6 +3094,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "d9dcb86b",
    "metadata": {},
    "source": [
     "## Append"
@@ -3018,6 +3102,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "1f896819",
    "metadata": {},
    "source": [
     "Appending values from another `Series` or array-like object."
@@ -3026,6 +3111,7 @@
   {
    "cell_type": "code",
    "execution_count": 45,
+   "id": "9976c1ce",
    "metadata": {},
    "outputs": [
     {
@@ -3064,6 +3150,7 @@
   {
    "cell_type": "code",
    "execution_count": 46,
+   "id": "fe5c54ab",
    "metadata": {},
    "outputs": [
     {
@@ -3093,6 +3180,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "9fa10ef3",
    "metadata": {},
    "source": [
     "## Grouping"
@@ -3100,6 +3188,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "8a6e41f5",
    "metadata": {},
    "source": [
     "Like pandas, cuDF and Dask-cuDF support the Split-Apply-Combine groupby paradigm."
@@ -3108,6 +3197,7 @@
   {
    "cell_type": "code",
    "execution_count": 47,
+   "id": "2a8cafa7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3119,6 +3209,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0179d60c",
    "metadata": {},
    "source": [
     "Grouping and then applying the `sum` function to the grouped data."
@@ -3127,6 +3218,7 @@
   {
    "cell_type": "code",
    "execution_count": 48,
+   "id": "7c56d186",
    "metadata": {},
    "outputs": [
     {
@@ -3201,6 +3293,7 @@
   {
    "cell_type": "code",
    "execution_count": 49,
+   "id": "f8823b30",
    "metadata": {},
    "outputs": [
     {
@@ -3274,6 +3367,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a84cb883",
    "metadata": {},
    "source": [
     "Grouping hierarchically then applying the `sum` function to grouped data."
@@ -3282,6 +3376,7 @@
   {
    "cell_type": "code",
    "execution_count": 50,
+   "id": "2184e3ad",
    "metadata": {},
    "outputs": [
     {
@@ -3372,6 +3467,7 @@
   {
    "cell_type": "code",
    "execution_count": 51,
+   "id": "4ec311c1",
    "metadata": {},
    "outputs": [
     {
@@ -3461,6 +3557,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "dedfeb1b",
    "metadata": {},
    "source": [
     "Grouping and applying statistical functions to specific columns, using `agg`."
@@ -3469,6 +3566,7 @@
   {
    "cell_type": "code",
    "execution_count": 52,
+   "id": "2563d8b2",
    "metadata": {},
    "outputs": [
     {
@@ -3539,6 +3637,7 @@
   {
    "cell_type": "code",
    "execution_count": 53,
+   "id": "22c77e75",
    "metadata": {},
    "outputs": [
     {
@@ -3608,6 +3707,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6d074822",
    "metadata": {},
    "source": [
     "## Transpose"
@@ -3615,6 +3715,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "16c0f0a8",
    "metadata": {},
    "source": [
     "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF."
@@ -3623,6 +3724,7 @@
   {
    "cell_type": "code",
    "execution_count": 54,
+   "id": "e265861e",
    "metadata": {},
    "outputs": [
     {
@@ -3690,6 +3792,7 @@
   {
    "cell_type": "code",
    "execution_count": 55,
+   "id": "1fe9b972",
    "metadata": {},
    "outputs": [
     {
@@ -3752,14 +3855,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "9ce02827",
    "metadata": {},
    "source": [
     "Time Series\n",
-    "------------\n"
+    "------------"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "fec907ff",
    "metadata": {},
    "source": [
     "`DataFrames` supports `datetime` typed columns, which allow users to interact with and filter data based on specific timestamps."
@@ -3768,6 +3873,7 @@
   {
    "cell_type": "code",
    "execution_count": 56,
+   "id": "7a425d3f",
    "metadata": {},
    "outputs": [
     {
@@ -3847,6 +3953,7 @@
   {
    "cell_type": "code",
    "execution_count": 57,
+   "id": "87f0e56e",
    "metadata": {},
    "outputs": [
     {
@@ -3919,6 +4026,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0d0e541c",
    "metadata": {},
    "source": [
     "Categoricals\n",
@@ -3927,6 +4035,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a36f9543",
    "metadata": {},
    "source": [
     "`DataFrames` support categorical columns."
@@ -3935,6 +4044,7 @@
   {
    "cell_type": "code",
    "execution_count": 58,
+   "id": "05bd8be8",
    "metadata": {},
    "outputs": [
     {
@@ -4021,6 +4131,7 @@
   {
    "cell_type": "code",
    "execution_count": 59,
+   "id": "676b4963",
    "metadata": {},
    "outputs": [
     {
@@ -4105,6 +4216,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e24f2e7b",
    "metadata": {},
    "source": [
     "Accessing the categories of a column. Note that this is currently not supported in Dask-cuDF."
@@ -4113,6 +4225,7 @@
   {
    "cell_type": "code",
    "execution_count": 60,
+   "id": "06310c36",
    "metadata": {},
    "outputs": [
     {
@@ -4132,6 +4245,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4eb6f858",
    "metadata": {},
    "source": [
     "Accessing the underlying code values of each categorical observation."
@@ -4140,6 +4254,7 @@
   {
    "cell_type": "code",
    "execution_count": 61,
+   "id": "0f6db260",
    "metadata": {},
    "outputs": [
     {
@@ -4166,6 +4281,7 @@
   {
    "cell_type": "code",
    "execution_count": 62,
+   "id": "b87c4375",
    "metadata": {},
    "outputs": [
     {
@@ -4191,6 +4307,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3f816916",
    "metadata": {},
    "source": [
     "Converting Data Representation\n",
@@ -4199,6 +4316,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "64a17f6d",
    "metadata": {},
    "source": [
     "## Pandas"
@@ -4206,6 +4324,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3acdcacc",
    "metadata": {},
    "source": [
     "Converting a cuDF and Dask-cuDF `DataFrame` to a pandas `DataFrame`."
@@ -4214,6 +4333,7 @@
   {
    "cell_type": "code",
    "execution_count": 63,
+   "id": "d1fed919",
    "metadata": {},
    "outputs": [
     {
@@ -4310,6 +4430,7 @@
   {
    "cell_type": "code",
    "execution_count": 64,
+   "id": "567c7363",
    "metadata": {},
    "outputs": [
     {
@@ -4405,6 +4526,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c2121453",
    "metadata": {},
    "source": [
     "## Numpy"
@@ -4412,6 +4534,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a9faa2c5",
    "metadata": {},
    "source": [
     "Converting a cuDF or Dask-cuDF `DataFrame` to a numpy `ndarray`."
@@ -4420,6 +4543,7 @@
   {
    "cell_type": "code",
    "execution_count": 65,
+   "id": "5490d226",
    "metadata": {},
    "outputs": [
     {
@@ -4459,6 +4583,7 @@
   {
    "cell_type": "code",
    "execution_count": 66,
+   "id": "b77ac8ae",
    "metadata": {},
    "outputs": [
     {
@@ -4497,6 +4622,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "1d24d30f",
    "metadata": {},
    "source": [
     "Converting a cuDF or Dask-cuDF `Series` to a numpy `ndarray`."
@@ -4505,6 +4631,7 @@
   {
    "cell_type": "code",
    "execution_count": 67,
+   "id": "f71a0ba3",
    "metadata": {},
    "outputs": [
     {
@@ -4526,6 +4653,7 @@
   {
    "cell_type": "code",
    "execution_count": 68,
+   "id": "a45a74b5",
    "metadata": {},
    "outputs": [
     {
@@ -4546,6 +4674,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0d78a4d2",
    "metadata": {},
    "source": [
     "## Arrow"
@@ -4553,6 +4682,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7e35b829",
    "metadata": {},
    "source": [
     "Converting a cuDF or Dask-cuDF `DataFrame` to a PyArrow `Table`."
@@ -4561,6 +4691,7 @@
   {
    "cell_type": "code",
    "execution_count": 69,
+   "id": "bb9e9a2a",
    "metadata": {},
    "outputs": [
     {
@@ -4592,6 +4723,7 @@
   {
    "cell_type": "code",
    "execution_count": 70,
+   "id": "4d020de7",
    "metadata": {},
    "outputs": [
     {
@@ -4622,14 +4754,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ace7b4f9",
    "metadata": {},
    "source": [
     "Getting Data In/Out\n",
-    "------------------------\n"
+    "------------------------"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "161abb12",
    "metadata": {},
    "source": [
     "## CSV"
@@ -4637,6 +4771,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7e5dc381",
    "metadata": {},
    "source": [
     "Writing to a CSV file."
@@ -4645,6 +4780,7 @@
   {
    "cell_type": "code",
    "execution_count": 71,
+   "id": "3a59715f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4657,6 +4793,7 @@
   {
    "cell_type": "code",
    "execution_count": 72,
+   "id": "4ebe98ed",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4665,6 +4802,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0479fc4f",
    "metadata": {},
    "source": [
     "Reading from a csv file."
@@ -4673,6 +4811,7 @@
   {
    "cell_type": "code",
    "execution_count": 73,
+   "id": "1a70e831",
    "metadata": {},
    "outputs": [
     {
@@ -4905,6 +5044,7 @@
   {
    "cell_type": "code",
    "execution_count": 74,
+   "id": "4c3d9ca3",
    "metadata": {},
    "outputs": [
     {
@@ -5136,6 +5276,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3d739c6e",
    "metadata": {},
    "source": [
     "Reading all CSV files in a directory into a single `dask_cudf.DataFrame`, using the star wildcard."
@@ -5144,6 +5285,7 @@
   {
    "cell_type": "code",
    "execution_count": 75,
+   "id": "cb7187d2",
    "metadata": {},
    "outputs": [
     {
@@ -5555,6 +5697,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c0939a1e",
    "metadata": {},
    "source": [
     "## Parquet"
@@ -5562,6 +5705,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "14e6a634",
    "metadata": {},
    "source": [
     "Writing to parquet files, using the CPU via PyArrow."
@@ -5570,6 +5714,7 @@
   {
    "cell_type": "code",
    "execution_count": 76,
+   "id": "1812346f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -5578,6 +5723,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "093cd0fe",
    "metadata": {},
    "source": [
     "Reading parquet files with a GPU-accelerated parquet reader."
@@ -5586,6 +5732,7 @@
   {
    "cell_type": "code",
    "execution_count": 77,
+   "id": "2354b20b",
    "metadata": {},
    "outputs": [
     {
@@ -5817,6 +5964,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "132c3ff2",
    "metadata": {},
    "source": [
     "Writing to parquet files from a `dask_cudf.DataFrame` using PyArrow under the hood."
@@ -5825,6 +5973,7 @@
   {
    "cell_type": "code",
    "execution_count": 78,
+   "id": "c5d7686c",
    "metadata": {},
    "outputs": [
     {
@@ -5844,6 +5993,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0d73d1dd",
    "metadata": {},
    "source": [
     "## ORC"
@@ -5851,6 +6001,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "61b5f466",
    "metadata": {},
    "source": [
     "Reading ORC files."
@@ -5858,16 +6009,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 79,
+   "id": "93364ff3",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'/home/mmccarty/sandbox/rapids/cudf/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc'"
+       "'/home/ashwin/workspace/rapids/cudf/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc'"
       ]
      },
-     "execution_count": 80,
+     "execution_count": 79,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -5883,7 +6035,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 80,
+   "id": "2b6785c7",
    "metadata": {},
    "outputs": [
     {
@@ -5974,7 +6127,7 @@
        "1  [{'key': 'chani', 'value': {'int1': 5, 'string...  "
       ]
      },
-     "execution_count": 81,
+     "execution_count": 80,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -5986,6 +6139,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "238ce6a4",
    "metadata": {},
    "source": [
     "Dask Performance Tips\n",
@@ -6000,6 +6154,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3de9aeca",
    "metadata": {},
    "source": [
     "First, we set up a GPU cluster. With our `client` set up, Dask-cuDF computation will be distributed across the GPUs in the cluster."
@@ -6007,17 +6162,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 81,
+   "id": "e4852d48",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-04-21 10:11:07,360 - distributed.diskutils - INFO - Found stale lock file and directory '/home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-ghcx5g0e', purging\n",
-      "2022-04-21 10:11:07,360 - distributed.diskutils - INFO - Found stale lock file and directory '/home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-wh16f0h3', purging\n",
-      "2022-04-21 10:11:07,360 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
-      "2022-04-21 10:11:07,388 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n"
+      "2022-04-21 13:26:06,860 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
+      "2022-04-21 13:26:06,904 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n"
      ]
     },
     {
@@ -6027,7 +6181,7 @@
        "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
        "    <div style=\"margin-left: 48px;\">\n",
        "        <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
-       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-e3492c89-c17c-11ec-813e-fc3497a62adc</p>\n",
+       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-20d00fd5-c198-11ec-906c-c8d9d2247354</p>\n",
        "        <table style=\"width: 100%; text-align: left;\">\n",
        "\n",
        "        <tr>\n",
@@ -6056,7 +6210,7 @@
        "    </div>\n",
        "    <div style=\"margin-left: 48px;\">\n",
        "        <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCUDACluster</h3>\n",
-       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">db2501e1</p>\n",
+       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">47648c26</p>\n",
        "        <table style=\"width: 100%; text-align: left;\">\n",
        "            <tr>\n",
        "                <td style=\"text-align: left;\">\n",
@@ -6093,11 +6247,11 @@
        "        <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
        "        <div style=\"margin-left: 48px;\">\n",
        "            <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
-       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-6f476508-e52f-49e9-8f1f-6a8641e177bd</p>\n",
+       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-f28bff16-cb70-452c-b8af-b9299a8d7b20</p>\n",
        "            <table style=\"width: 100%; text-align: left;\">\n",
        "                <tr>\n",
        "                    <td style=\"text-align: left;\">\n",
-       "                        <strong>Comm:</strong> tcp://127.0.0.1:39755\n",
+       "                        <strong>Comm:</strong> tcp://127.0.0.1:33995\n",
        "                    </td>\n",
        "                    <td style=\"text-align: left;\">\n",
        "                        <strong>Workers:</strong> 2\n",
@@ -6139,7 +6293,7 @@
        "                <table style=\"width: 100%; text-align: left;\">\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Comm: </strong> tcp://127.0.0.1:33491\n",
+       "                            <strong>Comm: </strong> tcp://127.0.0.1:40479\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
        "                            <strong>Total threads: </strong> 1\n",
@@ -6147,7 +6301,7 @@
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:34333/status\" target=\"_blank\">http://127.0.0.1:34333/status</a>\n",
+       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:38985/status\" target=\"_blank\">http://127.0.0.1:38985/status</a>\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
        "                            <strong>Memory: </strong> 62.82 GiB\n",
@@ -6155,13 +6309,13 @@
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Nanny: </strong> tcp://127.0.0.1:43093\n",
+       "                            <strong>Nanny: </strong> tcp://127.0.0.1:33447\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\"></td>\n",
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
-       "                            <strong>Local directory: </strong> /home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-jsuvfju4\n",
+       "                            <strong>Local directory: </strong> /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-be7zg92w\n",
        "                        </td>\n",
        "                    </tr>\n",
        "\n",
@@ -6193,7 +6347,7 @@
        "                <table style=\"width: 100%; text-align: left;\">\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Comm: </strong> tcp://127.0.0.1:44033\n",
+       "                            <strong>Comm: </strong> tcp://127.0.0.1:40519\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
        "                            <strong>Total threads: </strong> 1\n",
@@ -6201,7 +6355,7 @@
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:45225/status\" target=\"_blank\">http://127.0.0.1:45225/status</a>\n",
+       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:40951/status\" target=\"_blank\">http://127.0.0.1:40951/status</a>\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
        "                            <strong>Memory: </strong> 62.82 GiB\n",
@@ -6209,13 +6363,13 @@
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Nanny: </strong> tcp://127.0.0.1:46529\n",
+       "                            <strong>Nanny: </strong> tcp://127.0.0.1:39133\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\"></td>\n",
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
-       "                            <strong>Local directory: </strong> /home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-zlsacw8_\n",
+       "                            <strong>Local directory: </strong> /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-3v0c20ux\n",
        "                        </td>\n",
        "                    </tr>\n",
        "\n",
@@ -6251,10 +6405,10 @@
        "</div>"
       ],
       "text/plain": [
-       "<Client: 'tcp://127.0.0.1:39755' processes=2 threads=2, memory=125.65 GiB>"
+       "<Client: 'tcp://127.0.0.1:33995' processes=2 threads=2, memory=45.79 GiB>"
       ]
      },
-     "execution_count": 82,
+     "execution_count": 81,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -6272,6 +6426,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "181e4d10",
    "metadata": {},
    "source": [
     "### Persisting Data\n",
@@ -6280,7 +6435,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 82,
+   "id": "d47a1142",
    "metadata": {},
    "outputs": [
     {
@@ -6356,7 +6512,7 @@
        "<dask_cudf.DataFrame | 20 tasks | 5 npartitions>"
       ]
      },
-     "execution_count": 83,
+     "execution_count": 82,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -6372,45 +6528,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 83,
+   "id": "c3cb612a",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Thu Apr 21 10:11:07 2022       \n",
-      "+-----------------------------------------------------------------------------+\n",
-      "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\n",
-      "|-------------------------------+----------------------+----------------------+\n",
-      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
-      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
-      "|                               |                      |               MIG M. |\n",
-      "|===============================+======================+======================|\n",
-      "|   0  NVIDIA RTX A6000    On   | 00000000:01:00.0  On |                  Off |\n",
-      "| 30%   48C    P2    83W / 300W |   2970MiB / 48651MiB |      7%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "|   1  NVIDIA RTX A6000    On   | 00000000:02:00.0 Off |                  Off |\n",
-      "| 30%   36C    P2    25W / 300W |    265MiB / 48685MiB |      5%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "                                                                               \n",
-      "+-----------------------------------------------------------------------------+\n",
-      "| Processes:                                                                  |\n",
-      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
-      "|        ID   ID                                                   Usage      |\n",
-      "|=============================================================================|\n",
-      "|    0   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                871MiB |\n",
-      "|    0   N/A  N/A      2441      G   /usr/bin/gnome-shell              316MiB |\n",
-      "|    0   N/A  N/A   1240494      G   ...AAAAAAAAA= --shared-files       68MiB |\n",
-      "|    0   N/A  N/A   1240525      G   ...RendererForSitePerProcess       41MiB |\n",
-      "|    0   N/A  N/A   1243689      C   .../envs/cudf_dev/bin/python      593MiB |\n",
-      "|    0   N/A  N/A   1245502      C   .../envs/cudf_dev/bin/python      753MiB |\n",
-      "|    0   N/A  N/A   1245751      C   .../envs/cudf_dev/bin/python      257MiB |\n",
-      "|    1   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                  4MiB |\n",
-      "|    1   N/A  N/A   1245748      C   .../envs/cudf_dev/bin/python      257MiB |\n",
-      "+-----------------------------------------------------------------------------+\n"
+      "Thu Apr 21 13:26:07 2022       \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\r\n",
+      "|-------------------------------+----------------------+----------------------+\r\n",
+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n",
+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\r\n",
+      "|                               |                      |               MIG M. |\r\n",
+      "|===============================+======================+======================|\r\n",
+      "|   0  Quadro GV100        Off  | 00000000:15:00.0 Off |                  Off |\r\n",
+      "| 39%   52C    P2    51W / 250W |   1115MiB / 32508MiB |      0%      Default |\r\n",
+      "|                               |                      |                  N/A |\r\n",
+      "+-------------------------------+----------------------+----------------------+\r\n",
+      "|   1  Quadro GV100        Off  | 00000000:2D:00.0 Off |                  Off |\r\n",
+      "| 43%   57C    P2    52W / 250W |    306MiB / 32498MiB |      0%      Default |\r\n",
+      "|                               |                      |                  N/A |\r\n",
+      "+-------------------------------+----------------------+----------------------+\r\n",
+      "                                                                               \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| Processes:                                                                  |\r\n",
+      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\r\n",
+      "|        ID   ID                                                   Usage      |\r\n",
+      "|=============================================================================|\r\n",
+      "+-----------------------------------------------------------------------------+\r\n"
      ]
     }
    ],
@@ -6420,6 +6568,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "b98810c4",
    "metadata": {},
    "source": [
     "Because Dask is lazy, the computation has not yet occurred. We can see that there are twenty tasks in the task graph and we've used about 800 MB of memory. We can force computation by using `persist`. By forcing execution, the result is now explicitly in memory and our task graph only contains one task per partition (the baseline)."
@@ -6427,7 +6576,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 84,
+   "id": "a929577c",
    "metadata": {},
    "outputs": [
     {
@@ -6503,7 +6653,7 @@
        "<dask_cudf.DataFrame | 5 tasks | 5 npartitions>"
       ]
      },
-     "execution_count": 85,
+     "execution_count": 84,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -6515,45 +6665,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 85,
+   "id": "8aa7c079",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Thu Apr 21 10:11:08 2022       \n",
-      "+-----------------------------------------------------------------------------+\n",
-      "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\n",
-      "|-------------------------------+----------------------+----------------------+\n",
-      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
-      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
-      "|                               |                      |               MIG M. |\n",
-      "|===============================+======================+======================|\n",
-      "|   0  NVIDIA RTX A6000    On   | 00000000:01:00.0  On |                  Off |\n",
-      "| 30%   48C    P2    84W / 300W |   2970MiB / 48651MiB |      3%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "|   1  NVIDIA RTX A6000    On   | 00000000:02:00.0 Off |                  Off |\n",
-      "| 30%   36C    P2    37W / 300W |    265MiB / 48685MiB |      0%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "                                                                               \n",
-      "+-----------------------------------------------------------------------------+\n",
-      "| Processes:                                                                  |\n",
-      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
-      "|        ID   ID                                                   Usage      |\n",
-      "|=============================================================================|\n",
-      "|    0   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                871MiB |\n",
-      "|    0   N/A  N/A      2441      G   /usr/bin/gnome-shell              316MiB |\n",
-      "|    0   N/A  N/A   1240494      G   ...AAAAAAAAA= --shared-files       68MiB |\n",
-      "|    0   N/A  N/A   1240525      G   ...RendererForSitePerProcess       41MiB |\n",
-      "|    0   N/A  N/A   1243689      C   .../envs/cudf_dev/bin/python      593MiB |\n",
-      "|    0   N/A  N/A   1245502      C   .../envs/cudf_dev/bin/python      753MiB |\n",
-      "|    0   N/A  N/A   1245751      C   .../envs/cudf_dev/bin/python      257MiB |\n",
-      "|    1   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                  4MiB |\n",
-      "|    1   N/A  N/A   1245748      C   .../envs/cudf_dev/bin/python      257MiB |\n",
-      "+-----------------------------------------------------------------------------+\n"
+      "Thu Apr 21 13:26:08 2022       \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\r\n",
+      "|-------------------------------+----------------------+----------------------+\r\n",
+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n",
+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\r\n",
+      "|                               |                      |               MIG M. |\r\n",
+      "|===============================+======================+======================|\r\n",
+      "|   0  Quadro GV100        Off  | 00000000:15:00.0 Off |                  Off |\r\n",
+      "| 39%   52C    P2    52W / 250W |   1115MiB / 32508MiB |      3%      Default |\r\n",
+      "|                               |                      |                  N/A |\r\n",
+      "+-------------------------------+----------------------+----------------------+\r\n",
+      "|   1  Quadro GV100        Off  | 00000000:2D:00.0 Off |                  Off |\r\n",
+      "| 43%   57C    P2    51W / 250W |    306MiB / 32498MiB |      0%      Default |\r\n",
+      "|                               |                      |                  N/A |\r\n",
+      "+-------------------------------+----------------------+----------------------+\r\n",
+      "                                                                               \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| Processes:                                                                  |\r\n",
+      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\r\n",
+      "|        ID   ID                                                   Usage      |\r\n",
+      "|=============================================================================|\r\n",
+      "+-----------------------------------------------------------------------------+\r\n"
      ]
     }
    ],
@@ -6563,6 +6705,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ff9e14b6",
    "metadata": {},
    "source": [
     "Because we forced computation, we now have a larger object in distributed GPU memory."
@@ -6570,6 +6713,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "bb3b3dee",
    "metadata": {},
    "source": [
     "### Wait\n",
@@ -6580,7 +6724,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 86,
+   "id": "ef71bf00",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -6598,6 +6743,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e1099ec0",
    "metadata": {},
    "source": [
     "This function will do a basic transformation of every column in the dataframe, but the time spent in the function will vary due to the `time.sleep` statement randomly adding 1-60 seconds of time. We'll run this on every partition of our dataframe using `map_partitions`, which adds the task to our task-graph, and store the result. We can then call `persist` to force execution."
@@ -6605,7 +6751,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": 87,
+   "id": "700dd799",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -6615,6 +6762,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "5eb83a7e",
    "metadata": {},
    "source": [
     "However, some partitions will be done **much** sooner than others. If we had downstream processes that should wait for all partitions to be completed, we can enforce that behavior using `wait`."
@@ -6622,16 +6770,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": 88,
+   "id": "73bccf94",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "DoneAndNotDoneFutures(done={<Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 0)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 2)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 3)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 1)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 4)>}, not_done=set())"
+       "DoneAndNotDoneFutures(done={<Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-c8623f55c898739bdfb89533682776dc', 0)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-c8623f55c898739bdfb89533682776dc', 3)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-c8623f55c898739bdfb89533682776dc', 1)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-c8623f55c898739bdfb89533682776dc', 2)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-c8623f55c898739bdfb89533682776dc', 4)>}, not_done=set())"
       ]
      },
-     "execution_count": 89,
+     "execution_count": 88,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -6642,21 +6791,22 @@
   },
   {
    "cell_type": "markdown",
+   "id": "447301f5",
    "metadata": {},
    "source": [
-    "## With `wait`, we can safely proceed on in our workflow."
+    "With `wait`, we can safely proceed on in our workflow."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "7e06fcf4",
    "metadata": {},
    "outputs": [],
    "source": []
   }
  ],
  "metadata": {
-  "anaconda-cloud": {},
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -6673,21 +6823,8 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.8.13"
-  },
-  "toc": {
-   "base_numbering": 1,
-   "nav_menu": {},
-   "number_sections": true,
-   "sideBar": true,
-   "skip_h1_title": false,
-   "title_cell": "Table of Contents",
-   "title_sidebar": "Contents",
-   "toc_cell": false,
-   "toc_position": {},
-   "toc_section_display": true,
-   "toc_window_display": false
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/docs/cudf/source/user_guide/PandasCompat.md b/docs/cudf/source/user_guide/PandasCompat.md
new file mode 100644
index 00000000000..a33a354e2f8
--- /dev/null
+++ b/docs/cudf/source/user_guide/PandasCompat.md
@@ -0,0 +1,5 @@
+# Pandas Compatibility Notes
+
+```{eval-rst}
+.. pandas-compat-list::
+```
diff --git a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb
similarity index 87%
rename from docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
rename to docs/cudf/source/user_guide/cupy-interop.ipynb
index 35ca21f380e..9fbac3b2578 100644
--- a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
+++ b/docs/cudf/source/user_guide/cupy-interop.ipynb
@@ -2,9 +2,10 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "8e5e6878",
    "metadata": {},
    "source": [
-    "# 10 Minutes to cuDF and CuPy\n",
+    "# Interoperability between cuDF and CuPy\n",
     "\n",
     "This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations)."
    ]
@@ -12,6 +13,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "8b2d45c3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,6 +31,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e7e64b1a",
    "metadata": {},
    "source": [
     "### Converting a cuDF DataFrame to a CuPy Array\n",
@@ -45,15 +48,16 @@
   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "45c482ab",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "183 µs ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
-      "553 µs ± 6.25 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n",
-      "546 µs ± 2.25 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
+      "118 µs ± 77.2 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
+      "360 µs ± 6.04 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n",
+      "355 µs ± 722 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
      ]
     }
    ],
@@ -72,6 +76,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "a565effc",
    "metadata": {},
    "outputs": [
     {
@@ -98,6 +103,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0759ab29",
    "metadata": {},
    "source": [
     "### Converting a cuDF Series to a CuPy Array"
@@ -105,27 +111,29 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4f35ffbd",
    "metadata": {},
    "source": [
     "There are also multiple ways to convert a cuDF Series to a CuPy array:\n",
     "\n",
     "1. We can pass the Series to `cupy.asarray` as cuDF Series exposes [`__cuda_array_interface__`](https://docs-cupy.chainer.org/en/stable/reference/interoperability.html).\n",
     "2. We can leverage the dlpack interface `to_dlpack()`. \n",
-    "3. We can also use `Series.values` \n"
+    "3. We can also use `Series.values`"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "8f97f304",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "76.8 µs ± 636 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
-      "198 µs ± 2.72 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
-      "181 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
+      "54.4 µs ± 66 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
+      "125 µs ± 1.21 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
+      "119 µs ± 805 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
      ]
     }
    ],
@@ -140,6 +148,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "f96d5676",
    "metadata": {},
    "outputs": [
     {
@@ -160,6 +169,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c36e5b88",
    "metadata": {},
    "source": [
     "From here, we can proceed with normal CuPy workflows, such as reshaping the array, getting the diagonal, or calculating the norm."
@@ -168,6 +178,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
+   "id": "2a7ae43f",
    "metadata": {},
    "outputs": [
     {
@@ -195,6 +206,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
+   "id": "b442a30c",
    "metadata": {},
    "outputs": [
     {
@@ -219,6 +231,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "be7f4d32",
    "metadata": {},
    "outputs": [
     {
@@ -238,6 +251,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "b353bded",
    "metadata": {},
    "source": [
     "### Converting a CuPy Array to a cuDF DataFrame\n",
@@ -256,13 +270,14 @@
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "8887b253",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "23.9 ms ± 119 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
+      "14.3 ms ± 33.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -273,6 +288,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "08ec4ffa",
    "metadata": {},
    "outputs": [
     {
@@ -475,6 +491,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6804d291",
    "metadata": {},
    "source": [
     "We can check whether our array is Fortran contiguous by using cupy.isfortran or looking at the [flags](https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.ndarray.html#cupy.ndarray.flags) of the array."
@@ -483,6 +500,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
+   "id": "65b8bd0d",
    "metadata": {},
    "outputs": [
     {
@@ -502,6 +520,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "151982ad",
    "metadata": {},
    "source": [
     "In this case, we'll need to convert it before going to a cuDF DataFrame. In the next two cells, we create the DataFrame by leveraging dlpack and the CUDA array interface, respectively."
@@ -510,13 +529,14 @@
   {
    "cell_type": "code",
    "execution_count": 12,
+   "id": "27b2f563",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "9.15 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "6.57 ms ± 9.08 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -530,13 +550,14 @@
   {
    "cell_type": "code",
    "execution_count": 13,
+   "id": "0a0cc290",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "5.74 ms ± 29.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "4.48 ms ± 7.89 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -550,6 +571,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
+   "id": "0d2c5beb",
    "metadata": {},
    "outputs": [
     {
@@ -753,6 +775,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "395e2bba",
    "metadata": {},
    "source": [
     "### Converting a CuPy Array to a cuDF Series\n",
@@ -763,6 +786,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
+   "id": "d8518208",
    "metadata": {},
    "outputs": [
     {
@@ -787,6 +811,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7e159619",
    "metadata": {},
    "source": [
     "### Interweaving CuDF and CuPy for Smooth PyData Workflows\n",
@@ -799,6 +824,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
+   "id": "2bb8ed81",
    "metadata": {},
    "outputs": [
     {
@@ -1000,6 +1026,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2f3d4e78",
    "metadata": {},
    "source": [
     "We can just transform it into a CuPy array and use the `axis` argument of `sum`."
@@ -1008,6 +1035,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
+   "id": "2dde030d",
    "metadata": {},
    "outputs": [
     {
@@ -1035,6 +1063,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4450dcc3",
    "metadata": {},
    "source": [
     "With just that single line, we're able to seamlessly move between data structures in this ecosystem, giving us enormous flexibility without sacrificing speed."
@@ -1042,6 +1071,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "61bfb868",
    "metadata": {},
    "source": [
     "### Converting a cuDF DataFrame to a CuPy Sparse Matrix\n",
@@ -1054,6 +1084,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "e531fd15",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1072,6 +1103,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3f5e6ade",
    "metadata": {},
    "source": [
     "We can define a sparsely populated DataFrame to illustrate this conversion to either sparse matrix format."
@@ -1080,6 +1112,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
+   "id": "58c7e074",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1095,6 +1128,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
+   "id": "9265228d",
    "metadata": {},
    "outputs": [
     {
@@ -1143,115 +1177,115 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>9.37476</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>6.237859</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>11.308953</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.00000</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.065878</td>\n",
+       "      <td>-5.241297</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>17.58476</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>12.35705</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>3.232751</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.00000</td>\n",
-       "      <td>8.341915</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>3.110362</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.00000</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.00000</td>\n",
+       "      <td>10.869279</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>7.743024</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.00000</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>5.987098</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.526274</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
@@ -1261,19 +1295,19 @@
        "</div>"
       ],
       "text/plain": [
-       "         a0   a1   a2        a3   a4       a5        a6   a7   a8        a9  \\\n",
-       "0  0.000000  0.0  0.0  0.000000  0.0  9.37476  0.000000  0.0  0.0  0.000000   \n",
-       "1  0.000000  0.0  0.0  0.000000  0.0  0.00000  0.000000  0.0  0.0  0.000000   \n",
-       "2  3.232751  0.0  0.0  0.000000  0.0  0.00000  8.341915  0.0  0.0  0.000000   \n",
-       "3  0.000000  0.0  0.0  0.000000  0.0  0.00000  0.000000  0.0  0.0  0.000000   \n",
-       "4  0.000000  0.0  0.0  7.743024  0.0  0.00000  0.000000  0.0  0.0  5.987098   \n",
+       "    a0   a1   a2   a3   a4   a5        a6   a7   a8        a9  a10  a11  a12  \\\n",
+       "0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0   \n",
+       "1  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0 -5.241297  0.0  0.0  0.0   \n",
+       "2  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0   \n",
+       "3  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0   \n",
+       "4  0.0  0.0  0.0  0.0  0.0  0.0  2.526274  0.0  0.0  0.000000  0.0  0.0  0.0   \n",
        "\n",
-       "        a10  a11  a12       a13  a14  a15       a16  a17  a18       a19  \n",
-       "0  6.237859  0.0  0.0  0.000000  0.0  0.0   0.00000  0.0  0.0  0.000000  \n",
-       "1  0.000000  0.0  0.0  0.065878  0.0  0.0  12.35705  0.0  0.0  0.000000  \n",
-       "2  0.000000  0.0  0.0  0.000000  0.0  0.0   0.00000  0.0  0.0  3.110362  \n",
-       "3  0.000000  0.0  0.0  0.000000  0.0  0.0   0.00000  0.0  0.0  0.000000  \n",
-       "4  0.000000  0.0  0.0  0.000000  0.0  0.0   0.00000  0.0  0.0  0.000000  "
+       "        a13        a14  a15  a16  a17  a18        a19  \n",
+       "0   0.00000   0.000000  0.0  0.0  0.0  0.0  11.308953  \n",
+       "1  17.58476   0.000000  0.0  0.0  0.0  0.0   0.000000  \n",
+       "2   0.00000   0.000000  0.0  0.0  0.0  0.0   0.000000  \n",
+       "3   0.00000  10.869279  0.0  0.0  0.0  0.0   0.000000  \n",
+       "4   0.00000   0.000000  0.0  0.0  0.0  0.0   0.000000  "
       ]
      },
      "execution_count": 20,
@@ -1288,63 +1322,64 @@
   {
    "cell_type": "code",
    "execution_count": 21,
+   "id": "5ba1a551",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  (2, 0)\t3.2327506467190874\n",
-      "  (259, 0)\t10.723428115951062\n",
-      "  (643, 0)\t0.47763624588488707\n",
-      "  (899, 0)\t8.857065309921685\n",
-      "  (516, 0)\t8.792407143276648\n",
-      "  (262, 0)\t2.1900894573805396\n",
-      "  (390, 0)\t5.007630701229646\n",
-      "  (646, 0)\t6.630703075588639\n",
-      "  (392, 0)\t5.573713453854357\n",
-      "  (776, 0)\t10.501281989515688\n",
-      "  (904, 0)\t8.261890175181366\n",
-      "  (1033, 0)\t-0.41106824704220446\n",
-      "  (522, 0)\t12.619952511457068\n",
-      "  (139, 0)\t12.753348070606792\n",
-      "  (141, 0)\t4.936902335394504\n",
-      "  (270, 0)\t-1.7695949916946174\n",
-      "  (782, 0)\t4.378746787324408\n",
-      "  (15, 0)\t8.554141682891935\n",
-      "  (527, 0)\t5.1994882136423\n",
-      "  (912, 0)\t2.6101212854793125\n",
-      "  (401, 0)\t5.614628764689268\n",
-      "  (403, 0)\t9.999468341523317\n",
-      "  (787, 0)\t7.6170790481600985\n",
-      "  (404, 0)\t5.105328903336744\n",
-      "  (916, 0)\t1.395526391114967\n",
+      "  (770, 0)\t-1.373354548007899\n",
+      "  (771, 0)\t11.641890592020793\n",
+      "  (644, 0)\t-1.4820515981598015\n",
+      "  (773, 0)\t4.374245789758399\n",
+      "  (646, 0)\t4.58071340724814\n",
+      "  (776, 0)\t5.115792716318899\n",
+      "  (649, 0)\t8.676941295251092\n",
+      "  (522, 0)\t-0.11573951593420229\n",
+      "  (396, 0)\t8.124303607236273\n",
+      "  (652, 0)\t9.359339954077681\n",
+      "  (141, 0)\t8.50710863345112\n",
+      "  (272, 0)\t7.440244879175392\n",
+      "  (1042, 0)\t4.286859524587998\n",
+      "  (275, 0)\t-0.6091666840632348\n",
+      "  (787, 0)\t10.124449357828695\n",
+      "  (915, 0)\t11.391560911074649\n",
+      "  (1043, 0)\t11.478396096078907\n",
+      "  (408, 0)\t11.204049991287349\n",
+      "  (536, 0)\t13.239689100708974\n",
+      "  (26, 0)\t4.951917355877771\n",
+      "  (794, 0)\t2.736556006961319\n",
+      "  (539, 0)\t12.553519350929216\n",
+      "  (412, 0)\t2.8682583361020786\n",
+      "  (540, 0)\t-1.2121388231076713\n",
+      "  (796, 0)\t6.986443354019786\n",
       "  :\t:\n",
-      "  (9328, 19)\t5.938629381103238\n",
-      "  (9457, 19)\t4.463547879031807\n",
-      "  (9458, 19)\t-0.8034946631917106\n",
-      "  (8051, 19)\t-1.904327616912268\n",
-      "  (8819, 19)\t8.314944347687199\n",
-      "  (7543, 19)\t1.4303204025224376\n",
-      "  (8824, 19)\t5.1559713157589\n",
-      "  (7673, 19)\t7.478681299798863\n",
-      "  (7802, 19)\t0.502526238006068\n",
-      "  (8186, 19)\t-3.824944685072472\n",
-      "  (8570, 19)\t8.442324394481236\n",
-      "  (8571, 19)\t6.204199957873215\n",
-      "  (7420, 19)\t0.297737356585836\n",
-      "  (9212, 19)\t3.934797966994188\n",
-      "  (7421, 19)\t14.26161925450462\n",
-      "  (8574, 19)\t5.826108027573207\n",
-      "  (9214, 19)\t7.209975861932724\n",
-      "  (9825, 19)\t11.155342644729613\n",
-      "  (9702, 19)\t3.55144040779287\n",
-      "  (9578, 19)\t12.638681362546228\n",
-      "  (9712, 19)\t2.3542852760656348\n",
-      "  (9969, 19)\t-2.645175092587592\n",
-      "  (9973, 19)\t-2.2666402312025213\n",
-      "  (9851, 19)\t-4.293381721466055\n",
-      "  (9596, 19)\t6.6580506888430415\n"
+      "  (9087, 19)\t-2.9543770156500395\n",
+      "  (9440, 19)\t3.903613949374532\n",
+      "  (9186, 19)\t0.3141028170017329\n",
+      "  (9571, 19)\t1.7347840594688502\n",
+      "  (9188, 19)\t14.68745562157488\n",
+      "  (9316, 19)\t13.808308442016436\n",
+      "  (9957, 19)\t9.705810918221086\n",
+      "  (9318, 19)\t9.984168186940485\n",
+      "  (9446, 19)\t5.173000114288142\n",
+      "  (9830, 19)\t3.2442816093793607\n",
+      "  (9835, 19)\t5.713078257113576\n",
+      "  (9580, 19)\t5.373437384911853\n",
+      "  (9326, 19)\t10.736403419943093\n",
+      "  (9711, 19)\t-4.003216472911014\n",
+      "  (9200, 19)\t5.560182026578174\n",
+      "  (9844, 19)\t6.17251145210342\n",
+      "  (9333, 19)\t7.085353006324948\n",
+      "  (9208, 19)\t6.789030498520347\n",
+      "  (9464, 19)\t4.314887636528589\n",
+      "  (9720, 19)\t12.446300974563027\n",
+      "  (9594, 19)\t4.317523130615451\n",
+      "  (9722, 19)\t-2.3257161477576336\n",
+      "  (9723, 19)\t1.9288133227037407\n",
+      "  (9469, 19)\t0.268312217498608\n",
+      "  (9599, 19)\t4.100996763787237\n"
      ]
     }
    ],
@@ -1355,6 +1390,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e8e58cd5",
    "metadata": {},
    "source": [
     "From here, we could continue our workflow with a CuPy sparse matrix.\n",
@@ -1379,9 +1415,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.8.13"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/docs/cudf/source/user_guide/dask-cudf.md b/docs/cudf/source/user_guide/dask-cudf.md
new file mode 100644
index 00000000000..0c0b37f641c
--- /dev/null
+++ b/docs/cudf/source/user_guide/dask-cudf.md
@@ -0,0 +1,104 @@
+# Multi-GPU with Dask-cuDF
+
+cuDF is a single-GPU library. For Multi-GPU cuDF solutions we use
+[Dask](https://dask.org/) and the [dask-cudf
+package](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf),
+which is able to scale cuDF across multiple GPUs on a single machine,
+or multiple GPUs across many machines in a cluster.
+
+[Dask DataFrame](http://docs.dask.org/en/latest/dataframe.html) was
+originally designed to scale Pandas, orchestrating many Pandas
+DataFrames spread across many CPUs into a cohesive parallel DataFrame.
+Because cuDF currently implements only a subset of the Pandas API, not
+all Dask DataFrame operations work with cuDF.
+
+The following is tested and expected to work:
+
+## What works
+
+- Data ingestion
+
+  - `dask_cudf.read_csv`
+  - Use standard Dask ingestion with Pandas, then convert to cuDF (For
+    Parquet and other formats this is often decently fast)
+
+- Linear operations
+
+  - Element-wise operations: `df.x + df.y`, `df ** 2`
+  - Assignment: `df['z'] = df.x + df.y`
+  - Row-wise selections: `df[df.x > 0]`
+  - Loc: `df.loc['2001-01-01': '2005-02-02']`
+  - Date time/string accessors: `df.timestamp.dt.dayofweek`
+  - ... and most similar operations in this category that are already
+    implemented in cuDF
+
+- Reductions
+
+  - Like `sum`, `mean`, `max`, `count`, and so on on
+    `Series` objects
+  - Support for reductions on full dataframes
+  - `std`
+  - Custom reductions with
+    [dask.dataframe.reduction](https://docs.dask.org/en/latest/generated/dask.dataframe.Series.reduction.html)
+
+- Groupby aggregations
+
+  - On single columns: `df.groupby('x').y.max()`
+  - With custom aggregations:
+  - groupby standard deviation
+  - grouping on multiple columns
+  - groupby agg for multiple outputs
+
+- Joins:
+
+  - On full unsorted columns: `left.merge(right, on='id')`
+    (expensive)
+  - On sorted indexes:
+    `left.merge(right, left_index=True, right_index=True)` (fast)
+  - On large and small dataframes: `left.merge(cudf_df, on='id')`
+    (fast)
+
+- Rolling operations
+
+- Converting to and from other forms
+
+  - Dask + Pandas to Dask + cuDF
+    `df.map_partitions(cudf.from_pandas)`
+  - Dask + cuDF to Dask + Pandas
+    `df.map_partitions(lambda df: df.to_pandas())`
+  - cuDF to Dask + cuDF:
+    `dask.dataframe.from_pandas(df, npartitions=20)`
+  - Dask + cuDF to cuDF: `df.compute()`
+
+Additionally all generic Dask operations, like `compute`, `persist`,
+`visualize` and so on work regardless.
+
+## Developing the API
+
+Above we mention the following:
+
+> and most similar operations in this category that are already
+> implemented in cuDF
+
+This is because it is difficult to create a comprehensive list of
+operations in the cuDF and Pandas libraries. The API is large enough to
+be difficult to track effectively. For any operation that operates
+row-wise like `fillna` or `query` things will likely, but not
+certainly work. If operations don't work it is often due to a slight
+inconsistency between Pandas and cuDF that is generally easy to fix. We
+encourage users to look at the [cuDF issue
+tracker](https://github.com/rapidsai/cudf/issues) to see if their
+issue has already been reported and, if not, [raise a new
+issue](https://github.com/rapidsai/cudf/issues/new).
+
+## Navigating the API
+
+This project reuses the [Dask
+DataFrame](https://docs.dask.org/en/latest/dataframe.html) project,
+which was originally designed for Pandas, with the newer library cuDF.
+Because we use the same Dask classes for both projects there are often
+methods that are implemented for Pandas, but not yet for cuDF. As a
+result users looking at the full Dask DataFrame API can be misleading,
+and often lead to frustration when operations that are advertised in the
+Dask API do not work as expected with cuDF. We apologize for this in
+advance.
diff --git a/docs/cudf/source/user_guide/data-types.md b/docs/cudf/source/user_guide/data-types.md
new file mode 100644
index 00000000000..8963f87d52e
--- /dev/null
+++ b/docs/cudf/source/user_guide/data-types.md
@@ -0,0 +1,153 @@
+# Supported Data Types
+
+cuDF supports many data types supported by NumPy and Pandas, including
+numeric, datetime, timedelta, categorical and string data types. We
+also provide special data types for working with decimals, list-like,
+and dictionary-like data.
+
+All data types in cuDF are [nullable](missing-data).
+
+<div class="special-table">
+
+| Kind of data         | Data type(s)                                                                    |
+|----------------------|---------------------------------------------------------------------------------|
+| Signed integer       | `'int8'`, `'int16'`, `'int32'`, `'int64'`                                       |
+| Unsigned integer     | `'uint32'`, `'uint64'`                                                          |
+| Floating-point       | `'float32'`, `'float64'`                                                        |
+| Datetime             | `'datetime64[s]'`, `'datetime64[ms]'`, `'datetime64['us']`, `'datetime64[ns]'`  |
+| Timedelta (duration) | `'timedelta[s]'`, `'timedelta[ms]'`, `'timedelta['us']`, `'timedelta[ns]'`      |
+| Category             | `cudf.CategoricalDtype`                                                         |
+| String               | `'object'` or `'string'`                                                        |
+| Decimal              | `cudf.Decimal32Dtype`, `cudf.Decimal64Dtype`, `cudf.Decimal64Dtype`             |
+| List                 | `cudf.ListDtype`                                                                |
+| Struct               | `cudf.StructDtype`                                                              |
+
+</div>
+
+## NumPy data types
+
+We use NumPy data types for integer, floating, datetime, timedelta,
+and string data types.  Thus, just like in NumPy,
+`np.dtype("float32")`, `np.float32`, and `"float32"` are all acceptable
+ways to specify the `float32` data type:
+
+```python
+>>> import cudf
+>>> s = cudf.Series([1, 2, 3], dtype="float32")
+>>> s
+0    1.0
+1    2.0
+2    3.0
+dtype: float32
+```
+
+## A note on `object`
+
+The data type associated with string data in cuDF is `"np.object"`.
+
+```python
+>>> import cudf 
+>>> s = cudf.Series(["abc", "def", "ghi"])
+>>> s.dtype
+dtype("object")
+```
+
+This is for compatibility with Pandas, but it can be misleading. In
+both NumPy and Pandas, `"object"` is the data type associated data
+composed of arbitrary Python objects (not just strings).  However,
+cuDF does not support storing arbitrary Python objects.
+
+## Decimal data types
+
+We provide special data types for working with decimal data, namely
+`Decimal32Dtype`, `Decimal64Dtype`, and `Decimal128Dtype`.  Use these
+data types when you need to store values with greater precision than
+allowed by floating-point representation.
+
+Decimal data types in cuDF are based on fixed-point representation.  A
+decimal data type is composed of a _precision_ and a _scale_.  The
+precision represents the total number of digits in each value of this
+dtype. For example, the precision associated with the decimal value
+`1.023` is `4`. The scale is the total number of digits to the right
+of the decimal point. The scale associated with the value `1.023` is
+3.
+
+Each decimal data type is associated with a maximum precision:
+
+```python
+>>> cudf.Decimal32Dtype.MAX_PRECISION
+9.0
+>>> cudf.Decimal64Dtype.MAX_PRECISION
+18.0
+>>> cudf.Decimal128Dtype.MAX_PRECISION
+38
+```
+
+One way to create a decimal Series is from values of type [decimal.Decimal][python-decimal].
+
+```python
+>>> from decimal import Decimal
+>>> s = cudf.Series([Decimal("1.01"), Decimal("4.23"), Decimal("0.5")])
+>>> s
+0    1.01
+1    4.23
+2    0.50
+dtype: decimal128
+>>> s.dtype
+Decimal128Dtype(precision=3, scale=2)
+```
+
+Notice the data type of the result: `1.01`, `4.23`, `0.50` can all be
+represented with a precision of at least 3 and a scale of at least 2.
+
+However, the value `1.234` needs a precision of at least 4, and a
+scale of at least 3, and cannot be fully represented using this data
+type:
+
+```python
+>>> s[1] = Decimal("1.234")  # raises an error
+```
+
+## Nested data types (`List` and `Struct`)
+
+`ListDtype` and `StructDtype` are special data types in cuDF for
+working with list-like and dictionary-like data. These are referred to
+as "nested" data types, because they enable you to store a list of
+lists, or a struct of lists, or a struct of list of lists, etc.,
+
+You can create lists and struct Series from existing Pandas Series of
+lists and dictionaries respectively:
+
+```python
+>>> psr = pd.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
+>>> psr
+0 {'a': 1, 'b': 2}
+1 {'a': 3, 'b': 4}
+dtype: object
+>>> gsr = cudf.from_pandas(psr)
+>>> gsr
+0 {'a': 1, 'b': 2}
+1 {'a': 3, 'b': 4}
+dtype: struct
+>>> gsr.dtype
+StructDtype({'a': dtype('int64'), 'b': dtype('int64')})
+```
+
+Or by reading them from disk, using a [file format that supports
+nested data](io).
+
+```python
+>>> pdf = pd.DataFrame({"a": [[1, 2], [3, 4, 5], [6, 7, 8]]})
+>>> pdf.to_parquet("lists.pq")
+>>> gdf = cudf.read_parquet("lists.pq")
+>>> gdf
+           a
+0     [1, 2]
+1  [3, 4, 5]
+2  [6, 7, 8]
+>>> gdf["a"].dtype
+ListDtype(int64)
+```
+
+[numpy-dtype]: https://numpy.org/doc/stable/reference/arrays.dtypes.html#arrays-dtypes
+[python-decimal]: https://docs.python.org/3/library/decimal.html#decimal.Decimal
diff --git a/docs/cudf/source/user_guide/groupby.md b/docs/cudf/source/user_guide/groupby.md
new file mode 100644
index 00000000000..66b548727e1
--- /dev/null
+++ b/docs/cudf/source/user_guide/groupby.md
@@ -0,0 +1,273 @@
+---
+substitutions:
+  describe: '`describe`'
+---
+
+(basics-groupby)=
+
+# GroupBy
+
+cuDF supports a small (but important) subset of Pandas' [groupby
+API](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html).
+
+## Summary of supported operations
+
+1. Grouping by one or more columns
+2. Basic aggregations such as "sum", "mean", etc.
+3. Quantile aggregation
+4. A "collect" or `list` aggregation for collecting values in a group
+   into lists
+5. Automatic exclusion of columns with unsupported dtypes ("nuisance"
+   columns) when aggregating
+6. Iterating over the groups of a GroupBy object
+7. `GroupBy.groups` API that returns a mapping of group keys to row
+   labels
+8. `GroupBy.apply` API for performing arbitrary operations on each
+   group. Note that this has very limited functionality compared to the
+   equivalent Pandas function. See the section on
+   [apply](#groupby-apply) for more details.
+9. `GroupBy.pipe` similar to
+   [Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls).
+
+## Grouping
+
+A GroupBy object is created by grouping the values of a `Series` or
+`DataFrame` by one or more columns:
+
+```python
+>>> import cudf
+>>> df = cudf.DataFrame({'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]})
+>>> df
+   a  b  c
+0  1  1  1
+1  1  1  2
+2  1  2  3
+3  2  2  4
+4  2  3  5
+>>> gb1 = df.groupby('a')  # grouping by a single column
+>>> gb2 = df.groupby(['a', 'b'])  # grouping by multiple columns
+>>> gb3 = df.groupby(cudf.Series(['a', 'a', 'b', 'b', 'b']))  # grouping by an external column
+```
+
+````{warning}
+Unlike Pandas, cuDF uses `sort=False` by default to achieve better
+performance, which does not guarantee any particular group order in
+the result.
+
+For example:
+
+```python
+>>> df = cudf.DataFrame({'a' : [2, 2, 1], 'b' : [42, 21, 11]})
+>>> df.groupby('a').sum()
+   b
+a
+2  63
+1  11
+>>> df.to_pandas().groupby('a').sum()
+   b
+a
+1  11
+2  63
+```
+
+Setting `sort=True` will produce Pandas-like output, but with some performance penalty:
+
+```python
+>>> df.groupby('a', sort=True).sum()
+   b
+a
+1  11
+2  63
+```
+````
+
+### Grouping by index levels
+
+You can also group by one or more levels of a MultiIndex:
+
+```python
+>>> df = cudf.DataFrame(
+...     {'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]}
+... ).set_index(['a', 'b'])
+...
+>>> df.groupby(level='a')
+```
+
+### The `Grouper` object
+
+A `Grouper` can be used to disambiguate between columns and levels
+when they have the same name:
+
+```python
+>>> df
+   b  c
+b
+1  1  1
+1  1  2
+1  2  3
+2  2  4
+2  3  5
+>>> df.groupby('b', level='b')  # ValueError: Cannot specify both by and level
+>>> df.groupby([cudf.Grouper(key='b'), cudf.Grouper(level='b')])  # OK
+```
+
+## Aggregation
+
+Aggregations on groups are supported via the `agg` method:
+
+```python
+>>> df
+   a  b  c
+0  1  1  1
+1  1  1  2
+2  1  2  3
+3  2  2  4
+4  2  3  5
+>>> df.groupby('a').agg('sum')
+   b  c
+a
+1  4  6
+2  5  9
+>>> df.groupby('a').agg({'b': ['sum', 'min'], 'c': 'mean'})
+    b        c
+  sum min mean
+a
+1   4   1  2.0
+2   5   2  4.5
+>>> df.groupby("a").corr(method="pearson")
+          b          c
+a
+1 b  1.000000  0.866025
+  c  0.866025  1.000000
+2 b  1.000000  1.000000
+  c  1.000000  1.000000
+```
+
+The following table summarizes the available aggregations and the types
+that support them:
+
+```{eval-rst}
+.. table::
+    :class: special-table
+
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | Aggregations / dtypes              | Numeric   | Datetime   | String   | Categorical   | List   | Struct   | Interval   | Decimal   |
+    +====================================+===========+============+==========+===============+========+==========+============+===========+
+    | count                              | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | size                               | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | sum                                | ✅        | ✅         |          |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | idxmin                             | ✅        | ✅         |          |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | idxmax                             | ✅        | ✅         |          |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | min                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | max                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | mean                               | ✅        | ✅         |          |               |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | var                                | ✅        | ✅         |          |               |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | std                                | ✅        | ✅         |          |               |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | quantile                           | ✅        | ✅         |          |               |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | median                             | ✅        | ✅         |          |               |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | nunique                            | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | nth                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | collect                            | ✅        | ✅         | ✅       |               | ✅     |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | unique                             | ✅        | ✅         | ✅       | ✅            |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | corr                               | ✅        |            |          |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | cov                                | ✅        |            |          |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+```
+
+## GroupBy apply
+
+To apply function on each group, use the `GroupBy.apply()` method:
+
+```python
+>>> df
+   a  b  c
+0  1  1  1
+1  1  1  2
+2  1  2  3
+3  2  2  4
+4  2  3  5
+>>> df.groupby('a').apply(lambda x: x.max() - x.min())
+   a  b  c
+a
+0  0  1  2
+1  0  1  1
+```
+
+### Limitations
+
+- `apply` works by applying the provided function to each group
+  sequentially, and concatenating the results together. **This can be
+  very slow**, especially for a large number of small groups. For a
+  small number of large groups, it can give acceptable performance.
+- The results may not always match Pandas exactly. For example, cuDF
+  may return a `DataFrame` containing a single column where Pandas
+  returns a `Series`. Some post-processing may be required to match
+  Pandas behavior.
+- cuDF does not support some of the exceptional cases that Pandas
+  supports with `apply`, such as calling [describe] inside the
+  callable.
+
+## Transform
+
+The `.transform()` method aggregates per group, and broadcasts the
+result to the group size, resulting in a Series/DataFrame that is of
+the same size as the input Series/DataFrame.
+
+```python
+>>> import cudf
+>>> df = cudf.DataFrame({'a': [2, 1, 1, 2, 2], 'b': [1, 2, 3, 4, 5]})
+>>> df.groupby('a').transform('max')
+   b
+0  5
+1  3
+2  3
+3  5
+4  5
+```
+
+## Rolling window calculations
+
+Use the `GroupBy.rolling()` method to perform rolling window
+calculations on each group:
+
+```python
+>>> df
+   a  b  c
+0  1  1  1
+1  1  1  2
+2  1  2  3
+3  2  2  4
+4  2  3  5
+```
+
+Rolling window sum on each group with a window size of 2:
+
+```python
+>>> df.groupby('a').rolling(2).sum()
+        a     b     c
+a
+1 0  <NA>  <NA>  <NA>
+  1     2     2     3
+  2     2     3     5
+2 3  <NA>  <NA>  <NA>
+  4     4     5     9
+```
+
+[describe]: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#flexible-apply
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 8026c378156..ef7500a2be9 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -2,15 +2,16 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "77149e57",
    "metadata": {},
    "source": [
-    "Overview of User Defined Functions with cuDF\n",
-    "===================================="
+    "# Overview of User Defined Functions with cuDF"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "0c6b65ce",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -21,6 +22,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "8826af13",
    "metadata": {},
    "source": [
     "Like many tabular data processing APIs, cuDF provides a range of composable, DataFrame style operators. While out of the box functions are flexible and useful, it is sometimes necessary to write custom code, or user-defined functions (UDFs), that can be applied to rows, columns, and other groupings of the cells making up the DataFrame.\n",
@@ -39,10 +41,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "32a8f4fb",
    "metadata": {},
    "source": [
-    "Series UDFs\n",
-    "--------------\n",
+    "## Series UDFs\n",
     "\n",
     "You can execute UDFs on Series in two ways:\n",
     "\n",
@@ -54,14 +56,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "49399a84",
    "metadata": {},
    "source": [
-    "`cudf.Series.apply`\n",
-    "---------------------"
+    "### `cudf.Series.apply`"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "0a209ea2",
    "metadata": {},
    "source": [
     "cuDF provides a similar API to `pandas.Series.apply` for applying scalar UDFs to series objects. Here is a very basic example."
@@ -70,6 +73,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "e28d5b82",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -79,6 +83,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "48a9fa5e",
    "metadata": {},
    "source": [
     "UDFs destined for `cudf.Series.apply` might look something like this:"
@@ -87,6 +92,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "96aeb19f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -97,6 +103,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e61d0169",
    "metadata": {},
    "source": [
     "`cudf.Series.apply` is called like `pd.Series.apply` and returns a new `Series` object:"
@@ -105,6 +112,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "8ca08834",
    "metadata": {},
    "outputs": [
     {
@@ -127,14 +135,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c98dab03",
    "metadata": {},
    "source": [
-    "Functions with Additional Scalar Arguments\n",
-    "---------------------------------------------------"
+    "### Functions with Additional Scalar Arguments"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "2aa3df6f",
    "metadata": {},
    "source": [
     "In addition, `cudf.Series.apply` supports `args=` just like pandas, allowing you to write UDFs that accept an arbitrary number of scalar arguments. Here is an example of such a function and it's API call in both pandas and cuDF:"
@@ -143,6 +152,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "8d156d01",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -153,6 +163,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
+   "id": "1dee82d7",
    "metadata": {},
    "outputs": [
     {
@@ -176,6 +187,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "22739e28",
    "metadata": {},
    "source": [
     "As a final note, `**kwargs` is not yet supported."
@@ -183,14 +195,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "afbf33dc",
    "metadata": {},
    "source": [
-    "Nullable Data\n",
-    "----------------"
+    "### Nullable Data"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "5dc06e8c",
    "metadata": {},
    "source": [
     "The null value `NA` an propagates through unary and binary operations. Thus, `NA + 1`, `abs(NA)`, and `NA == NA` all return `NA`. To make this concrete, let's look at the same example from above, this time using nullable data:"
@@ -199,6 +212,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
+   "id": "bda261dd",
    "metadata": {},
    "outputs": [
     {
@@ -224,6 +238,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "0123ae07",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -235,6 +250,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "e95868dd",
    "metadata": {},
    "outputs": [
     {
@@ -258,6 +274,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "97372e15",
    "metadata": {},
    "source": [
     "Often however you want explicit null handling behavior inside the function. cuDF exposes this capability the same way as pandas, by interacting directly with the `NA` singleton object. Here's an example of a function with explicit null handling:"
@@ -266,6 +283,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "6c65241b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -280,6 +298,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
+   "id": "ab0f4dbf",
    "metadata": {},
    "outputs": [
     {
@@ -303,6 +322,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "bdddc4e8",
    "metadata": {},
    "source": [
     "In addition, `cudf.NA` can be returned from a function directly or conditionally. This capability should allow you to implement custom null handling in a wide variety of cases."
@@ -310,14 +330,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "54cafbc0",
    "metadata": {},
    "source": [
-    "Lower level control with custom `numba` kernels\n",
-    "---------------------------------------------------------"
+    "### Lower level control with custom `numba` kernels"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "00914f2a",
    "metadata": {},
    "source": [
     "In addition to the Series.apply() method for performing custom operations, you can also pass Series objects directly into [CUDA kernels written with Numba](https://numba.pydata.org/numba-doc/latest/cuda/kernels.html).\n",
@@ -329,6 +350,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
+   "id": "732434f6",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -338,6 +360,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
+   "id": "4f5997e5",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -352,6 +375,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "d9667a55",
    "metadata": {},
    "source": [
     "This kernel will take an input array, multiply it by a configurable value (supplied at runtime), and store the result in an output array. Notice that we wrapped our logic in an `if` statement. Because we can launch more threads than the size of our array, we need to make sure that we don't use threads with an index that would be out of bounds. Leaving this out can result in undefined behavior.\n",
@@ -362,6 +386,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
+   "id": "ea6008a6",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -372,6 +397,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3fb69909",
    "metadata": {},
    "source": [
     "After calling our kernel, our DataFrame is now populated with the result."
@@ -380,6 +406,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
+   "id": "183a82ed",
    "metadata": {},
    "outputs": [
     {
@@ -469,6 +496,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ab9c305e",
    "metadata": {},
    "source": [
     "This API allows a you to theoretically write arbitrary kernel logic, potentially accessing and using elements of the series at arbitrary indices and use them on cuDF data structures. Advanced developers with some CUDA experience can often use this capability to implement iterative transformations, or spot treat problem areas of a data pipeline with a custom kernel that does the same job faster."
@@ -476,28 +504,29 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0acc6ef2",
    "metadata": {},
    "source": [
-    "DataFrame UDFs\n",
-    "--------------------\n",
+    "## DataFrame UDFs\n",
     "\n",
     "Like `cudf.Series`, there are multiple ways of using UDFs on dataframes, which essentially amount to UDFs that expect multiple columns as input:\n",
     "\n",
     "- `cudf.DataFrame.apply`, which functions like `pd.DataFrame.apply` and expects a row udf\n",
     "- `cudf.DataFrame.apply_rows`, which is a thin wrapper around numba and expects a numba kernel\n",
-    "- `cudf.DataFrame.apply_chunks`, which is similar to `cudf.DataFrame.apply_rows` but offers lower level control.\n"
+    "- `cudf.DataFrame.apply_chunks`, which is similar to `cudf.DataFrame.apply_rows` but offers lower level control."
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "2102c3ed",
    "metadata": {},
    "source": [
-    "`cudf.DataFrame.apply`\n",
-    "---------------------------"
+    "### `cudf.DataFrame.apply`"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "238bec41",
    "metadata": {},
    "source": [
     "`cudf.DataFrame.apply` is the main entrypoint for UDFs that expect multiple columns as input and produce a single output column. Functions intended to be consumed by this API are written in terms of a \"row\" argument. The \"row\" is considered to be like a dictionary and contains all of the column values at a certain `iloc` in a `DataFrame`. The function can access these values by key within the function, the keys being the column names corresponding to the desired value. Below is an example function that would be used to add column `A` and column `B` together inside a UDF."
@@ -506,6 +535,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
+   "id": "73653918",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -515,6 +545,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "b5eb32dd",
    "metadata": {},
    "source": [
     "Let's create some very basic toy data containing at least one null."
@@ -523,6 +554,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
+   "id": "077feb75",
    "metadata": {},
    "outputs": [
     {
@@ -592,14 +624,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "609a3da5",
    "metadata": {},
    "source": [
-    "Finally call the function as you would in pandas - by using a lambda function to map the UDF onto \"rows\" of the DataFrame: "
+    "Finally call the function as you would in pandas - by using a lambda function to map the UDF onto \"rows\" of the DataFrame:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "091e39e1",
    "metadata": {},
    "outputs": [
     {
@@ -622,6 +656,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "44e54c31",
    "metadata": {},
    "source": [
     "The same function should produce the same result as pandas:"
@@ -630,6 +665,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
+   "id": "bd345fab",
    "metadata": {},
    "outputs": [
     {
@@ -652,6 +688,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "004fbbba",
    "metadata": {},
    "source": [
     "Notice that Pandas returns `object` dtype - see notes on this in the caveats section."
@@ -659,6 +696,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0b11c172",
    "metadata": {},
    "source": [
     "Like `cudf.Series.apply`, these functions support generalized null handling. Here's a function that conditionally returns a different value if a certain input is null:"
@@ -667,6 +705,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
+   "id": "b70f4b3b",
    "metadata": {},
    "outputs": [
     {
@@ -737,6 +776,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
+   "id": "0313c8df",
    "metadata": {},
    "outputs": [
     {
@@ -759,6 +799,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "313c77f3",
    "metadata": {},
    "source": [
     "`cudf.NA` can also be directly returned from a function resulting in data that has the the correct nulls in the end, just as if it were run in Pandas. For the following data, the last row fulfills the condition that `1 + 3 > 3` and returns `NA` for that row:"
@@ -767,6 +808,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
+   "id": "96a7952a",
    "metadata": {},
    "outputs": [
     {
@@ -845,6 +887,7 @@
   {
    "cell_type": "code",
    "execution_count": 23,
+   "id": "e0815f60",
    "metadata": {},
    "outputs": [
     {
@@ -867,6 +910,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "b9c674f4",
    "metadata": {},
    "source": [
     "Mixed types are allowed, but will return the common type, rather than object as in Pandas. Here's a null aware op between an int and a float column:"
@@ -875,6 +919,7 @@
   {
    "cell_type": "code",
    "execution_count": 24,
+   "id": "495efd14",
    "metadata": {},
    "outputs": [
     {
@@ -948,6 +993,7 @@
   {
    "cell_type": "code",
    "execution_count": 25,
+   "id": "678b0b5a",
    "metadata": {},
    "outputs": [
     {
@@ -970,6 +1016,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ce0897c0",
    "metadata": {},
    "source": [
     "Functions may also return scalar values, however the result will be promoted to a safe type regardless of the data. This means even if you have a function like:\n",
@@ -991,6 +1038,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
+   "id": "acf48d56",
    "metadata": {},
    "outputs": [
     {
@@ -1063,6 +1111,7 @@
   {
    "cell_type": "code",
    "execution_count": 27,
+   "id": "78a98172",
    "metadata": {},
    "outputs": [
     {
@@ -1085,6 +1134,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2ceaece4",
    "metadata": {},
    "source": [
     "Any number of columns and many arithmetic operators are supported, allowing for complex UDFs:"
@@ -1093,6 +1143,7 @@
   {
    "cell_type": "code",
    "execution_count": 28,
+   "id": "142c30a9",
    "metadata": {},
    "outputs": [
     {
@@ -1181,6 +1232,7 @@
   {
    "cell_type": "code",
    "execution_count": 29,
+   "id": "fee9198a",
    "metadata": {},
    "outputs": [
     {
@@ -1203,17 +1255,17 @@
   },
   {
    "cell_type": "markdown",
+   "id": "9c587bd2",
    "metadata": {},
    "source": [
-    "Numba kernels for DataFrames\n",
-    "------------------------------------"
+    "### Numba kernels for DataFrames"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "adc6a459",
    "metadata": {},
    "source": [
-    "\n",
     "We could apply a UDF on a DataFrame like we did above with `forall`. We'd need to write a kernel that expects multiple inputs, and pass multiple Series as arguments when we execute our kernel. Because this is fairly common and can be difficult to manage, cuDF provides two APIs to streamline this: `apply_rows` and `apply_chunks`. Below, we walk through an example of using `apply_rows`. `apply_chunks` works in a similar way, but also offers more control over low-level kernel behavior.\n",
     "\n",
     "Now that we have two numeric columns in our DataFrame, let's write a kernel that uses both of them."
@@ -1222,6 +1274,7 @@
   {
    "cell_type": "code",
    "execution_count": 30,
+   "id": "90cbcd85",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1235,6 +1288,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "bce045f2",
    "metadata": {},
    "source": [
     "Notice that we need to `enumerate` through our `zipped` function arguments (which either match or are mapped to our input column names). We can pass this kernel to `apply_rows`. We'll need to specify a few arguments:\n",
@@ -1251,6 +1305,7 @@
   {
    "cell_type": "code",
    "execution_count": 31,
+   "id": "e782daff",
    "metadata": {},
    "outputs": [
     {
@@ -1337,6 +1392,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6b838b89",
    "metadata": {},
    "source": [
     "As expected, we see our conditional addition worked. At this point, we've successfully executed UDFs on the core data structures of cuDF."
@@ -1344,9 +1400,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "fca97003",
    "metadata": {},
    "source": [
-    "## Null Handling in `apply_rows` and `apply_chunks`\n",
+    "### Null Handling in `apply_rows` and `apply_chunks`\n",
     "\n",
     "By default, DataFrame methods for applying UDFs like `apply_rows` will handle nulls pessimistically (all rows with a null value will be removed from the output if they are used in the kernel). Exploring how not handling not pessimistically can lead to undefined behavior is outside the scope of this guide. Suffice it to say, pessimistic null handling is the safe and consistent approach. You can see an example below."
    ]
@@ -1354,6 +1411,7 @@
   {
    "cell_type": "code",
    "execution_count": 32,
+   "id": "befd8333",
    "metadata": {},
    "outputs": [
     {
@@ -1445,6 +1503,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c710ce86",
    "metadata": {},
    "source": [
     "In the dataframe above, there are three null values. Each column has a null in a different row. When we use our UDF with `apply_rows`, our output should have two nulls due to pessimistic null handling (because we're not using column `c`, the null value there does not matter to us)."
@@ -1453,6 +1512,7 @@
   {
    "cell_type": "code",
    "execution_count": 33,
+   "id": "d1f3dcaf",
    "metadata": {},
    "outputs": [
     {
@@ -1546,6 +1606,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "53b9a2f8",
    "metadata": {},
    "source": [
     "As expected, we end up with two nulls in our output. The null values from the columns we used propogated to our output, but the null from the column we ignored did not."
@@ -1553,10 +1614,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4bbefa67",
    "metadata": {},
    "source": [
-    "Rolling Window UDFs\n",
-    "-------------------------\n",
+    "## Rolling Window UDFs\n",
     "\n",
     "For time-series data, we may need to operate on a small \\\"window\\\" of our column at a time, processing each portion independently. We could slide (\\\"roll\\\") this window over the entire column to answer questions like \\\"What is the 3-day moving average of a stock price over the past year?\"\n",
     "\n",
@@ -1566,6 +1627,7 @@
   {
    "cell_type": "code",
    "execution_count": 34,
+   "id": "6bc6aea3",
    "metadata": {},
    "outputs": [
     {
@@ -1593,6 +1655,7 @@
   {
    "cell_type": "code",
    "execution_count": 35,
+   "id": "a4c31df1",
    "metadata": {},
    "outputs": [
     {
@@ -1613,6 +1676,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ff40d863",
    "metadata": {},
    "source": [
     "Next, we'll define a function to use on our rolling windows. We created this one to highlight how you can include things like loops, mathematical functions, and conditionals. Rolling window UDFs do not yet support null values."
@@ -1621,6 +1685,7 @@
   {
    "cell_type": "code",
    "execution_count": 36,
+   "id": "eb5a081b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1637,6 +1702,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "df8ba31d",
    "metadata": {},
    "source": [
     "We can execute the function by passing it to `apply`. With `window=3`, `min_periods=3`, and `center=False`, our first two values are `null`."
@@ -1645,6 +1711,7 @@
   {
    "cell_type": "code",
    "execution_count": 37,
+   "id": "ddec3263",
    "metadata": {},
    "outputs": [
     {
@@ -1670,6 +1737,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "187478db",
    "metadata": {},
    "source": [
     "We can apply this function to every column in a DataFrame, too."
@@ -1678,6 +1746,7 @@
   {
    "cell_type": "code",
    "execution_count": 38,
+   "id": "8b61094a",
    "metadata": {},
    "outputs": [
     {
@@ -1759,6 +1828,7 @@
   {
    "cell_type": "code",
    "execution_count": 39,
+   "id": "bb8c3019",
    "metadata": {},
    "outputs": [
     {
@@ -1867,10 +1937,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "d4785060",
    "metadata": {},
    "source": [
-    "GroupBy DataFrame UDFs\n",
-    "-------------------------------\n",
+    "## GroupBy DataFrame UDFs\n",
     "\n",
     "We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation]().\n",
     "\n",
@@ -1880,6 +1950,7 @@
   {
    "cell_type": "code",
    "execution_count": 40,
+   "id": "3dc272ab",
    "metadata": {},
    "outputs": [
     {
@@ -1971,6 +2042,7 @@
   {
    "cell_type": "code",
    "execution_count": 41,
+   "id": "c0578e0a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1979,6 +2051,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4808726f",
    "metadata": {},
    "source": [
     "Next we'll define a function to apply to each group independently. In this case, we'll take the rolling average of column `e`, and call that new column `rolling_avg_e`."
@@ -1987,6 +2060,7 @@
   {
    "cell_type": "code",
    "execution_count": 42,
+   "id": "19f0f7fe",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2006,6 +2080,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7566f359",
    "metadata": {},
    "source": [
     "We can execute this with a very similar API to `apply_rows`. This time, though, it's going to execute independently for each group."
@@ -2014,6 +2089,7 @@
   {
    "cell_type": "code",
    "execution_count": 43,
+   "id": "c43426c3",
    "metadata": {},
    "outputs": [
     {
@@ -2157,6 +2233,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c8511306",
    "metadata": {},
    "source": [
     "Notice how, with a window size of three in the kernel, the first two values in each group for our output column are null."
@@ -2164,10 +2241,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0060678c",
    "metadata": {},
    "source": [
-    "Numba Kernels on CuPy Arrays\n",
-    "-------------------------------------\n",
+    "## Numba Kernels on CuPy Arrays\n",
     "\n",
     "We can also execute Numba kernels on CuPy NDArrays, again thanks to the `__cuda_array_interface__`. We can even run the same UDF on the Series and the CuPy array. First, we define a Series and then create a CuPy array from that Series."
    ]
@@ -2175,6 +2252,7 @@
   {
    "cell_type": "code",
    "execution_count": 44,
+   "id": "aa6a8509",
    "metadata": {},
    "outputs": [
     {
@@ -2198,6 +2276,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0fed556f",
    "metadata": {},
    "source": [
     "Next, we define a UDF and execute it on our Series. We need to allocate a Series of the same size for our output, which we'll call `out`."
@@ -2206,6 +2285,7 @@
   {
    "cell_type": "code",
    "execution_count": 45,
+   "id": "0bb8bf93",
    "metadata": {},
    "outputs": [
     {
@@ -2238,6 +2318,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a857b169",
    "metadata": {},
    "source": [
     "Finally, we execute the same function on our array. We allocate an empty array `out` to store our results."
@@ -2246,6 +2327,7 @@
   {
    "cell_type": "code",
    "execution_count": 46,
+   "id": "ce60b639",
    "metadata": {},
    "outputs": [
     {
@@ -2267,14 +2349,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "b899d51c",
    "metadata": {},
    "source": [
-    "Caveats\n",
-    "---------"
+    "## Caveats"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "fe7eb68b",
    "metadata": {},
    "source": [
     "- Only numeric nondecimal scalar types are currently supported as of yet, but strings and structured types are in planning. Attempting to use this API with those types will throw a `TypeError`.\n",
@@ -2283,10 +2366,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c690563b",
    "metadata": {},
    "source": [
-    "Summary\n",
-    "-----------\n",
+    "## Summary\n",
     "\n",
     "This guide has covered a lot of content. At this point, you should hopefully feel comfortable writing UDFs (with or without null values) that operate on\n",
     "\n",
@@ -2323,5 +2406,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/docs/cudf/source/user_guide/index.md b/docs/cudf/source/user_guide/index.md
new file mode 100644
index 00000000000..2750c75790a
--- /dev/null
+++ b/docs/cudf/source/user_guide/index.md
@@ -0,0 +1,16 @@
+# User Guide
+
+```{toctree}
+:maxdepth: 2
+
+10min
+data-types
+io
+missing-data
+groupby
+guide-to-udfs
+cupy-interop
+dask-cudf
+internals
+PandasCompat
+```
diff --git a/docs/cudf/source/user_guide/index.rst b/docs/cudf/source/user_guide/index.rst
deleted file mode 100644
index 1061008eb3c..00000000000
--- a/docs/cudf/source/user_guide/index.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-==========
-User Guide
-==========
-
-
-.. toctree::
-   :maxdepth: 2
-
-   10min.ipynb
-   10min-cudf-cupy.ipynb
-   guide-to-udfs.ipynb
-   Working-with-missing-data.ipynb
diff --git a/docs/cudf/source/user_guide/internals.md b/docs/cudf/source/user_guide/internals.md
new file mode 100644
index 00000000000..6ceef3d3492
--- /dev/null
+++ b/docs/cudf/source/user_guide/internals.md
@@ -0,0 +1,212 @@
+# cuDF internals
+
+The cuDF API closely matches that of the
+[Pandas](https://pandas.pydata.org/) library. Thus, we have the types
+`cudf.Series`, `cudf.DataFrame` and `cudf.Index` which look and
+feel very much like their Pandas counterparts.
+
+Under the hood, however, cuDF uses data structures very different from
+Pandas. In this document, we describe these internal data structures.
+
+## Column
+
+Columns are cuDF's core data structure and they are modeled after the
+[Apache Arrow Columnar
+Format](https://arrow.apache.org/docs/format/Columnar.html).
+
+A column represents a sequence of values, any number of which may be
+"null". Columns are specialized based on the type of data they contain.
+Thus we have `NumericalColumn`, `StringColumn`, `DatetimeColumn`,
+etc.
+
+A column is composed of the following:
+
+- A **data type**, specifying the type of each element.
+- A **data buffer** that may store the data for the column elements.
+  Some column types do not have a data buffer, instead storing data in
+  the children columns.
+- A **mask buffer** whose bits represent the validity (null or not
+  null) of each element. Columns whose elements are all "valid" may not
+  have a mask buffer. Mask buffers are padded to 64 bytes.
+- A tuple of **children** columns, which enable the representation
+  complex types such as columns with non-fixed width elements such as
+  strings or lists.
+- A **size** indicating the number of elements in the column.
+- An integer **offset**: a column may represent a "slice" of another
+  column, in which case this offset represents the first element of the
+  slice. The size of the column then gives the extent of the slice. A
+  column that is not a slice has an offset of 0.
+
+For example, the `NumericalColumn` backing a Series with 1000 elements
+of type 'int32' and containing nulls is composed of:
+
+1. A data buffer of size 4000 bytes (sizeof(int32) * 1000)
+2. A mask buffer of size 128 bytes (1000/8 padded to a multiple of 64
+   bytes)
+3. No children columns
+
+As another example, the `StringColumn` backing the Series
+`['do', 'you', 'have', 'any', 'cheese?']` is composed of:
+
+1. No data buffer
+2. No mask buffer as there are no nulls in the Series
+3. Two children columns:
+
+   > - A column of UTF-8 characters
+   >   `['d', 'o', 'y', 'o', 'u', 'h' ..., '?']`
+   > - A column of "offsets" to the characters column (in this case,
+   >   `[0, 2, 5, 9, 12, 19]`)
+
+## Buffer
+
+The data and mask buffers of a column represent data in GPU memory
+(a.k.a *device memory*), and are objects of type
+`cudf.core.buffer.Buffer`.
+
+Buffers can be constructed from array-like objects that live either on
+the host (e.g., numpy arrays) or the device (e.g., cupy arrays). Arrays
+must be of `uint8` dtype or viewed as such.
+
+When constructing a Buffer from a host object such as a numpy array, new
+device memory is allocated:
+
+```python
+>>> from cudf.core.buffer import Buffer
+>>> buf = Buffer(np.array([1, 2, 3], dtype='int64').view("uint8"))
+>>> print(buf.ptr)  # address of new device memory allocation
+140050901762560
+>>> print(buf.size)
+24
+>>> print(buf._owner)
+<rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baab50>
+```
+
+cuDF uses the [RMM](https://github.com/rapidsai/rmm) library for
+allocating device memory. You can read more about device memory
+allocation with RMM
+[here](https://github.com/rapidsai/rmm#devicebuffers).
+
+When constructing a Buffer from a device object such as a CuPy array, no
+new device memory is allocated. Instead, the Buffer points to the
+existing allocation, keeping a reference to the device array:
+
+```python
+>>> import cupy as cp
+>>> c_ary = cp.asarray([1, 2, 3], dtype='int64')
+>>> buf = Buffer(c_ary.view("uint8"))
+>>> print(c_ary.data.mem.ptr)
+140050901762560
+>>> print(buf.ptr)
+140050901762560
+>>> print(buf.size)
+24
+>>> print(buf._owner is c_ary)
+True
+```
+
+An uninitialized block of device memory can be allocated with
+`Buffer.empty`:
+
+```python
+>>> buf = Buffer.empty(10)
+>>> print(buf.size)
+10
+>>> print(buf._owner)
+<rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baa890>
+```
+
+## ColumnAccessor
+
+cuDF `Series`, `DataFrame` and `Index` are all subclasses of an
+internal `Frame` class. The underlying data structure of `Frame` is
+an ordered, dictionary-like object known as `ColumnAccessor`, which
+can be accessed via the `._data` attribute:
+
+```python
+>>> a = cudf.DataFrame({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
+>>> a._data
+ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d12e050>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e320>)]), multiindex=False, level_names=(None,))
+```
+
+ColumnAccessor is an ordered mapping of column labels to columns. In
+addition to behaving like an OrderedDict, it supports things like
+selecting multiple columns (both by index and label), as well as
+hierarchical indexing.
+
+```python
+>>> from cudf.core.column_accessor import ColumnAccessor
+```
+
+The values of a ColumnAccessor are coerced to Columns during
+construction:
+
+```python
+>>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
+>>> ca['x']
+<cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
+>>> ca['y']
+<cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>
+>>> ca.pop('x')
+<cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
+>>> ca
+ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>)]), multiindex=False, level_names=(None,))
+```
+
+Columns can be inserted at a specified location:
+
+```python
+>>> ca.insert('z', [3, 4, 5], loc=1)
+>>> ca
+ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e3b0>)]), multiindex=False, level_names=(None,))
+```
+
+Selecting columns by index:
+
+```python
+>>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c'], 'z': [4, 5, 6]})
+>>> ca.select_by_index(1)
+ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
+>>> ca.select_by_index([0, 1])
+ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
+>>> ca.select_by_index(slice(1, 3))
+ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
+```
+
+Selecting columns by label:
+
+```python
+>>> ca.select_by_label(['y', 'z'])
+ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
+>>> ca.select_by_label(slice('x', 'y'))
+ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
+```
+
+A ColumnAccessor with tuple keys (and constructed with
+`multiindex=True`) can be hierarchically indexed:
+
+```python
+>>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], 'b': [4, 5, 6]}, multiindex=True)
+>>> ca.select_by_label('a')
+ColumnAccessor(OrderedColumnDict([('b', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('c', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>)]), multiindex=False, level_names=(None,))
+>>> ca.select_by_label(('a', 'b'))
+ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>)]), multiindex=False, level_names=(None,))
+```
+
+"Wildcard" indexing is also allowed:
+
+```python
+>>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], ('d', 'b'): [4, 5, 6]}, multiindex=True)
+>>> ca.select_by_label((slice(None), 'b'))
+ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578830>), (('d', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>)]), multiindex=True, level_names=(None, None))
+```
+
+Finally, ColumnAccessors can convert to Pandas `Index` or
+`MultiIndex` objects:
+
+```python
+>>> ca.to_pandas_index()
+MultiIndex([('a', 'b'),
+            ('a', 'c'),
+            ('d', 'b')],
+           )
+```
diff --git a/docs/cudf/source/basics/io-supported-types.rst b/docs/cudf/source/user_guide/io.md
similarity index 69%
rename from docs/cudf/source/basics/io-supported-types.rst
rename to docs/cudf/source/user_guide/io.md
index 4a7da60fa85..672375eedaf 100644
--- a/docs/cudf/source/basics/io-supported-types.rst
+++ b/docs/cudf/source/user_guide/io.md
@@ -1,10 +1,17 @@
-I/O Supported dtypes
-====================
+# Input / Output
 
-The following table lists are compatible cudf types for each supported IO format.
+This page contains Input / Output related APIs in cuDF.
 
-.. rst-class:: io-supported-types-table special-table
+## I/O Supported dtypes
+
+The following table lists are compatible cudf types for each supported
+IO format.
+
+<div class="special-table-wrapper" style="overflow:auto">
+
+```{eval-rst}
 .. table::
+    :class: io-supported-types-table special-table
     :widths: 15 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+-------------------+--------+--------+---------+---------+
@@ -64,7 +71,103 @@ The following table lists are compatible cudf types for each supported IO format
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | decimal128            | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+```
+
+</div>
+
 
 **Notes:**
 
-* [¹] - Not GPU-accelerated.
+- \[¹\] - Not GPU-accelerated.
+
+## GPUDirect Storage Integration
+
+Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO
+operations.  GDS enables a direct data path for direct memory access
+(DMA) transfers between GPU memory and storage, which avoids a bounce
+buffer through the CPU.  GDS also has a compatibility mode that allows
+the library to fall back to copying through a CPU bounce buffer.  The
+SDK is available for download
+[here](https://developer.nvidia.com/gpudirect-storage).  GDS is also
+included in CUDA Toolkit 11.4 and higher.
+
+Use of GPUDirect Storage in cuDF is enabled by default, but can be
+disabled through the environment variable `LIBCUDF_CUFILE_POLICY`.
+This variable also controls the GDS compatibility mode.
+
+There are four valid values for the environment variable:
+
+- "GDS": Enable GDS use; GDS compatibility mode is *off*.
+- "ALWAYS": Enable GDS use; GDS compatibility mode is *on*.
+- "KVIKIO": Enable GDS through [KvikIO](https://github.com/rapidsai/kvikio).
+- "OFF": Completely disable GDS use.
+
+If no value is set, behavior will be the same as the "GDS" option.
+
+This environment variable also affects how cuDF treats GDS errors.
+
+- When `LIBCUDF_CUFILE_POLICY` is set to "GDS" and a GDS API call
+  fails for any reason, cuDF falls back to the internal implementation
+  with bounce buffers.
+- When `LIBCUDF_CUFILE_POLICY` is set to "ALWAYS" and a GDS API call
+fails for any reason (unlikely, given that the compatibility mode is
+on), cuDF throws an exception to propagate the error to the user.
+- When `LIBCUDF_CUFILE_POLICY` is set to "KVIKIO" and a KvikIO API
+  call fails for any reason (unlikely, given that KvikIO implements
+  its own compatibility mode) cuDF throws an exception to propagate
+  the error to the user.
+
+For more information about error handling, compatibility mode, and
+tuning parameters in KvikIO see: <https://github.com/rapidsai/kvikio>
+
+Operations that support the use of GPUDirect Storage:
+
+- {py:func}`cudf.read_avro`
+- {py:func}`cudf.read_parquet`
+- {py:func}`cudf.read_orc`
+- {py:meth}`cudf.DataFrame.to_csv`
+- {py:meth}`cudf.DataFrame.to_parquet`
+- {py:meth}`cudf.DataFrame.to_orc`
+
+Several parameters that can be used to tune the performance of
+GDS-enabled I/O are exposed through environment variables:
+
+- `LIBCUDF_CUFILE_THREAD_COUNT`: Integral value, maximum number of
+  parallel reads/writes per file (default 16);
+- `LIBCUDF_CUFILE_SLICE_SIZE`: Integral value, maximum size of each
+  GDS read/write, in bytes (default 4MB).  Larger I/O operations are
+  split into multiple calls.
+
+## nvCOMP Integration
+
+Some types of compression/decompression can be performed using either
+the [nvCOMP library](https://github.com/NVIDIA/nvcomp) or the internal
+implementation.
+
+Which implementation is used by default depends on the data format and
+the compression type.  Behavior can be influenced through environment
+variable `LIBCUDF_NVCOMP_POLICY`.
+
+There are three valid values for the environment variable:
+
+- "STABLE": Only enable the nvCOMP in places where it has been deemed
+  stable for production use.
+- "ALWAYS": Enable all available uses of nvCOMP, including new,
+  experimental combinations.
+- "OFF": Disable nvCOMP use whenever possible and use the internal
+  implementations instead.
+
+If no value is set, behavior will be the same as the "STABLE" option.
+
+```{eval-rst}
+.. table:: Current policy for nvCOMP use for different types
+    :widths: 20 15 15 15 15 15 15 15 15 15
+
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
+    |                       |       CSV       |      Parquet    |       JSON       |       ORC       |  AVRO  |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
+    | Compression Type      | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader |
+    +=======================+========+========+========+========+=========+========+========+========+========+
+    | snappy                | ❌     | ❌     | Stable | Stable | ❌      | ❌     | Stable | Stable | ❌     |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
+```
diff --git a/docs/cudf/source/user_guide/Working-with-missing-data.ipynb b/docs/cudf/source/user_guide/missing-data.ipynb
similarity index 87%
rename from docs/cudf/source/user_guide/Working-with-missing-data.ipynb
rename to docs/cudf/source/user_guide/missing-data.ipynb
index 54fe774060e..ad12c675373 100644
--- a/docs/cudf/source/user_guide/Working-with-missing-data.ipynb
+++ b/docs/cudf/source/user_guide/missing-data.ipynb
@@ -2,6 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "f8ffbea7",
    "metadata": {},
    "source": [
     "# Working with missing data"
@@ -9,6 +10,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7e3ab093",
    "metadata": {},
    "source": [
     "In this section, we will discuss missing (also referred to as `NA`) values in cudf. cudf supports having missing values in all dtypes. These missing values are represented by `<NA>`. These values are also referenced as \"null values\"."
@@ -16,25 +18,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "1. [How to Detect missing values](#How-to-Detect-missing-values)\n",
-    "2. [Float dtypes and missing data](#Float-dtypes-and-missing-data)\n",
-    "3. [Datetimes](#Datetimes)\n",
-    "4. [Calculations with missing data](#Calculations-with-missing-data)\n",
-    "5. [Sum/product of Null/nans](#Sum/product-of-Null/nans)\n",
-    "6. [NA values in GroupBy](#NA-values-in-GroupBy)\n",
-    "7. [Inserting missing data](#Inserting-missing-data)\n",
-    "8. [Filling missing values: fillna](#Filling-missing-values:-fillna)\n",
-    "9. [Filling with cudf Object](#Filling-with-cudf-Object)\n",
-    "10. [Dropping axis labels with missing data: dropna](#Dropping-axis-labels-with-missing-data:-dropna)\n",
-    "11. [Replacing generic values](#Replacing-generic-values)\n",
-    "12. [String/regular expression replacement](#String/regular-expression-replacement)\n",
-    "13. [Numeric replacement](#Numeric-replacement)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
+   "id": "8d657a82",
    "metadata": {},
    "source": [
     "## How to Detect missing values"
@@ -42,6 +26,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "9ea9f672",
    "metadata": {},
    "source": [
     "To detect missing values, you can use `isna()` and `notna()` functions."
@@ -50,6 +35,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "58050adb",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -60,6 +46,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "416d73da",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -69,6 +56,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "5dfc6bc3",
    "metadata": {},
    "outputs": [
     {
@@ -141,6 +129,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "4d7f7a6d",
    "metadata": {},
    "outputs": [
     {
@@ -213,6 +202,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "40edca67",
    "metadata": {},
    "outputs": [
     {
@@ -236,6 +226,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "acdf29d7",
    "metadata": {},
    "source": [
     "One has to be mindful that in Python (and NumPy), the nan's don’t compare equal, but None's do. Note that cudf/NumPy uses the fact that `np.nan != np.nan`, and treats `None` like `np.nan`."
@@ -244,6 +235,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
+   "id": "c269c1f5",
    "metadata": {},
    "outputs": [
     {
@@ -264,6 +256,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
+   "id": "99fb083a",
    "metadata": {},
    "outputs": [
     {
@@ -283,22 +276,23 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4fdb8bc7",
    "metadata": {},
    "source": [
-    "So as compared to above, a scalar equality comparison versus a None/np.nan doesn’t provide useful information.\n",
-    "\n"
+    "So as compared to above, a scalar equality comparison versus a None/np.nan doesn’t provide useful information."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "630ef6bb",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "0    False\n",
-       "1    False\n",
+       "1     <NA>\n",
        "2    False\n",
        "3    False\n",
        "Name: b, dtype: bool"
@@ -316,6 +310,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "8162e383",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -325,6 +320,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "199775b3",
    "metadata": {},
    "outputs": [
     {
@@ -348,14 +344,15 @@
   {
    "cell_type": "code",
    "execution_count": 11,
+   "id": "cd09d80c",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0    False\n",
-       "1    False\n",
-       "2    False\n",
+       "0    <NA>\n",
+       "1    <NA>\n",
+       "2    <NA>\n",
        "dtype: bool"
       ]
      },
@@ -371,6 +368,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
+   "id": "6b23bb0c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -380,6 +378,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
+   "id": "cafb79ee",
    "metadata": {},
    "outputs": [
     {
@@ -403,6 +402,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
+   "id": "13363897",
    "metadata": {},
    "outputs": [
     {
@@ -425,6 +425,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "208a3776",
    "metadata": {},
    "source": [
     "## Float dtypes and missing data"
@@ -432,16 +433,18 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2c174b88",
    "metadata": {},
    "source": [
     "Because ``NaN`` is a float, a column of integers with even one missing values is cast to floating-point dtype. However this doesn't happen by default.\n",
     "\n",
-    "By default if a ``NaN`` value is passed to `Series` constructor, it is treated as `<NA>` value. "
+    "By default if a ``NaN`` value is passed to `Series` constructor, it is treated as `<NA>` value."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 15,
+   "id": "c59c3c54",
    "metadata": {},
    "outputs": [
     {
@@ -464,6 +467,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a9eb2d9c",
    "metadata": {},
    "source": [
     "Hence to consider a ``NaN`` as ``NaN`` you will have to pass `nan_as_null=False` parameter into `Series` constructor."
@@ -472,6 +476,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
+   "id": "ecc5ae92",
    "metadata": {},
    "outputs": [
     {
@@ -494,6 +499,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "d1db7b08",
    "metadata": {},
    "source": [
     "## Datetimes"
@@ -501,15 +507,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "548d3734",
    "metadata": {},
    "source": [
-    "For `datetime64` types, cudf doesn't support having `NaT` values. Instead these values which are specific to numpy and pandas are considered as null values(`<NA>`) in cudf. The actual underlying value of `NaT` is `min(int64)` and cudf retains the underlying value when converting a cudf object to pandas object.\n",
-    "\n"
+    "For `datetime64` types, cudf doesn't support having `NaT` values. Instead these values which are specific to numpy and pandas are considered as null values(`<NA>`) in cudf. The actual underlying value of `NaT` is `min(int64)` and cudf retains the underlying value when converting a cudf object to pandas object."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 17,
+   "id": "de70f244",
    "metadata": {},
    "outputs": [
     {
@@ -535,6 +542,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "8411a914",
    "metadata": {},
    "outputs": [
     {
@@ -557,6 +565,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "df664145",
    "metadata": {},
    "source": [
     "any operations on rows having `<NA>` values in `datetime` column will result in `<NA>` value at the same location in resulting column:"
@@ -565,6 +574,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
+   "id": "829c32d0",
    "metadata": {},
    "outputs": [
     {
@@ -587,6 +597,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "aa8031ef",
    "metadata": {},
    "source": [
     "## Calculations with missing data"
@@ -594,6 +605,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c587fae2",
    "metadata": {},
    "source": [
     "Null values propagate naturally through arithmetic operations between pandas objects."
@@ -602,6 +614,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
+   "id": "f8f2aec7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -611,6 +624,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
+   "id": "0c8a3011",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -620,6 +634,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
+   "id": "052f6c2b",
    "metadata": {},
    "outputs": [
     {
@@ -698,6 +713,7 @@
   {
    "cell_type": "code",
    "execution_count": 23,
+   "id": "0fb0a083",
    "metadata": {},
    "outputs": [
     {
@@ -776,6 +792,7 @@
   {
    "cell_type": "code",
    "execution_count": 24,
+   "id": "6f8152c0",
    "metadata": {},
    "outputs": [
     {
@@ -853,6 +870,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "11170d49",
    "metadata": {},
    "source": [
     "While summing the data along a series, `NA` values will be treated as `0`."
@@ -861,6 +879,7 @@
   {
    "cell_type": "code",
    "execution_count": 25,
+   "id": "45081790",
    "metadata": {},
    "outputs": [
     {
@@ -886,6 +905,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
+   "id": "39922658",
    "metadata": {},
    "outputs": [
     {
@@ -905,6 +925,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6e99afe0",
    "metadata": {},
    "source": [
     "Since `NA` values are treated as `0`, the mean would result to 2 in this case `(1 + 0 + 2 + 3 + 0)/5 = 2`"
@@ -913,6 +934,7 @@
   {
    "cell_type": "code",
    "execution_count": 27,
+   "id": "b2f16ddb",
    "metadata": {},
    "outputs": [
     {
@@ -932,6 +954,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "07f2ec5a",
    "metadata": {},
    "source": [
     "To preserve `NA` values in the above calculations, `sum` & `mean` support `skipna` parameter.\n",
@@ -942,6 +965,7 @@
   {
    "cell_type": "code",
    "execution_count": 28,
+   "id": "d4a463a0",
    "metadata": {},
    "outputs": [
     {
@@ -962,6 +986,7 @@
   {
    "cell_type": "code",
    "execution_count": 29,
+   "id": "a944c42e",
    "metadata": {},
    "outputs": [
     {
@@ -981,6 +1006,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "fb8c8f18",
    "metadata": {},
    "source": [
     "Cumulative methods like `cumsum` and `cumprod` ignore `NA` values by default."
@@ -989,6 +1015,7 @@
   {
    "cell_type": "code",
    "execution_count": 30,
+   "id": "4f2a7306",
    "metadata": {},
    "outputs": [
     {
@@ -1013,6 +1040,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c8f6054b",
    "metadata": {},
    "source": [
     "To preserve `NA` values in cumulative methods, provide `skipna=False`."
@@ -1021,6 +1049,7 @@
   {
    "cell_type": "code",
    "execution_count": 31,
+   "id": "d4c46776",
    "metadata": {},
    "outputs": [
     {
@@ -1045,6 +1074,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "67077d65",
    "metadata": {},
    "source": [
     "## Sum/product of Null/nans"
@@ -1052,6 +1082,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ffbb9ca1",
    "metadata": {},
    "source": [
     "The sum of an empty or all-NA Series of a DataFrame is 0."
@@ -1060,6 +1091,7 @@
   {
    "cell_type": "code",
    "execution_count": 32,
+   "id": "f430c9ce",
    "metadata": {},
    "outputs": [
     {
@@ -1080,6 +1112,7 @@
   {
    "cell_type": "code",
    "execution_count": 33,
+   "id": "7fde514b",
    "metadata": {},
    "outputs": [
     {
@@ -1100,6 +1133,7 @@
   {
    "cell_type": "code",
    "execution_count": 34,
+   "id": "56cedd17",
    "metadata": {},
    "outputs": [
     {
@@ -1119,6 +1153,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "cb188adb",
    "metadata": {},
    "source": [
     "The product of an empty or all-NA Series of a DataFrame is 1."
@@ -1127,6 +1162,7 @@
   {
    "cell_type": "code",
    "execution_count": 35,
+   "id": "d20bbbef",
    "metadata": {},
    "outputs": [
     {
@@ -1147,6 +1183,7 @@
   {
    "cell_type": "code",
    "execution_count": 36,
+   "id": "75abbcfa",
    "metadata": {},
    "outputs": [
     {
@@ -1167,6 +1204,7 @@
   {
    "cell_type": "code",
    "execution_count": 37,
+   "id": "becce0cc",
    "metadata": {},
    "outputs": [
     {
@@ -1186,6 +1224,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0e899e03",
    "metadata": {},
    "source": [
     "## NA values in GroupBy"
@@ -1193,6 +1232,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7fb20874",
    "metadata": {},
    "source": [
     "`NA` groups in GroupBy are automatically excluded. For example:"
@@ -1201,6 +1241,7 @@
   {
    "cell_type": "code",
    "execution_count": 38,
+   "id": "1379037c",
    "metadata": {},
    "outputs": [
     {
@@ -1279,6 +1320,7 @@
   {
    "cell_type": "code",
    "execution_count": 39,
+   "id": "d6b91e6f",
    "metadata": {},
    "outputs": [
     {
@@ -1345,6 +1387,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "cb83fb11",
    "metadata": {},
    "source": [
     "It is also possible to include `NA` in groups by passing `dropna=False`"
@@ -1353,9 +1396,8 @@
   {
    "cell_type": "code",
    "execution_count": 40,
-   "metadata": {
-    "scrolled": true
-   },
+   "id": "768c3e50",
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -1426,6 +1468,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "133816b4",
    "metadata": {},
    "source": [
     "## Inserting missing data"
@@ -1433,6 +1476,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "306082ad",
    "metadata": {},
    "source": [
     "All dtypes support insertion of missing value by assignment. Any specific location in series can made null by assigning it to `None`."
@@ -1441,6 +1485,7 @@
   {
    "cell_type": "code",
    "execution_count": 41,
+   "id": "7ddde1fe",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1450,6 +1495,7 @@
   {
    "cell_type": "code",
    "execution_count": 42,
+   "id": "16e54597",
    "metadata": {},
    "outputs": [
     {
@@ -1474,6 +1520,7 @@
   {
    "cell_type": "code",
    "execution_count": 43,
+   "id": "f628f94d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1483,9 +1530,8 @@
   {
    "cell_type": "code",
    "execution_count": 44,
-   "metadata": {
-    "scrolled": true
-   },
+   "id": "b30590b7",
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -1508,6 +1554,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a1b123d0",
    "metadata": {},
    "source": [
     "## Filling missing values: fillna"
@@ -1515,6 +1562,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "114aa23a",
    "metadata": {},
    "source": [
     "`fillna()` can fill in `NA` & `NaN` values with non-NA data."
@@ -1523,6 +1571,7 @@
   {
    "cell_type": "code",
    "execution_count": 45,
+   "id": "59e22668",
    "metadata": {},
    "outputs": [
     {
@@ -1601,6 +1650,7 @@
   {
    "cell_type": "code",
    "execution_count": 46,
+   "id": "05c221ee",
    "metadata": {},
    "outputs": [
     {
@@ -1625,6 +1675,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "401f91b2",
    "metadata": {},
    "source": [
     "## Filling with cudf Object"
@@ -1632,6 +1683,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e79346d6",
    "metadata": {},
    "source": [
     "You can also fillna using a dict or Series that is alignable. The labels of the dict or index of the Series must match the columns of the frame you wish to fill. The use case of this is to fill a DataFrame with the mean of that column."
@@ -1640,6 +1692,7 @@
   {
    "cell_type": "code",
    "execution_count": 47,
+   "id": "f52c5d8f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1650,6 +1703,7 @@
   {
    "cell_type": "code",
    "execution_count": 48,
+   "id": "6affebe9",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1659,6 +1713,7 @@
   {
    "cell_type": "code",
    "execution_count": 49,
+   "id": "1ce1b96f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1668,6 +1723,7 @@
   {
    "cell_type": "code",
    "execution_count": 50,
+   "id": "90829195",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1677,6 +1733,7 @@
   {
    "cell_type": "code",
    "execution_count": 51,
+   "id": "c0feac14",
    "metadata": {},
    "outputs": [
     {
@@ -1708,63 +1765,63 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>0.771245</td>\n",
-       "      <td>0.051024</td>\n",
-       "      <td>1.199239</td>\n",
+       "      <td>-0.408268</td>\n",
+       "      <td>-0.676643</td>\n",
+       "      <td>-1.274743</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>-1.168041</td>\n",
-       "      <td>0.702664</td>\n",
-       "      <td>-0.270806</td>\n",
+       "      <td>-0.029322</td>\n",
+       "      <td>-0.873593</td>\n",
+       "      <td>-1.214105</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>-1.467009</td>\n",
-       "      <td>-0.143080</td>\n",
-       "      <td>-0.806151</td>\n",
+       "      <td>-0.866371</td>\n",
+       "      <td>1.081735</td>\n",
+       "      <td>-0.226840</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>NaN</td>\n",
-       "      <td>-0.610798</td>\n",
-       "      <td>-0.272895</td>\n",
+       "      <td>0.812278</td>\n",
+       "      <td>1.074973</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>1.396784</td>\n",
+       "      <td>-0.366725</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>-0.439343</td>\n",
+       "      <td>-1.016239</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>1.093102</td>\n",
-       "      <td>-0.764758</td>\n",
+       "      <td>0.675123</td>\n",
+       "      <td>1.067536</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>0.003098</td>\n",
-       "      <td>-0.722648</td>\n",
+       "      <td>0.221568</td>\n",
+       "      <td>2.025961</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>-0.095899</td>\n",
-       "      <td>-1.285156</td>\n",
-       "      <td>-0.300566</td>\n",
+       "      <td>-0.317241</td>\n",
+       "      <td>1.011275</td>\n",
+       "      <td>0.674891</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.109465</td>\n",
-       "      <td>2.497843</td>\n",
-       "      <td>-1.199856</td>\n",
+       "      <td>-0.877041</td>\n",
+       "      <td>-1.919394</td>\n",
+       "      <td>-1.029201</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1772,16 +1829,16 @@
       ],
       "text/plain": [
        "          A         B         C\n",
-       "0  0.771245  0.051024  1.199239\n",
-       "1 -1.168041  0.702664 -0.270806\n",
-       "2 -1.467009 -0.143080 -0.806151\n",
-       "3       NaN -0.610798 -0.272895\n",
-       "4       NaN       NaN  1.396784\n",
-       "5 -0.439343       NaN       NaN\n",
-       "6  1.093102 -0.764758       NaN\n",
-       "7  0.003098 -0.722648       NaN\n",
-       "8 -0.095899 -1.285156 -0.300566\n",
-       "9  0.109465  2.497843 -1.199856"
+       "0 -0.408268 -0.676643 -1.274743\n",
+       "1 -0.029322 -0.873593 -1.214105\n",
+       "2 -0.866371  1.081735 -0.226840\n",
+       "3       NaN  0.812278  1.074973\n",
+       "4       NaN       NaN -0.366725\n",
+       "5 -1.016239       NaN       NaN\n",
+       "6  0.675123  1.067536       NaN\n",
+       "7  0.221568  2.025961       NaN\n",
+       "8 -0.317241  1.011275  0.674891\n",
+       "9 -0.877041 -1.919394 -1.029201"
       ]
      },
      "execution_count": 51,
@@ -1796,6 +1853,7 @@
   {
    "cell_type": "code",
    "execution_count": 52,
+   "id": "a07c1260",
    "metadata": {},
    "outputs": [
     {
@@ -1827,63 +1885,63 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>0.771245</td>\n",
-       "      <td>0.051024</td>\n",
-       "      <td>1.199239</td>\n",
+       "      <td>-0.408268</td>\n",
+       "      <td>-0.676643</td>\n",
+       "      <td>-1.274743</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>-1.168041</td>\n",
-       "      <td>0.702664</td>\n",
-       "      <td>-0.270806</td>\n",
+       "      <td>-0.029322</td>\n",
+       "      <td>-0.873593</td>\n",
+       "      <td>-1.214105</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>-1.467009</td>\n",
-       "      <td>-0.143080</td>\n",
-       "      <td>-0.806151</td>\n",
+       "      <td>-0.866371</td>\n",
+       "      <td>1.081735</td>\n",
+       "      <td>-0.226840</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>-0.149173</td>\n",
-       "      <td>-0.610798</td>\n",
-       "      <td>-0.272895</td>\n",
+       "      <td>-0.327224</td>\n",
+       "      <td>0.812278</td>\n",
+       "      <td>1.074973</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>-0.149173</td>\n",
-       "      <td>-0.034364</td>\n",
-       "      <td>1.396784</td>\n",
+       "      <td>-0.327224</td>\n",
+       "      <td>0.316145</td>\n",
+       "      <td>-0.366725</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>-0.439343</td>\n",
-       "      <td>-0.034364</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>-1.016239</td>\n",
+       "      <td>0.316145</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>1.093102</td>\n",
-       "      <td>-0.764758</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>0.675123</td>\n",
+       "      <td>1.067536</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>0.003098</td>\n",
-       "      <td>-0.722648</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>0.221568</td>\n",
+       "      <td>2.025961</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>-0.095899</td>\n",
-       "      <td>-1.285156</td>\n",
-       "      <td>-0.300566</td>\n",
+       "      <td>-0.317241</td>\n",
+       "      <td>1.011275</td>\n",
+       "      <td>0.674891</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.109465</td>\n",
-       "      <td>2.497843</td>\n",
-       "      <td>-1.199856</td>\n",
+       "      <td>-0.877041</td>\n",
+       "      <td>-1.919394</td>\n",
+       "      <td>-1.029201</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1891,16 +1949,16 @@
       ],
       "text/plain": [
        "          A         B         C\n",
-       "0  0.771245  0.051024  1.199239\n",
-       "1 -1.168041  0.702664 -0.270806\n",
-       "2 -1.467009 -0.143080 -0.806151\n",
-       "3 -0.149173 -0.610798 -0.272895\n",
-       "4 -0.149173 -0.034364  1.396784\n",
-       "5 -0.439343 -0.034364 -0.036322\n",
-       "6  1.093102 -0.764758 -0.036322\n",
-       "7  0.003098 -0.722648 -0.036322\n",
-       "8 -0.095899 -1.285156 -0.300566\n",
-       "9  0.109465  2.497843 -1.199856"
+       "0 -0.408268 -0.676643 -1.274743\n",
+       "1 -0.029322 -0.873593 -1.214105\n",
+       "2 -0.866371  1.081735 -0.226840\n",
+       "3 -0.327224  0.812278  1.074973\n",
+       "4 -0.327224  0.316145 -0.366725\n",
+       "5 -1.016239  0.316145 -0.337393\n",
+       "6  0.675123  1.067536 -0.337393\n",
+       "7  0.221568  2.025961 -0.337393\n",
+       "8 -0.317241  1.011275  0.674891\n",
+       "9 -0.877041 -1.919394 -1.029201"
       ]
      },
      "execution_count": 52,
@@ -1915,6 +1973,7 @@
   {
    "cell_type": "code",
    "execution_count": 53,
+   "id": "9e70d61a",
    "metadata": {},
    "outputs": [
     {
@@ -1946,63 +2005,63 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>0.771245</td>\n",
-       "      <td>0.051024</td>\n",
-       "      <td>1.199239</td>\n",
+       "      <td>-0.408268</td>\n",
+       "      <td>-0.676643</td>\n",
+       "      <td>-1.274743</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>-1.168041</td>\n",
-       "      <td>0.702664</td>\n",
-       "      <td>-0.270806</td>\n",
+       "      <td>-0.029322</td>\n",
+       "      <td>-0.873593</td>\n",
+       "      <td>-1.214105</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>-1.467009</td>\n",
-       "      <td>-0.143080</td>\n",
-       "      <td>-0.806151</td>\n",
+       "      <td>-0.866371</td>\n",
+       "      <td>1.081735</td>\n",
+       "      <td>-0.226840</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>NaN</td>\n",
-       "      <td>-0.610798</td>\n",
-       "      <td>-0.272895</td>\n",
+       "      <td>0.812278</td>\n",
+       "      <td>1.074973</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>NaN</td>\n",
-       "      <td>-0.034364</td>\n",
-       "      <td>1.396784</td>\n",
+       "      <td>0.316145</td>\n",
+       "      <td>-0.366725</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>-0.439343</td>\n",
-       "      <td>-0.034364</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>-1.016239</td>\n",
+       "      <td>0.316145</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>1.093102</td>\n",
-       "      <td>-0.764758</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>0.675123</td>\n",
+       "      <td>1.067536</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>0.003098</td>\n",
-       "      <td>-0.722648</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>0.221568</td>\n",
+       "      <td>2.025961</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>-0.095899</td>\n",
-       "      <td>-1.285156</td>\n",
-       "      <td>-0.300566</td>\n",
+       "      <td>-0.317241</td>\n",
+       "      <td>1.011275</td>\n",
+       "      <td>0.674891</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.109465</td>\n",
-       "      <td>2.497843</td>\n",
-       "      <td>-1.199856</td>\n",
+       "      <td>-0.877041</td>\n",
+       "      <td>-1.919394</td>\n",
+       "      <td>-1.029201</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2010,16 +2069,16 @@
       ],
       "text/plain": [
        "          A         B         C\n",
-       "0  0.771245  0.051024  1.199239\n",
-       "1 -1.168041  0.702664 -0.270806\n",
-       "2 -1.467009 -0.143080 -0.806151\n",
-       "3       NaN -0.610798 -0.272895\n",
-       "4       NaN -0.034364  1.396784\n",
-       "5 -0.439343 -0.034364 -0.036322\n",
-       "6  1.093102 -0.764758 -0.036322\n",
-       "7  0.003098 -0.722648 -0.036322\n",
-       "8 -0.095899 -1.285156 -0.300566\n",
-       "9  0.109465  2.497843 -1.199856"
+       "0 -0.408268 -0.676643 -1.274743\n",
+       "1 -0.029322 -0.873593 -1.214105\n",
+       "2 -0.866371  1.081735 -0.226840\n",
+       "3       NaN  0.812278  1.074973\n",
+       "4       NaN  0.316145 -0.366725\n",
+       "5 -1.016239  0.316145 -0.337393\n",
+       "6  0.675123  1.067536 -0.337393\n",
+       "7  0.221568  2.025961 -0.337393\n",
+       "8 -0.317241  1.011275  0.674891\n",
+       "9 -0.877041 -1.919394 -1.029201"
       ]
      },
      "execution_count": 53,
@@ -2033,6 +2092,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0ace728d",
    "metadata": {},
    "source": [
     "## Dropping axis labels with missing data: dropna"
@@ -2040,15 +2100,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2ccd7115",
    "metadata": {},
    "source": [
-    "Missing data can be excluded using `dropna()`:\n",
-    "\n"
+    "Missing data can be excluded using `dropna()`:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 54,
+   "id": "98c57be7",
    "metadata": {},
    "outputs": [
     {
@@ -2127,6 +2188,7 @@
   {
    "cell_type": "code",
    "execution_count": 55,
+   "id": "bc3f273a",
    "metadata": {},
    "outputs": [
     {
@@ -2187,6 +2249,7 @@
   {
    "cell_type": "code",
    "execution_count": 56,
+   "id": "a48d4de0",
    "metadata": {},
    "outputs": [
     {
@@ -2249,14 +2312,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0b1954f9",
    "metadata": {},
    "source": [
-    "An equivalent `dropna()` is available for Series. "
+    "An equivalent `dropna()` is available for Series."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 57,
+   "id": "2dd8f660",
    "metadata": {},
    "outputs": [
     {
@@ -2279,6 +2344,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "121eb6d7",
    "metadata": {},
    "source": [
     "## Replacing generic values"
@@ -2286,6 +2352,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3cc4c5f1",
    "metadata": {},
    "source": [
     "Often times we want to replace arbitrary values with other values.\n",
@@ -2296,6 +2363,7 @@
   {
    "cell_type": "code",
    "execution_count": 58,
+   "id": "e6c14e8a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2305,6 +2373,7 @@
   {
    "cell_type": "code",
    "execution_count": 59,
+   "id": "a852f0cb",
    "metadata": {},
    "outputs": [
     {
@@ -2330,6 +2399,7 @@
   {
    "cell_type": "code",
    "execution_count": 60,
+   "id": "f6ac12eb",
    "metadata": {},
    "outputs": [
     {
@@ -2354,6 +2424,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a6e1b6d7",
    "metadata": {},
    "source": [
     "We can also replace any value with a `<NA>` value."
@@ -2362,6 +2433,7 @@
   {
    "cell_type": "code",
    "execution_count": 61,
+   "id": "f0156bff",
    "metadata": {},
    "outputs": [
     {
@@ -2386,6 +2458,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6673eefb",
    "metadata": {},
    "source": [
     "You can replace a list of values by a list of other values:"
@@ -2394,6 +2467,7 @@
   {
    "cell_type": "code",
    "execution_count": 62,
+   "id": "f3110f5b",
    "metadata": {},
    "outputs": [
     {
@@ -2418,6 +2492,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "61521e8b",
    "metadata": {},
    "source": [
     "You can also specify a mapping dict:"
@@ -2426,6 +2501,7 @@
   {
    "cell_type": "code",
    "execution_count": 63,
+   "id": "45862d05",
    "metadata": {},
    "outputs": [
     {
@@ -2450,6 +2526,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "04a34549",
    "metadata": {},
    "source": [
     "For a DataFrame, you can specify individual values by column:"
@@ -2458,6 +2535,7 @@
   {
    "cell_type": "code",
    "execution_count": 64,
+   "id": "348caa64",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2467,6 +2545,7 @@
   {
    "cell_type": "code",
    "execution_count": 65,
+   "id": "cca41ec4",
    "metadata": {},
    "outputs": [
     {
@@ -2545,6 +2624,7 @@
   {
    "cell_type": "code",
    "execution_count": 66,
+   "id": "64334693",
    "metadata": {},
    "outputs": [
     {
@@ -2622,6 +2702,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2f0ceec7",
    "metadata": {},
    "source": [
     "## String/regular expression replacement"
@@ -2629,6 +2710,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c6f44740",
    "metadata": {},
    "source": [
     "cudf supports replacing string values using `replace` API:"
@@ -2637,6 +2719,7 @@
   {
    "cell_type": "code",
    "execution_count": 67,
+   "id": "031d3533",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2646,6 +2729,7 @@
   {
    "cell_type": "code",
    "execution_count": 68,
+   "id": "12b41efb",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2655,6 +2739,7 @@
   {
    "cell_type": "code",
    "execution_count": 69,
+   "id": "d450df49",
    "metadata": {},
    "outputs": [
     {
@@ -2732,6 +2817,7 @@
   {
    "cell_type": "code",
    "execution_count": 70,
+   "id": "f823bc46",
    "metadata": {},
    "outputs": [
     {
@@ -2809,6 +2895,7 @@
   {
    "cell_type": "code",
    "execution_count": 71,
+   "id": "bc52f6e9",
    "metadata": {},
    "outputs": [
     {
@@ -2885,14 +2972,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7c1087be",
    "metadata": {},
    "source": [
-    "Replace a few different values (list -> list):\n"
+    "Replace a few different values (list -> list):"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 72,
+   "id": "7e23eba9",
    "metadata": {},
    "outputs": [
     {
@@ -2969,6 +3058,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "42845a9c",
    "metadata": {},
    "source": [
     "Only search in column 'b' (dict -> dict):"
@@ -2977,6 +3067,7 @@
   {
    "cell_type": "code",
    "execution_count": 73,
+   "id": "d2e79805",
    "metadata": {},
    "outputs": [
     {
@@ -3053,6 +3144,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "774b42a6",
    "metadata": {},
    "source": [
     "## Numeric replacement"
@@ -3060,6 +3152,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "1c1926ac",
    "metadata": {},
    "source": [
     "`replace()` can also be used similar to `fillna()`."
@@ -3068,6 +3161,7 @@
   {
    "cell_type": "code",
    "execution_count": 74,
+   "id": "355a2f0d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3077,6 +3171,7 @@
   {
    "cell_type": "code",
    "execution_count": 75,
+   "id": "d9eed372",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3086,6 +3181,7 @@
   {
    "cell_type": "code",
    "execution_count": 76,
+   "id": "ae944244",
    "metadata": {},
    "outputs": [
     {
@@ -3116,70 +3212,70 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-0.089358787</td>\n",
+       "      <td>-0.728419386</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-2.141612003</td>\n",
+       "      <td>-0.574415182</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>0.123160746</td>\n",
-       "      <td>1.09464783</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>0.774643462</td>\n",
+       "      <td>2.07287721</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>0.93799853</td>\n",
+       "      <td>-1.054129436</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>0.68137677</td>\n",
-       "      <td>-0.357346253</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-0.435293012</td>\n",
+       "      <td>1.163009584</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>1.346623287</td>\n",
+       "      <td>0.31961371</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>1.173285961</td>\n",
-       "      <td>-0.968616065</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.147922362</td>\n",
-       "      <td>-0.154880098</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "             0             1\n",
-       "0         <NA>          <NA>\n",
-       "1         <NA>          <NA>\n",
-       "2  0.123160746    1.09464783\n",
-       "3         <NA>          <NA>\n",
-       "4         <NA>          <NA>\n",
-       "5   0.68137677  -0.357346253\n",
-       "6         <NA>          <NA>\n",
-       "7         <NA>          <NA>\n",
-       "8  1.173285961  -0.968616065\n",
-       "9  0.147922362  -0.154880098"
+       "              0             1\n",
+       "0  -0.089358787  -0.728419386\n",
+       "1  -2.141612003  -0.574415182\n",
+       "2          <NA>          <NA>\n",
+       "3   0.774643462    2.07287721\n",
+       "4    0.93799853  -1.054129436\n",
+       "5          <NA>          <NA>\n",
+       "6  -0.435293012   1.163009584\n",
+       "7   1.346623287    0.31961371\n",
+       "8          <NA>          <NA>\n",
+       "9          <NA>          <NA>"
       ]
      },
      "execution_count": 76,
@@ -3193,15 +3289,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0f32607c",
    "metadata": {},
    "source": [
-    "Replacing more than one value is possible by passing a list.\n",
-    "\n"
+    "Replacing more than one value is possible by passing a list."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 77,
+   "id": "59b81c60",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3211,6 +3308,7 @@
   {
    "cell_type": "code",
    "execution_count": 78,
+   "id": "01a71d4c",
    "metadata": {},
    "outputs": [
     {
@@ -3241,70 +3339,70 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>10.000000</td>\n",
+       "      <td>-0.728419</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>-2.141612</td>\n",
+       "      <td>-0.574415</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>0.123161</td>\n",
-       "      <td>1.094648</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>0.774643</td>\n",
+       "      <td>2.072877</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>0.937999</td>\n",
+       "      <td>-1.054129</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>0.681377</td>\n",
-       "      <td>-0.357346</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>-0.435293</td>\n",
+       "      <td>1.163010</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>1.346623</td>\n",
+       "      <td>0.319614</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>1.173286</td>\n",
-       "      <td>-0.968616</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.147922</td>\n",
-       "      <td>-0.154880</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "          0         1\n",
-       "0  5.000000  5.000000\n",
-       "1  5.000000  5.000000\n",
-       "2  0.123161  1.094648\n",
-       "3  5.000000  5.000000\n",
-       "4  5.000000  5.000000\n",
-       "5  0.681377 -0.357346\n",
-       "6  5.000000  5.000000\n",
-       "7  5.000000  5.000000\n",
-       "8  1.173286 -0.968616\n",
-       "9  0.147922 -0.154880"
+       "           0         1\n",
+       "0  10.000000 -0.728419\n",
+       "1  -2.141612 -0.574415\n",
+       "2   5.000000  5.000000\n",
+       "3   0.774643  2.072877\n",
+       "4   0.937999 -1.054129\n",
+       "5   5.000000  5.000000\n",
+       "6  -0.435293  1.163010\n",
+       "7   1.346623  0.319614\n",
+       "8   5.000000  5.000000\n",
+       "9   5.000000  5.000000"
       ]
      },
      "execution_count": 78,
@@ -3318,15 +3416,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "1080e97b",
    "metadata": {},
    "source": [
-    "You can also operate on the DataFrame in place:\n",
-    "\n"
+    "You can also operate on the DataFrame in place:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 79,
+   "id": "5f0859d7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3336,6 +3435,7 @@
   {
    "cell_type": "code",
    "execution_count": 80,
+   "id": "5cf28369",
    "metadata": {},
    "outputs": [
     {
@@ -3366,70 +3466,70 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-0.089358787</td>\n",
+       "      <td>-0.728419386</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-2.141612003</td>\n",
+       "      <td>-0.574415182</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>0.123160746</td>\n",
-       "      <td>1.09464783</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>0.774643462</td>\n",
+       "      <td>2.07287721</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>0.93799853</td>\n",
+       "      <td>-1.054129436</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>0.68137677</td>\n",
-       "      <td>-0.357346253</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-0.435293012</td>\n",
+       "      <td>1.163009584</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>1.346623287</td>\n",
+       "      <td>0.31961371</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>1.173285961</td>\n",
-       "      <td>-0.968616065</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.147922362</td>\n",
-       "      <td>-0.154880098</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "             0             1\n",
-       "0         <NA>          <NA>\n",
-       "1         <NA>          <NA>\n",
-       "2  0.123160746    1.09464783\n",
-       "3         <NA>          <NA>\n",
-       "4         <NA>          <NA>\n",
-       "5   0.68137677  -0.357346253\n",
-       "6         <NA>          <NA>\n",
-       "7         <NA>          <NA>\n",
-       "8  1.173285961  -0.968616065\n",
-       "9  0.147922362  -0.154880098"
+       "              0             1\n",
+       "0  -0.089358787  -0.728419386\n",
+       "1  -2.141612003  -0.574415182\n",
+       "2          <NA>          <NA>\n",
+       "3   0.774643462    2.07287721\n",
+       "4    0.93799853  -1.054129436\n",
+       "5          <NA>          <NA>\n",
+       "6  -0.435293012   1.163009584\n",
+       "7   1.346623287    0.31961371\n",
+       "8          <NA>          <NA>\n",
+       "9          <NA>          <NA>"
       ]
      },
      "execution_count": 80,
@@ -3444,7 +3544,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -3458,9 +3558,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.9"
+   "version": "3.8.13"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }

From 1a457efc019ee06bf11d350485cee12087db9d6e Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Wed, 4 May 2022 14:42:42 -0700
Subject: [PATCH 27/28] In-place updates with loc or iloc don't work correctly
 when the LHS has more than one column (#9918)

Fixes: https://github.com/rapidsai/cudf/issues/7377

This PR enables to `setitem` using a scalar value, dataframe  or  array/list iterable in both `DataframeLocIndexer `and  `DataFrameIlocIndexer `. Only the following cases are currently supported in cudf:
- Scalar value: follows the original code path, assigns column- values via specified  key (row-label)
- Dataframe : checks for column-alignment in LHS and RHS, then uses a scatter map of the indices to assign column-values accordingly. Substitute NA for columns not found in the RHS
- All other cases (array, list, range value, etc) :  first conversion to cupy array followed by special handling:
   * If 2d array:  If the inner dimension is 1, it's broadcastable to all columns of the dataframe.
   * Otherwise the value must be a 1d array (scalar values are handled in case 1 above), there are 2 subcases:
     * If the key on column axis is a scalar, meaning the user is indexing a single column; Therefore 1d value should assign along the columns.
     * Otherwise, the key on column axis is a 1d array. In this case, the key on row axis can be a scalar or 1d and in both cases of row key, the ith element in value corresponds to the ith row in the indexed object. If the key is 1d, a broadcast will happen.

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)
  - Michael Wang (https://github.com/isVoid)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/9918
---
 python/cudf/cudf/core/dataframe.py       | 123 ++++++++++++---
 python/cudf/cudf/core/indexed_frame.py   |   1 -
 python/cudf/cudf/tests/test_dataframe.py |  37 -----
 python/cudf/cudf/tests/test_indexing.py  | 186 +++++++++++++++++++++++
 4 files changed, 286 insertions(+), 61 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8c459e855c1..036ef890696 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -111,6 +111,14 @@
 }
 
 
+def _shape_mismatch_error(x, y):
+    raise ValueError(
+        f"shape mismatch: value array of shape {x} "
+        f"could not be broadcast to indexing result of "
+        f"shape {y}"
+    )
+
+
 class _DataFrameIndexer(_FrameIndexer):
     def __getitem__(self, arg):
         if (
@@ -342,28 +350,58 @@ def _setitem_tuple_arg(self, key, value):
                 )
             self._frame._data.insert(key[1], new_col)
         else:
-            if isinstance(value, (cupy.ndarray, np.ndarray)):
-                value_df = DataFrame(value)
-                if value_df.shape[1] != columns_df.shape[1]:
-                    if value_df.shape[1] == 1:
-                        value_cols = (
-                            value_df._data.columns * columns_df.shape[1]
-                        )
-                    else:
-                        raise ValueError(
-                            f"shape mismatch: value array of shape "
-                            f"{value_df.shape} could not be "
-                            f"broadcast to indexing result of shape "
-                            f"{columns_df.shape}"
-                        )
-                else:
-                    value_cols = value_df._data.columns
-                for i, col in enumerate(columns_df._column_names):
-                    self._frame[col].loc[key[0]] = value_cols[i]
-            else:
+            if is_scalar(value):
                 for col in columns_df._column_names:
                     self._frame[col].loc[key[0]] = value
 
+            elif isinstance(value, cudf.DataFrame):
+                if value.shape != self._frame.loc[key[0]].shape:
+                    _shape_mismatch_error(
+                        value.shape,
+                        self._frame.loc[key[0]].shape,
+                    )
+                value_column_names = set(value._column_names)
+                scatter_map = _indices_from_labels(self._frame, key[0])
+                for col in columns_df._column_names:
+                    columns_df[col][scatter_map] = (
+                        value._data[col]
+                        if col in value_column_names
+                        else cudf.NA
+                    )
+
+            else:
+                value = cupy.asarray(value)
+                if cupy.ndim(value) == 2:
+                    # If the inner dimension is 1, it's broadcastable to
+                    # all columns of the dataframe.
+                    indexed_shape = columns_df.loc[key[0]].shape
+                    if value.shape[1] == 1:
+                        if value.shape[0] != indexed_shape[0]:
+                            _shape_mismatch_error(value.shape, indexed_shape)
+                        for i, col in enumerate(columns_df._column_names):
+                            self._frame[col].loc[key[0]] = value[:, 0]
+                    else:
+                        if value.shape != indexed_shape:
+                            _shape_mismatch_error(value.shape, indexed_shape)
+                        for i, col in enumerate(columns_df._column_names):
+                            self._frame[col].loc[key[0]] = value[:, i]
+                else:
+                    # handle cases where value is 1d object:
+                    # If the key on column axis is a scalar, we indexed
+                    # a single column; The 1d value should assign along
+                    # the columns.
+                    if is_scalar(key[1]):
+                        for col in columns_df._column_names:
+                            self._frame[col].loc[key[0]] = value
+                    # Otherwise, there are two situations. The key on row axis
+                    # can be a scalar or 1d. In either of the situation, the
+                    # ith element in value corresponds to the ith row in
+                    # the indexed object.
+                    # If the key is 1d, a broadcast will happen.
+                    else:
+                        for i, col in enumerate(columns_df._column_names):
+                            self._frame[col].loc[key[0]] = value[i]
+
 
 class _DataFrameIlocIndexer(_DataFrameIndexer):
     """
@@ -424,10 +462,49 @@ def _getitem_tuple_arg(self, arg):
 
     @_cudf_nvtx_annotate
     def _setitem_tuple_arg(self, key, value):
-        # TODO: Determine if this usage is prevalent enough to expose this
-        # selection logic at a higher level than ColumnAccessor.
-        for col in self._frame._data.get_labels_by_index(key[1]):
-            self._frame[col].iloc[key[0]] = value
+        columns_df = self._frame._from_data(
+            self._frame._data.select_by_index(key[1]), self._frame._index
+        )
+
+        if is_scalar(value):
+            for col in columns_df._column_names:
+                self._frame[col].iloc[key[0]] = value
+
+        elif isinstance(value, cudf.DataFrame):
+            if value.shape != self._frame.iloc[key[0]].shape:
+                _shape_mismatch_error(
+                    value.shape,
+                    self._frame.loc[key[0]].shape,
+                )
+            value_column_names = set(value._column_names)
+            for col in columns_df._column_names:
+                columns_df[col][key[0]] = (
+                    value._data[col] if col in value_column_names else cudf.NA
+                )
+
+        else:
+            # TODO: consolidate code path with identical counterpart
+            # in `_DataFrameLocIndexer._setitem_tuple_arg`
+            value = cupy.asarray(value)
+            if cupy.ndim(value) == 2:
+                indexed_shape = columns_df.iloc[key[0]].shape
+                if value.shape[1] == 1:
+                    if value.shape[0] != indexed_shape[0]:
+                        _shape_mismatch_error(value.shape, indexed_shape)
+                    for i, col in enumerate(columns_df._column_names):
+                        self._frame[col].iloc[key[0]] = value[:, 0]
+                else:
+                    if value.shape != indexed_shape:
+                        _shape_mismatch_error(value.shape, indexed_shape)
+                    for i, col in enumerate(columns_df._column_names):
+                        self._frame._data[col][key[0]] = value[:, i]
+            else:
+                if is_scalar(key[1]):
+                    for col in columns_df._column_names:
+                        self._frame[col].iloc[key[0]] = value
+                else:
+                    for i, col in enumerate(columns_df._column_names):
+                        self._frame[col].iloc[key[0]] = value[i]
 
     def _getitem_scalar(self, arg):
         col = self._frame.columns[arg[1]]
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 1361fc56fa0..f4dcf9f59ca 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -144,7 +144,6 @@ def _drop_columns(f: Frame, columns: abc.Iterable, errors: str):
 
 
 def _indices_from_labels(obj, labels):
-
     if not isinstance(labels, cudf.MultiIndex):
         labels = cudf.core.column.as_column(labels)
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 9f2a3d45778..7f482c0e776 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8697,43 +8697,6 @@ def test_frame_series_where():
     assert_eq(expected, actual)
 
 
-@pytest.mark.parametrize(
-    "array,is_error",
-    [
-        (cupy.arange(20, 40).reshape(-1, 2), False),
-        (cupy.arange(20, 50).reshape(-1, 3), True),
-        (np.arange(20, 40).reshape(-1, 2), False),
-        (np.arange(20, 30).reshape(-1, 1), False),
-        (cupy.arange(20, 30).reshape(-1, 1), False),
-    ],
-)
-def test_dataframe_indexing_setitem_np_cp_array(array, is_error):
-    gdf = cudf.DataFrame({"a": range(10), "b": range(10)})
-    pdf = gdf.to_pandas()
-    if not is_error:
-        gdf.loc[:, ["a", "b"]] = array
-        pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array)
-
-        assert_eq(gdf, pdf)
-    else:
-        assert_exceptions_equal(
-            lfunc=pdf.loc.__setitem__,
-            rfunc=gdf.loc.__setitem__,
-            lfunc_args_and_kwargs=(
-                [(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)],
-                {},
-            ),
-            rfunc_args_and_kwargs=(
-                [(slice(None, None, None), ["a", "b"]), array],
-                {},
-            ),
-            compare_error_message=False,
-            expected_error_message="shape mismatch: value array of shape "
-            "(10, 3) could not be broadcast to indexing "
-            "result of shape (10, 2)",
-        )
-
-
 @pytest.mark.parametrize(
     "data",
     [{"a": [1, 2, 3], "b": [1, 1, 0]}],
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 225aa0cd6bc..790fbd0d3f8 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1486,3 +1486,189 @@ def test_iloc_decimal():
         ["4.00", "3.00", "2.00", "1.00"],
     ).astype(cudf.Decimal64Dtype(scale=2, precision=3))
     assert_eq(expect.reset_index(drop=True), got.reset_index(drop=True))
+
+
+@pytest.mark.parametrize(
+    ("key, value"),
+    [
+        (
+            ([0], ["x", "y"]),
+            [10, 20],
+        ),
+        (
+            ([0, 2], ["x", "y"]),
+            [[10, 30], [20, 40]],
+        ),
+        (
+            (0, ["x", "y"]),
+            [10, 20],
+        ),
+        (
+            ([0, 2], "x"),
+            [10, 20],
+        ),
+    ],
+)
+def test_dataframe_loc_inplace_update(key, value):
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    pdf = gdf.to_pandas()
+
+    actual = gdf.loc[key] = value
+    expected = pdf.loc[key] = value
+
+    assert_eq(expected, actual)
+
+
+def test_dataframe_loc_inplace_update_string_index():
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=list("abc"))
+    pdf = gdf.to_pandas()
+
+    actual = gdf.loc[["a"], ["x", "y"]] = [10, 20]
+    expected = pdf.loc[["a"], ["x", "y"]] = [10, 20]
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    ("key, value"),
+    [
+        ([0], [10, 20]),
+        ([0, 2], [[10, 30], [20, 40]]),
+        (([0, 2], [0, 1]), [[10, 30], [20, 40]]),
+        (([0, 2], 0), [10, 30]),
+        ((0, [0, 1]), [20, 40]),
+    ],
+)
+def test_dataframe_iloc_inplace_update(key, value):
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    pdf = gdf.to_pandas()
+
+    actual = gdf.iloc[key] = value
+    expected = pdf.iloc[key] = value
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "loc_key",
+    [([0, 2], ["x", "y"])],
+)
+@pytest.mark.parametrize(
+    "iloc_key",
+    [[0, 2]],
+)
+@pytest.mark.parametrize(
+    ("data, index"),
+    [
+        (
+            {"x": [10, 20], "y": [30, 40]},
+            [0, 2],
+        )
+    ],
+)
+def test_dataframe_loc_iloc_inplace_update_with_RHS_dataframe(
+    loc_key, iloc_key, data, index
+):
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    pdf = gdf.to_pandas()
+
+    actual = gdf.loc[loc_key] = cudf.DataFrame(data, index=cudf.Index(index))
+    expected = pdf.loc[loc_key] = pd.DataFrame(data, index=pd.Index(index))
+    assert_eq(expected, actual)
+
+    actual = gdf.iloc[iloc_key] = cudf.DataFrame(data, index=cudf.Index(index))
+    expected = pdf.iloc[iloc_key] = pd.DataFrame(data, index=pd.Index(index))
+    assert_eq(expected, actual)
+
+
+def test_dataframe_loc_inplace_update_with_invalid_RHS_df_columns():
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    pdf = gdf.to_pandas()
+
+    actual = gdf.loc[[0, 2], ["x", "y"]] = cudf.DataFrame(
+        {"b": [10, 20], "y": [30, 40]}, index=cudf.Index([0, 2])
+    )
+    expected = pdf.loc[[0, 2], ["x", "y"]] = pd.DataFrame(
+        {"b": [10, 20], "y": [30, 40]}, index=pd.Index([0, 2])
+    )
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    ("key, value"),
+    [
+        (([0, 2], ["x", "y"]), [[10, 30, 50], [20, 40, 60]]),
+        (([0], ["x", "y"]), [[10], [20]]),
+    ],
+)
+def test_dataframe_loc_inplace_update_shape_mismatch(key, value):
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    with pytest.raises(ValueError, match="shape mismatch:"):
+        gdf.loc[key] = value
+
+
+@pytest.mark.parametrize(
+    ("key, value"),
+    [
+        ([0, 2], [[10, 30, 50], [20, 40, 60]]),
+        ([0], [[10], [20]]),
+    ],
+)
+def test_dataframe_iloc_inplace_update_shape_mismatch(key, value):
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    with pytest.raises(ValueError, match="shape mismatch:"):
+        gdf.iloc[key] = value
+
+
+def test_dataframe_loc_inplace_update_shape_mismatch_RHS_df():
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    with pytest.raises(ValueError, match="shape mismatch:"):
+        gdf.loc[([0, 2], ["x", "y"])] = cudf.DataFrame(
+            {"x": [10, 20]}, index=cudf.Index([0, 2])
+        )
+
+
+def test_dataframe_iloc_inplace_update_shape_mismatch_RHS_df():
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    with pytest.raises(ValueError, match="shape mismatch:"):
+        gdf.iloc[[0, 2]] = cudf.DataFrame(
+            {"x": [10, 20]}, index=cudf.Index([0, 2])
+        )
+
+
+@pytest.mark.parametrize(
+    "array,is_error",
+    [
+        (cupy.arange(20, 40).reshape(-1, 2), False),
+        (cupy.arange(20, 50).reshape(-1, 3), True),
+        (np.arange(20, 40).reshape(-1, 2), False),
+        (np.arange(20, 30).reshape(-1, 1), False),
+        (cupy.arange(20, 30).reshape(-1, 1), False),
+    ],
+)
+def test_dataframe_indexing_setitem_np_cp_array(array, is_error):
+    gdf = cudf.DataFrame({"a": range(10), "b": range(10)})
+    pdf = gdf.to_pandas()
+    if not is_error:
+        gdf.loc[:, ["a", "b"]] = array
+        pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array)
+
+        assert_eq(gdf, pdf)
+    else:
+        assert_exceptions_equal(
+            lfunc=pdf.loc.__setitem__,
+            rfunc=gdf.loc.__setitem__,
+            lfunc_args_and_kwargs=(
+                [(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)],
+                {},
+            ),
+            rfunc_args_and_kwargs=(
+                [(slice(None, None, None), ["a", "b"]), array],
+                {},
+            ),
+            compare_error_message=False,
+            expected_error_message="shape mismatch: value array of shape "
+            "(10, 3) could not be broadcast to indexing "
+            "result of shape (10, 2)",
+        )

From 14b51693c43fcc376576ceb347a7ca748fa43d32 Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gshegalov@nvidia.com>
Date: Wed, 4 May 2022 21:32:07 -0700
Subject: [PATCH 28/28] Enable ccache for cudfjni build in Docker (#10790)

This PR enables ccache support for `./build.sh clean cudfjar`.

ccache 4.6 is built during image creation because ccacheversion available via `yum install` does not cache nvcc-compiled binaries.

It's enabled by default  for build.sh and repeated no-change  build
```bash
PARALLEL_LEVEL=6 SKIP_TESTS=true time ./build.sh clean cudfjar
```
succeeds in 1.5 min on my machine. To disable set CCACHE_DISABLE=1 in the environment

It's not enabled for ./java/ci/build-in-docker.sh

Signed-off-by: Gera Shegalov <gera@apache.org>

Authors:
  - Gera Shegalov (https://github.com/gerashegalov)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/10790
---
 build.sh                   | 19 +++++++++++++++++--
 java/ci/Dockerfile.centos7 | 20 +++++++++++++++++++-
 java/pom.xml               |  2 ++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/build.sh b/build.sh
index 48182ca1a6f..ab3bd0e7a89 100755
--- a/build.sh
+++ b/build.sh
@@ -112,16 +112,22 @@ function buildLibCudfJniInDocker {
     local localMavenRepo=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
     local workspaceRepoDir="$workspaceDir/cudf"
     local workspaceMavenRepoDir="$workspaceDir/.m2/repository"
+    local workspaceCcacheDir="$workspaceDir/.ccache"
     mkdir -p "$CUDF_JAR_JAVA_BUILD_DIR/libcudf-cmake-build"
+    mkdir -p "$HOME/.ccache" "$HOME/.m2"
     nvidia-docker build \
         -f java/ci/Dockerfile.centos7 \
         --build-arg CUDA_VERSION=${cudaVersion} \
         -t $imageName .
     nvidia-docker run -it -u $(id -u):$(id -g) --rm \
+        -e PARALLEL_LEVEL \
+        -e CCACHE_DISABLE \
+        -e CCACHE_DIR="$workspaceCcacheDir" \
         -v "/etc/group:/etc/group:ro" \
         -v "/etc/passwd:/etc/passwd:ro" \
         -v "/etc/shadow:/etc/shadow:ro" \
         -v "/etc/sudoers.d:/etc/sudoers.d:ro" \
+        -v "$HOME/.ccache:$workspaceCcacheDir:rw" \
         -v "$REPODIR:$workspaceRepoDir:rw" \
         -v "$localMavenRepo:$workspaceMavenRepoDir:rw" \
         --workdir "$workspaceRepoDir/java/target/libcudf-cmake-build" \
@@ -129,11 +135,16 @@ function buildLibCudfJniInDocker {
         scl enable devtoolset-9 \
             "cmake $workspaceRepoDir/cpp \
                 -G${CMAKE_GENERATOR} \
+                -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_LINKER_LAUNCHER=ccache \
                 -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} \
-                -DCMAKE_INSTALL_PREFIX==/usr/local/rapids \
-                -DUSE_NVTX=ON -DCUDF_USE_ARROW_STATIC=ON \
+                -DCMAKE_INSTALL_PREFIX=/usr/local/rapids \
+                -DUSE_NVTX=ON \
+                -DCUDF_USE_ARROW_STATIC=ON \
                 -DCUDF_ENABLE_ARROW_S3=OFF \
                 -DBUILD_TESTS=OFF \
                 -DPER_THREAD_DEFAULT_STREAM=ON \
@@ -145,6 +156,10 @@ function buildLibCudfJniInDocker {
                 -Dmaven.repo.local=$workspaceMavenRepoDir \
                 -DskipTests=${SKIP_TESTS:-false} \
                 -Dparallel.level=${PARALLEL_LEVEL} \
+                -Dcmake.ccache.opts='-DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CXX_LINKER_LAUNCHER=ccache' \
                 -DCUDF_CPP_BUILD_DIR=$workspaceRepoDir/java/target/libcudf-cmake-build \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DPER_THREAD_DEFAULT_STREAM=ON \
diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
index dc8c0e4a95b..7993804554d 100644
--- a/java/ci/Dockerfile.centos7
+++ b/java/ci/Dockerfile.centos7
@@ -26,8 +26,9 @@ ARG CUDA_VERSION=11.5.0
 FROM gpuci/cuda:$CUDA_VERSION-devel-centos7
 
 ### Install basic requirements
+ARG DEVTOOLSET_VERSION=9
 RUN yum install -y centos-release-scl
-RUN yum install -y devtoolset-9 epel-release
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION} epel-release
 RUN yum install -y git zlib-devel maven tar wget patch ninja-build
 
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
@@ -37,4 +38,21 @@ ARG CMAKE_VERSION=3.22.3
 RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
    tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
    rm cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
+
 ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:$PATH
+
+ARG CCACHE_VERSION=4.6
+RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
+   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
+   rm ccache-${CCACHE_VERSION}.tar.gz && \
+   cd ccache-${CCACHE_VERSION} && \
+   mkdir build && \
+   cd build && \
+   scl enable devtoolset-${DEVTOOLSET_VERSION} \
+      "cmake .. \
+         -DCMAKE_BUILD_TYPE=Release \
+         -DZSTD_FROM_INTERNET=ON \
+         -DREDIS_STORAGE_BACKEND=OFF && \
+      cmake --build . --parallel ${PARALLEL_LEVEL} --target install" && \
+   cd ../.. && \
+   rm -rf ccache-${CCACHE_VERSION}
diff --git a/java/pom.xml b/java/pom.xml
index 50b6ca59440..31a79ec9801 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -173,6 +173,7 @@
         <arrow.version>0.15.1</arrow.version>
         <parallel.level>4</parallel.level>
         <CUDF_CPP_BUILD_DIR/>
+        <cmake.ccache.opts/>
     </properties>
 
     <profiles>
@@ -382,6 +383,7 @@
                                       failonerror="true"
                                       executable="cmake">
                                     <arg value="${basedir}/src/main/native"/>
+                                    <arg line="${cmake.ccache.opts}"/>
                                     <arg value="-DCUDA_STATIC_RUNTIME=${CUDA_STATIC_RUNTIME}" />
                                     <arg value="-DPER_THREAD_DEFAULT_STREAM=${PER_THREAD_DEFAULT_STREAM}" />
                                     <arg value="-DUSE_GDS=${USE_GDS}" />